Nur in b/arch/arm/boot/compressed: head-comcerto.S.
diff -ur a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile
--- a/arch/arm/boot/compressed/Makefile	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/boot/compressed/Makefile	2014-01-21 09:36:45.000000000 +0100
@@ -33,6 +33,11 @@
 #
 # Architecture dependencies
 #
+
+ifeq ($(CONFIG_ARCH_COMCERTO),y)
+OBJS		+= head-comcerto.o
+endif
+
 ifeq ($(CONFIG_ARCH_ACORN),y)
 OBJS		+= ll_char_wr.o font.o
 endif
diff -ur a/arch/arm/common/gic.c b/arch/arm/common/gic.c
--- a/arch/arm/common/gic.c	2013-08-24 11:36:17.000000000 +0200
+++ b/arch/arm/common/gic.c	2014-02-17 11:56:17.000000000 +0100
@@ -43,6 +43,10 @@
 #include <asm/mach/irq.h>
 #include <asm/hardware/gic.h>
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <mach/sema.h>
+#endif
+
 static DEFINE_RAW_SPINLOCK(irq_controller_lock);
 
 /* Address of GIC 0 CPU interface */
@@ -90,30 +94,70 @@
 static void gic_mask_irq(struct irq_data *d)
 {
 	u32 mask = 1 << (gic_irq(d) % 32);
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned long flags;
+#endif
+
+#if defined(CONFIG_SYNO_COMCERTO)
+	if ((gic_irq(d) == 87) || (gic_irq(d) == 66) || (gic_irq(d) == 33)) {
+		return;
+	}
+#endif
 
 	raw_spin_lock(&irq_controller_lock);
+#if defined(CONFIG_SYNO_COMCERTO)
+	flags = msp_lock_frqsave();
+#endif
 	writel_relaxed(mask, gic_dist_base(d) + GIC_DIST_ENABLE_CLEAR + (gic_irq(d) / 32) * 4);
 	if (gic_arch_extn.irq_mask)
 		gic_arch_extn.irq_mask(d);
+#if defined(CONFIG_SYNO_COMCERTO)
+	msp_unlock_frqrestore(flags);
+#endif
 	raw_spin_unlock(&irq_controller_lock);
 }
 
 static void gic_unmask_irq(struct irq_data *d)
 {
 	u32 mask = 1 << (gic_irq(d) % 32);
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned long flags;
+#endif
+
+#if defined(CONFIG_SYNO_COMCERTO)
+	if ((gic_irq(d) == 87) || (gic_irq(d) == 66) || (gic_irq(d) == 33)) {
+		return;
+	}
+#endif
 
 	raw_spin_lock(&irq_controller_lock);
+#if defined(CONFIG_SYNO_COMCERTO)
+	flags = msp_lock_frqsave();
+#endif
 	if (gic_arch_extn.irq_unmask)
 		gic_arch_extn.irq_unmask(d);
 	writel_relaxed(mask, gic_dist_base(d) + GIC_DIST_ENABLE_SET + (gic_irq(d) / 32) * 4);
+#if defined(CONFIG_SYNO_COMCERTO)
+	msp_unlock_frqrestore(flags);
+#endif
 	raw_spin_unlock(&irq_controller_lock);
 }
 
 static void gic_eoi_irq(struct irq_data *d)
 {
 	if (gic_arch_extn.irq_eoi) {
+#if defined(CONFIG_SYNO_COMCERTO)
+		unsigned long flags;
+#endif
+
 		raw_spin_lock(&irq_controller_lock);
+#if defined(CONFIG_SYNO_COMCERTO)
+		flags = msp_lock_frqsave();
+#endif
 		gic_arch_extn.irq_eoi(d);
+#if defined(CONFIG_SYNO_COMCERTO)
+		msp_unlock_frqrestore(flags);
+#endif
 		raw_spin_unlock(&irq_controller_lock);
 	}
 
@@ -308,6 +352,14 @@
 	for (i = 32; i < gic_irqs; i += 32)
 		writel_relaxed(0xffffffff, base + GIC_DIST_ENABLE_CLEAR + i * 4 / 32);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+	/*
+	 * Set SPI interrupts are nonSecure
+	 */
+	for (i = 32; i < gic_irqs; i += 32)
+		writel_relaxed(0xffffffff, base + GIC_DIST_SECURITY_BIT + i * 4 / 32);
+#endif  /* CONFIG_SYNO_COMCERTO && CONFIG_COMCERTO_MSP */
+
 	/*
 	 * Setup the Linux IRQ subsystem.
 	 */
@@ -325,7 +377,14 @@
 		irq_set_chip_data(irq, gic);
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+	/*
+	 * Enable NonSecure interrupts in Distributor
+	 */
+	writel_relaxed(3, base + GIC_DIST_CTRL);
+#else  /* !CONFIG_SYNO_COMCERTO || !CONFIG_COMCERTO_MSP */
 	writel_relaxed(1, base + GIC_DIST_CTRL);
+#endif /*CONFIG_SYNO_COMCERTO &&  CONFIG_COMCERTO_MSP */
 }
 
 static void __cpuinit gic_cpu_init(struct gic_chip_data *gic)
@@ -348,9 +407,62 @@
 		writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4 / 4);
 
 	writel_relaxed(0xf0, base + GIC_CPU_PRIMASK);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+	/*
+	 * Set PPI and SGI interrupts are nonSecure
+	 */
+	writel_relaxed(0xffffffff, dist_base + GIC_DIST_SECURITY_BIT);
+
+	/*
+	 * Enable NonSecure interrupts in CPU interface,
+	 * Secure interrupts go to FIQ line,
+	 * Secure read returns valid NonSecure interrupt ID
+	 */
+	writel_relaxed(0xf, base + GIC_CPU_CTRL);
+#else  /* !CONFIG_SYNO_COMCERTO || !CONFIG_COMCERTO_MSP */
 	writel_relaxed(1, base + GIC_CPU_CTRL);
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_COMCERTO_MSP */
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+
+static void __cpuinit gic_cpu_init_irq_only(struct gic_chip_data *gic)
+{
+	void __iomem *dist_base = gic->dist_base;
+	void __iomem *base = gic->cpu_base;
+	int i;
+
+	/*
+	 * Deal with the banked PPI and SGI interrupts - disable all
+	 * PPI interrupts, ensure all SGI interrupts are enabled.
+	 */
+	writel_relaxed(0xffff0000, dist_base + GIC_DIST_ENABLE_CLEAR);
+	writel_relaxed(0x0000ffff, dist_base + GIC_DIST_ENABLE_SET);
+
+	/*
+	 * Set priority on PPI and SGI interrupts
+	 */
+	for (i = 0; i < 32; i += 4)
+		writel_relaxed(0xa0a0a0a0, dist_base + GIC_DIST_PRI + i * 4 / 4);
+
+	writel_relaxed(0xf0, base + GIC_CPU_PRIMASK);
+
+	/*
+	 * Set PPI and SGI interrupts are nonSecure
+	 */
+	writel_relaxed(0xffffffff, dist_base + GIC_DIST_SECURITY_BIT);
+
+	/*
+	 * Enable NonSecure interrupts in CPU interface,
+	 * Secure interrupts go to IRQ line,
+	 * Secure read returns valid NonSecure interrupt ID
+	 */
+	writel_relaxed(0x7, base + GIC_CPU_CTRL);
+}
+
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_COMCERTO_MSP */
+
 #ifdef CONFIG_CPU_PM
 /*
  * Saves the GIC distributor registers during suspend or idle.  Must be called
@@ -625,7 +737,12 @@
 {
 	BUG_ON(gic_nr >= MAX_GIC_NR);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+	/* run alternative secondary_boot gic init */
+	gic_cpu_init_irq_only(&gic_data[gic_nr]);
+#else  /* !CONFIG_SYNO_COMCERTO || !CONFIG_COMCERTO_MSP */
 	gic_cpu_init(&gic_data[gic_nr]);
+#endif  /* CONFIG_SYNO_COMCERTO && CONFIG_COMCERTO_MSP */
 }
 
 #ifdef CONFIG_SMP
@@ -645,7 +762,16 @@
 	dsb();
 
 	/* this always happens on GIC0 */
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+#define GIC_SGI_SATT (1 << 15)
+	/*
+	 * Send SGI from Secure write to NonSecure target
+	 */
+	writel_relaxed(map << 16 | GIC_SGI_SATT | irq, gic_data[0].dist_base + GIC_DIST_SOFTINT);
+#else  /* !CONFIG_SYNO_COMCERTO || !CONFIG_COMCERTO_MSP */
 	writel_relaxed(map << 16 | irq, gic_data[0].dist_base + GIC_DIST_SOFTINT);
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_COMCERTO_MSP */
 }
 #endif
 
Nur in b/arch/arm/configs: c2kasic_defconfig.
Nur in b/arch/arm/configs: c2kevm_defconfig.
Nur in b/arch/arm/configs: c2klv_defconfig.
Nur in b/arch/arm/configs: c2krtsm_defconfig.
diff -ur a/arch/arm/configs/sam9_l9260_defconfig b/arch/arm/configs/sam9_l9260_defconfig
--- a/arch/arm/configs/sam9_l9260_defconfig	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/configs/sam9_l9260_defconfig	2014-01-21 09:36:45.000000000 +0100
@@ -39,7 +39,7 @@
 CONFIG_MTD_NAND_ATMEL=y
 CONFIG_MTD_NAND_PLATFORM=y
 CONFIG_MTD_UBI=y
-CONFIG_MTD_UBI_BEB_RESERVE=3
+CONFIG_MTD_UBI_BEB_LIMIT=25
 CONFIG_MTD_UBI_GLUEBI=y
 CONFIG_BLK_DEV_LOOP=y
 CONFIG_BLK_DEV_RAM=y
diff -ur a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
--- a/arch/arm/include/asm/elf.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/elf.h	2014-02-17 11:56:14.000000000 +0100
@@ -109,7 +109,7 @@
 #define ELF_CORE_COPY_TASK_REGS dump_task_regs
 
 #define CORE_DUMP_USE_REGSET
-#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE)
+#if defined(CONFIG_SYNO_COMCERTO) || (defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE))
 #define ELF_EXEC_PAGESIZE       PAGE_SIZE
 #else
 #define ELF_EXEC_PAGESIZE	4096
@@ -134,8 +134,4 @@
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
-extern int vectors_user_mapping(void);
-#define arch_setup_additional_pages(bprm, uses_interp) vectors_user_mapping()
-#define ARCH_HAS_SETUP_ADDITIONAL_PAGES
-
 #endif
diff -ur a/arch/arm/include/asm/hardware/entry-macro-gic.S b/arch/arm/include/asm/hardware/entry-macro-gic.S
--- a/arch/arm/include/asm/hardware/entry-macro-gic.S	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/hardware/entry-macro-gic.S	2014-02-17 11:56:14.000000000 +0100
@@ -34,6 +34,22 @@
 
 	.macro  get_irqnr_and_base, irqnr, irqstat, base, tmp
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+	ldr     \irqstat, [\base, #GIC_CPU_HIGHPRI]
+	bic     \irqnr, \irqstat, #0x1c00
+	cmp     \irqnr, #33
+	cmpne   \irqnr, #66
+	cmpne   \irqnr, #87
+	cmpeq   \irqnr, \irqnr
+	bne 1001f
+
+	mov \irqnr, \irqnr /* breakpoint here */
+
+	beq 1002f
+
+1001:
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_COMCERTO_MSP */
+
 	ldr     \irqstat, [\base, #GIC_CPU_INTACK]
 	/* bits 12-10 = src CPU, 9-0 = int # */
 
@@ -43,6 +59,11 @@
 	cmpcc	\irqnr, \irqnr
 	cmpne	\irqnr, \tmp
 	cmpcs	\irqnr, \irqnr
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MSP)
+1002:
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_COMCERTO_MSP */
+
 	.endm
 
 /* We assume that irqstat (the raw value of the IRQ acknowledge
diff -ur a/arch/arm/include/asm/hardware/gic.h b/arch/arm/include/asm/hardware/gic.h
--- a/arch/arm/include/asm/hardware/gic.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/hardware/gic.h	2014-02-17 11:56:14.000000000 +0100
@@ -22,6 +22,9 @@
 
 #define GIC_DIST_CTRL			0x000
 #define GIC_DIST_CTR			0x004
+#if defined(CONFIG_SYNO_COMCERTO)
+#define GIC_DIST_SECURITY_BIT		0x080
+#endif
 #define GIC_DIST_ENABLE_SET		0x100
 #define GIC_DIST_ENABLE_CLEAR		0x180
 #define GIC_DIST_PENDING_SET		0x200
diff -ur a/arch/arm/include/asm/kexec.h b/arch/arm/include/asm/kexec.h
--- a/arch/arm/include/asm/kexec.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/kexec.h	2014-02-17 11:56:14.000000000 +0100
@@ -10,7 +10,11 @@
 /* Maximum address we can use for the control code buffer */
 #define KEXEC_CONTROL_MEMORY_LIMIT (-1UL)
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#define KEXEC_CONTROL_PAGE_SIZE	(PAGE_SIZE)
+#else
 #define KEXEC_CONTROL_PAGE_SIZE	4096
+#endif
 
 #define KEXEC_ARCH KEXEC_ARCH_ARM
 
diff -ur a/arch/arm/include/asm/mach/map.h b/arch/arm/include/asm/mach/map.h
--- a/arch/arm/include/asm/mach/map.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/mach/map.h	2014-02-17 11:56:14.000000000 +0100
@@ -30,6 +30,10 @@
 #define MT_MEMORY_DTCM		12
 #define MT_MEMORY_ITCM		13
 #define MT_MEMORY_SO		14
+#if defined(CONFIG_SYNO_COMCERTO)
+#define MT_MSP				15
+#define MT_MSP_NCNB			16
+#endif
 
 #ifdef CONFIG_MMU
 extern void iotable_init(struct map_desc *, int);
diff -ur a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
--- a/arch/arm/include/asm/memory.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/memory.h	2014-02-17 11:56:14.000000000 +0100
@@ -35,30 +35,44 @@
  * TASK_SIZE - the maximum size of a user space task.
  * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area
  */
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <asm/pgtable-2level.h>
+#endif
+#if defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE) && defined(CONFIG_SYNO_ARMADA_ARCH)
+#define PAGE_OFFSET		UL(CONFIG_PAGE_OFFSET)
+#define TASK_SIZE		(UL(CONFIG_PAGE_OFFSET) - UL(0x01C00000))
+#define TASK_UNMAPPED_BASE	(UL(CONFIG_PAGE_OFFSET) / 3)
+#elif defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_64K_PAGES)
+#define PAGE_OFFSET		UL(CONFIG_PAGE_OFFSET)
+#define TASK_SIZE		((UL(CONFIG_PAGE_OFFSET) - UL(0x01000000)) & ~(UL((1 << PMD_SHIFT)-1)))  // Must be aligned on PMD size (kernel/user space can share same PMD)
+#define TASK_UNMAPPED_BASE	(UL(CONFIG_PAGE_OFFSET) / 3)
+#else
 #define PAGE_OFFSET		UL(CONFIG_PAGE_OFFSET)
 #define TASK_SIZE		(UL(CONFIG_PAGE_OFFSET) - UL(0x01000000))
 #define TASK_UNMAPPED_BASE	(UL(CONFIG_PAGE_OFFSET) / 3)
-
+#endif
 /*
  * The maximum size of a 26-bit user space task.
  */
 #define TASK_SIZE_26		UL(0x04000000)
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
 /*
  * The module space lives between the addresses given by TASK_SIZE
  * and PAGE_OFFSET - it must be within 32MB of the kernel text.
  */
 #ifndef CONFIG_THUMB2_KERNEL
+/* 28MB is heuristic setting. */
+#if defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE) && defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_ARCH_ARMADA370)
+#define MODULES_VADDR		(PAGE_OFFSET - 28*1024*1024)
+#else
 #define MODULES_VADDR		(PAGE_OFFSET - 16*1024*1024)
+#endif
 #else
 /* smaller range for Thumb-2 symbols relocation (2^24)*/
 #define MODULES_VADDR		(PAGE_OFFSET - 8*1024*1024)
 #endif
 
-#if TASK_SIZE > MODULES_VADDR
-#error Top of user space clashes with start of module space
-#endif
-
 /*
  * The highmem pkmap virtual space shares the end of the module area.
  */
@@ -68,6 +82,22 @@
 #define MODULES_END		(PAGE_OFFSET)
 #endif
 
+#else
+/* Move module space into the hole reserved for MSP/PFE so we can have a bigger DMA zone */
+#define MODULES_END		((COMCERTO_DDR_SHARED_BASE + COMCERTO_DDR_SHARED_SIZE - PLAT_PHYS_OFFSET + PAGE_OFFSET) & PMD_MASK) // convert SHARED_END to virt and align on lower PMD boundary
+
+#ifndef CONFIG_THUMB2_KERNEL
+#define MODULES_VADDR	(MODULES_END - 16*1024*1024)
+#else
+#define MODULES_VADDR	(MODULES_END - 10*1024*1024) // Relocations will be guaranteed to work as long as kernel size is less than 6MB
+#endif
+
+#endif
+
+#if TASK_SIZE > MODULES_VADDR
+#error Top of user space clashes with start of module space
+#endif
+
 /*
  * The XIP kernel gets mapped at the bottom of the module vm area.
  * Since we use sections to map it, this macro replaces the physical address
diff -ur a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
--- a/arch/arm/include/asm/mmu_context.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/mmu_context.h	2014-02-17 11:56:14.000000000 +0100
@@ -18,6 +18,7 @@
 #include <asm/cacheflush.h>
 #include <asm/cachetype.h>
 #include <asm/proc-fns.h>
+#include <asm-generic/mm_hooks.h>
 
 void __check_kvm_seq(struct mm_struct *mm);
 
@@ -133,32 +134,4 @@
 #define deactivate_mm(tsk,mm)	do { } while (0)
 #define activate_mm(prev,next)	switch_mm(prev, next, NULL)
 
-/*
- * We are inserting a "fake" vma for the user-accessible vector page so
- * gdb and friends can get to it through ptrace and /proc/<pid>/mem.
- * But we also want to remove it before the generic code gets to see it
- * during process exit or the unmapping of it would  cause total havoc.
- * (the macro is used as remove_vma() is static to mm/mmap.c)
- */
-#define arch_exit_mmap(mm) \
-do { \
-	struct vm_area_struct *high_vma = find_vma(mm, 0xffff0000); \
-	if (high_vma) { \
-		BUG_ON(high_vma->vm_next);  /* it should be last */ \
-		if (high_vma->vm_prev) \
-			high_vma->vm_prev->vm_next = NULL; \
-		else \
-			mm->mmap = NULL; \
-		rb_erase(&high_vma->vm_rb, &mm->mm_rb); \
-		mm->mmap_cache = NULL; \
-		mm->map_count--; \
-		remove_vma(high_vma); \
-	} \
-} while (0)
-
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
-{
-}
-
 #endif
diff -ur a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
--- a/arch/arm/include/asm/page.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/page.h	2014-02-17 11:56:14.000000000 +0100
@@ -11,7 +11,7 @@
 #define _ASMARM_PAGE_H
 
 /* PAGE_SHIFT determines the page size */
-#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE)
+#if (defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_64K_PAGES)) || (defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE))
 #define PAGE_SHIFT		16
 #else
 #define PAGE_SHIFT		12
@@ -155,6 +155,8 @@
 #define clear_page(page)	memset((void *)(page), 0, PAGE_SIZE)
 extern void copy_page(void *to, const void *from);
 
+#define __HAVE_ARCH_GATE_AREA 1
+
 #if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_ARM_LPAE)
 #include <asm/pgtable-3level-types.h>
 #else
diff -ur a/arch/arm/include/asm/param.h b/arch/arm/include/asm/param.h
--- a/arch/arm/include/asm/param.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/param.h	2014-02-17 11:56:14.000000000 +0100
@@ -18,7 +18,11 @@
 # define HZ		100
 #endif
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 #define EXEC_PAGESIZE	4096
+#else
+#define EXEC_PAGESIZE	65536
+#endif
 
 #ifndef NOGROUP
 #define NOGROUP         (-1)
diff -ur a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h
--- a/arch/arm/include/asm/pgalloc.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/pgalloc.h	2014-02-17 11:56:14.000000000 +0100
@@ -134,11 +134,20 @@
 				  pmdval_t prot)
 {
 	pmdval_t pmdval = (pte + PTE_HWTABLE_OFF) | prot;
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 	pmdp[0] = __pmd(pmdval);
 #if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_ARM_LPAE)
 #else
 	pmdp[1] = __pmd(pmdval + 256 * sizeof(pte_t));
 #endif
+#else
+	int i, off = 0;
+	for (i = 0; i < LINKED_PMDS; i++) {
+		pmdp[i] = __pmd(pmdval + off);
+		off += 1024; // Each PMD points to a 1kB 2nd-level table
+	}
+
+#endif
 	flush_pmd_entry(pmdp);
 }
 
diff -ur a/arch/arm/include/asm/pgtable-2level.h b/arch/arm/include/asm/pgtable-2level.h
--- a/arch/arm/include/asm/pgtable-2level.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/pgtable-2level.h	2014-02-17 11:56:14.000000000 +0100
@@ -28,6 +28,17 @@
  * which contain the state information Linux needs.  We, therefore, end up
  * with 512 entries in the "PTE" level.
  *
+#if defined(CONFIG_SYNO_COMCERTO)
+ * 64k pages support (Mindspeed COMCERTO):
+ * We cheat even more and tell Linux that we have 256 entries in the first
+ * level, each of which is 64 bytes (16 hardware pointers). The 2nd level
+ * contains 16 hardware PTE tables, or 4096 hardware entries. However,
+ * since 64kB pages are done by duplicating 4kB entries, there will only by
+ * 256 entries in the Linux "PTE" level (and the PTE entry will be larger).
+ * All defines are now also derived from the LINKED_PMDS_SHIFT macro, which
+ * determines how many PMDs point into a single 2nd-level table.
+ *
+#endif
  * This leads to the page tables having the following layout:
  *
  *    pgd             pte
@@ -70,6 +81,7 @@
  */
 
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 #ifdef CONFIG_SYNO_ARMADA_ARCH
 #ifdef CONFIG_MV_SUPPORT_64KB_PAGE_SIZE
 #define PTRS_PER_PTE           32      /* (512 / (64K / 4K)) */
@@ -100,6 +112,21 @@
  */
 #define PMD_SHIFT		21
 #define PGDIR_SHIFT		21
+#else
+#define LINKED_PMDS_SHIFT	4
+#define LINKED_PMDS			(1 << LINKED_PMDS_SHIFT)	/* number of PMDs pointing to the same 2nd-level page */
+#define PTRS_PER_PGD		(4096 / LINKED_PMDS)		/* one pgdir table contains 4096 entries */
+#define PGDIR_SHIFT			(20 + LINKED_PMDS_SHIFT)	/* one pgdir entry can map 1MB (2^20) */
+#define PMD_SHIFT			(PGDIR_SHIFT)
+#define PTET_SIZE_SHIFT		6							/* a HW PTE entry is 16*4bytes */
+#define PTE_HWTABLE_PTRS	(1 << (10 + LINKED_PMDS_SHIFT - PTET_SIZE_SHIFT)) /* one HW PTE table is 1kB (2^10) */
+
+#define PTRS_PER_PTE		(PTE_HWTABLE_PTRS)
+#define PTRS_PER_PMD		1
+
+#define PTE_HWTABLE_OFF		32768 //(PTRS_PER_PTE * sizeof(pte_t))
+#define PTE_HWTABLE_SIZE	(1 << (10 + LINKED_PMDS_SHIFT))
+#endif
 
 #define PMD_SIZE		(1UL << PMD_SHIFT)
 #define PMD_MASK		(~(PMD_SIZE-1))
diff -ur a/arch/arm/include/asm/pgtable-2level-hwdef.h b/arch/arm/include/asm/pgtable-2level-hwdef.h
--- a/arch/arm/include/asm/pgtable-2level-hwdef.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/pgtable-2level-hwdef.h	2014-02-17 11:56:14.000000000 +0100
@@ -65,7 +65,7 @@
 /*
  *   - extended small page/tiny page
  */
-#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE)
+#if (defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_64K_PAGES)) || (defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE))
 #define PTE_EXT_XN		(_AT(pteval_t, 1) << 15)        /* v6 */
 #else
 #define PTE_EXT_XN		(_AT(pteval_t, 1) << 0)		/* v6 */
@@ -77,7 +77,7 @@
 #define PTE_EXT_AP_UNO_SRW	(PTE_EXT_AP0)
 #define PTE_EXT_AP_URO_SRW	(PTE_EXT_AP1)
 #define PTE_EXT_AP_URW_SRW	(PTE_EXT_AP1|PTE_EXT_AP0)
-#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE)
+#if (defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_64K_PAGES)) || (defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE))
 #define PTE_EXT_TEX(x)         (_AT(pteval_t, (x)) << 12)     /* Large Page */
 #else
 #define PTE_EXT_TEX(x)		(_AT(pteval_t, (x)) << 6)	/* v5 */
diff -ur a/arch/arm/include/asm/pgtable-2level-types.h b/arch/arm/include/asm/pgtable-2level-types.h
--- a/arch/arm/include/asm/pgtable-2level-types.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/pgtable-2level-types.h	2014-02-17 11:56:14.000000000 +0100
@@ -24,12 +24,17 @@
 typedef u32 pteval_t;
 typedef u32 pmdval_t;
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 #undef STRICT_MM_TYPECHECKS
+#else
+#define STRICT_MM_TYPECHECKS	1
+#endif
 
 #ifdef STRICT_MM_TYPECHECKS
 /*
  * These are used to make use of C type-checking..
  */
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 typedef struct { pteval_t pte; } pte_t;
 typedef struct { pmdval_t pmd; } pmd_t;
 typedef struct { pmdval_t pgd[2]; } pgd_t;
@@ -45,6 +50,23 @@
 #define __pgprot(x)     ((pgprot_t) { (x) } )
 
 #else
+#include <asm/pgtable-2level.h>
+typedef struct { pteval_t pte[16]; } pte_t;
+typedef struct { pmdval_t pmd; } pmd_t;
+typedef struct { pmdval_t pgd[LINKED_PMDS]; } pgd_t;
+typedef struct { pteval_t pgprot; } pgprot_t;
+
+#define pte_val(x)      ((x).pte[0])
+#define pmd_val(x)      ((x).pmd)
+#define pgd_val(x)	((x).pgd[0])
+#define pgprot_val(x)   ((x).pgprot)
+
+#define __pte(x)        ((pte_t) { {(x)} } )
+#define __pmd(x)        ((pmd_t) { (x) } )
+#define __pgprot(x)     ((pgprot_t) { (x) } )
+#endif
+
+#else
 /*
  * .. while these make it easier on the compiler
  */
diff -ur a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
--- a/arch/arm/include/asm/pgtable.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/pgtable.h	2014-02-17 11:56:14.000000000 +0100
@@ -270,6 +270,7 @@
 
 #define pmd_bad(pmd)		(pmd_val(pmd) & 2)
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 #define copy_pmd(pmdpd,pmdps)		\
 	do {				\
 		pmdpd[0] = pmdps[0];	\
@@ -284,11 +285,36 @@
 		clean_pmd_entry(pmdp);	\
 	} while (0)
 
-#endif	/* CONFIG_ARM_LPAE */
+#else
+#define copy_pmd(pmdpd,pmdps)	\
+	do {	\
+		int i;	\
+		for(i = 0; i < LINKED_PMDS; i++)	\
+			pmdpd[i] = pmdps[i];	\
+		flush_pmd_entry(pmdpd);	\
+	} while (0)
+
+#define pmd_clear(pmdp)	\
+	do {	\
+		int i;	\
+		for(i = 0; i < LINKED_PMDS; i++)	\
+			pmdp[i] = __pmd(0);	\
+		clean_pmd_entry(pmdp);	\
+	} while (0)
 
+#endif
+#endif	/* CONFIG_ARM_LPAE */
+ 
+#if defined(CONFIG_SYNO_COMCERTO)
+#define PMD_PAGE_ADDR_MASK		(~((1 << 10) - 1))
+#endif
 static inline pte_t *pmd_page_vaddr(pmd_t pmd)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	return __va((pmd_val(pmd) & PHYS_MASK & (s32)PMD_PAGE_ADDR_MASK) - PTE_HWTABLE_OFF);
+#else
 	return __va(pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK);
+#endif
 }
 
 #define pmd_page(pmd)		pfn_to_page(__phys_to_pfn(pmd_val(pmd) & PHYS_MASK))
@@ -322,7 +348,11 @@
 #define mk_pte(page,prot)	pfn_pte(page_to_pfn(page), prot)
 
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#define pte_clear(mm,addr,ptep)	do {__sync_outer_cache(ptep, __pte(0)); set_pte_ext(ptep, __pte(0), 0); } while (0)
+#else
 #define pte_clear(mm,addr,ptep)	set_pte_ext(ptep, __pte(0), 0)
+#endif
 
 #define pte_none(pte)		(!pte_val(pte))
 #define pte_present(pte)	(pte_val(pte) & L_PTE_PRESENT)
@@ -338,9 +368,21 @@
 
 #if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_ARM_LPAE)
 #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,__pte(pte_val(pte)|(ext)))
+#elif defined(CONFIG_SYNO_COMCERTO)
+#define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte_val(pte),ext)
 #else
 #define set_pte_ext(ptep,pte,ext) cpu_set_pte_ext(ptep,pte,ext)
 #endif
+
+#if defined(CONFIG_SYNO_COMCERTO)
+#if !defined(CONFIG_L2X0_INSTRUCTION_ONLY)
+static inline void __sync_outer_cache(pte_t *ptep, pte_t pteval)
+{
+}
+#else
+extern void __sync_outer_cache(pte_t *ptep, pte_t pteval);
+#endif
+#endif
 #if __LINUX_ARM_ARCH__ < 6
 static inline void __sync_icache_dcache(pte_t pteval)
 {
@@ -354,6 +396,10 @@
 {
 	unsigned long ext = 0;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	__sync_outer_cache(ptep, pteval);
+#endif
+
 	if (addr < TASK_SIZE && pte_present_user(pteval)) {
 		__sync_icache_dcache(pteval);
 		ext |= PTE_EXT_NG;
@@ -402,7 +448,11 @@
 #define __swp_entry(type,offset) ((swp_entry_t) { ((type) << __SWP_TYPE_SHIFT) | ((offset) << __SWP_OFFSET_SHIFT) })
 
 #define __pte_to_swp_entry(pte)	((swp_entry_t) { pte_val(pte) })
+#if defined(CONFIG_SYNO_COMCERTO)
+#define __swp_entry_to_pte(swp)	((pte_t) { { (swp).val } })
+#else
 #define __swp_entry_to_pte(swp)	((pte_t) { (swp).val })
+#endif
 
 /*
  * It is an error for the kernel to have more swap files than we can
diff -ur a/arch/arm/include/asm/proc-fns.h b/arch/arm/include/asm/proc-fns.h
--- a/arch/arm/include/asm/proc-fns.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/proc-fns.h	2014-02-17 11:56:14.000000000 +0100
@@ -67,6 +67,8 @@
 	 */
 #if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_ARM_LPAE)
 	void (*set_pte_ext)(pte_t *ptep, pte_t pte);
+#elif defined(CONFIG_SYNO_COMCERTO)
+	void (*set_pte_ext)(pte_t *ptep, pteval_t pte, unsigned int ext);
 #else
 	void (*set_pte_ext)(pte_t *ptep, pte_t pte, unsigned int ext);
 #endif
@@ -85,6 +87,8 @@
 extern void cpu_do_switch_mm(unsigned long pgd_phys, struct mm_struct *mm);
 #if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_ARM_LPAE)
 extern void cpu_set_pte_ext(pte_t *ptep, pte_t pte);
+#elif defined(CONFIG_SYNO_COMCERTO)
+extern void cpu_set_pte_ext(pte_t *ptep, pteval_t pte, unsigned int ext);
 #else
 extern void cpu_set_pte_ext(pte_t *ptep, pte_t pte, unsigned int ext);
 #endif
diff -ur a/arch/arm/include/asm/shmparam.h b/arch/arm/include/asm/shmparam.h
--- a/arch/arm/include/asm/shmparam.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/shmparam.h	2014-02-17 11:56:14.000000000 +0100
@@ -6,11 +6,15 @@
  * or page size, whichever is greater since the cache aliases
  * every size/ways bytes.
  */
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 #if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE)
 #define	SHMLBA	(16 << 10)		 /* attach addr a multiple of this */
 #else
 #define	SHMLBA	(4 * PAGE_SIZE)		 /* attach addr a multiple of this */
 #endif
+#else
+#define	SHMLBA	(PAGE_SIZE)		 /* attach addr a multiple of this */
+#endif
 
 /*
  * Enforce SHMLBA in shmat
diff -ur a/arch/arm/include/asm/smp_twd.h b/arch/arm/include/asm/smp_twd.h
--- a/arch/arm/include/asm/smp_twd.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/smp_twd.h	2014-02-17 11:56:14.000000000 +0100
@@ -24,5 +24,8 @@
 
 void twd_timer_setup(struct clock_event_device *);
 void twd_timer_stop(struct clock_event_device *);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_LOCAL_TIMERS)
+int twd_timer_ack(void);
+#endif
 
 #endif
diff -ur a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
--- a/arch/arm/include/asm/thread_info.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/thread_info.h	2014-02-17 11:56:14.000000000 +0100
@@ -15,8 +15,13 @@
 #include <linux/compiler.h>
 #include <asm/fpstate.h>
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 #define THREAD_SIZE_ORDER	1
 #define THREAD_SIZE		8192
+#else
+#define THREAD_SIZE_ORDER	0
+#define THREAD_SIZE		65536
+#endif
 #define THREAD_START_SP		(THREAD_SIZE - 8)
 
 #ifndef __ASSEMBLY__
diff -ur a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h
--- a/arch/arm/include/asm/tlbflush.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/tlbflush.h	2014-02-17 11:56:14.000000000 +0100
@@ -479,6 +479,7 @@
  *	these operations.  This is typically used when we are removing
  *	PMD entries.
  */
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 static inline void flush_pmd_entry(void *pmd)
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
@@ -547,6 +548,40 @@
 			: : "r" (pmd) : "cc");
 #endif
 }
+#else
+static inline void flush_pmd_entry(void *pmd)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+	char *p = (char *)pmd;
+
+	if (tlb_flag(TLB_DCLEAN)) {
+		while (p < ((char *)pmd + (LINKED_PMDS * sizeof(u32)))) { // A PMD contains LINKED_PMDS pointers to the 2nd-level table
+			asm("mcr	p15, 0, %0, c7, c10, 1	@ flush_pmd"
+					: : "r" (p) : "cc");
+			p += 32; //Next cache line
+		}
+	}
+
+	if (tlb_flag(TLB_WB))
+		dsb();
+}
+
+static inline void clean_pmd_entry(void *pmd)
+{
+	const unsigned int __tlb_flag = __cpu_tlb_flags;
+	char *p = (char *)pmd;
+
+	if (tlb_flag(TLB_DCLEAN)) {
+		while (p < ((char *)pmd + (LINKED_PMDS * sizeof(u32)))) { // A PMD contains LINKED_PMDS pointers to the 2nd-level table
+			asm("mcr	p15, 0, %0, c7, c10, 1	@ flush_pmd"
+					: : "r" (p) : "cc");
+			p += 32; //Next cache line
+		}
+	}
+}
+
+#endif
+
 
 #undef tlb_flag
 #undef always_tlb_flags
diff -ur a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
--- a/arch/arm/include/asm/unistd.h	2013-08-24 11:36:16.000000000 +0200
+++ b/arch/arm/include/asm/unistd.h	2014-02-17 11:56:14.000000000 +0100
@@ -410,11 +410,6 @@
 #define __NR_process_vm_writev		(__NR_SYSCALL_BASE+377)
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOmmap				(__NR_SYSCALL_BASE+400)
-#define SYNOmmap(x)					syscall(__NR_SYNOmmap, x)
-#endif
-
-#ifdef MY_ABC_HERE
 #define __NR_SYNOUtime                          (__NR_SYSCALL_BASE+402)
 #define SYNOUtime(arg1, arg2)                   syscall(__NR_SYNOUtime, arg1, arg2)
 #endif
diff -ur a/arch/arm/Kconfig b/arch/arm/Kconfig
--- a/arch/arm/Kconfig	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/Kconfig	2014-01-21 09:36:45.000000000 +0100
@@ -266,6 +266,16 @@
 	help
 	  Support for ARM's Integrator platform.
 
+config ARCH_COMCERTO
+	bool "Mindspeed Comcerto"
+	select ARCH_SUPPORTS_MSI
+	select NEED_MACH_MEMORY_H
+	select ARCH_REQUIRE_GPIOLIB
+	help
+	  This enables support for Mindspeed's Comcerto development boards.
+	  If you would like to build your kernel to run on one of these boards
+	  then you must say 'Y' here. Otherwise say 'N'    
+
 config ARCH_REALVIEW
 	bool "ARM Ltd. RealView family"
 	select ARM_AMBA
@@ -1141,6 +1151,8 @@
 
 source "arch/arm/mach-w90x900/Kconfig"
 
+source "arch/arm/mach-comcerto/Kconfig"
+
 # Definitions to make life easier
 config PLAT_ARMADA
 	bool
@@ -1427,6 +1439,16 @@
 	  on systems with an outer cache, the store buffer is drained
 	  explicitly.
 
+config ARM_ERRATA_775420
+       bool "ARM errata: A data cache maintenance operation which aborts, might lead to deadlock"
+       depends on CPU_V7 && SYNO_COMCERTO
+       help
+	 This option enables the workaround for the 775420 Cortex-A9 (r2p2,
+	 r2p6,r2p8,r2p10,r3p0) erratum. In case a date cache maintenance
+	 operation aborts with MMU exception, it might cause the processor
+	 to deadlock. This workaround puts DSB before executing ISB if
+	 an abort may occur on cache maintenance.
+
 endmenu
 
 source "arch/arm/common/Kconfig"
@@ -1504,7 +1526,7 @@
 	depends on REALVIEW_EB_ARM11MP || REALVIEW_EB_A9MP || \
 		 MACH_REALVIEW_PB11MP || MACH_REALVIEW_PBX || ARCH_OMAP4 || \
 		 ARCH_EXYNOS4 || ARCH_TEGRA || ARCH_U8500 || ARCH_VEXPRESS_CA9X4 || \
-		 ARCH_MSM_SCORPIONMP || ARCH_SHMOBILE || ARCH_HIGHBANK || SOC_IMX6Q || ARCH_ARMADA_XP
+		 ARCH_MSM_SCORPIONMP || ARCH_SHMOBILE || ARCH_HIGHBANK || SOC_IMX6Q || ARCH_ARMADA_XP || ARCH_COMCERTO
 	depends on MMU
 	select USE_GENERIC_SMP_HELPERS
 	select HAVE_ARM_SCU if !ARCH_MSM_SCORPIONMP
@@ -1568,6 +1590,11 @@
 	help
 	  This option enables support for the ARM system coherency unit
 
+config SCU_SPECULATIVE_LINE_FILLS
+	bool "SCU speculative line fills"
+	depends on HAVE_ARM_SCU && CACHE_PL310
+	default n
+
 config HAVE_ARM_TWD
 	bool
 	depends on SMP
diff -ur a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
--- a/arch/arm/kernel/calls.S	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/calls.S	2014-02-17 11:56:29.000000000 +0100
@@ -411,11 +411,7 @@
 		CALL(sys_ni_syscall)
 		CALL(sys_ni_syscall)
 		CALL(sys_ni_syscall)
-#ifdef MY_ABC_HERE
-		CALL(sys_SYNOmmap)  		/* 400 */
-#else
 		CALL(sys_ni_syscall)
-#endif
 		CALL(sys_ni_syscall)
 #ifdef MY_ABC_HERE
 		CALL(sys_SYNOUtime)	/* 402 */
diff -ur a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
--- a/arch/arm/kernel/entry-common.S	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/entry-common.S	2014-02-17 11:56:29.000000000 +0100
@@ -14,7 +14,9 @@
 #include <asm/unwind.h>
 
 #include "entry-header.S"
-
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <asm/page.h>
+#endif
 
 	.align	5
 /*
@@ -583,6 +585,9 @@
  */
 sys_mmap2:
 #if PAGE_SHIFT > 12
+#if defined(CONFIG_SYNO_COMCERTO)
+#define PGOFF_MASK ((1 << (PAGE_SHIFT - 12)) - 1)
+#endif
 		tst	r5, #PGOFF_MASK
 		moveq	r5, r5, lsr #PAGE_SHIFT - 12
 		streq	r5, [sp, #4]
diff -ur a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
--- a/arch/arm/kernel/entry-header.S	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/entry-header.S	2014-02-17 11:56:29.000000000 +0100
@@ -108,11 +108,17 @@
 	movs	pc, lr				@ return & move spsr_svc into cpsr
 	.endm
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 	.macro	get_thread_info, rd
 	mov	\rd, sp, lsr #13
 	mov	\rd, \rd, lsl #13
 	.endm
-
+#else
+	.macro	get_thread_info, rd
+	mov	\rd, sp, lsr #16
+	mov	\rd, \rd, lsl #16
+	.endm
+#endif
 	@
 	@ 32-bit wide "mov pc, reg"
 	@
@@ -148,11 +154,18 @@
 	movs	pc, lr				@ return & move spsr_svc into cpsr
 	.endm
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 	.macro	get_thread_info, rd
 	mov	\rd, sp
 	lsr	\rd, \rd, #13
 	mov	\rd, \rd, lsl #13
 	.endm
+#else
+	.macro	get_thread_info, rd
+	mov	\rd, sp, lsr #16
+	mov	\rd, \rd, lsl #16
+	.endm
+#endif
 
 	@
 	@ 32-bit wide "mov pc, reg"
diff -ur a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
--- a/arch/arm/kernel/head.S	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/head.S	2014-02-17 11:56:29.000000000 +0100
@@ -50,9 +50,23 @@
 
 	.globl	swapper_pg_dir
 	.equ	swapper_pg_dir, KERNEL_RAM_VADDR - PG_DIR_SIZE
-
+#if defined(CONFIG_SYNO_COMCERTO)
+/*
+*	Mindspeed:
+*   Need to break the function in case text offset is too big
+*   this is the case when using zone_dma
+*	There is probably a more elegant way to to that
+*	original code:
+*	add	\rd, \phys, #TEXT_OFFSET - PG_DIR_SIZE
+*/
+#endif
 	.macro	pgtbl, rd, phys
+#if defined(CONFIG_SYNO_COMCERTO)
+	ldr	\rd, =TEXT_OFFSET - PG_DIR_SIZE
+	add	\rd, \phys, \rd
+#else
 	add	\rd, \phys, #TEXT_OFFSET - PG_DIR_SIZE
+#endif
 	.endm
 
 #ifdef CONFIG_XIP_KERNEL
diff -ur a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile
--- a/arch/arm/kernel/Makefile	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/kernel/Makefile	2014-01-21 09:36:45.000000000 +0100
@@ -12,10 +12,11 @@
 CFLAGS_REMOVE_return_address.o = -pg
 
 # Object file lists.
-
+ifeq ($(CONFIG_SYNO_ARMADA_ARCH),y)
 ifneq ($(MACHINE),)
 include $(srctree)/$(MACHINE)/config/mvRules.mk
 endif
+endif
 
 ifeq ($(CONFIG_SYNO_ARMADA_ARCH),y)
 obj-y		:= elf.o entry-armv-armada.o entry-common-armada.o irq.o \
diff -ur a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
--- a/arch/arm/kernel/module.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/module.c	2014-02-17 11:56:29.000000000 +0100
@@ -94,6 +94,11 @@
 			       rel->r_offset, dstsec->sh_size);
 			return -ENOEXEC;
 		}
+#if defined(CONFIG_SYNO_COMCERTO)
+		if ((IS_ERR_VALUE(sym->st_value) || !sym->st_value) &&
+		    ELF_ST_BIND(sym->st_info) == STB_WEAK)
+			continue;
+#endif
 
 		loc = dstsec->sh_addr + rel->r_offset;
 
diff -ur a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
--- a/arch/arm/kernel/process.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/process.c	2014-02-17 11:56:29.000000000 +0100
@@ -495,22 +495,40 @@
 #ifdef CONFIG_MMU
 /*
  * The vectors page is always readable from user space for the
- * atomic helpers and the signal restart code.  Let's declare a mapping
- * for it so it is visible through ptrace and /proc/<pid>/mem.
+ * atomic helpers and the signal restart code. Insert it into the
+ * gate_vma so that it is visible through ptrace and /proc/<pid>/mem.
  */
+static struct vm_area_struct gate_vma;
 
-int vectors_user_mapping(void)
+static int __init gate_vma_init(void)
 {
-	struct mm_struct *mm = current->mm;
-	return install_special_mapping(mm, 0xffff0000, PAGE_SIZE,
-				       VM_READ | VM_EXEC |
-				       VM_MAYREAD | VM_MAYEXEC |
-				       VM_ALWAYSDUMP | VM_RESERVED,
-				       NULL);
+	gate_vma.vm_start	= 0xffff0000;
+	gate_vma.vm_end		= 0xffff0000 + PAGE_SIZE;
+	gate_vma.vm_page_prot	= PAGE_READONLY_EXEC;
+	gate_vma.vm_flags	= VM_READ | VM_EXEC |
+				  VM_MAYREAD | VM_MAYEXEC |
+				  VM_ALWAYSDUMP;
+	return 0;
+}
+arch_initcall(gate_vma_init);
+
+struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
+{
+	return &gate_vma;
+}
+
+int in_gate_area(struct mm_struct *mm, unsigned long addr)
+{
+	return (addr >= gate_vma.vm_start) && (addr < gate_vma.vm_end);
+}
+
+int in_gate_area_no_mm(unsigned long addr)
+{
+	return in_gate_area(NULL, addr);
 }
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
-	return (vma->vm_start == 0xffff0000) ? "[vectors]" : NULL;
+	return (vma == &gate_vma) ? "[vectors]" : NULL;
 }
 #endif
diff -ur a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
--- a/arch/arm/kernel/setup.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/setup.c	2014-02-17 11:56:29.000000000 +0100
@@ -122,10 +122,6 @@
 extern long gSynoFlashMemorySize;
 #endif
 
-#ifdef CONFIG_SYNO_ARMADA
-extern int gSynoUSBStation;
-#endif
-
 #ifdef MY_ABC_HERE
 extern int gSynoFactoryUSBFastReset;
 #endif
@@ -438,28 +434,6 @@
 __setup("flash_size=", early_flash_memory_size);
 #endif
 
-#ifdef CONFIG_SYNO_ARMADA
-static int __init early_is_usbstation(char *p)
-{
-	int iLen = 0;
-
-	gSynoUSBStation = 0;
-
-	if ((NULL == p) || (0 == (iLen = strlen(p)))) {
-		goto END;
-	}
-
-	if ( 0 == strcmp (p, "y")) {
-		gSynoUSBStation = 1;
-		printk("Synology USB Station.\n");
-	}
-
-END:
-	return 1;
-}
-__setup("syno_usbstation=", early_is_usbstation);
-#endif
-
 #ifdef MY_ABC_HERE
 static int __init early_factory_usb_fast_reset(char *p)
 {
diff -ur a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
--- a/arch/arm/kernel/smp.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/smp.c	2014-02-17 11:56:29.000000000 +0100
@@ -367,7 +367,10 @@
 	 * now.
 	 */
 	local_irq_enable();
+
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_MSP)
 	local_fiq_enable();
+#endif  /* !CONFIG_COMCERTO_MSP */
 
 	/*
 	 * OK, it's off to the idle thread for us
diff -ur a/arch/arm/kernel/smp_scu.c b/arch/arm/kernel/smp_scu.c
--- a/arch/arm/kernel/smp_scu.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/smp_scu.c	2014-02-17 11:56:29.000000000 +0100
@@ -52,6 +52,10 @@
 	if (scu_ctrl & 1)
 		return;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_SCU_SPECULATIVE_LINE_FILLS)
+	scu_ctrl |= (1 << 3);
+#endif
+
 	scu_ctrl |= 1;
 	__raw_writel(scu_ctrl, scu_base + SCU_CTRL);
 
diff -ur a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c
--- a/arch/arm/kernel/sys_arm.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/kernel/sys_arm.c	2014-02-17 11:56:29.000000000 +0100
@@ -29,21 +29,6 @@
 
 #include <linux/slab.h> 
 
-#ifdef MY_ABC_HERE
-asmlinkage int sys_SYNOmmap(SYNO_MMAP_ARG __user *arg)
-{
-	int error = -EFAULT;
-	SYNO_MMAP_ARG a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		goto out;;
-	
-	error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.pgoff);
-out:
-	return error;
-}
-#endif
-
 /* Fork a new task - this creates a new program thread.
  * This is called indirectly via a small wrapper
  */
diff -ur a/arch/arm/lib/copy_page.S b/arch/arm/lib/copy_page.S
--- a/arch/arm/lib/copy_page.S	2013-08-24 11:36:21.000000000 +0200
+++ b/arch/arm/lib/copy_page.S	2014-02-17 11:56:21.000000000 +0100
@@ -30,6 +30,8 @@
 	PLD(	pld	[r1, #L1_CACHE_BYTES]		)
 #if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_MV_SUPPORT_64KB_PAGE_SIZE)
 		ldr	r2, =COPY_COUNT
+#elif defined(CONFIG_SYNO_COMCERTO)
+		ldr	r2, =COPY_COUNT			@	1
 #else
 		mov	r2, #COPY_COUNT			@	1
 #endif
diff -ur a/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvLib.c b/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvLib.c
--- a/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvLib.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvLib.c	2014-02-17 11:56:30.000000000 +0100
@@ -2626,6 +2626,7 @@
 	case SYNO_US3_ID:
 	case SYNO_RS214_ID:
 	case SYNO_DS214se_ID:
+	case SYNO_DS414slim_ID:
 #endif
 		return &BOARD_INFO(boardId)->boardPexInfo;
 		break;
diff -ur a/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.c b/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.c
--- a/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.c	2014-02-17 11:56:30.000000000 +0100
@@ -428,7 +428,7 @@
 
 MV_BOARD_MAC_INFO synods213jInfoBoardMacInfo[] = {
 	/* {{MV_BOARD_MAC_SPEED	boardMacSpeed, MV_U8 boardEthSmiAddr}} */
-	{BOARD_MAC_SPEED_AUTO, 0x1},
+	{BOARD_MAC_SPEED_AUTO, 0x1, 0, 0},
 };
 
 MV_BOARD_MODULE_TYPE_INFO synods213jInfoBoardModTypeInfo[] = {
@@ -525,7 +525,7 @@
 
 MV_BOARD_MAC_INFO synods214seInfoBoardMacInfo[] = {
 	/* {{MV_BOARD_MAC_SPEED	boardMacSpeed, MV_U8 boardEthSmiAddr}} */
-	{BOARD_MAC_SPEED_AUTO, 0x1},
+	{BOARD_MAC_SPEED_AUTO, 0x1, 0, 0},
 };
 
 MV_BOARD_MODULE_TYPE_INFO synods214seInfoBoardModTypeInfo[] = {
@@ -621,12 +621,13 @@
 /***********************/
 MV_BOARD_MAC_INFO synous3InfoBoardMacInfo[] = {
 	/* {{MV_BOARD_MAC_SPEED	boardMacSpeed, MV_U8 boardEthSmiAddr}} */
-	{BOARD_MAC_SPEED_AUTO, 0x1},
+	{BOARD_MAC_SPEED_AUTO, 0x1, 0 ,0},
+	{BOARD_MAC_SPEED_AUTO, 0x0, 0 ,0},
 };
 
 MV_BOARD_MODULE_TYPE_INFO synous3InfoBoardModTypeInfo[] = {
 	{
-		.boardMppGrp1Mod	= MV_BOARD_AUTO,
+		.boardMppGrp1Mod	= MV_BOARD_RGMII1 | MV_BOARD_RGMII0,
 		.boardMppGrp2Mod	= MV_BOARD_AUTO
 	}
 };
@@ -717,8 +718,8 @@
 /**********************/
 MV_BOARD_MAC_INFO synors214InfoBoardMacInfo[] = {
 	/* {{MV_BOARD_MAC_SPEED	boardMacSpeed, MV_U8 boardEthSmiAddr}} */
-	{BOARD_MAC_SPEED_AUTO, 0x1},
-	{BOARD_MAC_SPEED_AUTO, 0x0},
+	{BOARD_MAC_SPEED_AUTO, 0x1, 0, 0},
+	{BOARD_MAC_SPEED_AUTO, 0x0, 0, 0},
 };
 
 MV_BOARD_MODULE_TYPE_INFO synors214InfoBoardModTypeInfo[] = {
@@ -807,6 +808,104 @@
 	.norFlashReadParams		= 0,
 	.norFlashWriteParams	= 0
 };
+
+/***********************/
+/* SYNO DS414slim BOARD   */
+/***********************/
+
+MV_BOARD_MAC_INFO synods414slimInfoBoardMacInfo[] = {
+	/* {{MV_BOARD_MAC_SPEED	boardMacSpeed, MV_U8 boardEthSmiAddr}} */
+	{BOARD_MAC_SPEED_AUTO, 0x1, 0, 0},
+	{BOARD_MAC_SPEED_AUTO, 0x0, 0, 0},
+};
+
+MV_BOARD_MODULE_TYPE_INFO synods414slimInfoBoardModTypeInfo[] = {
+	{
+		.boardMppGrp1Mod	= MV_BOARD_RGMII0|MV_BOARD_RGMII1,
+		.boardMppGrp2Mod	= MV_BOARD_AUTO
+	}
+};
+
+MV_DEV_CS_INFO synods414slimInfoBoardDeCsInfo[] = {
+	/*{deviceCS, params, devType, devWidth, busWidth }*/
+#if defined(MV_INCLUDE_SPI)
+	{SPI_CS0, N_A, BOARD_DEV_SPI_FLASH, 8, 8}, /* SPI DEV */
+#endif
+#if defined(MV_INCLUDE_NOR)
+	{DEV_BOOCS, N_A, BOARD_DEV_NOR_FLASH, 16, 16} /* NOR DEV */
+#endif
+};
+
+MV_BOARD_MPP_INFO synods414slimInfoBoardMppConfigValue[] = {
+	{ {
+		SYNO_DS414slim_MPP0_7,
+		SYNO_DS414slim_MPP8_15,
+		SYNO_DS414slim_MPP16_23,
+		SYNO_DS414slim_MPP24_31,
+		SYNO_DS414slim_MPP32_39,
+		SYNO_DS414slim_MPP40_47,
+		SYNO_DS414slim_MPP48_55,
+		SYNO_DS414slim_MPP56_63,
+		SYNO_DS414slim_MPP64_67,
+	} }
+};
+
+MV_BOARD_TDM_INFO	synods414slimTdm880[]	= { {0} };
+
+MV_BOARD_TDM_SPI_INFO synods414slimTdmSpiInfo[] = { {1} };
+
+MV_BOARD_INFO synods414slimInfo = {
+	.boardName				= "SYNO-DS414slim",
+	.enableModuleScan 			= MV_FALSE,
+	.numBoardMppTypeValue		= ARRSZ(synods414slimInfoBoardModTypeInfo),
+	.pBoardModTypeValue			= synods414slimInfoBoardModTypeInfo,
+	.numBoardMppConfigValue		= ARRSZ(synods414slimInfoBoardMppConfigValue),
+	.pBoardMppConfigValue		= synods414slimInfoBoardMppConfigValue,
+	.intsGppMaskLow				= 0,
+	.intsGppMaskMid				= 0,
+	.intsGppMaskHigh			= 0,
+	.numBoardDeviceIf			= ARRSZ(synods414slimInfoBoardDeCsInfo),
+	.pDevCsInfo					= synods414slimInfoBoardDeCsInfo,
+	.numBoardTwsiDev			= 0,
+	.pBoardTwsiDev				= NULL,
+	.numBoardMacInfo			= ARRSZ(synods414slimInfoBoardMacInfo),
+	.pBoardMacInfo				= synods414slimInfoBoardMacInfo,
+	.numBoardGppInfo			= 0,
+	.pBoardGppInfo				= NULL,
+	.activeLedsNumber			= 0,
+	.pLedGppPin					= NULL,
+	.ledsPolarity				= 0,
+
+	/* PMU Power */
+	.pmuPwrUpPolarity			= 0,
+	.pmuPwrUpDelay				= 16000,
+
+	/* GPP values */
+	.gppOutEnValLow			= SYNO_DS414slim_GPP_OUT_ENA_LOW,
+	.gppOutEnValMid			= SYNO_DS414slim_GPP_OUT_ENA_MID,
+	.gppOutEnValHigh		= SYNO_DS414slim_GPP_OUT_ENA_HIGH,
+	.gppOutValLow			= SYNO_DS414slim_GPP_OUT_VAL_LOW,
+	.gppOutValMid			= SYNO_DS414slim_GPP_OUT_VAL_MID,
+	.gppOutValHigh			= SYNO_DS414slim_GPP_OUT_VAL_HIGH,
+	.gppPolarityValLow		= SYNO_DS414slim_GPP_POL_LOW,
+	.gppPolarityValMid		= SYNO_DS414slim_GPP_POL_MID,
+	.gppPolarityValHigh		= SYNO_DS414slim_GPP_POL_HIGH,
+
+	/* External Switch Configuration */
+	.pSwitchInfo = NULL,
+	.switchInfoNum = 0,
+
+	/* TDM configuration */
+	.numBoardTdmInfo		= {1},
+	.pBoardTdmInt2CsInfo		= {synods414slimTdm880},
+	.boardTdmInfoIndex		= 0,
+	.pBoardTdmSpiInfo 		= synods414slimTdmSpiInfo,
+
+	/* NOR init params */
+	.norFlashReadParams		= 0,
+	.norFlashWriteParams	= 0
+};
+
 #endif /* CONFIG_SYNO_ARMADA_ARCH */
 
 MV_BOARD_INFO *boardInfoTbl[] = {
@@ -831,5 +930,6 @@
 	,&synous3Info
 	,&synors214Info
 	,&synods214seInfo
+	,&synods414slimInfo
 #endif
 };
diff -ur a/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.h b/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.h
--- a/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.h	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/mach-armada370/armada_370_family/boardEnv/mvBoardEnvSpec.h	2014-02-17 11:56:30.000000000 +0100
@@ -104,7 +104,8 @@
 #define SYNO_US3_ID			(SYNO_DS213j_ID + 1)
 #define SYNO_RS214_ID			(SYNO_US3_ID + 1)
 #define SYNO_DS214se_ID			(SYNO_RS214_ID + 1)
-#define MV_MAX_BOARD_ID			(SYNO_DS214se_ID + 1)
+#define SYNO_DS414slim_ID			(SYNO_DS214se_ID + 1)
+#define MV_MAX_BOARD_ID			(SYNO_DS414slim_ID + 1)
 
 #else /* CONFIG_SYNO_ARMADA_ARCH */
 #define MV_MAX_BOARD_ID			(RD_88F6710_ID + 1)
@@ -176,30 +177,23 @@
 /*******************/
 /*    SYNO US3 BP         */
 /*******************/
-#define SYNO_US3_MPP0_7		0x00010011
-#define SYNO_US3_MPP8_15		0x00000000
-#define SYNO_US3_MPP16_23		0x00000110
-#define SYNO_US3_MPP24_31		0x00000000
-#define SYNO_US3_MPP32_39		0x00222220
-#define SYNO_US3_MPP40_47		0x00000000
+#define SYNO_US3_MPP0_7		0x11110011
+#define SYNO_US3_MPP8_15		0x11111111
+#define SYNO_US3_MPP16_23		0x22222111
+#define SYNO_US3_MPP24_31		0x02222222
+#define SYNO_US3_MPP32_39		0x00000001
+#define SYNO_US3_MPP40_47		0x30000000
+#define SYNO_US3_MPP48_55		0x00033333
+#define SYNO_US3_MPP56_63		0x10000000
+#define SYNO_US3_MPP64_67		0x00000011
 
-#ifdef MV_INCLUDE_NOR
-#define SYNO_US3_MPP48_55		0x00000004
-#define SYNO_US3_MPP56_63		0x00000000
-#else
-#define SYNO_US3_MPP48_55		0x00000004
-#define SYNO_US3_MPP56_63		0x00000000
-#endif
-
-#define SYNO_US3_MPP64_67		0x00000000
 
-
-#define SYNO_US3_GPP_OUT_ENA_LOW		(~(BIT11|BIT12))
-#define SYNO_US3_GPP_OUT_ENA_MID		0x0
-#define SYNO_US3_GPP_OUT_ENA_HIGH		0x0
+#define SYNO_US3_GPP_OUT_ENA_LOW		(~(0x0))
+#define SYNO_US3_GPP_OUT_ENA_MID		(~(BIT8|BIT10|BIT11|BIT13|BIT28))
+#define SYNO_US3_GPP_OUT_ENA_HIGH		(~(0x0))
 
 #define SYNO_US3_GPP_OUT_VAL_LOW		0x0
-#define SYNO_US3_GPP_OUT_VAL_MID		0x0
+#define SYNO_US3_GPP_OUT_VAL_MID		(BIT11|BIT28)
 #define SYNO_US3_GPP_OUT_VAL_HIGH		0x0
 
 #define SYNO_US3_GPP_POL_LOW			0x0
@@ -231,6 +225,31 @@
 #define SYNO_RS214_GPP_POL_MID                  0x0
 #define SYNO_RS214_GPP_POL_HIGH                 0x0
 
+/*********************/
+/*    SYNO DS414slim */
+/*********************/
+#define SYNO_DS414slim_MPP0_7          0x11111111
+#define SYNO_DS414slim_MPP8_15         0x11111111
+#define SYNO_DS414slim_MPP16_23        0x22222111
+#define SYNO_DS414slim_MPP24_31        0x02222222
+#define SYNO_DS414slim_MPP32_39        0x00022220
+#define SYNO_DS414slim_MPP40_47        0x00000220
+#define SYNO_DS414slim_MPP48_55        0x00000004
+#define SYNO_DS414slim_MPP56_63        0x00000000
+#define SYNO_DS414slim_MPP64_67        0x00000000
+
+
+#define SYNO_DS414slim_GPP_OUT_ENA_LOW        (~(0x0))
+#define SYNO_DS414slim_GPP_OUT_ENA_MID        (~(BIT31))
+#define SYNO_DS414slim_GPP_OUT_ENA_HIGH       (~(BIT0|BIT1))
+
+#define SYNO_DS414slim_GPP_OUT_VAL_LOW        0x0
+#define SYNO_DS414slim_GPP_OUT_VAL_MID        0x0
+#define SYNO_DS414slim_GPP_OUT_VAL_HIGH       0x0
+
+#define SYNO_DS414slim_GPP_POL_LOW            0x0
+#define SYNO_DS414slim_GPP_POL_MID            0x0
+#define SYNO_DS414slim_GPP_POL_HIGH           0x0
 
 #define GPIO_UNDEF 0xFF
 
diff -ur a/arch/arm/mach-armada370/armada_370_family/ctrlEnv/mvCtrlEnvLib.c b/arch/arm/mach-armada370/armada_370_family/ctrlEnv/mvCtrlEnvLib.c
--- a/arch/arm/mach-armada370/armada_370_family/ctrlEnv/mvCtrlEnvLib.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/mach-armada370/armada_370_family/ctrlEnv/mvCtrlEnvLib.c	2014-02-17 11:56:30.000000000 +0100
@@ -275,6 +275,8 @@
 		return 2;
 	case SYNO_DS214se_ID:
 		return 1;
+	case SYNO_DS414slim_ID:
+		return 2;
 	default:
 		return MV_PEX_MAX_IF;
 	}
diff -ur a/arch/arm/mach-armada370/core.c b/arch/arm/mach-armada370/core.c
--- a/arch/arm/mach-armada370/core.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/mach-armada370/core.c	2014-02-17 11:56:30.000000000 +0100
@@ -132,9 +132,6 @@
 
 struct mbus_dram_target_info armadaxp_mbus_dram_info;
 
-/* USB Station */
-extern int gSynoUSBStation;
-
 /*********************************************************************************/
 /**************                 Early Printk Support                **************/
 /*********************************************************************************/
@@ -1225,6 +1222,7 @@
 #define SOFTWARE_REBOOT                 0x43
 extern void synology_gpio_init(void);
 
+void (*syno_power_off_indicator)(void) = NULL;
 static void synology_power_off(void)
 {
 #ifdef MY_ABC_HERE
@@ -1232,20 +1230,18 @@
 	syno_mv_net_shutdown();
 #endif
 
-	if (!gSynoUSBStation) {
-		writel(SET8N1, UART1_REG(LCR));
-		writel(SOFTWARE_SHUTDOWN, UART1_REG(TX));
+	writel(SET8N1, UART1_REG(LCR));
+	writel(SOFTWARE_SHUTDOWN, UART1_REG(TX));
+
+	if (syno_power_off_indicator) {
+		syno_power_off_indicator();
 	}
 }
 
 static void synology_restart(char mode, const char *cmd)
 {
-	if (gSynoUSBStation) {
-		mvBoardReset();
-	} else {
-		writel(SET8N1, UART1_REG(LCR));
-		writel(SOFTWARE_REBOOT, UART1_REG(TX));
-	}
+	writel(SET8N1, UART1_REG(LCR));
+	writel(SOFTWARE_REBOOT, UART1_REG(TX));
 
 	/* Calls original reset function for models those do not use uP
 	* I.e. USB Station. */
diff -ur a/arch/arm/mach-armada370/Makefile b/arch/arm/mach-armada370/Makefile
--- a/arch/arm/mach-armada370/Makefile	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/mach-armada370/Makefile	2014-01-21 09:36:45.000000000 +0100
@@ -112,7 +112,7 @@
 		  $(HAL_QD_DIR)/src/msapi/gtPIRL2.o $(HAL_QD_DIR)/src/msapi/gtCCPVT.o		\
 		  $(HAL_QD_DIR)/src/msapi/gtPCSCtrl.o $(HAL_QD_DIR)/src/msapi/gtBrgStu.o
 
-LSP_OBJS        = core.o irq.o time.o leds.o sysmap.o export.o clock.o synology-gpio.o
+LSP_OBJS        = core.o irq.o time.o leds.o sysmap.o export.o clock.o synology-gpio.o synology-platform.o
 
 obj-y   				:=  armada370.o
 armada370-objs  			:=$(LSP_OBJS) $(COMMON_OBJS) $(OSSERVICES_OBJS) $(HAL_OBJS) 	\
diff -ur a/arch/arm/mach-armada370/synology-gpio.c b/arch/arm/mach-armada370/synology-gpio.c
--- a/arch/arm/mach-armada370/synology-gpio.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/mach-armada370/synology-gpio.c	2014-02-17 11:56:30.000000000 +0100
@@ -40,16 +40,11 @@
 #define DISK_LED_ORANGE_SOLID	2
 #define DISK_LED_ORANGE_BLINK	3
 #define DISK_LED_GREEN_BLINK    4
-#define DISK_LED_BLUE			5
 
 #define SYNO_LED_OFF		0
 #define SYNO_LED_ON			1
 #define SYNO_LED_BLINKING	2
 
-#ifdef  MY_ABC_HERE
-extern char gszSynoHWVersion[];
-#endif
-
 typedef struct __tag_SYNO_ARMADA_HDD_PM_GPIO {
 	u8 hdd1_pm;
 	u8 hdd2_pm;
@@ -84,6 +79,7 @@
 	u8 hdd4_led_1;
 	u8 hdd5_led_0;
 	u8 hdd5_led_1;
+	u8 hdd_led_mask;
 } SYNO_ARMADA_EXT_HDD_LED_GPIO;
 
 typedef struct __tag_SYNO_ARMADA_MULTI_BAY_GPIO {
@@ -107,6 +103,10 @@
 	u8 power_led;
 } SYNO_ARMADA_STATUS_LED_GPIO;
 
+typedef struct __tag_SYNO_ARMADA_USB_GPIO {
+	u8 usb_power;
+} SYNO_ARMADA_USB_GPIO;
+
 typedef struct __tag_SYNO_ARMADA_GENERIC_GPIO {
 	SYNO_ARMADA_EXT_HDD_LED_GPIO	ext_sata_led;
 	SYNO_ARMADA_SOC_HDD_LED_GPIO	soc_sata_led;
@@ -116,6 +116,7 @@
 	SYNO_ARMADA_RACK_GPIO			rack;
 	SYNO_ARMADA_MULTI_BAY_GPIO		multi_bay;
 	SYNO_ARMADA_STATUS_LED_GPIO		status;
+	SYNO_ARMADA_USB_GPIO			usb;
 }SYNO_ARMADA_GENERIC_GPIO;
 
 static SYNO_ARMADA_GENERIC_GPIO generic_gpio;
@@ -185,8 +186,14 @@
 	return MV_OK;
 }
 
+void SYNO_ENABLE_HDD_LED(int blEnable)
+{
+	if (GPIO_UNDEF != generic_gpio.ext_sata_led.hdd_led_mask)
+		gpio_set_value(generic_gpio.ext_sata_led.hdd_led_mask, blEnable ? 0 : 1);
+}
+
 int
-SYNO_CTRL_INTERNAL_HDD_LED_SET(int index, int status)
+SYNO_SOC_HDD_LED_SET(int index, int status)
 {
 	int ret = -1;
 	int mpp_pin;
@@ -196,7 +203,7 @@
 	int active = 0; //note: led is low active
 
 #ifdef MY_ABC_HERE
-	if (0 == strcmp(gszSynoHWVersion, "RS214v10-j")) {
+	if (syno_is_hw_version(HW_RS214v10)) {
 		// RS214 led is high active
 		active = 1;
 	}
@@ -241,12 +248,17 @@
 		gpio_set_value(fail_led, active);
 	}
 	else if ( DISK_LED_GREEN_SOLID == status ||
-			  DISK_LED_OFF == status ||
-			  DISK_LED_BLUE == status)
+			  DISK_LED_GREEN_BLINK == status )
 	{
 		SYNOMppCtrlRegWrite(mpp_pin, mode_sata_present);  // change MPP to sata present mode
 		gpio_set_value(fail_led, !active);
 	}
+	else if (DISK_LED_OFF == status)
+	{
+		SYNOMppCtrlRegWrite(mpp_pin, mode_gpio);
+		gpio_set_value(mpp_pin, !active);
+		gpio_set_value(fail_led, !active);
+	}
 	else
 	{
 		printk("Wrong HDD led status [%d]\n", status);
@@ -335,7 +347,6 @@
 		blink2 = 0;
 		break;
 	case DISK_LED_GREEN_SOLID:
-	case DISK_LED_BLUE:
 		bit1 = 0;
 		bit2 = 1;
 		blink1 = 0;
@@ -554,6 +565,9 @@
 	case SYNO_DS214se_ID:
 		def_max_disk = 2;
 		break;
+	case SYNO_DS414slim_ID:
+		def_max_disk = 0;
+		break;
 
 	default:
 		break;
@@ -565,7 +579,7 @@
 	if (table) {
 		int i;
 		for (i = 0; i < table_cnt; i++) {
-			if (0 == strcmp(table[i].hw_version, gszSynoHWVersion)) {
+			if (syno_is_hw_version(table[i].hw_version)) {
 				if (disk_id <= table[i].max_disk_id) {
 					ret = 1;
 				}
@@ -587,10 +601,17 @@
 	return 1;
 }
 
+void SYNO_ENABLE_USB_POWER(int blEnable)
+{
+	if (GPIO_UNDEF != generic_gpio.usb.usb_power)
+		gpio_set_value(generic_gpio.usb.usb_power, blEnable ? 0 : 1);
+}
+
 EXPORT_SYMBOL(SYNOArmadaIsBoardNeedPowerUpHDD);
 EXPORT_SYMBOL(SYNO_ARMADA_GPIO_PIN);
 EXPORT_SYMBOL(SYNO_ARMADA_GPIO_BLINK);
-EXPORT_SYMBOL(SYNO_CTRL_INTERNAL_HDD_LED_SET);
+EXPORT_SYMBOL(SYNO_ENABLE_HDD_LED);
+EXPORT_SYMBOL(SYNO_SOC_HDD_LED_SET);
 EXPORT_SYMBOL(SYNO_CTRL_EXT_CHIP_HDD_LED_SET);
 EXPORT_SYMBOL(SYNO_CTRL_USB_HDD_LED_SET);
 EXPORT_SYMBOL(SYNO_CTRL_POWER_LED_SET);
@@ -601,6 +622,7 @@
 EXPORT_SYMBOL(SYNO_CTRL_BACKPLANE_STATUS_GET);
 EXPORT_SYMBOL(SYNO_CTRL_BUZZER_CLEARED_GET);
 EXPORT_SYMBOL(SYNO_CHECK_HDD_PRESENT);
+EXPORT_SYMBOL(SYNO_ENABLE_USB_POWER);
 
 /*
  Pin 		Mode	Signal select and definition	Input/output	Pull-up/pull-down
@@ -644,6 +666,7 @@
 							.hdd4_led_1 = GPIO_UNDEF,
 							.hdd5_led_0 = GPIO_UNDEF,
 							.hdd5_led_1 = GPIO_UNDEF,
+							.hdd_led_mask = GPIO_UNDEF,
 						},
 		.soc_sata_led = {
 							.hdd2_fail_led = 32,
@@ -682,11 +705,28 @@
 							.power_led = GPIO_UNDEF,
 							.alarm_led = GPIO_UNDEF,
 						},
+		.usb		  = {
+							.usb_power = GPIO_UNDEF,
+						},
 	};
 
 	*global_gpio = gpio_213j;
 }
 
+extern void (*syno_power_off_indicator)(void);
+static void us3_power_off(void)
+{
+	/* since US3 has no microP to power off,
+	 * we need an indicator for system halt */
+	printk("Set US3 shutdown indicator\n");
+	/* set power green off */
+	gpio_set_value(42, 1);
+	SYNO_ARMADA_GPIO_BLINK(42, 0);
+	/* set power orange on */
+	gpio_set_value(43, 0);
+	SYNO_ARMADA_GPIO_BLINK(43, 0);
+}
+
 static void
 Armada_370_us3_GPIO_init(SYNO_ARMADA_GENERIC_GPIO *global_gpio)
 {
@@ -702,6 +742,7 @@
 							.hdd4_led_1 = GPIO_UNDEF,
 							.hdd5_led_0 = GPIO_UNDEF,
 							.hdd5_led_1 = GPIO_UNDEF,
+							.hdd_led_mask = GPIO_UNDEF,
 						},
 		.soc_sata_led = {
 							.hdd2_fail_led = GPIO_UNDEF,
@@ -740,9 +781,15 @@
 							.power_led = GPIO_UNDEF,
 							.alarm_led = GPIO_UNDEF,
 						},
+		.usb		  = {
+							.usb_power = GPIO_UNDEF,
+						},
 	};
 
 	*global_gpio = gpio_us3;
+
+	/* customize power off indicator */
+	syno_power_off_indicator = us3_power_off;
 }
 
 static void
@@ -760,6 +807,7 @@
 							.hdd4_led_1 = GPIO_UNDEF,
 							.hdd5_led_0 = GPIO_UNDEF,
 							.hdd5_led_1 = GPIO_UNDEF,
+							.hdd_led_mask = GPIO_UNDEF,
 						},
 		.soc_sata_led = {
 							.hdd2_fail_led = 32,
@@ -798,6 +846,9 @@
 							.power_led = GPIO_UNDEF,
 							.alarm_led = GPIO_UNDEF,
 						},
+		.usb		  = {
+							.usb_power = GPIO_UNDEF,
+						},
 	};
 
 	*global_gpio = gpio_rs214;
@@ -818,6 +869,7 @@
 							.hdd4_led_1 = GPIO_UNDEF,
 							.hdd5_led_0 = GPIO_UNDEF,
 							.hdd5_led_1 = GPIO_UNDEF,
+							.hdd_led_mask = GPIO_UNDEF,
 						},
 		.soc_sata_led = {
 							.hdd2_fail_led = 32,
@@ -856,10 +908,75 @@
 							.power_led = GPIO_UNDEF,
 							.alarm_led = GPIO_UNDEF,
 						},
+		.usb		  = {
+							.usb_power = GPIO_UNDEF,
+						},
 	};
 
 	*global_gpio = gpio_214se;
 }
+
+static void 
+Armada_370_414slim_GPIO_init(SYNO_ARMADA_GENERIC_GPIO *global_gpio)
+{
+	SYNO_ARMADA_GENERIC_GPIO gpio_414slim = {
+		.ext_sata_led = {
+							.hdd1_led_0 = GPIO_UNDEF,
+							.hdd1_led_1 = GPIO_UNDEF,
+							.hdd2_led_0 = GPIO_UNDEF,
+							.hdd2_led_1 = GPIO_UNDEF,
+							.hdd3_led_0 = GPIO_UNDEF,
+							.hdd3_led_1 = GPIO_UNDEF,
+							.hdd4_led_0 = GPIO_UNDEF,
+							.hdd4_led_1 = GPIO_UNDEF,
+							.hdd5_led_0 = GPIO_UNDEF,
+							.hdd5_led_1 = GPIO_UNDEF,
+							.hdd_led_mask = 39,
+						},
+		.soc_sata_led = {
+							.hdd2_fail_led = GPIO_UNDEF,
+							.hdd1_fail_led = GPIO_UNDEF,
+						},
+		.model		  = {
+							.model_id_0 = 55,
+							.model_id_1 = 56,
+							.model_id_2 = 57,
+							.model_id_3 = 58,
+						},
+		.fan		  = {
+							.fan_1 = 65,
+							.fan_2 = 64,
+							.fan_3 = 63,
+							.fan_fail = 38,
+							.fan_fail_2 = GPIO_UNDEF,
+							.fan_fail_3 = GPIO_UNDEF,
+						},
+		.hdd_pm		  = {
+							.hdd1_pm = GPIO_UNDEF,
+							.hdd2_pm = GPIO_UNDEF,
+							.hdd3_pm = GPIO_UNDEF,
+							.hdd4_pm = GPIO_UNDEF,
+						},
+		.rack		  = {
+							.buzzer_mute_req = GPIO_UNDEF,
+							.buzzer_mute_ack = GPIO_UNDEF,
+							.rps1_on = GPIO_UNDEF,
+							.rps2_on = GPIO_UNDEF,
+						},
+		.multi_bay	  = {
+							.inter_lock = GPIO_UNDEF,
+						},
+		.status		  = {
+							.power_led = GPIO_UNDEF,
+							.alarm_led = GPIO_UNDEF,
+						},
+		.usb		  = {
+							.usb_power = 44,
+						},
+	};
+
+	*global_gpio = gpio_414slim;
+}
 static void
 ARMADA_default_GPIO_init(SYNO_ARMADA_GENERIC_GPIO *global_gpio)
 {
@@ -875,6 +992,7 @@
 							.hdd4_led_1 = GPIO_UNDEF,
 							.hdd5_led_0 = GPIO_UNDEF,
 							.hdd5_led_1 = GPIO_UNDEF,
+							.hdd_led_mask = GPIO_UNDEF,
 						},
 		.soc_sata_led = {
 							.hdd2_fail_led = GPIO_UNDEF,
@@ -913,6 +1031,9 @@
 							.power_led = GPIO_UNDEF,
 							.alarm_led = GPIO_UNDEF,
 						},
+		.usb		  = {
+							.usb_power = GPIO_UNDEF,
+						},
 	};
 
 	*global_gpio = gpio_default;
@@ -938,6 +1059,10 @@
 		Armada_370_214se_GPIO_init(&generic_gpio);
 		printk("Synology Armada370 DS214se GPIO Init\n");
 		break;
+	case SYNO_DS414slim_ID:
+		Armada_370_414slim_GPIO_init(&generic_gpio);
+		printk("Synology Armada370 DS414slim GPIO Init\n");
+		break;
 
 	default:
 		printk("%s BoardID not match\n", __FUNCTION__);
Nur in b/arch/arm/mach-armada370: synology-platform.c.
diff -ur a/arch/arm/mach-armada370/sysmap.c b/arch/arm/mach-armada370/sysmap.c
--- a/arch/arm/mach-armada370/sysmap.c	2013-08-24 11:36:25.000000000 +0200
+++ b/arch/arm/mach-armada370/sysmap.c	2014-02-17 11:56:30.000000000 +0100
@@ -84,6 +84,7 @@
 		case SYNO_US3_ID:
 		case SYNO_RS214_ID:
 		case SYNO_DS214se_ID:
+		case SYNO_DS414slim_ID:
 #endif
 			return SYSMAP_ARMADA_370;
 		default:
diff -ur a/arch/arm/mach-armadaxp/synology-gpio.c b/arch/arm/mach-armadaxp/synology-gpio.c
--- a/arch/arm/mach-armadaxp/synology-gpio.c	2013-08-24 11:36:23.000000000 +0200
+++ b/arch/arm/mach-armadaxp/synology-gpio.c	2014-02-17 11:56:25.000000000 +0100
@@ -40,16 +40,11 @@
 #define DISK_LED_ORANGE_SOLID	2
 #define DISK_LED_ORANGE_BLINK	3
 #define DISK_LED_GREEN_BLINK    4
-#define DISK_LED_BLUE			5
 
 #define SYNO_LED_OFF		0
 #define SYNO_LED_ON		1
 #define SYNO_LED_BLINKING	2
 
-#ifdef  MY_ABC_HERE
-extern char gszSynoHWVersion[];
-#endif
-
 typedef struct __tag_SYNO_HDD_DETECT_GPIO {
 	u8 hdd1_present_detect;
 	u8 hdd2_present_detect;
@@ -255,8 +250,7 @@
 		gpio_set_value(mpp_pin, !active);
 		gpio_set_value(fail_led, active);
 	}
-	else if ( DISK_LED_GREEN_SOLID == status ||
-			  DISK_LED_BLUE == status)
+	else if ( DISK_LED_GREEN_SOLID == status )
 	{
 		SYNOMppCtrlRegWrite(mpp_pin, mode_sata_present);  // change MPP to sata present mode
 		gpio_set_value(fail_led, !active);
@@ -355,7 +349,6 @@
 		blink2 = 0;
 		break;
 	case DISK_LED_GREEN_SOLID:
-	case DISK_LED_BLUE:
 		bit1 = 0;
 		bit2 = 1;
 		blink1 = 0;
@@ -602,7 +595,7 @@
 {
 	int i=0;
 	while (tbl[i].hw_version) {
-		if (0 == strcmp(tbl[i].hw_version, gszSynoHWVersion))
+		if (syno_is_hw_version(tbl[i].hw_version))
 			return tbl[i].max_disk_id;
 		i++;
 	}
Nur in b/arch/arm: mach-comcerto.
diff -ur a/arch/arm/mach-kirkwood/include/mach/synology.h b/arch/arm/mach-kirkwood/include/mach/synology.h
--- a/arch/arm/mach-kirkwood/include/mach/synology.h	2013-08-24 11:36:23.000000000 +0200
+++ b/arch/arm/mach-kirkwood/include/mach/synology.h	2014-02-17 11:56:27.000000000 +0100
@@ -37,7 +37,6 @@
 #define DISK_LED_GREEN_SOLID    1
 #define DISK_LED_ORANGE_SOLID   2
 #define DISK_LED_ORANGE_BLINK   3
-#define DISK_LED_BLUE			5
 
 typedef struct __tag_SYNO_FAN_GPIO {
 	u8 fan_1;
diff -ur a/arch/arm/mach-kirkwood/synology-setup.c b/arch/arm/mach-kirkwood/synology-setup.c
--- a/arch/arm/mach-kirkwood/synology-setup.c	2013-08-24 11:36:23.000000000 +0200
+++ b/arch/arm/mach-kirkwood/synology-setup.c	2014-02-17 11:56:27.000000000 +0100
@@ -356,8 +356,7 @@
 	//note: hd led is active low
 	if ( DISK_LED_OFF == status ) {
 		fail_led = 1;
-	} else if ( DISK_LED_GREEN_SOLID == status ||
-				DISK_LED_BLUE == status) {
+	} else if ( DISK_LED_GREEN_SOLID == status ) {
 		fail_led = 1;
 	} else if ( DISK_LED_ORANGE_SOLID == status ||
 		DISK_LED_ORANGE_BLINK == status ) {
diff -ur a/arch/arm/Makefile b/arch/arm/Makefile
--- a/arch/arm/Makefile	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/Makefile	2014-01-21 09:36:45.000000000 +0100
@@ -27,7 +27,11 @@
 
 # Do not use arch/arm/defconfig - it's always outdated.
 # Select a platform tht is kept up-to-date
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+KBUILD_DEFCONFIG := c2krtsm_defconfig
+else
 KBUILD_DEFCONFIG := versatile_defconfig
+endif
 
 # defines filename extension depending memory management type.
 ifeq ($(CONFIG_MMU),)
@@ -140,6 +144,10 @@
 machine-$(CONFIG_ARCH_BCMRING)		:= bcmring
 machine-$(CONFIG_ARCH_CLPS711X)		:= clps711x
 machine-$(CONFIG_ARCH_CNS3XXX)		:= cns3xxx
+machine-$(CONFIG_ARCH_COMCERTO)         := comcerto
+ifeq ($(CONFIG_ARCH_COMCERTO),y)
+textofs-$(CONFIG_ZONE_DMA) := 0x04008000
+endif
 machine-$(CONFIG_ARCH_DAVINCI)		:= davinci
 machine-$(CONFIG_ARCH_DOVE)		:= dove
 machine-$(CONFIG_ARCH_EBSA110)		:= ebsa110
@@ -202,6 +210,8 @@
 machine-$(CONFIG_ARCH_ZYNQ)		:= zynq
 machine-$(CONFIG_ARCH_ARMADA_XP)	:= armadaxp
 
+
+
 # Platform directory name.  This list is sorted alphanumerically
 # by CONFIG_* macro name.
 plat-$(CONFIG_PLAT_ARMADA)	:= armada
diff -ur a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c
--- a/arch/arm/mm/cache-l2x0.c	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/cache-l2x0.c	2014-02-17 11:56:29.000000000 +0100
@@ -299,8 +299,10 @@
 		lockregs = 1;
 
 	for (i = 0; i < lockregs; i++) {
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_L2X0_INSTRUCTION_ONLY)
 		writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_D_BASE +
 			       i * L2X0_LOCKDOWN_STRIDE);
+#endif
 		writel_relaxed(0x0, l2x0_base + L2X0_LOCKDOWN_WAY_I_BASE +
 			       i * L2X0_LOCKDOWN_STRIDE);
 	}
@@ -311,6 +313,9 @@
 	__u32 aux;
 	__u32 cache_id;
 	__u32 way_size = 0;
+#if defined(CONFIG_SYNO_COMCERTO)
+	__u32 prefetch = 0;
+#endif
 	int ways;
 	const char *type;
 
@@ -330,11 +335,25 @@
 		else
 			ways = 8;
 		type = "L310";
+
 #ifdef CONFIG_PL310_ERRATA_753970
 		/* Unmapped register. */
 		sync_reg_offset = L2X0_DUMMY_REG;
 #endif
 		outer_cache.set_debug = pl310_set_debug;
+
+#if defined(CONFIG_SYNO_COMCERTO)
+		prefetch = readl_relaxed(l2x0_base + L2X0_PREFETCH_CTRL);
+
+#ifdef CONFIG_PL310_DOUBLE_LINE_FILL
+		prefetch |= (1 << 30) | (1 << 24);
+#ifdef CONFIG_PL310_INCR_DOUBLE_LINE_FILL
+		prefetch |= (1 << 23);
+#endif
+#endif
+		writel_relaxed(prefetch, l2x0_base + L2X0_PREFETCH_CTRL);
+#endif
+
 		break;
 	case L2X0_CACHE_ID_PART_L210:
 		ways = (aux >> 13) & 0xf;
@@ -385,8 +404,13 @@
 	outer_cache.disable = l2x0_disable;
 
 	printk(KERN_INFO "%s cache controller enabled\n", type);
+#if defined(CONFIG_SYNO_COMCERTO)
+	printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, PREFETCH_CTRL 0x%08x, Cache size: %d B\n",
+			ways, cache_id, aux, prefetch, l2x0_size);
+#else
 	printk(KERN_INFO "l2x0: %d ways, CACHE_ID 0x%08x, AUX_CTRL 0x%08x, Cache size: %d B\n",
 			ways, cache_id, aux, l2x0_size);
+#endif
 }
 
 #ifdef CONFIG_OF
diff -ur a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
--- a/arch/arm/mm/cache-v7.S	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/cache-v7.S	2014-02-17 11:56:28.000000000 +0100
@@ -211,9 +211,19 @@
  * isn't mapped, just try the next page.
  */
 9001:
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARM_ERRATA_775420)
+	dsb
+#endif
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 	mov	r12, r12, lsr #12
 	mov	r12, r12, lsl #12
 	add	r12, r12, #4096
+#else
+	mov	r12, r12, lsr #16
+	mov	r12, r12, lsl #16
+	add	r12, r12, #65536
+
+#endif
 	b	3b
  UNWIND(.fnend		)
 ENDPROC(v7_coherent_kern_range)
diff -ur a/arch/arm/mm/copypage-v6.c b/arch/arm/mm/copypage-v6.c
--- a/arch/arm/mm/copypage-v6.c	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/copypage-v6.c	2014-02-17 11:56:28.000000000 +0100
@@ -20,9 +20,16 @@
 
 #include "mm.h"
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 #if SHMLBA > 16384
 #error FIX ME
 #endif
+#else
+#if SHMLBA > PAGE_SIZE
+#error FIX ME
+#endif
+#endif
+
 
 #define from_address	(0xffff8000)
 #define to_address	(0xffffc000)
diff -ur a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
--- a/arch/arm/mm/dma-mapping.c	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/dma-mapping.c	2014-02-17 11:56:28.000000000 +0100
@@ -29,6 +29,10 @@
 
 #include "mm.h"
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+extern unsigned long arm_dma_zone_size;
+#endif
+
 static u64 get_coherent_dma_mask(struct device *dev)
 {
 	u64 mask = (u64)arm_dma_limit;
@@ -168,8 +172,11 @@
 	pte_t *pte;
 	int i = 0;
 	unsigned long base = consistent_base;
+#if defined(CONFIG_SYNO_COMCERTO)	
+	unsigned long num_ptes = (CONSISTENT_END - base + PMD_SIZE -1) >> PMD_SHIFT;
+#else
 	unsigned long num_ptes = (CONSISTENT_END - base) >> PMD_SHIFT;
-
+#endif
 	consistent_pte = kmalloc(num_ptes * sizeof(pte_t), GFP_KERNEL);
 	if (!consistent_pte) {
 		pr_err("%s: no memory\n", __func__);
@@ -195,8 +202,9 @@
 			ret = -ENOMEM;
 			break;
 		}
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 		WARN_ON(!pmd_none(*pmd));
-
+#endif
 		pte = pte_alloc_kernel(pmd, base);
 		if (!pte) {
 			printk(KERN_ERR "%s: no pte tables\n", __func__);
@@ -205,8 +213,13 @@
 		}
 
 		consistent_pte[i++] = pte;
+#if defined(CONFIG_SYNO_COMCERTO)
+		base = (base + PMD_SIZE) & PMD_MASK;
+	} while ((base-1) < (CONSISTENT_END - 1));
+#else
 		base += PMD_SIZE;
 	} while (base < CONSISTENT_END);
+#endif
 
 	return ret;
 }
@@ -455,6 +468,21 @@
 }
 EXPORT_SYMBOL(dma_free_coherent);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+static inline void __dmac_unmap_area(const void *kaddr, size_t size,
+	int dir)
+{
+#if !defined(CONFIG_CPU_SPECULATIVE_ACCESS_DISABLED)
+	dmac_unmap_area(kaddr, size, dir);
+#else
+	size_t size_inv = min_t(size_t, 32, size);
+
+	dmac_unmap_area(kaddr, size_inv, dir);
+	dmac_unmap_area(kaddr + size - size_inv, size_inv, dir);
+#endif
+}
+#endif
+
 /*
  * Make an area consistent for devices.
  * Note: Drivers should NOT use this function directly, as it will break
@@ -464,28 +492,71 @@
 void ___dma_single_cpu_to_dev(const void *kaddr, size_t size,
 	enum dma_data_direction dir)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned long paddr = __pa(kaddr);
+#else
 	unsigned long paddr;
+#endif
 
 	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+	if ((paddr + size) <= arm_dma_zone_size)
+		return;
+#endif
+
 	dmac_map_area(kaddr, size, dir);
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	paddr = __pa(kaddr);
+#endif
+
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_L2X0_INSTRUCTION_ONLY)
 	if (dir == DMA_FROM_DEVICE) {
 		outer_inv_range(paddr, paddr + size);
 	} else {
 		outer_clean_range(paddr, paddr + size);
 	}
 	/* FIXME: non-speculating: flush on bidirectional mappings? */
+#endif
 }
 EXPORT_SYMBOL(___dma_single_cpu_to_dev);
 
 void ___dma_single_dev_to_cpu(const void *kaddr, size_t size,
 	enum dma_data_direction dir)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned long paddr = __pa(kaddr);
+#endif
+
 	BUG_ON(!virt_addr_valid(kaddr) || !virt_addr_valid(kaddr + size - 1));
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+	if ((paddr + size) <= arm_dma_zone_size)
+		return;
+#endif
+
+#if !defined(CONFIG_SYNO_COMCERTO)
 	/* FIXME: non-speculating: not required */
+#endif
+
+#if defined(CONFIG_SYNO_COMCERTO)
+#if !defined(CONFIG_L2X0_INSTRUCTION_ONLY)
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE) {
+#if !defined(CONFIG_CPU_SPECULATIVE_ACCESS_DISABLED)
+		outer_inv_range(paddr, paddr + size);
+#else
+		size_t size_inv = min_t(size_t, 32, size);
+
+		outer_inv_range(paddr, paddr + size_inv);
+		outer_inv_range(paddr + size - size_inv, paddr + size);
+#endif
+	}
+#endif
+
+	__dmac_unmap_area(kaddr, size, dir);
+#else
 	/* don't bother invalidating if DMA to device */
 	if (dir != DMA_TO_DEVICE) {
 		unsigned long paddr = __pa(kaddr);
@@ -493,6 +564,8 @@
 	}
 
 	dmac_unmap_area(kaddr, size, dir);
+
+#endif
 }
 EXPORT_SYMBOL(___dma_single_dev_to_cpu);
 
@@ -545,16 +618,30 @@
 void ___dma_page_cpu_to_dev(struct page *page, unsigned long off,
 	size_t size, enum dma_data_direction dir)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned long paddr = page_to_phys(page) + off;
+#else
 	unsigned long paddr;
+#endif
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+	if ((paddr + size) <= arm_dma_zone_size)
+		return;
+#endif
 
 	dma_cache_maint_page(page, off, size, dir, dmac_map_area);
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	paddr = page_to_phys(page) + off;
+#endif
+
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_L2X0_INSTRUCTION_ONLY)
 	if (dir == DMA_FROM_DEVICE) {
 		outer_inv_range(paddr, paddr + size);
 	} else {
 		outer_clean_range(paddr, paddr + size);
 	}
+#endif
 	/* FIXME: non-speculating: flush on bidirectional mappings? */
 }
 EXPORT_SYMBOL(___dma_page_cpu_to_dev);
@@ -564,13 +651,39 @@
 {
 	unsigned long paddr = page_to_phys(page) + off;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+	if ((paddr + size) <= arm_dma_zone_size)
+		return;
+#endif
+
+#if !defined(CONFIG_SYNO_COMCERTO)
 	/* FIXME: non-speculating: not required */
+#endif
+
+#if defined(CONFIG_SYNO_COMCERTO)
+#if !defined(CONFIG_L2X0_INSTRUCTION_ONLY)
+	/* don't bother invalidating if DMA to device */
+	if (dir != DMA_TO_DEVICE) {
+#if !defined(CONFIG_CPU_SPECULATIVE_ACCESS_DISABLED)
+		outer_inv_range(paddr, paddr + size);
+#else
+		size_t size_inv = min_t(size_t, 32, size);
+
+		outer_inv_range(paddr, paddr + size_inv);
+		outer_inv_range(paddr + size - size_inv, paddr + size);
+#endif
+	}
+#endif
+	dma_cache_maint_page(page, off, size, dir, __dmac_unmap_area);
+#else
 	/* don't bother invalidating if DMA to device */
 	if (dir != DMA_TO_DEVICE)
 		outer_inv_range(paddr, paddr + size);
 
 	dma_cache_maint_page(page, off, size, dir, dmac_unmap_area);
 
+#endif
+
 	/*
 	 * Mark the D-cache clean for this page to avoid extra flushing.
 	 */
diff -ur a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c
--- a/arch/arm/mm/flush.c	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/flush.c	2014-02-17 11:56:28.000000000 +0100
@@ -232,6 +232,26 @@
 	flush_dcache_mmap_unlock(mapping);
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_L2X0_INSTRUCTION_ONLY)
+
+void __sync_outer_cache(pte_t *ptep, pte_t pteval)
+{
+	if (pte_present(*ptep) && pte_exec(*ptep) && (!pte_present(pteval) || !pte_exec(pteval))) {
+		unsigned long phys = __pfn_to_phys(pte_pfn(*ptep));
+
+//		printk(KERN_INFO "outer flush range: %x %x %lx-%lx\n", pte_val(*ptep), pteval, phys, phys + PAGE_SIZE);
+		outer_flush_range(phys, phys + PAGE_SIZE);
+	}
+}
+
+static void sync_outer_cache(struct page *page)
+{
+	unsigned long phys = page_to_phys(page);
+
+	outer_flush_range(phys, phys + PAGE_SIZE);
+}
+#endif
+
 #if __LINUX_ARM_ARCH__ >= 6
 void __sync_icache_dcache(pte_t pteval)
 {
@@ -301,6 +321,10 @@
 			__flush_dcache_aliases(mapping, page);
 		else if (mapping)
 			__flush_icache_all();
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_L2X0_INSTRUCTION_ONLY)
+		sync_outer_cache(page);
+#endif
 		set_bit(PG_dcache_clean, &page->flags);
 	}
 }
diff -ur a/arch/arm/mm/init.c b/arch/arm/mm/init.c
--- a/arch/arm/mm/init.c	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/init.c	2014-02-17 11:56:28.000000000 +0100
@@ -298,7 +298,11 @@
 	if (arm_dma_zone_size) {
 		arm_adjust_dma_zone(zone_size, zhole_size,
 			arm_dma_zone_size >> PAGE_SHIFT);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+		arm_dma_limit = 0xffffffff;
+#else
 		arm_dma_limit = PHYS_OFFSET + arm_dma_zone_size - 1;
+#endif
 	} else
 		arm_dma_limit = 0xffffffff;
 #endif
@@ -437,9 +441,15 @@
 
 	for (; pfn < end; pfn++) {
 		struct page *page = pfn_to_page(pfn);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_L2X0_INSTRUCTION_ONLY)
+		unsigned long phys = page_to_phys(page);
+#endif
 		ClearPageReserved(page);
 		init_page_count(page);
 		__free_page(page);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_L2X0_INSTRUCTION_ONLY)
+		outer_flush_range(phys, phys + PAGE_SIZE);
+#endif
 		pages++;
 	}
 
diff -ur a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
--- a/arch/arm/mm/Kconfig	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/mm/Kconfig	2014-01-21 09:36:46.000000000 +0100
@@ -790,6 +790,13 @@
 	  Say Y here to use the predictable round-robin cache replacement
 	  policy.  Unless you specifically require this or are unsure, say N.
 
+config CPU_SPECULATIVE_ACCESS_DISABLED
+	bool "Disable speculative accesses"
+	depends on CPU_V7 && ARCH_COMCERTO && SYNO_COMCERTO
+	help
+	  Disable speculative accesses for the processor. This reduces overhead of cache maintenance for
+	  DMA mapped memory zones but may reduce overall system memory performance. If unsure, say N.
+
 config CPU_BPREDICT_DISABLE
 	bool "Disable branch prediction"
 	depends on CPU_ARM1020 || CPU_V6 || CPU_V6K || CPU_MOHAWK || CPU_XSC3 || CPU_V7 || CPU_FA526
@@ -1211,7 +1218,7 @@
 		   REALVIEW_EB_A9MP || ARCH_IMX_V6_V7 || MACH_REALVIEW_PBX || \
 		   ARCH_NOMADIK || ARCH_OMAP4 || ARCH_EXYNOS4 || ARCH_TEGRA || \
 		   ARCH_U8500 || ARCH_VEXPRESS_CA9X4 || ARCH_SHMOBILE || \
-		   ARCH_PRIMA2 || ARCH_ZYNQ || ARCH_CNS3XXX || ARCH_HIGHBANK
+		   ARCH_PRIMA2 || ARCH_ZYNQ || ARCH_CNS3XXX || ARCH_HIGHBANK || ARCH_COMCERTO
 	default y
 	select OUTER_CACHE
 	select OUTER_CACHE_SYNC
@@ -1226,6 +1233,51 @@
 	  This option enables optimisations for the PL310 cache
 	  controller.
 
+config L2X0_INSTRUCTION_ONLY
+	bool "Use L2 cache for instructions only"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_EXCLUSIVE_CACHE
+	bool "Exclusive cache"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_EARLY_WRITE_RESPONSE
+	bool "Early write response"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_FULL_LINE_OF_ZERO
+	bool "Full line of zero"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_STORE_BUFFER_DEVICE_LIMITATION
+	bool "Store buffer device limitation"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_INSTRUCTION_PREFETCH
+	bool "Instruction prefetch"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_DATA_PREFETCH
+	bool "Data prefetch"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_DOUBLE_LINE_FILL
+	bool "Double line fill"
+	depends on CACHE_PL310 && SYNO_COMCERTO
+	default n
+
+config PL310_INCR_DOUBLE_LINE_FILL
+	bool "Incremental double line fill"
+	depends on PL310_DOUBLE_LINE_FILL && SYNO_COMCERTO
+	default n
+
 config CACHE_TAUROS2
 	bool "Enable the Tauros2 L2 cache controller"
 	depends on (ARCH_DOVE || ARCH_MMP || CPU_PJ4)
@@ -1256,7 +1308,7 @@
 config ARM_DMA_MEM_BUFFERABLE
 	bool "Use non-cacheable memory for DMA" if (CPU_V6 || CPU_V6K) && !CPU_V7
 	depends on !(MACH_REALVIEW_PB1176 || REALVIEW_EB_ARM11MP || \
-		     MACH_REALVIEW_PB11MP)
+		     MACH_REALVIEW_PB11MP || COMCERTO_DDR_ECC_SUPPORT)
 	default y if CPU_V6 || CPU_V6K || CPU_V7
 	help
 	  Historically, the kernel has used strongly ordered mappings to
diff -ur a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile
--- a/arch/arm/mm/Makefile	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/mm/Makefile	2014-01-21 09:36:46.000000000 +0100
@@ -1,10 +1,11 @@
 #
 # Makefile for the linux arm-specific parts of the memory manager.
 #
-
+ifeq ($(CONFIG_SYNO_ARMADA_ARCH), y)
 ifneq ($(MACHINE),)
 include $(srctree)/$(MACHINE)/config/mvRules.mk
 endif
+endif
 
 obj-y				:= dma-mapping.o extable.o fault.o init.o \
 				   iomap.o
diff -ur a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
--- a/arch/arm/mm/mmu.c	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/mmu.c	2014-02-17 11:56:28.000000000 +0100
@@ -294,6 +294,20 @@
 				PMD_SECT_UNCACHED | PMD_SECT_XN,
 		.domain    = DOMAIN_KERNEL,
 	},
+#if defined(CONFIG_SYNO_COMCERTO)
+	[MT_MSP] = {
+		.prot_pte  = PROT_PTE_DEVICE | L_PTE_MT_DEV_CACHED | L_PTE_XN,
+		.prot_l1   = PMD_TYPE_TABLE,
+		.prot_sect = PROT_SECT_DEVICE | PMD_SECT_WB,
+		.domain    = DOMAIN_IO,
+	},
+	[MT_MSP_NCNB] = {
+		.prot_pte  = PROT_PTE_DEVICE | L_PTE_XN,
+		.prot_l1   = PMD_TYPE_TABLE,
+		.prot_sect = PROT_SECT_DEVICE | PMD_SECT_S,
+		.domain    = DOMAIN_IO,
+	},
+#endif
 };
 
 const struct mem_type *get_mem_type(unsigned int type)
@@ -612,8 +626,12 @@
 	if (((addr | end | phys) & ~SECTION_MASK) == 0) {
 		pmd_t *p = pmd;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+		pmd += (addr & (~PGDIR_MASK)) >> SECTION_SHIFT;
+#else
 		if (addr & SECTION_SIZE)
 			pmd++;
+#endif
 
 		do {
 			*pmd = __pmd(phys | type->prot_sect);
@@ -1116,7 +1134,10 @@
 static void __init map_lowmem(void)
 {
 	struct memblock_region *reg;
-
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+	extern unsigned long arm_dma_zone_size;
+	phys_addr_t length_ncnb = arm_dma_zone_size, length_ncnb_now = 0;
+#endif
 	/* Map all the lowmem memory banks. */
 	for_each_memblock(memory, reg) {
 		phys_addr_t start = reg->base;
@@ -1127,7 +1148,23 @@
 			end = lowmem_limit;
 		if (start >= end)
 			break;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+		if (length_ncnb > 0)
+		{
+			length_ncnb_now = min(length_ncnb, end - start);
+			map.pfn = __phys_to_pfn(start);
+			map.virtual = __phys_to_virt(start);
+			map.length = length_ncnb_now;
+			map.type = MT_MSP_NCNB;
+			printk("Comcerto: zone_dma mapping size=%lx type=%lx\n", (unsigned long) map.length, (unsigned long) map.type);
+			create_mapping(&map);
+			start += length_ncnb_now;
+			length_ncnb -= length_ncnb_now;
+			if (start == end)
+				continue;
 
+		}
+#endif
 		map.pfn = __phys_to_pfn(start);
 		map.virtual = __phys_to_virt(start);
 		map.length = end - start;
diff -ur a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
--- a/arch/arm/mm/pgd.c	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/pgd.c	2014-02-17 11:56:28.000000000 +0100
@@ -42,6 +42,8 @@
 
 #if defined(CONFIG_SYNO_ARMADA_ARCH)
 	new_pgd = __pgd_alloc();
+#elif defined(CONFIG_SYNO_COMCERTO)
+	new_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, get_order(16384));
 #else
 	new_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL, 2);
 #endif
@@ -112,6 +114,8 @@
 no_pud:
 #if defined(CONFIG_SYNO_ARMADA_ARCH)
 	__pgd_free(new_pgd);
+#elif defined(CONFIG_SYNO_COMCERTO)
+	free_pages((unsigned long)new_pgd, get_order(16384));
 #else
 	free_pages((unsigned long)new_pgd, 2);
 #endif
@@ -172,6 +176,8 @@
 	}
 #endif
 	__pgd_free(pgd_base);
+#elif defined(CONFIG_SYNO_COMCERTO)
+	free_pages((unsigned long) pgd_base, get_order(16384));
 #else
 	free_pages((unsigned long) pgd_base, 2);
 #endif
diff -ur a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S
--- a/arch/arm/mm/proc-v7.S	2013-08-24 11:36:24.000000000 +0200
+++ b/arch/arm/mm/proc-v7.S	2014-02-17 11:56:28.000000000 +0100
@@ -46,7 +46,9 @@
 	mrc	p15, 0, r0, c1, c0, 0		@ ctrl register
 	bic	r0, r0, #0x1000			@ ...i............
 	bic	r0, r0, #0x0006			@ .............ca.
+#if !defined(CONFIG_SYNO_COMCERTO)
 	mcr	p15, 0, r0, c1, c0, 0		@ disable caches
+#endif
 	mov	pc, lr
 ENDPROC(cpu_v7_proc_fin)
 
@@ -158,7 +160,11 @@
 	bic	r3, r1, #0x000003f0
 	bic	r3, r3, #PTE_TYPE_MASK
 	orr	r3, r3, r2
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
 	orr	r3, r3, #PTE_EXT_AP0 | 2
+#else
+	orr	r3, r3, #PTE_EXT_AP0 | 1
+#endif
 
 	tst	r1, #1 << 4
 	orrne	r3, r3, #PTE_EXT_TEX(1)
@@ -182,10 +188,26 @@
 	tstne	r1, #L_PTE_PRESENT
 	moveq	r3, #0
 
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(CONFIG_COMCERTO_64K_PAGES)
  ARM(	str	r3, [r0, #2048]! )
  THUMB(	add	r0, r0, #2048 )
  THUMB(	str	r3, [r0] )
 	mcr	p15, 0, r0, c7, c10, 1		@ flush_pte
+ #else
+	ldr r1, =32768		@ PTE_HWTABLE_OFF
+	add r1, r1, r0
+	add r2, r1, #64		@ 16*4 bytes
+ 1:
+	sub r2, r2, #4
+ 	str	r3, [r2]!
+	cmp r1, r2
+	blo 1b
+	mcr	p15, 0, r1, c7, c10, 1		@ flush_pte
+	add r1, r1, #32				@ 1 cache line, since HW PTE takes 64 bytes
+	mcr	p15, 0, r1, c7, c10, 1		@ flush_pte
+ #endif
+
+
 #endif
 	mov	pc, lr
 ENDPROC(cpu_v7_set_pte_ext)
@@ -286,9 +308,18 @@
  *	- cache type register is implemented
  */
 __v7_ca5mp_setup:
+#if defined(CONFIG_SYNO_COMCERTO)
+	mov	r10, #(1 << 0)			@ TLB ops broadcasting
+	b	1f
+#endif
+
 __v7_ca9mp_setup:
 	mov	r10, #(1 << 0)			@ TLB ops broadcasting
+#if defined(CONFIG_SYNO_COMCERTO) && !defined(CONFIG_CPU_SPECULATIVE_ACCESS_DISABLED)
+	orr	r10, #(3 << 1)			@ L1 data prefetch, L2 prefetch hints enable
+#endif
 	b	1f
+
 __v7_ca15mp_setup:
 	mov	r10, #0
 1:
diff -ur a/arch/arm/tools/mach-types b/arch/arm/tools/mach-types
--- a/arch/arm/tools/mach-types	2013-08-03 09:59:49.000000000 +0200
+++ b/arch/arm/tools/mach-types	2014-01-21 09:36:49.000000000 +0100
@@ -214,6 +214,7 @@
 onearm			MACH_ONEARM		ONEARM			1075
 smdk2443		MACH_SMDK2443		SMDK2443		1084
 fsg			MACH_FSG		FSG			1091
+comcerto		MACH_COMCERTO		COMCERTO		1094
 at91sam9260ek		MACH_AT91SAM9260EK	AT91SAM9260EK		1099
 glantank		MACH_GLANTANK		GLANTANK		1100
 n2100			MACH_N2100		N2100			1101
diff -ur a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
--- a/arch/powerpc/include/asm/systbl.h	2013-08-24 11:36:12.000000000 +0200
+++ b/arch/powerpc/include/asm/systbl.h	2014-02-17 11:56:08.000000000 +0100
@@ -404,11 +404,7 @@
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
 SYSCALL(ni_syscall)
-#ifdef MY_ABC_HERE
-SYSCALL(SYNOmmap)			/* 400 */
-#else
 SYSCALL(ni_syscall)
-#endif
 SYSCALL(ni_syscall)
 #ifdef MY_ABC_HERE
 SYSCALL(SYNOUtime)             /* 402 */
diff -ur a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
--- a/arch/powerpc/include/asm/unistd.h	2013-08-24 11:36:12.000000000 +0200
+++ b/arch/powerpc/include/asm/unistd.h	2014-02-17 11:56:09.000000000 +0100
@@ -381,11 +381,6 @@
 #define __NR_process_vm_writev	352
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOmmap				400
-#define SYNOmmap(x)					syscall(__NR_SYNOmmap, x)
-#endif
-
-#ifdef MY_ABC_HERE
 #define __NR_SYNOMTDAlloc			405
 #define SYNOMTDAlloc(x)				syscall(__NR_SYNOMTDAlloc, x)
 #endif
diff -ur a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c
--- a/arch/powerpc/kernel/syscalls.c	2013-08-24 11:36:14.000000000 +0200
+++ b/arch/powerpc/kernel/syscalls.c	2014-02-17 11:56:12.000000000 +0100
@@ -76,21 +76,6 @@
 	return do_mmap2(addr, len, prot, flags, fd, offset, PAGE_SHIFT);
 }
 
-#ifdef MY_ABC_HERE
-unsigned long sys_SYNOmmap(SYNO_MMAP_ARG __user *arg)
-{
-	int error = -EFAULT;
-	SYNO_MMAP_ARG a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		goto out;;
-
-	error = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.pgoff, PAGE_SHIFT-12);
-out:
-	return error;
-}
-#endif
-
 #ifdef CONFIG_PPC32
 /*
  * Due to some executables calling the wrong select we sometimes
diff -ur a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
--- a/arch/powerpc/mm/hugetlbpage.c	2013-08-24 11:36:14.000000000 +0200
+++ b/arch/powerpc/mm/hugetlbpage.c	2014-02-17 11:56:11.000000000 +0100
@@ -297,7 +297,8 @@
 	int i;
 
 	strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE);
-	parse_args("hugetlb gpages", cmdline, NULL, 0, &do_gpage_early_setup);
+	parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0,
+			&do_gpage_early_setup);
 
 	/*
 	 * Walk gpage list in reverse, allocating larger page sizes first.
diff -ur a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
--- a/arch/s390/kvm/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/arch/s390/kvm/Kconfig	2014-01-21 09:36:53.000000000 +0100
@@ -21,6 +21,7 @@
 	depends on HAVE_KVM && EXPERIMENTAL
 	select PREEMPT_NOTIFIERS
 	select ANON_INODES
+	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	---help---
 	  Support hosting paravirtualized guest machines using the SIE
 	  virtualization capability on the mainframe. This should work
diff -ur a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
--- a/arch/x86/ia32/ia32entry.S	2013-08-24 11:36:39.000000000 +0200
+++ b/arch/x86/ia32/ia32entry.S	2014-02-17 11:56:48.000000000 +0100
@@ -904,11 +904,7 @@
 	.quad sys_ni_syscall
 	.quad sys_ni_syscall
 	.quad sys_ni_syscall
-#ifdef MY_ABC_HERE
-	.quad compat_sys_SYNOmmap  /* 400 */
-#else
 	.quad sys_ni_syscall
-#endif
 	.quad sys_ni_syscall
 #ifdef MY_ABC_HERE
 	.quad compat_sys_SYNOUtime		/* 402 */
diff -ur a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
--- a/arch/x86/ia32/sys_ia32.c	2013-08-24 11:36:39.000000000 +0200
+++ b/arch/x86/ia32/sys_ia32.c	2014-02-17 11:56:48.000000000 +0100
@@ -104,7 +104,9 @@
 
 #include <linux/namei.h>
 
-extern int __SYNOCaselessStat(char __user * filename, int isLink, struct kstat *stat, int *lastComponent);
+extern int __SYNOCaselessStat(char __user * filename, int nofollowLink, struct kstat *stat, int *lastComponent, int flags);
+extern int syno_vfs_stat(const char __user *name, struct kstat *stat, int flags, int stat_flags);
+extern int syno_vfs_fstat(unsigned int fd, struct kstat *stat, int stat_flags);
 
 asmlinkage long sys32_SYNOCaselessStat(char __user * filename, struct stat64 __user *statbuf)
 {
@@ -112,7 +114,7 @@
 	long error = -1;
 	struct kstat stat;
 
-	error =  __SYNOCaselessStat(filename, 0, &stat, &lastComponent);
+	error =  __SYNOCaselessStat(filename, 0, &stat, &lastComponent, 0);
 	if (!error) {
 		error = cp_stat64(statbuf, &stat);
 	}
@@ -126,7 +128,7 @@
 	long error = -1;
 	struct kstat stat;
 
-	error =  __SYNOCaselessStat(filename, 1, &stat, &lastComponent);
+	error =  __SYNOCaselessStat(filename, 1, &stat, &lastComponent, 0);
 	if (!error) {
 		error = cp_stat64(statbuf, &stat);
 	}
@@ -195,15 +197,15 @@
 	return error;
 }
 
-static long do_SYNOStat32(char __user * filename, int isLink, unsigned int f, struct SYNOSTAT64 __user * pSt)
+static long do_SYNOStat32(char __user * filename, int nofollowLink, unsigned int flags, struct SYNOSTAT64 __user * pSt)
 {
 	long error = -EINVAL;
 	int lastComponent = 0;
 	struct kstat kst;
 
-	if (f & SYNOST_IS_CASELESS) {
+	if (flags & SYNOST_IS_CASELESS) {
 #ifdef MY_ABC_HERE
-		error = __SYNOCaselessStat(filename, isLink, &kst, &lastComponent);
+		error = __SYNOCaselessStat(filename, nofollowLink, &kst, &lastComponent, flags);
 		if (-ENOENT == error) {
 			if (__put_user(lastComponent, &pSt->ext.lastComponent)){
 				goto Out;
@@ -213,10 +215,10 @@
 		error = -EOPNOTSUPP;
 #endif
 	} else {
-		if (isLink) {
-			error = vfs_lstat(filename, &kst);
+		if (nofollowLink) {
+			error = syno_vfs_stat(filename, &kst, 0, flags);
 		} else {
-			error = vfs_stat(filename, &kst);
+			error = syno_vfs_stat(filename, &kst, LOOKUP_FOLLOW, flags);
 #ifdef MY_ABC_HERE
 			if(syno_hibernation_log_sec > 0) {
 				syno_do_hibernation_log(filename);
@@ -229,7 +231,7 @@
 		goto Out;
 	}
 
-	error = SYNOStatCopyToUser(&kst, f, pSt);
+	error = SYNOStatCopyToUser(&kst, flags, pSt);
 Out:
 	return error;
 }
@@ -244,7 +246,7 @@
 	int error;
 	struct kstat kst;
 
-	error = vfs_fstat(fd, &kst);
+	error = syno_vfs_fstat(fd, &kst, flags);
 	if (!error) {
 		error = SYNOStatCopyToUser(&kst, flags, pSt);
 	}
diff -ur a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
--- a/arch/x86/include/asm/serial.h	2013-08-24 11:36:38.000000000 +0200
+++ b/arch/x86/include/asm/serial.h	2014-02-17 11:56:47.000000000 +0100
@@ -46,7 +46,7 @@
     { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS },  /* ttyS2 */
 #endif
 #else
-#if !defined(CONFIG_SYNO_CEDARVIEW)
+#if !defined(CONFIG_SYNO_CEDARVIEW) && !defined(CONFIG_SYNO_AVOTON)
 #define SERIAL_PORT_DFNS            \
 	/* UART CLK   PORT IRQ     FLAGS        */          \
 	{ 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS },  /* ttyS0 */ \
diff -ur a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
--- a/arch/x86/include/asm/unistd_32.h	2013-08-24 11:36:38.000000000 +0200
+++ b/arch/x86/include/asm/unistd_32.h	2014-02-17 11:56:47.000000000 +0100
@@ -360,11 +360,6 @@
 #define __NR_process_vm_writev	348
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOmmap		400
-#define SYNOmmap(x)		syscall(__NR_SYNOmmap, x)
-#endif
-
-#ifdef MY_ABC_HERE
 #define __NR_SYNOUtime                          402
 #define SYNOUtime(arg1, arg2)                   syscall(__NR_SYNOUtime, arg1, arg2)
 #endif
@@ -376,12 +371,12 @@
 
 #ifdef MY_ABC_HERE
 #define __NR_recvfile                           404
-#define recvfile(arg1,arg2,arg3,arg4,arg5)      syscall(__NR_recvfile,arg1,arg2,arg3,arg4,arg5)
+#define recvfile(arg1,arg2,arg3,arg4,arg5)      syscall(__NR_recvfile, arg1, arg2, arg3, arg4, arg5)
 #endif
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOMTDAlloc			405
-#define SYNOMTDAlloc(x)				syscall(__NR_SYNOMTDAlloc, x)
+#define __NR_SYNOMTDAlloc                       405
+#define SYNOMTDAlloc(arg1)                      syscall(__NR_SYNOMTDAlloc, arg1)
 #endif
 
 #ifdef MY_ABC_HERE
@@ -389,81 +384,81 @@
 #define __NR_SYNOCaselessLStat64                407
 #define __NR_SYNOCaselessStat                   408
 #define __NR_SYNOCaselessLStat                  409
+
 #if !defined(__KERNEL__)
 /* direct SYNOCaselessStat to stat64 in 32-bit platform
  * 64-bits arch has no stat64 support */
 #include <bits/wordsize.h>
 #if __WORDSIZE == 64
-#define SYNOCaselessStat(arg1,arg2)		syscall(__NR_SYNOCaselessStat ,arg1,arg2)
-#define SYNOCaselessLStat(arg1,arg2)	syscall(__NR_SYNOCaselessLStat ,arg1,arg2)
+#define SYNOCaselessStat(arg1, arg2)            syscall(__NR_SYNOCaselessStat, arg1, arg2)
+#define SYNOCaselessLStat(arg1, arg2)           syscall(__NR_SYNOCaselessLStat, arg1, arg2)
 #elif (_FILE_OFFSET_BITS == 64)
-#define SYNOCaselessStat(arg1,arg2)		syscall(__NR_SYNOCaselessStat64 ,arg1,arg2)
-#define SYNOCaselessLStat(arg1,arg2)	syscall(__NR_SYNOCaselessLStat64 ,arg1,arg2)
+#define SYNOCaselessStat(arg1, arg2)            syscall(__NR_SYNOCaselessStat64, arg1, arg2)
+#define SYNOCaselessLStat(arg1, arg2)           syscall(__NR_SYNOCaselessLStat64, arg1, arg2)
 #endif
 /* define stat64 interface for compatibility
    These should be removed after AP modification */
-#define SYNOCaselessStat64(arg1,arg2)	syscall(__NR_SYNOCaselessStat64 ,arg1,arg2)
-#define SYNOCaselessLStat64(arg1,arg2)	syscall(__NR_SYNOCaselessLStat64 ,arg1,arg2)
+#define SYNOCaselessStat64(arg1, arg2)          syscall(__NR_SYNOCaselessStat64, arg1, arg2)
+#define SYNOCaselessLStat64(arg1, arg2)         syscall(__NR_SYNOCaselessLStat64, arg1, arg2)
 #endif
 #endif /* MY_ABC_HERE */
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOEcryptName                 410
-#define __NR_SYNODecryptName                411
-#define SYNOEcryptName(arg1, arg2)          syscall(__NR_SYNOEcryptName, arg1, arg2)
-#define SYNODecryptName(arg1, arg2, arg3)         syscall(__NR_SYNODecryptName, arg1, arg2, arg3)
+#define __NR_SYNOEcryptName                     410
+#define __NR_SYNODecryptName                    411
+#define SYNOEcryptName(arg1, arg2)              syscall(__NR_SYNOEcryptName, arg1, arg2)
+#define SYNODecryptName(arg1, arg2, arg3)       syscall(__NR_SYNODecryptName, arg1, arg2, arg3)
 #endif
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOACLCheckPerm               412
-#define SYNOACLSysCheckPerm(arg1, arg2)            syscall(__NR_SYNOACLCheckPerm, arg1, arg2)
-#define __NR_SYNOACLIsSupport               413
-#define SYNOACLSysIsSupport(arg1, arg2, arg3)            syscall(__NR_SYNOACLIsSupport, arg1, arg2, arg3)
-#define __NR_SYNOACLGetPerm               414
-#define SYNOACLSysGetPerm(arg1, arg2)            syscall(__NR_SYNOACLGetPerm, arg1, arg2)
+#define __NR_SYNOACLCheckPerm                   412
+#define SYNOACLSysCheckPerm(arg1, arg2)         syscall(__NR_SYNOACLCheckPerm, arg1, arg2)
+#define __NR_SYNOACLIsSupport                   413
+#define SYNOACLSysIsSupport(arg1, arg2, arg3)   syscall(__NR_SYNOACLIsSupport, arg1, arg2, arg3)
+#define __NR_SYNOACLGetPerm                     414
+#define SYNOACLSysGetPerm(arg1, arg2)           syscall(__NR_SYNOACLGetPerm, arg1, arg2)
 #endif
 
-
 #ifdef MY_ABC_HERE
-#define __NR_SYNOStat              416
-#define __NR_SYNOFStat              417
-#define __NR_SYNOLStat              418
-#define __NR_SYNOStat64              419
-#define __NR_SYNOFStat64             420
-#define __NR_SYNOLStat64              421
+#define __NR_SYNOStat                           416
+#define __NR_SYNOFStat                          417
+#define __NR_SYNOLStat                          418
+#define __NR_SYNOStat64                         419
+#define __NR_SYNOFStat64                        420
+#define __NR_SYNOLStat64                        421
 
 #if !defined(__KERNEL__)
 /* direct SYNOStat to stat64 in 32-bit platform
  * 64-bits arch has no stat64 support */
 #include <bits/wordsize.h>
 #if __WORDSIZE == 64
-#define SYNOStat(arg1, arg2, arg3)  syscall(__NR_SYNOStat, arg1, arg2, arg3)
-#define SYNOFStat(arg1, arg2, arg3) syscall(__NR_SYNOFStat, arg1, arg2, arg3)
-#define SYNOLStat(arg1, arg2, arg3) syscall(__NR_SYNOLStat, arg1, arg2, arg3)
+#define SYNOStat(arg1, arg2, arg3)              syscall(__NR_SYNOStat, arg1, arg2, arg3)
+#define SYNOFStat(arg1, arg2, arg3)             syscall(__NR_SYNOFStat, arg1, arg2, arg3)
+#define SYNOLStat(arg1, arg2, arg3)             syscall(__NR_SYNOLStat, arg1, arg2, arg3)
 #elif (_FILE_OFFSET_BITS == 64)
-#define SYNOStat(arg1, arg2, arg3)  syscall(__NR_SYNOStat64, arg1, arg2, arg3)
-#define SYNOFStat(arg1, arg2, arg3) syscall(__NR_SYNOFStat64, arg1, arg2, arg3)
-#define SYNOLStat(arg1, arg2, arg3) syscall(__NR_SYNOLStat64, arg1, arg2, arg3)
+#define SYNOStat(arg1, arg2, arg3)              syscall(__NR_SYNOStat64, arg1, arg2, arg3)
+#define SYNOFStat(arg1, arg2, arg3)             syscall(__NR_SYNOFStat64, arg1, arg2, arg3)
+#define SYNOLStat(arg1, arg2, arg3)             syscall(__NR_SYNOLStat64, arg1, arg2, arg3)
 #endif
 #endif /* __KERNEL__ */
 
 #endif /* MY_ABC_HERE */
 #ifdef CONFIG_SYNO_NOTIFY
-#define __NR_SYNONotifyInit		422
-#define SYNONotifyInit(arg1)	syscall(__NR_SYNONotifyInit, arg1)
-#define __NR_SYNONotifyAddWatch		423
-#define SYNONotifyAddWatch(arg1, arg2, arg3)	syscall(__NR_SYNONotifyAddWatch, arg1, arg2, arg3)
-#define __NR_SYNONotifyRemoveWatch		424
-#define SYNONotifyRemoveWatch(arg1, arg2, arg3)	syscall(__NR_SYNONotifyRemoveWatch, arg1, arg2, arg3)
-#define __NR_SYNONotifyAddWatch32	425
-#define SYNONotifyAddWatch32(arg1, arg2, arg3)	syscall(__NR_SYNONotifyAddWatch32, arg1, arg2, arg3)
-#define __NR_SYNONotifyRemoveWatch32	426
-#define SYNONotifyRemoveWatch32(arg1, arg2, arg3)	syscall(__NR_SYNONotifyRemoveWatch32, arg1, arg2, arg3)
+#define __NR_SYNONotifyInit                     422
+#define SYNONotifyInit(arg1)                    syscall(__NR_SYNONotifyInit, arg1)
+#define __NR_SYNONotifyAddWatch                 423
+#define SYNONotifyAddWatch(arg1, arg2, arg3)    syscall(__NR_SYNONotifyAddWatch, arg1, arg2, arg3)
+#define __NR_SYNONotifyRemoveWatch              424
+#define SYNONotifyRemoveWatch(arg1, arg2, arg3) syscall(__NR_SYNONotifyRemoveWatch, arg1, arg2, arg3)
+#define __NR_SYNONotifyAddWatch32               425
+#define SYNONotifyAddWatch32(arg1, arg2, arg3)  syscall(__NR_SYNONotifyAddWatch32, arg1, arg2, arg3)
+#define __NR_SYNONotifyRemoveWatch32            426
+#define SYNONotifyRemoveWatch32(arg1,arg2,arg3) syscall(__NR_SYNONotifyRemoveWatch32, arg1, arg2, arg3)
 #endif /* CONFIG_SYNO_NOTIFY */
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOArchiveOverwrite	427
-#define SYNOArchiveOverwrite(arg1, arg2)	syscall(__NR_SYNOArchiveOverwrite, arg1, arg2)
+#define __NR_SYNOArchiveOverwrite               427
+#define SYNOArchiveOverwrite(arg1, arg2)        syscall(__NR_SYNOArchiveOverwrite, arg1, arg2)
 #endif
 
 #ifdef MY_ABC_HERE
diff -ur a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
--- a/arch/x86/include/asm/unistd_64.h	2013-08-24 11:36:38.000000000 +0200
+++ b/arch/x86/include/asm/unistd_64.h	2014-02-17 11:56:47.000000000 +0100
@@ -692,12 +692,6 @@
 __SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOmmap                           400
-#define SYNOmmap(x)                             syscall(__NR_SYNOmmap, x)
-__SYSCALL(__NR_SYNOmmap, sys_SYNOmmap)
-#endif
-
-#ifdef MY_ABC_HERE
 #define __NR_SYNOUtime                          402
 #define SYNOUtime(arg1, arg2)                   syscall(__NR_SYNOUtime, arg1, arg2)
 __SYSCALL(__NR_SYNOUtime, sys_SYNOUtime)
@@ -711,13 +705,13 @@
 
 #ifdef MY_ABC_HERE
 #define __NR_recvfile                           404
-#define recvfile(arg1,arg2,arg3,arg4,arg5)      syscall(__NR_recvfile,arg1,arg2,arg3,arg4,arg5)
+#define recvfile(arg1,arg2,arg3,arg4,arg5)      syscall(__NR_recvfile, arg1, arg2, arg3, arg4, arg5)
 __SYSCALL(__NR_recvfile, sys_recvfile)
 #endif
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOMTDAlloc			405
-#define SYNOMTDAlloc(x)				syscall(__NR_SYNOMTDAlloc, x)
+#define __NR_SYNOMTDAlloc                       405
+#define SYNOMTDAlloc(arg1)                      syscall(__NR_SYNOMTDAlloc, arg1)
 #endif
 
 #ifdef MY_ABC_HERE
@@ -725,102 +719,103 @@
 #define __NR_SYNOCaselessLStat64                407
 #define __NR_SYNOCaselessStat                   408
 #define __NR_SYNOCaselessLStat                  409
+
 #if !defined(__KERNEL__)
 /* direct SYNOCaselessStat to stat64 in 32-bit platform
  * 64-bits arch has no stat64 support */
 #include <bits/wordsize.h>
 #if __WORDSIZE == 64
-#define SYNOCaselessStat(arg1,arg2)		syscall(__NR_SYNOCaselessStat , arg1,arg2)
-#define SYNOCaselessLStat(arg1,arg2)	syscall(__NR_SYNOCaselessLStat , arg1,arg2)
+#define SYNOCaselessStat(arg1, arg2)            syscall(__NR_SYNOCaselessStat, arg1, arg2)
+#define SYNOCaselessLStat(arg1, arg2)           syscall(__NR_SYNOCaselessLStat, arg1, arg2)
 __SYSCALL(__NR_SYNOCaselessStat, sys_SYNOCaselessStat)
 __SYSCALL(__NR_SYNOCaselessLStat, sys_SYNOCaselessLStat)
 #elif (_FILE_OFFSET_BITS == 64)
-#define SYNOCaselessStat(arg1,arg2)		syscall(__NR_SYNOCaselessStat64 , arg1,arg2)
-#define SYNOCaselessLStat(arg1,arg2)	syscall(__NR_SYNOCaselessLStat64 , arg1,arg2)
+#define SYNOCaselessStat(arg1, arg2)            syscall(__NR_SYNOCaselessStat64, arg1, arg2)
+#define SYNOCaselessLStat(arg1, arg2)           syscall(__NR_SYNOCaselessLStat64, arg1, arg2)
 __SYSCALL(__NR_SYNOCaselessStat64, sys_SYNOCaselessStat64)
 __SYSCALL(__NR_SYNOCaselessLStat64, sys_SYNOCaselessLStat64)
 #endif
 /* define stat64 interface for compatibility
    These should be removed after AP modification */
-#define SYNOCaselessStat64(arg1,arg2)	syscall(__NR_SYNOCaselessStat64 , arg1,arg2)
-#define SYNOCaselessLStat64(arg1,arg2)	syscall(__NR_SYNOCaselessLStat64 , arg1,arg2)
+#define SYNOCaselessStat64(arg1, arg2)          syscall(__NR_SYNOCaselessStat64, arg1, arg2)
+#define SYNOCaselessLStat64(arg1, arg2)         syscall(__NR_SYNOCaselessLStat64, arg1, arg2)
 __SYSCALL(__NR_SYNOCaselessStat64, sys_SYNOCaselessStat64)
 __SYSCALL(__NR_SYNOCaselessLStat64, sys_SYNOCaselessLStat64)
 #endif
 #endif /* MY_ABC_HERE */
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOEcryptName                 410
+#define __NR_SYNOEcryptName                     410
 #define SYNOEcryptName(arg1, arg2)              syscall(__NR_SYNOEcryptName, arg1, arg2)
 __SYSCALL(__NR_SYNOEcryptName, sys_SYNOEcryptName)
-#define __NR_SYNODecryptName                411
-#define SYNODecryptName(arg1, arg2, arg3)              syscall(__NR_SYNODecryptName, arg1, arg2, arg3)
+#define __NR_SYNODecryptName                    411
+#define SYNODecryptName(arg1, arg2, arg3)       syscall(__NR_SYNODecryptName, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNODecryptName, sys_SYNODecryptName)
 #endif
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOACLCheckPerm               412
-#define SYNOACLSysCheckPerm(arg1, arg2)            syscall(__NR_SYNOACLCheckPerm, arg1, arg2)
+#define __NR_SYNOACLCheckPerm                   412
+#define SYNOACLSysCheckPerm(arg1, arg2)         syscall(__NR_SYNOACLCheckPerm, arg1, arg2)
 __SYSCALL(__NR_SYNOACLCheckPerm, sys_SYNOACLCheckPerm)
-#define __NR_SYNOACLIsSupport               413
-#define SYNOACLSysIsSupport(arg1, arg2, arg3)            syscall(__NR_SYNOACLIsSupport, arg1, arg2, arg3)
+#define __NR_SYNOACLIsSupport                   413
+#define SYNOACLSysIsSupport(arg1, arg2, arg3)   syscall(__NR_SYNOACLIsSupport, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNOACLIsSupport, sys_SYNOACLIsSupport)
-#define __NR_SYNOACLGetPerm               414
-#define SYNOACLSysGetPerm(arg1, arg2)            syscall(__NR_SYNOACLGetPerm, arg1, arg2)
+#define __NR_SYNOACLGetPerm                     414
+#define SYNOACLSysGetPerm(arg1, arg2)           syscall(__NR_SYNOACLGetPerm, arg1, arg2)
 __SYSCALL(__NR_SYNOACLGetPerm, sys_SYNOACLGetPerm)
 #endif
 
-
 #ifdef MY_ABC_HERE
-#define __NR_SYNOStat              416
-#define __NR_SYNOFStat              417
-#define __NR_SYNOLStat              418
-#define __NR_SYNOStat64                 419
-#define __NR_SYNOFStat64                420
-#define __NR_SYNOLStat64                421
+#define __NR_SYNOStat                           416
+#define __NR_SYNOFStat                          417
+#define __NR_SYNOLStat                          418
+#define __NR_SYNOStat64                         419
+#define __NR_SYNOFStat64                        420
+#define __NR_SYNOLStat64                        421
 
 #if !defined(__KERNEL__)
 /* direct SYNOStat to stat64 in 32-bit platform
  * 64-bits arch has no stat64 support */
 #include <bits/wordsize.h>
 #if __WORDSIZE == 64
-#define SYNOStat(arg1, arg2, arg3)  syscall(__NR_SYNOStat, arg1, arg2, arg3)
-#define SYNOFStat(arg1, arg2, arg3) syscall(__NR_SYNOFStat, arg1, arg2, arg3)
-#define SYNOLStat(arg1, arg2, arg3) syscall(__NR_SYNOLStat, arg1, arg2, arg3)
+#define SYNOStat(arg1, arg2, arg3)              syscall(__NR_SYNOStat, arg1, arg2, arg3)
+#define SYNOFStat(arg1, arg2, arg3)             syscall(__NR_SYNOFStat, arg1, arg2, arg3)
+#define SYNOLStat(arg1, arg2, arg3)             syscall(__NR_SYNOLStat, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNOStat, sys_SYNOStat)
 __SYSCALL(__NR_SYNOFStat, sys_SYNOFStat)
 __SYSCALL(__NR_SYNOLStat, sys_SYNOLStat)
 #elif (_FILE_OFFSET_BITS == 64)
-#define SYNOStat(arg1, arg2, arg3)  syscall(__NR_SYNOStat64, arg1, arg2, arg3)
-#define SYNOFStat(arg1, arg2, arg3) syscall(__NR_SYNOFStat64, arg1, arg2, arg3)
-#define SYNOLStat(arg1, arg2, arg3) syscall(__NR_SYNOLStat64, arg1, arg2, arg3)
+#define SYNOStat(arg1, arg2, arg3)              syscall(__NR_SYNOStat64, arg1, arg2, arg3)
+#define SYNOFStat(arg1, arg2, arg3)             syscall(__NR_SYNOFStat64, arg1, arg2, arg3)
+#define SYNOLStat(arg1, arg2, arg3)             syscall(__NR_SYNOLStat64, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNOStat64, sys_SYNOStat64)
 __SYSCALL(__NR_SYNOFStat64, sys_SYNOFStat64)
 __SYSCALL(__NR_SYNOLStat64, sys_SYNOLStat64)
 #endif
 #endif /* __KERNEL__ */
 #endif /* MY_ABC_HERE */
+
 #ifdef CONFIG_SYNO_NOTIFY
-#define __NR_SYNONotifyInit         422
-#define SYNONotifyInit(arg1)      syscall(__NR_SYNONotifyInit, arg1)
+#define __NR_SYNONotifyInit                     422
+#define SYNONotifyInit(arg1)                    syscall(__NR_SYNONotifyInit, arg1)
 __SYSCALL(__NR_SYNONotifyInit, sys_SYNONotifyInit)
-#define __NR_SYNONotifyAddWatch         423
-#define SYNONotifyAddWatch(arg1, arg2, arg3)      syscall(__NR_SYNONotifyAddWatch, arg1, arg2, arg3)
+#define __NR_SYNONotifyAddWatch                 423
+#define SYNONotifyAddWatch(arg1, arg2, arg3)    syscall(__NR_SYNONotifyAddWatch, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNONotifyAddWatch, sys_SYNONotifyAddWatch)
-#define __NR_SYNONotifyRemoveWatch         424
-#define SYNONotifyRemoveWatch(arg1, arg2, arg3)      syscall(__NR_SYNONotifyRemoveWatch, arg1, arg2, arg3)
+#define __NR_SYNONotifyRemoveWatch              424
+#define SYNONotifyRemoveWatch(arg1, arg2, arg3) syscall(__NR_SYNONotifyRemoveWatch, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNONotifyRemoveWatch, sys_SYNONotifyRemoveWatch)
-#define __NR_SYNONotifyAddWatch32         425
-#define SYNONotifyAddWatch32(arg1, arg2, arg3)      syscall(__NR_SYNONotifyAddWatch32, arg1, arg2, arg3)
+#define __NR_SYNONotifyAddWatch32               425
+#define SYNONotifyAddWatch32(arg1, arg2, arg3)  syscall(__NR_SYNONotifyAddWatch32, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNONotifyAddWatch32, sys_SYNONotifyAddWatch32)
-#define __NR_SYNONotifyRemoveWatch32        426
-#define SYNONotifyRemoveWatch32(arg1, arg2, arg3)      syscall(__NR_SYNONotifyRemoveWatch32, arg1, arg2, arg3)
+#define __NR_SYNONotifyRemoveWatch32            426
+#define SYNONotifyRemoveWatch32(arg1,arg2,arg3) syscall(__NR_SYNONotifyRemoveWatch32, arg1, arg2, arg3)
 __SYSCALL(__NR_SYNONotifyRemoveWatch32, sys_SYNONotifyRemoveWatch32)
 #endif /* CONFIG_SYNO_NOTIFY */
 
 #ifdef MY_ABC_HERE
-#define __NR_SYNOArchiveOverwrite	427
-#define SYNOArchiveOverwrite(arg1, arg2)	syscall(__NR_SYNOArchiveOverwrite, arg1, arg2)
+#define __NR_SYNOArchiveOverwrite               427
+#define SYNOArchiveOverwrite(arg1, arg2)        syscall(__NR_SYNOArchiveOverwrite, arg1, arg2)
 __SYSCALL(__NR_SYNOArchiveOverwrite, sys_SYNOArchiveOverwrite)
 #endif
 
diff -ur a/arch/x86/Kconfig b/arch/x86/Kconfig
--- a/arch/x86/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/arch/x86/Kconfig	2014-01-21 09:36:54.000000000 +0100
@@ -97,8 +97,12 @@
 	bool "Intel-Cedarview"
 	depends on X86_64
 
-config SYNO_AMD_RICHLAND
-	bool "AMD-Richland"
+config SYNO_AVOTON
+	bool "Intel-Avoton"
+	depends on X86_64
+
+config SYNO_KVMX64
+	bool "KVM X64 Virtual machine"
 	depends on X86_64
 
 config OUTPUT_FORMAT
diff -ur a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
--- a/arch/x86/kernel/setup.c	2013-08-24 11:36:40.000000000 +0200
+++ b/arch/x86/kernel/setup.c	2014-02-17 11:56:49.000000000 +0100
@@ -186,6 +186,10 @@
 extern int gSynoFactoryUSB3Disable;
 #endif
 
+#ifdef CONFIG_SYNO_DUAL_HEAD
+extern int gSynoDualHead;
+#endif
+
 /*
  * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
  * The direct mapping extends to max_pfn_mapped, so that we can directly access
@@ -854,6 +858,18 @@
 __setup("syno_disable_usb3=", early_factory_usb3_disable);
 #endif
 
+#ifdef CONFIG_SYNO_DUAL_HEAD
+static int __init early_dual_head(char *p)
+{
+	gSynoDualHead = simple_strtol(p, NULL, 10);
+
+	printk("Synology Dual Head: %d\n", gSynoDualHead);
+
+	return 1;
+}
+__setup("dual_head=", early_dual_head);
+#endif
+
 /*
  * --------- Crashkernel reservation ------------------------------
  */
diff -ur a/arch/x86/kernel/synology-gpio.c b/arch/x86/kernel/synology-gpio.c
--- a/arch/x86/kernel/synology-gpio.c	2013-08-24 11:36:40.000000000 +0200
+++ b/arch/x86/kernel/synology-gpio.c	2014-02-17 11:56:49.000000000 +0100
@@ -44,16 +44,11 @@
 #define DISK_LED_ORANGE_SOLID	2
 #define DISK_LED_ORANGE_BLINK	3
 #define DISK_LED_GREEN_BLINK    4
-#define DISK_LED_BLUE			5
 
 #define SYNO_LED_OFF		0
 #define SYNO_LED_ON		1
 #define SYNO_LED_BLINKING	2
 
-#ifdef  MY_ABC_HERE
-extern char gszSynoHWVersion[];
-#endif
-
 #define SYNO_DS214p_GPP_SCHEDULE_ON		8
 #define SYNO_DS214p_GPP_HDD1_PWR_EN		9
 #define SYNO_DS214p_GPP_HDD2_PWR_EN		10
@@ -187,8 +182,7 @@
 	if ( DISK_LED_OFF == status ) {
 		fail_led = 1;
 		act_led = 1;
-	} else if ( DISK_LED_GREEN_SOLID == status ||
-				DISK_LED_BLUE == status) {
+	} else if ( DISK_LED_GREEN_SOLID == status ) {
 		fail_led = 1;
 		act_led = 0;
 	} else if ( DISK_LED_ORANGE_SOLID == status ||
@@ -351,11 +345,11 @@
 	u8 ret = 0;
 
 #ifdef  MY_ABC_HERE
-	if ( 0 == strncmp(gszSynoHWVersion, HW_DS214p, strlen(HW_DS214p)) ) {
+	if (syno_is_hw_version(HW_DS214play)) {
 		if (2 >= disk_id ) {
 			ret = 1;
 		}
-	} else if ( 0 == strncmp(gszSynoHWVersion, HW_DS114p, strlen(HW_DS114p)) ) {
+	} else if (syno_is_hw_version(HW_DS114p)) {
 		if (1 >= disk_id ) {
 			ret = 1;
 		}
@@ -647,10 +641,10 @@
 void synology_gpio_init(void)
 {
 #ifdef  MY_ABC_HERE
-	if ( 0 == strncmp(gszSynoHWVersion, HW_DS214play, strlen(HW_DS214play)) ) {
+	if (syno_is_hw_version(HW_DS214play)) {
 		EVANSPORT_214p_GPIO_init(&generic_gpio);
 		printk("Synology Evansport 2 bay GPIO Init\n");
-	} else if ( 0 == strncmp(gszSynoHWVersion, HW_DS114p, strlen(HW_DS114p)) ) {
+	} else if (syno_is_hw_version(HW_DS114p)) {
 		EVANSPORT_114p_GPIO_init(&generic_gpio);
 		printk("Synology Evansport 1 bay GPIO Init\n");
 	} else {
diff -ur a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
--- a/arch/x86/kernel/syscall_table_32.S	2013-08-24 11:36:40.000000000 +0200
+++ b/arch/x86/kernel/syscall_table_32.S	2014-02-17 11:56:49.000000000 +0100
@@ -402,11 +402,7 @@
     .long sys_ni_syscall
     .long sys_ni_syscall
     .long sys_ni_syscall
-#ifdef MY_ABC_HERE
-    .long sys_SYNOmmap			/* 400 */
-#else
     .long sys_ni_syscall
-#endif
     .long sys_ni_syscall
 #ifdef MY_ABC_HERE
     .long sys_SYNOUtime		/* 402 */
diff -ur a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
--- a/arch/x86/kernel/sys_i386_32.c	2013-08-24 11:36:40.000000000 +0200
+++ b/arch/x86/kernel/sys_i386_32.c	2014-02-17 11:56:49.000000000 +0100
@@ -39,17 +39,3 @@
 	return __res;
 }
 
-#ifdef MY_ABC_HERE
-asmlinkage int sys_SYNOmmap(SYNO_MMAP_ARG __user *arg)
-{
-	int error = -EFAULT;
-	SYNO_MMAP_ARG a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		goto out;;
-
-	error = sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, a.pgoff);
-out:
-	return error;
-}
-#endif
diff -ur a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
--- a/arch/x86/kernel/sys_x86_64.c	2013-08-24 11:36:40.000000000 +0200
+++ b/arch/x86/kernel/sys_x86_64.c	2014-02-17 11:56:49.000000000 +0100
@@ -281,16 +281,3 @@
 
 	return addr;
 }
-
-#ifdef MY_ABC_HERE
-SYSCALL_DEFINE1(SYNOmmap, SYNO_MMAP_ARG __user *, arg)
-{
-	long error = -EFAULT;
-	SYNO_MMAP_ARG a;
-
-	if (copy_from_user(&a, arg, sizeof(a)))
-		return error;
-
-	return sys_mmap(a.addr, a.len, a.prot, a.flags, a.fd, a.pgoff << PAGE_SHIFT);
-}
-#endif
diff -ur a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
--- a/arch/x86/kvm/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/arch/x86/kvm/Kconfig	2014-01-21 09:36:55.000000000 +0100
@@ -35,6 +35,9 @@
 	select KVM_MMIO
 	select TASKSTATS
 	select TASK_DELAY_ACCT
+	select PERF_EVENTS
+	select HAVE_KVM_MSI
+	select HAVE_KVM_CPU_RELAX_INTERCEPT
 	---help---
 	  Support hosting fully virtualized guest machines using hardware
 	  virtualization extensions.  You will need a fairly recent
diff -ur a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
--- a/arch/x86/platform/efi/efi.c	2013-08-24 11:36:39.000000000 +0200
+++ b/arch/x86/platform/efi/efi.c	2014-02-17 11:56:48.000000000 +0100
@@ -83,6 +83,15 @@
 }
 EXPORT_SYMBOL(efi_enabled);
 
+#ifdef CONFIG_SYNO_EFI
+static bool disable_runtime = true;
+static int __init setup_withefi(char *arg)
+{
+	disable_runtime = false;
+	return 0;
+}
+early_param("withefi", setup_withefi);
+#else
 static bool disable_runtime = false;
 static int __init setup_noefi(char *arg)
 {
@@ -90,6 +99,7 @@
 	return 0;
 }
 early_param("noefi", setup_noefi);
+#endif
 
 int add_efi_memmap;
 EXPORT_SYMBOL(add_efi_memmap);
diff -ur a/crypto/Kconfig b/crypto/Kconfig
--- a/crypto/Kconfig	2013-08-16 08:07:18.000000000 +0200
+++ b/crypto/Kconfig	2014-01-21 09:36:56.000000000 +0100
@@ -31,7 +31,7 @@
 	  this is.
 
 config CRYPTO_ALGAPI
-	tristate
+	tristate "ALGAPI"
 	select CRYPTO_ALGAPI2
 	help
 	  This option provides the API for cryptographic algorithms.
@@ -40,7 +40,7 @@
 	tristate
 
 config CRYPTO_AEAD
-	tristate
+	tristate "AEAD"
 	select CRYPTO_AEAD2
 	select CRYPTO_ALGAPI
 
@@ -49,7 +49,7 @@
 	select CRYPTO_ALGAPI2
 
 config CRYPTO_BLKCIPHER
-	tristate
+	tristate "BLKCIPHER"
 	select CRYPTO_BLKCIPHER2
 	select CRYPTO_ALGAPI
 
@@ -60,7 +60,7 @@
 	select CRYPTO_WORKQUEUE
 
 config CRYPTO_HASH
-	tristate
+	tristate "HASH"
 	select CRYPTO_HASH2
 	select CRYPTO_ALGAPI
 
@@ -69,7 +69,7 @@
 	select CRYPTO_ALGAPI2
 
 config CRYPTO_RNG
-	tristate
+	tristate "RNG"
 	select CRYPTO_RNG2
 	select CRYPTO_ALGAPI
 
@@ -926,3 +926,4 @@
 
 source "crypto/ocf/Kconfig"
 
+source "crypto/ocf/Kconfig"
diff -ur a/crypto/ocf/cryptodev.h b/crypto/ocf/cryptodev.h
--- a/crypto/ocf/cryptodev.h	2013-08-24 11:36:50.000000000 +0200
+++ b/crypto/ocf/cryptodev.h	2014-02-17 11:57:09.000000000 +0100
@@ -61,6 +61,9 @@
 #ifndef _CRYPTO_CRYPTO_H_
 #define _CRYPTO_CRYPTO_H_
 
+#if  defined(CONFIG_OCF_M86XXX_MODULE) && (defined(__KERNEL__) || !defined(__GLIBC__) || (__GLIBC__ < 2))
+#include <linux/in.h>
+#endif
 /* Some initial values */
 #define CRYPTO_DRIVERS_INITIAL	4
 #define CRYPTO_SW_SESSIONS	32
@@ -157,7 +160,20 @@
 #define CRYPTO_SHA2_512			24
 #define CRYPTO_RIPEMD160		25
 #define	CRYPTO_LZS_COMP			26
+#if defined(CONFIG_OCF_M86XXX_MODULE)
+#define CRYPTO_ESP_RFC2406 		27
+//#define CRYPTO_ESP_RFC2406_TRANSPORT 20
+#define CRYPTO_ESP_RFC4303  		28
+#define CRYPTO_ESP4_RFC4303  		28
+#define CRYPTO_ESP6_RFC4303  		29
+#define CRYPTO_AH			30
+#define CRYPTO_AH4			30
+#define CRYPTO_AH6			31
+#define CRYPTO_SHA2_HMAC		32 /*TODO is it a duplicate entry*/
+#define CRYPTO_ALGORITHM_MAX		32 /* Keep updated - see below */
+#else
 #define CRYPTO_ALGORITHM_MAX	26 /* Keep updated - see above */
+#endif
 
 /* Algorithm flags */
 #define CRYPTO_ALG_FLAG_SUPPORTED	0x01 /* Algorithm is supported */
@@ -309,13 +325,60 @@
 /* Standard initialization structure beginning */
 struct cryptoini {
 	int		cri_alg;	/* Algorithm to use */
+#if defined(CONFIG_OCF_M86XXX_MODULE)
+	int		cri_flags;
+	union {
+		struct {
+			int		cri_mlen;	/* Number of bytes we want from the
+					   entire hash. 0 means all. */
+			int			cri_klen;	/* Key length, in bits */
+			caddr_t		cri_key;	/* key to use */
+			u_int8_t	cri_iv[EALG_MAX_BLOCK_LEN];	/* IV to use */
+		} cri_alg;
+		struct {
+			u_int32_t basealg;
+			struct sockaddr_in tun_source;
+			struct sockaddr_in tun_destination;
+			int tun_df_mode;
+			int tun_ds_mode;
+		 	int tun_ttl_value;
+		 	int tun_replay_windowsize;
+		 	int spivalue ;
+		 	int replayinit;  /* set to 0 to disable replay on receive */
+		 	u_int64_t time_hard_lifetime;
+		 	u_int64_t time_soft_lifetime;
+		 	u_int64_t byte_hard_lifetime;
+		 	u_int64_t byte_soft_lifetime;
+		} cri_pack;	
+	} u;
+#else
 	int		cri_klen;	/* Key length, in bits */
 	int		cri_mlen;	/* Number of bytes we want from the
 					   entire hash. 0 means all. */
 	caddr_t		cri_key;	/* key to use */
 	u_int8_t	cri_iv[EALG_MAX_BLOCK_LEN];	/* IV to use */
+#endif
 	struct cryptoini *cri_next;
 };
+#if defined(CONFIG_OCF_M86XXX_MODULE)
+#define cri_mlen		u.cri_alg.cri_mlen
+#define cri_klen		u.cri_alg.cri_klen
+#define cri_key			u.cri_alg.cri_key
+#define cri_iv			u.cri_alg.cri_iv
+#define crip_basealg			u.cri_pack.basealg
+#define crip_tun_source 		u.cri_pack.tun_source
+#define crip_tun_destination	u.cri_pack.tun_destination
+#define crip_tun_df_mode		u.cri_pack.tun_df_mode
+#define crip_tun_ds_mode	u.cri_pack.tun_ds_mode
+#define crip_tun_ttl_value	u.cri_pack.tun_ttl_value
+#define crip_tun_replay_windowsize u.cri_pack.tun_replay_windowsize
+#define crip_spivalue 		u.cri_pack.spivalue
+#define crip_replayinit		u.cri_pack.replayinit
+#define crip_time_hard_lifetime 	 u.cri_pack.time_hard_lifetime
+#define crip_time_soft_lifetime 	 u.cri_pack.time_soft_lifetime
+#define crip_byte_hard_lifetime 	 u.cri_pack.byte_hard_lifetime
+#define crip_byte_soft_lifetime 	 u.cri_pack.byte_soft_lifetime
+#endif
 
 /* Describe boundaries of a single crypto operation */
 struct cryptodesc {
@@ -377,6 +440,37 @@
 
 	int (*crp_callback)(struct cryptop *); /* Callback function */
 };
+#if defined(CONFIG_OCF_M86XXX_MODULE)
+enum crypto_packet_return_code {
+		CRYPTO_OK=0,
+		CRYPTO_SOFT_TTL = 2,
+ 		CRYPTO_HARD_TTL,
+ 		CRYPTO_SA_INACTIVE,
+ 		CRYPTO_REPLAY,
+ 		CRYPTO_ICV_FAIL,
+ 		CRYPTO_SEQ_ROLL,
+ 		CRYPTO_MEM_ERROR,
+ 		CRYPTO_VERS_ERROR,
+ 		CRYPTO_PROT_ERROR,
+ 		CRYPTO_PYLD_ERROR,
+ 		CRYPTO_PAD_ERROR 
+};
+
+enum crypto_accel_type {
+                  CRYPTO_PACKET  =0x2,    /* OR together desired bits */
+                  CRYPTO_HARDWARE=0x1,
+                  CRYPTO_SOFTWARE=0x0
+};
+
+enum crypto_flags {
+                  CRYPTO_ENCRYPT=0x1, 	// same for encap (OCF l2)
+                  CRYPTO_DECRYPT=0x2,		// same for decap (OCF l2)
+                  CRYPTO_MAC_GEN=0x4,
+                  CRYPTO_MAC_CHECK=0x08,
+                  CRYPTO_COMPRESS_SMALLER=0x10,
+                  CRYPTO_COMPRESS_BIGGER=0x20
+};
+#endif
 
 #define CRYPTO_BUF_CONTIG	0x0
 #define CRYPTO_BUF_IOV		0x1
diff -ur a/crypto/ocf/cryptosoft.c b/crypto/ocf/cryptosoft.c
--- a/crypto/ocf/cryptosoft.c	2013-08-24 11:36:50.000000000 +0200
+++ b/crypto/ocf/cryptosoft.c	2014-02-17 11:57:09.000000000 +0100
@@ -277,6 +277,9 @@
 	DEVMETHOD(cryptodev_process,	swcr_process),
 };
 
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+static DEFINE_SPINLOCK(syno_softcrypto_lock);
+#endif
 #define debug swcr_debug
 int swcr_debug = 0;
 module_param(swcr_debug, int, 0644);
@@ -343,6 +346,9 @@
 	int error;
 	char *algo;
 	int mode;
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+	unsigned long flags;
+#endif
 
 	dprintk("%s()\n", __FUNCTION__);
 	if (sid == NULL || cri == NULL) {
@@ -350,6 +356,9 @@
 		return EINVAL;
 	}
 
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+	spin_lock_irqsave(&syno_softcrypto_lock, flags);
+#endif
 	if (swcr_sessions) {
 		for (i = 1; i < swcr_sesnum; i++)
 			if (swcr_sessions[i] == NULL)
@@ -371,6 +380,9 @@
 				swcr_sesnum = 0;
 			else
 				swcr_sesnum /= 2;
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+			spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 			dprintk("%s,%d: ENOBUFS\n", __FILE__, __LINE__);
 			return ENOBUFS;
 		}
@@ -393,6 +405,9 @@
 		*swd = (struct swcr_data *) kmalloc(sizeof(struct swcr_data),
 				SLAB_ATOMIC);
 		if (*swd == NULL) {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+			spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 			swcr_freesession(NULL, i);
 			dprintk("%s,%d: ENOBUFS\n", __FILE__, __LINE__);
 			return ENOBUFS;
@@ -401,6 +416,9 @@
 
 		if (cri->cri_alg < 0 ||
 				cri->cri_alg>=sizeof(crypto_details)/sizeof(crypto_details[0])){
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+			spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 			printk("cryptosoft: Unknown algorithm 0x%x\n", cri->cri_alg);
 			swcr_freesession(NULL, i);
 			return EINVAL;
@@ -408,6 +426,9 @@
 
 		algo = crypto_details[cri->cri_alg].alg_name;
 		if (!algo || !*algo) {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+			spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 			printk("cryptosoft: Unsupported algorithm 0x%x\n", cri->cri_alg);
 			swcr_freesession(NULL, i);
 			return EINVAL;
@@ -446,6 +467,9 @@
 			}
 			if (!(*swd)->sw_tfm || IS_ERR((*swd)->sw_tfm)) {
 				int err;
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+				spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 				dprintk("cryptosoft: crypto_alloc_blkcipher failed(%s, 0x%x)\n",
 						algo,mode);
 				err = IS_ERR((*swd)->sw_tfm) ? -(PTR_ERR((*swd)->sw_tfm)) : EINVAL;
@@ -480,6 +504,9 @@
 								cri->cri_key, (cri->cri_klen + 7) / 8);
 			}
 			if (error) {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+				spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 				printk("cryptosoft: setkey failed %d (crt_flags=0x%x)\n", error,
 						(*swd)->sw_tfm->crt_flags);
 				swcr_freesession(NULL, i);
@@ -502,6 +529,9 @@
 			}
 
 			if (!(*swd)->sw_tfm) {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+				spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 				dprintk("cryptosoft: crypto_alloc_hash failed(%s,0x%x)\n",
 						algo, mode);
 				swcr_freesession(NULL, i);
@@ -512,6 +542,9 @@
 			(*swd)->u.hmac.sw_key = (char *)kmalloc((*swd)->u.hmac.sw_klen,
 					SLAB_ATOMIC);
 			if ((*swd)->u.hmac.sw_key == NULL) {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+				spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 				swcr_freesession(NULL, i);
 				dprintk("%s,%d: ENOBUFS\n", __FILE__, __LINE__);
 				return ENOBUFS;
@@ -530,6 +563,9 @@
 			(*swd)->sw_tfm = crypto_comp_tfm(
 					crypto_alloc_comp(algo, 0, CRYPTO_ALG_ASYNC));
 			if (!(*swd)->sw_tfm) {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+				spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 				dprintk("cryptosoft: crypto_alloc_comp failed(%s,0x%x)\n",
 						algo, mode);
 				swcr_freesession(NULL, i);
@@ -537,11 +573,17 @@
 			}
 			(*swd)->u.sw_comp_buf = kmalloc(CRYPTO_MAX_DATA_LEN, SLAB_ATOMIC);
 			if ((*swd)->u.sw_comp_buf == NULL) {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+				spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 				swcr_freesession(NULL, i);
 				dprintk("%s,%d: ENOBUFS\n", __FILE__, __LINE__);
 				return ENOBUFS;
 			}
 		} else {
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+			spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 			printk("cryptosoft: Unhandled sw_type %d\n", (*swd)->sw_type);
 			swcr_freesession(NULL, i);
 			return EINVAL;
@@ -550,6 +592,9 @@
 		cri = cri->cri_next;
 		swd = &((*swd)->sw_next);
 	}
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+	spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 	return 0;
 }
 
@@ -561,6 +606,9 @@
 {
 	struct swcr_data *swd;
 	u_int32_t sid = CRYPTO_SESID2LID(tid);
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+	unsigned long flags;
+#endif
 
 	dprintk("%s()\n", __FUNCTION__);
 	if (sid > swcr_sesnum || swcr_sessions == NULL ||
@@ -573,6 +621,9 @@
 	if (sid == 0)
 		return(0);
 
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+	spin_lock_irqsave(&syno_softcrypto_lock, flags);
+#endif
 	while ((swd = swcr_sessions[sid]) != NULL) {
 		swcr_sessions[sid] = swd->sw_next;
 		if (swd->sw_tfm) {
@@ -616,6 +667,9 @@
 		}
 		kfree(swd);
 	}
+#if defined(CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE)
+	spin_unlock_irqrestore(&syno_softcrypto_lock, flags);
+#endif
 	return 0;
 }
 
diff -ur a/crypto/ocf/Kconfig b/crypto/ocf/Kconfig
--- a/crypto/ocf/Kconfig	2013-08-16 08:07:18.000000000 +0200
+++ b/crypto/ocf/Kconfig	2014-01-21 09:36:56.000000000 +0100
@@ -88,6 +88,12 @@
 	help
 	  OCF driver for the Intel EP80579 Integrated Processor Product Line.
 
+config OCF_M86XXX
+	tristate "M86XXX (HW crypto engine)"
+	depends on OCF_OCF && SYNO_COMCERTO
+	help
+	  OCF driver for the Mindspeed Comcerto2000 Processor Product Line.
+
 config OCF_CRYPTOCTEON
 	tristate "cryptocteon (HW crypto engine)"
 	depends on OCF_OCF
Nur in b/crypto/ocf: m86xxx.
diff -ur a/crypto/ocf/Makefile b/crypto/ocf/Makefile
--- a/crypto/ocf/Makefile	2013-08-16 08:07:18.000000000 +0200
+++ b/crypto/ocf/Makefile	2014-01-21 09:36:56.000000000 +0100
@@ -48,6 +48,7 @@
 $(_obj)-$(CONFIG_OCF_OCFNULL) += ocfnull$(_slash)
 $(_obj)-$(CONFIG_OCF_C7108) += c7108$(_slash)
 $(_obj)-$(CONFIG_OCF_UBSEC_SSB) += ubsec_ssb$(_slash)
+$(_obj)-$(CONFIG_OCF_M86XXX) += m86xxx$(_slash)
 
 ocf-objs := $(OCF_OBJS)
 
diff -ur a/Documentation/devices.txt b/Documentation/devices.txt
--- a/Documentation/devices.txt	2013-08-03 09:59:49.000000000 +0200
+++ b/Documentation/devices.txt	2014-01-21 09:36:41.000000000 +0100
@@ -447,6 +447,9 @@
 		234 = /dev/btrfs-control	Btrfs control device
 		235 = /dev/autofs	Autofs control device
 		236 = /dev/mapper/control	Device-Mapper control device
+		237 = /dev/loop-control Loopback control device
+		238 = /dev/vhost-net	Host kernel accelerator for virtio net
+
 		240-254			Reserved for local use
 		255			Reserved for MISC_DYNAMIC_MINOR
 
diff -ur a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
--- a/Documentation/filesystems/Locking	2013-08-03 09:59:52.000000000 +0200
+++ b/Documentation/filesystems/Locking	2014-01-21 09:36:41.000000000 +0100
@@ -62,6 +62,7 @@
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
+	void (*update_time)(struct inode *, struct timespec *, int);
 
 locking rules:
 	all may block
@@ -89,6 +90,8 @@
 removexattr:	yes
 truncate_range:	yes
 fiemap:		no
+update_time:	no
+
 	Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
 victim.
 	cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
diff -ur a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
--- a/Documentation/filesystems/vfs.txt	2013-08-03 09:59:52.000000000 +0200
+++ b/Documentation/filesystems/vfs.txt	2014-01-21 09:36:41.000000000 +0100
@@ -364,6 +364,7 @@
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
+	void (*update_time)(struct inode *, struct timespec *, int);
 };
 
 Again, all methods are called without any locks being held, unless
@@ -475,6 +476,9 @@
   truncate_range: a method provided by the underlying filesystem to truncate a
   	range of blocks , i.e. punch a hole somewhere in a file.
 
+  update_time: called by the VFS to update a specific time or the i_version of
+  	an inode.  If this is not defined the VFS will update the inode itself
+  	and call mark_inode_dirty_sync.
 
 The Address Space Object
 ========================
diff -ur a/Documentation/i2c/busses/i2c-i801 b/Documentation/i2c/busses/i2c-i801
--- a/Documentation/i2c/busses/i2c-i801	2013-08-03 09:59:49.000000000 +0200
+++ b/Documentation/i2c/busses/i2c-i801	2014-01-21 09:36:41.000000000 +0100
@@ -22,6 +22,7 @@
   * Intel Panther Point (PCH)
   * Intel Lynx Point (PCH)
   * Intel Lynx Point-LP (PCH)
+  * Intel Avoton (SOC)
    Datasheets: Publicly available at the Intel website
 
 On Intel Patsburg and later chipsets, both the normal host SMBus controller
diff -ur a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
--- a/Documentation/kernel-parameters.txt	2013-08-03 09:59:49.000000000 +0200
+++ b/Documentation/kernel-parameters.txt	2014-01-21 09:36:41.000000000 +0100
@@ -110,6 +110,7 @@
 	USB	USB support is enabled.
 	USBHID	USB Human Interface Device support is enabled.
 	V4L	Video For Linux support is enabled.
+	VMMIO   Driver for memory mapped virtio devices is enabled.
 	VGA	The VGA console has been enabled.
 	VT	Virtual terminal support is enabled.
 	WDT	Watchdog support is enabled.
@@ -2725,6 +2726,22 @@
 	video=		[FB] Frame buffer configuration
 			See Documentation/fb/modedb.txt.
 
+	virtio_mmio.device=
+			[VMMIO] Memory mapped virtio (platform) device.
+
+				<size>@<baseaddr>:<irq>[:<id>]
+			where:
+				<size>     := size (can use standard suffixes
+						like K, M and G)
+				<baseaddr> := physical base address
+				<irq>      := interrupt number (as passed to
+						request_irq())
+				<id>       := (optional) platform device id
+			example:
+				virtio_mmio.device=1K@0x100b0000:48:7
+
+			Can be used multiple times for multiple devices.
+
 	vga=		[BOOT,X86-32] Select a particular video mode
 			See Documentation/x86/boot.txt and
 			Documentation/svga.txt.
diff -ur a/Documentation/power/freezing-of-tasks.txt b/Documentation/power/freezing-of-tasks.txt
--- a/Documentation/power/freezing-of-tasks.txt	2013-08-03 09:59:52.000000000 +0200
+++ b/Documentation/power/freezing-of-tasks.txt	2014-01-21 09:36:41.000000000 +0100
@@ -21,7 +21,7 @@
 try_to_freeze_tasks() that sets TIF_FREEZE for all of the freezable tasks and
 either wakes them up, if they are kernel threads, or sends fake signals to them,
 if they are user space processes.  A task that has TIF_FREEZE set, should react
-to it by calling the function called refrigerator() (defined in
+to it by calling the function called __refrigerator() (defined in
 kernel/freezer.c), which sets the task's PF_FROZEN flag, changes its state
 to TASK_UNINTERRUPTIBLE and makes it loop until PF_FROZEN is cleared for it.
 Then, we say that the task is 'frozen' and therefore the set of functions
@@ -29,10 +29,10 @@
 defined in kernel/power/process.c, kernel/freezer.c & include/linux/freezer.h).
 User space processes are generally frozen before kernel threads.
 
-It is not recommended to call refrigerator() directly.  Instead, it is
-recommended to use the try_to_freeze() function (defined in
-include/linux/freezer.h), that checks the task's TIF_FREEZE flag and makes the
-task enter refrigerator() if the flag is set.
+__refrigerator() must not be called directly.  Instead, use the
+try_to_freeze() function (defined in include/linux/freezer.h), that checks
+the task's TIF_FREEZE flag and makes the task enter __refrigerator() if the
+flag is set.
 
 For user space processes try_to_freeze() is called automatically from the
 signal-handling code, but the freezable kernel threads need to call it
@@ -61,7 +61,7 @@
 After the system memory state has been restored from a hibernation image and
 devices have been reinitialized, the function thaw_processes() is called in
 order to clear the PF_FROZEN flag for each frozen task.  Then, the tasks that
-have been frozen leave refrigerator() and continue running.
+have been frozen leave __refrigerator() and continue running.
 
 III. Which kernel threads are freezable?
 
diff -ur a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
--- a/Documentation/virtual/kvm/api.txt	2013-08-03 09:59:49.000000000 +0200
+++ b/Documentation/virtual/kvm/api.txt	2014-01-21 09:36:44.000000000 +0100
@@ -1466,6 +1466,167 @@
 an RMA, or 1 if the processor can use an RMA but doesn't require it,
 because it supports the Virtual RMA (VRMA) facility.
 
+4.64 KVM_NMI
+
+Capability: KVM_CAP_USER_NMI
+Architectures: x86
+Type: vcpu ioctl
+Parameters: none
+Returns: 0 on success, -1 on error
+
+Queues an NMI on the thread's vcpu.  Note this is well defined only
+when KVM_CREATE_IRQCHIP has not been called, since this is an interface
+between the virtual cpu core and virtual local APIC.  After KVM_CREATE_IRQCHIP
+has been called, this interface is completely emulated within the kernel.
+
+To use this to emulate the LINT1 input with KVM_CREATE_IRQCHIP, use the
+following algorithm:
+
+  - pause the vpcu
+  - read the local APIC's state (KVM_GET_LAPIC)
+  - check whether changing LINT1 will queue an NMI (see the LVT entry for LINT1)
+  - if so, issue KVM_NMI
+  - resume the vcpu
+
+Some guests configure the LINT1 NMI input to cause a panic, aiding in
+debugging.
+
+4.65 KVM_S390_UCAS_MAP
+
+Capability: KVM_CAP_S390_UCONTROL
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_ucas_mapping (in)
+Returns: 0 in case of success
+
+The parameter is defined like this:
+	struct kvm_s390_ucas_mapping {
+		__u64 user_addr;
+		__u64 vcpu_addr;
+		__u64 length;
+	};
+
+This ioctl maps the memory at "user_addr" with the length "length" to
+the vcpu's address space starting at "vcpu_addr". All parameters need to
+be alligned by 1 megabyte.
+
+4.66 KVM_S390_UCAS_UNMAP
+
+Capability: KVM_CAP_S390_UCONTROL
+Architectures: s390
+Type: vcpu ioctl
+Parameters: struct kvm_s390_ucas_mapping (in)
+Returns: 0 in case of success
+
+The parameter is defined like this:
+	struct kvm_s390_ucas_mapping {
+		__u64 user_addr;
+		__u64 vcpu_addr;
+		__u64 length;
+	};
+
+This ioctl unmaps the memory in the vcpu's address space starting at
+"vcpu_addr" with the length "length". The field "user_addr" is ignored.
+All parameters need to be alligned by 1 megabyte.
+
+4.67 KVM_S390_VCPU_FAULT
+
+Capability: KVM_CAP_S390_UCONTROL
+Architectures: s390
+Type: vcpu ioctl
+Parameters: vcpu absolute address (in)
+Returns: 0 in case of success
+
+This call creates a page table entry on the virtual cpu's address space
+(for user controlled virtual machines) or the virtual machine's address
+space (for regular virtual machines). This only works for minor faults,
+thus it's recommended to access subject memory page via the user page
+table upfront. This is useful to handle validity intercepts for user
+controlled virtual machines to fault in the virtual cpu's lowcore pages
+prior to calling the KVM_RUN ioctl.
+
+4.68 KVM_SET_ONE_REG
+
+Capability: KVM_CAP_ONE_REG
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_one_reg (in)
+Returns: 0 on success, negative value on failure
+
+struct kvm_one_reg {
+       __u64 id;
+       __u64 addr;
+};
+
+Using this ioctl, a single vcpu register can be set to a specific value
+defined by user space with the passed in struct kvm_one_reg, where id
+refers to the register identifier as described below and addr is a pointer
+to a variable with the respective size. There can be architecture agnostic
+and architecture specific registers. Each have their own range of operation
+and their own constants and width. To keep track of the implemented
+registers, find a list below:
+
+  Arch  |       Register        | Width (bits)
+        |                       |
+  PPC   | KVM_REG_PPC_HIOR      | 64
+
+4.69 KVM_GET_ONE_REG
+
+Capability: KVM_CAP_ONE_REG
+Architectures: all
+Type: vcpu ioctl
+Parameters: struct kvm_one_reg (in and out)
+Returns: 0 on success, negative value on failure
+
+This ioctl allows to receive the value of a single register implemented
+in a vcpu. The register to read is indicated by the "id" field of the
+kvm_one_reg struct passed in. On success, the register value can be found
+at the memory location pointed to by "addr".
+
+The list of registers accessible using this interface is identical to the
+list in 4.64.
+
+4.70 KVM_KVMCLOCK_CTRL
+
+Capability: KVM_CAP_KVMCLOCK_CTRL
+Architectures: Any that implement pvclocks (currently x86 only)
+Type: vcpu ioctl
+Parameters: None
+Returns: 0 on success, -1 on error
+
+This signals to the host kernel that the specified guest is being paused by
+userspace.  The host will set a flag in the pvclock structure that is checked
+from the soft lockup watchdog.  The flag is part of the pvclock structure that
+is shared between guest and host, specifically the second bit of the flags
+field of the pvclock_vcpu_time_info structure.  It will be set exclusively by
+the host and read/cleared exclusively by the guest.  The guest operation of
+checking and clearing the flag must an atomic operation so
+load-link/store-conditional, or equivalent must be used.  There are two cases
+where the guest will clear the flag: when the soft lockup watchdog timer resets
+itself or when a soft lockup is detected.  This ioctl can be called any time
+after pausing the vcpu, but before it is resumed.
+
+4.71 KVM_SIGNAL_MSI
+
+Capability: KVM_CAP_SIGNAL_MSI
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_msi (in)
+Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
+
+Directly inject a MSI message. Only valid with in-kernel irqchip that handles
+MSI messages.
+
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u8  pad[16];
+};
+
+No flags are defined so far. The corresponding field must be 0.
+
 5. The kvm_run structure
 
 Application code obtains a pointer to the kvm_run structure by
diff -ur a/drivers/ata/ahci.c b/drivers/ata/ahci.c
--- a/drivers/ata/ahci.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/ahci.c	2014-02-17 11:57:44.000000000 +0100
@@ -312,6 +312,38 @@
 	{ PCI_VDEVICE(INTEL, 0x1e06), board_ahci }, /* Panther Point RAID */
 	{ PCI_VDEVICE(INTEL, 0x1e07), board_ahci }, /* Panther Point RAID */
 	{ PCI_VDEVICE(INTEL, 0x1e0e), board_ahci }, /* Panther Point RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c02), board_ahci }, /* Lynx Point AHCI */
+	{ PCI_VDEVICE(INTEL, 0x8c03), board_ahci }, /* Lynx Point AHCI */
+	{ PCI_VDEVICE(INTEL, 0x8c04), board_ahci }, /* Lynx Point RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c05), board_ahci }, /* Lynx Point RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c06), board_ahci }, /* Lynx Point RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c07), board_ahci }, /* Lynx Point RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c0e), board_ahci }, /* Lynx Point RAID */
+	{ PCI_VDEVICE(INTEL, 0x8c0f), board_ahci }, /* Lynx Point RAID */
+	{ PCI_VDEVICE(INTEL, 0x9c02), board_ahci }, /* Lynx Point-LP AHCI */
+	{ PCI_VDEVICE(INTEL, 0x9c03), board_ahci }, /* Lynx Point-LP AHCI */
+	{ PCI_VDEVICE(INTEL, 0x9c04), board_ahci }, /* Lynx Point-LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x9c05), board_ahci }, /* Lynx Point-LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x9c06), board_ahci }, /* Lynx Point-LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x9c07), board_ahci }, /* Lynx Point-LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x9c0e), board_ahci }, /* Lynx Point-LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x9c0f), board_ahci }, /* Lynx Point-LP RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f22), board_ahci }, /* Avoton AHCI */
+	{ PCI_VDEVICE(INTEL, 0x1f23), board_ahci }, /* Avoton AHCI */
+	{ PCI_VDEVICE(INTEL, 0x1f24), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f25), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f26), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f27), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f2e), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f2f), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f32), board_ahci }, /* Avoton AHCI */
+	{ PCI_VDEVICE(INTEL, 0x1f33), board_ahci }, /* Avoton AHCI */
+	{ PCI_VDEVICE(INTEL, 0x1f34), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f35), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f36), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f37), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f3e), board_ahci }, /* Avoton RAID */
+	{ PCI_VDEVICE(INTEL, 0x1f3f), board_ahci }, /* Avoton RAID */
 
 	/* JMicron 360/1/3/5/6, match class to avoid IDE function */
 	{ PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID,
@@ -538,10 +570,44 @@
 	syno_mv_9235_gpio_reg_set(host, MV_9235_GPIO_ACTIVE, 0x00B6D8D1);
 }
 
+int syno_mv_9235_disk_led_get(const unsigned short hostnum)
+{
+	struct Scsi_Host *shost = scsi_host_lookup(hostnum);
+	struct ata_port *ap = NULL;
+	int ret = -1;
+	u32 value;
+	int led_idx;
+
+	if (NULL == shost) {
+		goto END;
+	}
+
+	if (NULL == (ap = ata_shost_to_port(shost))) {
+		goto END;
+	}
+
+	// fault pins on last 4 pins
+	led_idx = ap->print_id - ap->host->ports[0]->print_id + 4;
+
+	value = syno_mv_9235_gpio_reg_read(ap->host, MV_9235_GPIO_DATA_OUT);
+
+	if (value & (1 << led_idx)) {
+		ret = 1;
+	} else {
+		ret = 0;
+	}
+END:
+	if (NULL != shost) {
+		scsi_host_put(shost);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(syno_mv_9235_disk_led_get);
+
 /*
  *	Write value to 9235 gpio
  */
-int syno_mv_9235_disk_led_set(const unsigned short hostnum, SYNO_DISK_LED status)
+int syno_mv_9235_disk_led_set(const unsigned short hostnum, int iValue)
 {
 	struct Scsi_Host *shost = scsi_host_lookup(hostnum);
 	struct ata_port *ap = NULL;
@@ -554,21 +620,22 @@
 	}
 
 	if(NULL == (ap = ata_shost_to_port(shost))) {
-		scsi_host_put(shost);
 		goto END;
 	}
 
 	led_idx = ap->print_id - ap->host->ports[0]->print_id + 4;
 	value = syno_mv_9235_gpio_reg_read(ap->host, MV_9235_GPIO_DATA_OUT);
-	if (DISK_LED_ORANGE_BLINK == status || DISK_LED_ORANGE_SOLID == status) {
+	if (1 == iValue) {
 		value |= (1 << led_idx);
 	} else {
 		value &= ~(1 << led_idx);
 	}
 	syno_mv_9235_gpio_reg_set(ap->host, MV_9235_GPIO_DATA_OUT, value);
-	scsi_host_put(shost);
 	ret = 0;
 END:
+	if (NULL != shost) {
+		scsi_host_put(shost);
+	}
 	return ret;
 }
 
@@ -1399,8 +1466,10 @@
 
 #ifdef MY_ABC_HERE
 	if (pdev->vendor == 0x1b4b && pdev->device == 0x9235) {
-		hpriv->flags |= AHCI_HFLAG_YES_MV9235_FIX;
-		dev_info(&pdev->dev, "enable MV_9235_WORKAROUND\n");
+		for (i = 0; i < host->n_ports; i++) {
+			struct ata_port *ap = host->ports[i];
+			ap->link.uiStsFlags |= SYNO_STATUS_IS_MV9235;
+		}
 	}
 #endif
 
diff -ur a/drivers/ata/ahci.h b/drivers/ata/ahci.h
--- a/drivers/ata/ahci.h	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/ahci.h	2014-02-17 11:57:45.000000000 +0100
@@ -224,9 +224,6 @@
 #ifdef MY_DEF_HERE
 	AHCI_HFLAG_REPROBE		= (1 << 15),
 #endif
-#ifdef MY_ABC_HERE
-	AHCI_HFLAG_YES_MV9235_FIX	= (1 << 31), /* port0|port2 , port1|port3 */
-#endif
 
 	/* ap->flags bits */
 
diff -ur a/drivers/ata/ahci_platform.c b/drivers/ata/ahci_platform.c
--- a/drivers/ata/ahci_platform.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/ahci_platform.c	2014-02-17 11:57:45.000000000 +0100
@@ -21,8 +21,19 @@
 #include <linux/platform_device.h>
 #include <linux/libata.h>
 #include <linux/ahci_platform.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/clk.h>
+#include <mach/reset.h>
+#endif
 #include "ahci.h"
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+/* SATA Clocks */
+static struct clk *sata_oob_clk; /* Core clock */
+static struct clk *sata_pmu_clk; /* PMU alive clock */
+static struct clk *sata_clk;	/* Sata AXI ref clock */
+#endif 
+
 enum ahci_type {
 	AHCI,		/* standard platform ahci */
 	IMX53_AHCI,	/* ahci on i.mx53 */
@@ -62,6 +73,50 @@
 	AHCI_SHT("ahci_platform"),
 };
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_PM)
+static int ahci_platform_suspend(struct platform_device *pdev, pm_message_t state)
+{
+        struct ata_host *host = platform_get_drvdata(pdev);
+	int ret=0;
+        if (host)
+		ret = ata_host_suspend(host, state);
+
+#ifdef CONFIG_ARCH_M86XXX
+	if (!ret) /* sucessfully done the host suspend */
+	{
+		/* No do the clock disable PMU,OOB,AXI here */
+		clk_disable(sata_clk);
+		clk_disable(sata_oob_clk);
+		clk_disable(sata_pmu_clk);
+	}
+#endif
+	
+        return ret;
+}
+
+static int ahci_platform_resume(struct platform_device *pdev)
+{
+        struct ata_host *host = platform_get_drvdata(pdev);
+
+#ifdef CONFIG_ARCH_M86XXX
+	/* Do the  clock enable here  PMU,OOB,AXI */
+	clk_enable(sata_clk);
+	clk_enable(sata_oob_clk);
+	clk_enable(sata_pmu_clk);
+#endif
+
+        if (host) 
+		ata_host_resume(host);
+
+	return 0;
+}
+#else
+#define ahci_platform_suspend NULL
+#define ahci_platform_resume NULL
+#endif
+
+
+
 static int __init ahci_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
@@ -76,6 +131,53 @@
 	int n_ports;
 	int i;
 	int rc;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	/* Get the Reference and Enable  the SATA clocks here */
+
+	sata_clk = clk_get(NULL,"sata");
+	/* Error Handling , if no SATA(AXI) clock reference: return error */
+	if (IS_ERR(sata_clk)) {
+		pr_err("%s: Unable to obtain SATA(AXI) clock: %ld\n",__func__,PTR_ERR(sata_clk));
+		return PTR_ERR(sata_clk);
+ 	}
+
+	/*Enable the SATA(AXI) clock here */
+        rc = clk_enable(sata_clk);
+	if (rc){
+		pr_err("%s: SATA(AXI) clock enable failed \n",__func__);
+                return rc;
+	}
+	sata_oob_clk = clk_get(NULL,"sata_oob");
+	/* Error Handling , if no SATA_OOB clock reference: return error */
+	if (IS_ERR(sata_oob_clk)) {
+		pr_err("%s: Unable to obtain SATA_OOB clock: %ld\n",__func__,PTR_ERR(sata_oob_clk));
+		return PTR_ERR(sata_oob_clk);
+ 	}
+
+	sata_pmu_clk = clk_get(NULL,"sata_pmu");
+	/* Error Handling , if no SATA_PMU clock reference: return error */
+	if (IS_ERR(sata_pmu_clk)) {
+		pr_err("%s: Unable to obtain SATA_PMU clock: %ld\n",__func__,PTR_ERR(sata_pmu_clk));
+		return PTR_ERR(sata_pmu_clk);
+	}
+	/*Enable the SATA(PMU and OOB) clocks here */
+        rc = clk_enable(sata_oob_clk);
+	if (rc){
+		pr_err("%s: SATA_OOB clock enable failed \n",__func__);
+                return rc;
+	}
+
+        rc = clk_enable(sata_pmu_clk);
+	if (rc){
+		pr_err("%s: SATA_PMU clock enable failed \n",__func__);
+		return rc;
+	}
+
+	/* Set the SATA PMU clock to 30 MHZ and OOB clock to 125MHZ */
+	clk_set_rate(sata_oob_clk,125000000);
+	clk_set_rate(sata_pmu_clk,30000000);
+	
+#endif
 
 	mem = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (!mem) {
@@ -164,6 +266,12 @@
 		if (ap->flags & ATA_FLAG_EM)
 			ap->em_message_type = hpriv->em_msg_type;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+		/* Optimized PFE/SATA DDR interaction,
+		limit burst size of SATA controller */
+		writel(0 , ahci_port_base(ap) + 0x70);
+#endif
+
 		/* disabled/not-implemented port */
 		if (!(hpriv->port_map & (1 << i)))
 			ap->ops = &ata_dummy_port_ops;
@@ -198,6 +306,28 @@
 
 	if (pdata && pdata->exit)
 		pdata->exit(dev);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	/* Disbale the SATA clocks Here */
+	clk_disable(sata_clk);
+	clk_put(sata_clk);
+	clk_disable(sata_oob_clk);
+	clk_put(sata_oob_clk);
+	clk_disable(sata_pmu_clk);
+	clk_put(sata_pmu_clk);
+
+	/*Putting  SATA in reset state 
+	 * Sata axi clock domain in reset state
+	 * Serdes 1/2 in reset state, this depends upon PCIE1 and SGMII 
+         * sata 0/1 serdes controller in reset state
+	*/
+	c2000_block_reset(COMPONENT_AXI_SATA,1);
+
+	c2000_block_reset(COMPONENT_SERDES1,1);
+	c2000_block_reset(COMPONENT_SERDES_SATA0,1);
+
+	c2000_block_reset(COMPONENT_SERDES2,1);
+	c2000_block_reset(COMPONENT_SERDES_SATA1,1);
+#endif
 
 	return 0;
 }
@@ -209,13 +339,17 @@
 MODULE_DEVICE_TABLE(of, ahci_of_match);
 
 static struct platform_driver ahci_driver = {
-	.remove = __devexit_p(ahci_remove),
-	.driver = {
-		.name = "ahci",
-		.owner = THIS_MODULE,
-		.of_match_table = ahci_of_match,
+	.remove  = __devexit_p(ahci_remove),
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_PM)
+	.suspend = ahci_platform_suspend,
+	.resume  = ahci_platform_resume,
+#endif
+	.driver  = {
+		 .name = "ahci",
+		 .owner = THIS_MODULE,
+		 .of_match_table = ahci_of_match,
 	},
-	.id_table	= ahci_devtype,
+	.id_table = ahci_devtype,
 };
 
 static int __init ahci_init(void)
diff -ur a/drivers/ata/libahci.c b/drivers/ata/libahci.c
--- a/drivers/ata/libahci.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/libahci.c	2014-02-17 11:57:44.000000000 +0100
@@ -48,8 +48,11 @@
 #endif
 #include <linux/libata.h>
 #include "ahci.h"
+#if defined(CONFIG_SYNO_AVOTON)
+#include <linux/pci.h>
+#endif
 
-#ifdef CONFIG_ARCH_GEN3
+#if defined(CONFIG_ARCH_GEN3) || defined(CONFIG_SYNO_AVOTON)
 extern int SYNO_CTRL_HDD_ACT_NOTIFY(int index);
 #endif
 
@@ -113,9 +116,6 @@
 				      struct device_attribute *attr, char *buf);
 static ssize_t ahci_show_port_cmd(struct device *dev,
 				  struct device_attribute *attr, char *buf);
-#ifdef MY_ABC_HERE
-static void ahci_port_intr(struct ata_port *ap);
-#endif
 static ssize_t ahci_read_em_buffer(struct device *dev,
 				   struct device_attribute *attr, char *buf);
 static ssize_t ahci_store_em_buffer(struct device *dev,
@@ -269,9 +269,6 @@
 	&dev_attr_syno_manutil_power_disable,
 	&dev_attr_syno_pm_gpio,
 	&dev_attr_syno_pm_info,
-#ifdef MY_ABC_HERE
-	&dev_attr_syno_port_thaw,
-#endif
 #endif
 #ifdef MY_ABC_HERE
 	&dev_attr_syno_diskname_trans,
@@ -293,10 +290,6 @@
 	&dev_attr_syno_disk_serial,
 #endif
 #ifdef MY_ABC_HERE
-	&dev_attr_syno_fake_error_ctrl,
-	&dev_attr_syno_pwr_reset_count,
-#endif
-#ifdef MY_ABC_HERE
 	&dev_attr_sw_locate,
 	&dev_attr_sw_fault,
 #endif
@@ -338,9 +331,6 @@
 #endif
 	.port_start		= ahci_port_start,
 	.port_stop		= ahci_port_stop,
-#ifdef MY_ABC_HERE
-	.syno_force_intr	= ahci_port_intr,
-#endif
 };
 EXPORT_SYMBOL_GPL(ahci_ops);
 
@@ -899,6 +889,29 @@
 }
 #endif
 
+#if defined(CONFIG_SYNO_AVOTON)
+static int syno_is_avoton_ahci(struct ata_port *ap)
+{
+	struct pci_dev *pdev = NULL;
+	int ret = 0;
+
+	if (ap != NULL) {
+		pdev = to_pci_dev(ap->dev);
+		if (pdev != NULL && pdev->vendor == 0x8086) {
+			switch (pdev->device) {
+				case 0x1f22:
+					ret = 1;
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	return ret;
+}
+#endif
+
 static void ahci_start_port(struct ata_port *ap)
 {
 	struct ahci_port_priv *pp = ap->private_data;
@@ -931,6 +944,11 @@
 		}
 	}
 
+#if defined(CONFIG_SYNO_AVOTON)
+	if (syno_is_avoton_ahci(ap)) {
+		ap->flags |= ATA_FLAG_SW_ACTIVITY;
+	}
+#endif
 	if (ap->flags & ATA_FLAG_SW_ACTIVITY)
 		ata_for_each_link(link, ap, EDGE)
 			ahci_init_sw_activity(link);
@@ -1101,7 +1119,7 @@
 	 * toggle state of LED and reset timer.  If not,
 	 * turn LED to desired idle state.
 	 */
-#if defined(CONFIG_ARCH_GEN3)
+#if defined(CONFIG_ARCH_GEN3) || defined(CONFIG_SYNO_AVOTON)
 	spin_lock_irqsave(ap->lock, flags);
 	if (emp->saved_activity != emp->activity) {
 		emp->saved_activity = emp->activity;
@@ -1150,7 +1168,7 @@
 	emp->saved_activity = emp->activity = 0;
 	setup_timer(&emp->timer, ahci_sw_activity_blink, (unsigned long)link);
 
-#ifdef CONFIG_ARCH_GEN3
+#if defined(CONFIG_ARCH_GEN3) || defined(CONFIG_SYNO_AVOTON)
 #else
 	/* check our blink policy and set flag for link if it's enabled */
 	if (emp->blink_policy)
@@ -1523,24 +1541,7 @@
 
 	/* issue the second D2H Register FIS */
 	tf.ctl &= ~ATA_SRST;
-#ifdef MY_ABC_HERE
-	if (!(hpriv->flags & AHCI_HFLAG_YES_MV9235_FIX)) {
-#endif
 	ahci_exec_polled_cmd(ap, pmp, &tf, 0, 0, 0);
-#ifdef MY_ABC_HERE
-	} else {
-		/* 9235 may fail at 2nd D2H, so we use the same check as 1st D2H */
-		msecs = 0;
-		now = jiffies;
-		if (time_after(deadline, now))
-			msecs = jiffies_to_msecs(deadline - now);
-		if(ahci_exec_polled_cmd(ap, pmp, &tf, 0, 0, msecs)) {
-			rc = -EIO;
-			reason = "2nd FIS failed";
-			goto fail;
-		}
-	}
-#endif
 
 	/* wait for link to become ready */
 	rc = ata_wait_after_reset(link, deadline, check_ready);
@@ -1564,12 +1565,6 @@
 
  fail:
 	ata_link_err(link, "softreset failed (%s)\n", reason);
-#ifdef MY_ABC_HERE
-	if (-EBUSY == rc) {
-		ata_link_printk(link, KERN_ERR, "SRST fail, set srst fail flag\n");
-		link->uiSflags |= ATA_SYNO_FLAG_SRST_FAIL;
-	}
-#endif
 	return rc;
 }
 
@@ -1889,18 +1884,7 @@
 		ata_ehi_push_desc(host_ehi, "interface fatal error");
 	}
 
-#ifdef MY_ABC_HERE
-	if ((irq_stat & (PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)) || (ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR)) {
-		if (ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR) {
-			ap->uiSflags &= ~ATA_SYNO_FLAG_FORCE_INTR;
-			DBGMESG("ata%u: clear ATA_SYNO_FLAG_FORCE_INTR\n", ap->print_id);
-		} else {
-			ap->iDetectStat = 1;
-			DBGMESG("ata%u: set detect stat check\n", ap->print_id);
-		}
-#else
 	if (irq_stat & (PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)) {
-#endif
 #ifdef MY_ABC_HERE
 		syno_ata_info_print(ap);
 #endif
@@ -1949,11 +1933,7 @@
 		ahci_scr_write(&ap->link, SCR_ERROR, SERR_PHYRDY_CHG);
 	}
 
-#ifdef MY_ABC_HERE
-	if (unlikely(status & PORT_IRQ_ERROR) || (ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR)) {
-#else
 	if (unlikely(status & PORT_IRQ_ERROR)) {
-#endif
 		ahci_error_intr(ap, status);
 		return;
 	}
@@ -2018,6 +1998,26 @@
 	}
 }
 
+#if defined(CONFIG_SYNO_COMCERTO)
+/*
+*/
+
+#if defined (CONFIG_COMCERTO_AHCI_PROF)
+unsigned int ahci_time_counter[256]; // 4 ms -> 1S
+unsigned int ahci_data_counter[256]; // 4K-> 1020K
+unsigned int ahci_int_before_req;
+static struct timeval last_ahci_req;
+unsigned int init_ahci_prof = 0;
+unsigned int enable_ahci_prof = 0;
+extern struct timeval ahci_last_qc_comp[32];
+extern unsigned int ahci_last_qc_comp_flag[32];
+#endif
+
+static struct timeval time;
+
+#endif
+
+
 irqreturn_t ahci_interrupt(int irq, void *dev_instance)
 {
 	struct ata_host *host = dev_instance;
@@ -2036,16 +2036,6 @@
 	if (!irq_stat)
 		return IRQ_NONE;
 
-#ifdef MY_ABC_HERE
-	if (hpriv->flags & AHCI_HFLAG_YES_MV9235_FIX) {
-		u32 port_mask[2] = {0x5, 0xa};
-		for (i = 0; i < (host->n_ports/2) ; i++) {
-			if (irq_stat & port_mask[i])
-				irq_stat |= port_mask[i];
-		}
-	}
-#endif
-
 	irq_masked = irq_stat & hpriv->port_map;
 
 	spin_lock(&host->lock);
@@ -2095,6 +2085,33 @@
 	void __iomem *port_mmio = ahci_port_base(ap);
 	struct ahci_port_priv *pp = ap->private_data;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_AHCI_PROF)
+	struct timeval now;
+
+	if (enable_ahci_prof) {
+		do_gettimeofday(&now);
+
+		if (init_ahci_prof) {
+			int diff_time_ms;
+			diff_time_ms = ((now.tv_sec - last_ahci_req.tv_sec) * 1000) + ((now.tv_usec - last_ahci_req.tv_usec) / 1000);
+			if (diff_time_ms < 1000) {//Don't record more than 1s
+				ahci_time_counter[diff_time_ms >> 3]++;
+			}
+			else
+				ahci_time_counter[255]++;
+		}
+		else {
+			init_ahci_prof = 1;
+		}
+		last_ahci_req = now;
+
+		if (qc->nbytes < (1 << 21))
+			ahci_data_counter[(qc->nbytes >> 13) & 0xFF]++;
+		else
+			ahci_data_counter[255]++;
+	}
+#endif
+
 	/* Keep track of the currently active link.  It will be used
 	 * in completion path to determine whether NCQ phase is in
 	 * progress.
@@ -2111,6 +2128,12 @@
 		writel(fbs, port_mmio + PORT_FBS);
 		pp->fbs_last_dev = qc->dev->link->pmp;
 	}
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_AHCI_PROF)
+	if (enable_ahci_prof) {
+		ahci_last_qc_comp[qc->tag] = now;
+		ahci_last_qc_comp_flag[qc->tag] = 1;
+	}
+#endif
 
 	writel(1 << qc->tag, port_mmio + PORT_CMD_ISSUE);
 
@@ -2520,7 +2543,7 @@
 		}
 #endif
 	}
-#ifdef CONFIG_ARCH_GEN3
+#if defined(CONFIG_ARCH_GEN3)
 	pi->flags |= ATA_FLAG_SW_ACTIVITY;
 #endif
 }
diff -ur a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
--- a/drivers/ata/libata-core.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/libata-core.c	2014-02-17 11:57:44.000000000 +0100
@@ -145,6 +145,12 @@
 extern int SYNO_CHECK_HDD_PRESENT(int index);
 #endif
 
+#if defined(CONFIG_ARCH_COMCERTO)
+extern unsigned char SYNOComcerto2kIsBoardNeedPowerUpHDD(u32);
+extern int SYNO_CTRL_HDD_POWERON(int index, int value);
+extern int SYNO_CHECK_HDD_PRESENT(int index);
+#endif
+
 /* param_buf is thrown away after initialization, disallow read */
 module_param_string(force, ata_force_param_buf, sizeof(ata_force_param_buf), 0);
 MODULE_PARM_DESC(force, "Force ATA configurations including cable type, link speed and transfer mode (see Documentation/kernel-parameters.txt for details)");
@@ -312,27 +318,6 @@
 	return dev;
 }
 
-#ifdef MY_ABC_HERE
-unsigned int uiCheckPortLinksFlags(struct ata_port *pAp) {
-	struct ata_link *pLink = NULL;
-	unsigned int uiFlags = 0x0;
-
-	if (!pAp) {
-		goto END;
-	}
-
-	ata_for_each_link(pLink, pAp, EDGE) {
-		if (pLink->uiSflags) {
-			ata_link_printk(pLink, KERN_ERR, "get error flags 0x%x\n", pLink->uiSflags);
-			uiFlags |= pLink->uiSflags;
-		}
-	}
-
-END:
-	return uiFlags;
-}
-#endif
-
 /**
  *	ata_dev_phys_link - find physical link for a device
  *	@dev: ATA device to look up physical link for
@@ -1919,8 +1904,27 @@
 	}
 
 #ifdef SYNO_SATA_PM_DEVICE_GPIO
-	if (IS_SYNO_PMP_CMD(tf) && !(ehc->i.action & ATA_EH_REVALIDATE))
+	if (IS_SYNO_PMP_CMD(tf) &&
+		(!(ehc->i.action & ATA_EH_REVALIDATE)))
+
 		return syno_ata_exec_internal_gpio(dev, tf, timeout);
+	else if (1 == dev->link->ap->PMSynoPowerDisable &&
+			 IS_SYNO_PMP_CMD(tf) &&
+			 (ehc->i.action & ATA_EH_REVALIDATE)) {
+		u16 reg;
+		u32 val;
+
+		ata_port_printk(dev->link->ap, KERN_ERR,
+				"Syno PMP cmd but REVALIDATE flag set, Manutil is running\n");
+
+		reg = ((tf->hob_feature << 8) | tf->feature);
+		val = ((tf->lbah << 24) | (tf->lbam << 16) | (tf->lbal << 8) | tf->nsect);
+
+		ata_port_printk(dev->link->ap, KERN_ERR, "Syno PMP cmd %x reg %x addr %x\n",
+				tf->command, reg, val);
+
+		return syno_ata_exec_internal_gpio(dev, tf, timeout);
+	}
 	else
 #endif
 	return ata_exec_internal_sg(dev, tf, cdb, dma_dir, psg, n_elem,
@@ -2361,7 +2365,7 @@
 	dev->horkage |= ata_dev_blacklisted(dev);
 #if defined(SYNO_SATA_PM_DEVICE_GPIO) && defined(MY_ABC_HERE)
 	if(ap->nr_pmp_links) {
-		if (0 == strncmp(gszSynoHWVersion, HW_DS1812p, strlen(HW_DS1812p)) &&
+		if (syno_is_hw_version(HW_DS1812p) &&
 				IS_SYNOLOGY_DX510(ap->PMSynoUnique) && (1 == ap->PMSynoCpldVer)) {
 
 			if (!(dev->horkage & ATA_HORKAGE_1_5_GBPS)) {
@@ -2373,13 +2377,13 @@
 
 		/*For DS412+, qoriq, 6282 with DX513, the link should be limited to 1.5G*/
 		} else if (IS_SYNOLOGY_DX513(ap->PMSynoUnique) &&
-				(0 == strncmp(gszSynoHWVersion, HW_DS412p, strlen(HW_DS412p)) ||
-				 0 == strncmp(gszSynoHWVersion, HW_DS112 , strlen(HW_DS112)) ||
-				 0 == strncmp(gszSynoHWVersion, HW_DS112pv10, strlen(HW_DS112pv10)) ||
-				 0 == strncmp(gszSynoHWVersion, HW_DS213pv10, strlen(HW_DS213pv10)) ||
-				 0 == strncmp(gszSynoHWVersion, HW_DS413, strlen(HW_DS413)) ||
-				 0 == strncmp(gszSynoHWVersion, HW_DS212pv10, strlen(HW_DS212pv10)) ||
-				 0 == strncmp(gszSynoHWVersion, HW_DS212pv20, strlen(HW_DS212pv20)))) {
+				(syno_is_hw_version(HW_DS412p) ||
+				 syno_is_hw_version(HW_DS112) ||
+				 syno_is_hw_version(HW_DS112pv10) ||
+				 syno_is_hw_version(HW_DS213pv10) ||
+				 syno_is_hw_version(HW_DS413) ||
+				 syno_is_hw_version(HW_DS212pv10) ||
+				 syno_is_hw_version(HW_DS212pv20))) {
 			if (!(dev->horkage & ATA_HORKAGE_1_5_GBPS)) {
 				ata_dev_printk(dev, KERN_ERR,
 						"DX513 workaround, limit the speed to 1.5 GBPS\n");
@@ -3504,12 +3508,6 @@
 		     ata_mode_string(ata_xfer_mode2mask(dev->xfer_mode)),
 		     dev_err_whine);
 
-#ifdef MY_ABC_HERE
-	if (ap->uiSflags & ATA_SYNO_FLAG_REVALID_FAIL) {
-		DBGMESG("port %d set mode sucessfully , clear revalid fail flag\n", ap->print_id);
-		ap->uiSflags &= ~ATA_SYNO_FLAG_REVALID_FAIL;
-	}
-#endif
 	return 0;
 
  fail:
@@ -4007,6 +4005,10 @@
 {
 	u32 scontrol;
 	int rc;
+#if defined(CONFIG_SYNO_COMCERTO)
+	int try_count=0;
+	u32 sstatus;
+#endif
 
 	DPRINTK("ENTER\n");
 
@@ -4030,12 +4032,21 @@
 		sata_set_spd(link);
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO)
+keep_trying:
+#endif
+
 	/* issue phy wake/reset */
 	if ((rc = sata_scr_read(link, SCR_CONTROL, &scontrol)))
 		goto out;
 
 	scontrol = (scontrol & 0x0f0) | 0x301;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	//Limit the max speed to 3GBps
+	scontrol = (scontrol & ~(0xf0)) | 0x20;
+#endif
+
 	if ((rc = sata_scr_write_flush(link, SCR_CONTROL, scontrol)))
 		goto out;
 
@@ -4048,6 +4059,24 @@
 	rc = sata_link_resume(link, timing, deadline);
 	if (rc)
 		goto out;
+
+#if defined(CONFIG_SYNO_COMCERTO)
+	try_count++;
+	sata_scr_read(link, SCR_STATUS, &sstatus);
+
+	//Check if PHY not ready
+	if (((sstatus & 0xf) == 0x1) && (try_count < 7))
+	{
+		printk("!!!!!!!!!!! PHY Not Ready : SStatus 0x%x !!!!!!!!!!!\n",sstatus);
+		goto keep_trying;
+	}
+	else
+	{
+		if((sstatus & 0xf) == 0x3)
+		printk("!!!!!!!!!!! PHY Ready : SStatus 0x%x !!!!!!!!!!!\n",sstatus);
+	}
+#endif
+
 	/* if link is offline nothing more to do */
 	if (ata_phys_link_offline(link))
 		goto out;
@@ -4079,26 +4108,11 @@
 	if (check_ready)
 		rc = ata_wait_ready(link, deadline, check_ready);
  out:
-#ifdef MY_ABC_HERE
-	if (0 < link->ap->iFakeError) {
-		ata_link_printk(link, KERN_ERR, "generate fake error, Fake count %d\n", link->ap->iFakeError);
-		if (SYNO_ERROR_MAX > link->ap->iFakeError) {
-			--(link->ap->iFakeError);
-		}
-		rc = -EBUSY;
-	}
-#endif
 	if (rc && rc != -EAGAIN) {
 		/* online is set iff link is online && reset succeeded */
 		if (online)
 			*online = false;
 		ata_link_err(link, "COMRESET failed (errno=%d)\n", rc);
-#ifdef MY_ABC_HERE
-		if (-EBUSY == rc || -EIO == rc) {
-			ata_link_printk(link, KERN_ERR, "COMRESET fail, set COMRESET fail flag\n");
-			link->uiSflags |= ATA_SYNO_FLAG_COMRESET_FAIL;
-		}
-#endif
 	}
 #ifdef MY_ABC_HERE
 	link->uiStsFlags &= ~SYNO_STATUS_IS_SIL3132PM;
@@ -4336,12 +4350,6 @@
 	dev->n_sectors = n_sectors;
  fail:
 	ata_dev_err(dev, "revalidation failed (errno=%d)\n", rc);
-#ifdef MY_ABC_HERE
-	if (-EIO == rc) {
-		DBGMESG("port %d revalidation failed, set revalid fail flag\n", dev->link->ap->print_id);
-		dev->link->ap->uiSflags |= ATA_SYNO_FLAG_REVALID_FAIL;
-	}
-#endif
 	return rc;
 }
 
@@ -5023,6 +5031,14 @@
 #endif /* __BIG_ENDIAN */
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_AHCI_PROF)
+unsigned int ahci_qc_comp_counter[33];
+struct timeval ahci_last_qc_comp[32];
+unsigned int ahci_last_qc_comp_flag[32];
+unsigned int ahci_qc_no_free_slot = 0;
+extern unsigned int enable_ahci_prof;
+#endif
+
 /**
  *	ata_qc_new - Request an available ATA command, for queueing
  *	@ap: target port
@@ -5050,6 +5066,13 @@
 	if (qc)
 		qc->tag = i;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_AHCI_PROF)
+	if (enable_ahci_prof)
+		if (qc == NULL) {
+			ahci_qc_no_free_slot++;
+		}
+#endif
+
 	return qc;
 }
 
@@ -5093,6 +5116,11 @@
 	struct ata_port *ap;
 	unsigned int tag;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_AHCI_PROF)
+	struct timeval now;
+	int diff_time_ms;
+#endif
+
 	WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */
 	ap = qc->ap;
 
@@ -5101,6 +5129,26 @@
 	if (likely(ata_tag_valid(tag))) {
 		qc->tag = ATA_TAG_POISON;
 		clear_bit(tag, &ap->qc_allocated);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_AHCI_PROF)
+	if (enable_ahci_prof) {
+		if (ahci_last_qc_comp_flag[tag]) {
+			int inx = 32;
+
+			do_gettimeofday(&now);
+
+			diff_time_ms = ((now.tv_sec - ahci_last_qc_comp[tag].tv_sec) * 1000) + 
+                                ((now.tv_usec - ahci_last_qc_comp[tag].tv_usec) / 1000);
+
+			if (diff_time_ms < 512) 
+				inx = diff_time_ms >> 4;
+
+			ahci_qc_comp_counter[inx]++;
+
+			ahci_last_qc_comp_flag[tag] = 0;
+		}
+	}
+#endif
 	}
 }
 
@@ -5767,9 +5815,6 @@
 	link->pmp = pmp;
 	link->active_tag = ATA_TAG_POISON;
 	link->hw_sata_spd_limit = UINT_MAX;
-#ifdef MY_ABC_HERE
-	link->uiSflags = 0x0;
-#endif
 
 	/* can't use iterator, ap isn't initialized yet */
 	for (i = 0; i < ATA_MAX_DEVICES; i++) {
@@ -5812,7 +5857,11 @@
 		return rc;
 
 	spd = (link->saved_scontrol >> 4) & 0xf;
+#ifdef MY_ABC_HERE
+	if (spd && !(link->ap->PMSynoUnique && IS_SYNOLOGY_RX410(link->ap->PMSynoUnique)))
+#else
 	if (spd)
+#endif
 		link->hw_sata_spd_limit &= (1 << spd) - 1;
 
 	ata_force_link_limits(link);
@@ -5881,13 +5930,6 @@
 	ap->stats.idle_irq = 1;
 #endif
 #ifdef MY_ABC_HERE
-	ap->uiSflags = 0x0;
-	ap->iFakeError = 0;
-	ap->iDetectStat = 0;
-	INIT_WORK(&ap->SendPwrResetEventTask, SendPwrResetEvent);
-	INIT_WORK(&ap->SendPortDisEventTask, SendPortDisEvent);
-#endif
-#ifdef MY_ABC_HERE
 	init_completion(&(ap->synoHotplugWait));
 #endif
 	ata_sff_port_init(ap);
@@ -6308,10 +6350,10 @@
 
 #if defined(CONFIG_SYNO_ARMADA)
 	if(SYNOArmadaIsBoardNeedPowerUpHDD(pAp->print_id)) {
+		SYNO_CTRL_HDD_POWERON(pAp->print_id, 1);
 		if (0 == SYNO_CHECK_HDD_PRESENT(pAp->print_id)) {
 			goto END;
 		}
-		SYNO_CTRL_HDD_POWERON(pAp->print_id, 1);
 		SleepForLatency();
 		iIsDoLatency = 1;
 	}
@@ -6319,10 +6361,21 @@
 
 #if defined(CONFIG_ARCH_GEN3)
 	if(SYNOEvansportIsBoardNeedPowerUpHDD(pAp->print_id)) {
+		SYNO_CTRL_HDD_POWERON(pAp->print_id, 1);
 		if (0 == SYNO_CHECK_HDD_PRESENT(pAp->print_id)) {
 			goto END;
 		}
+		SleepForLatency();
+		iIsDoLatency = 1;
+	}
+#endif
+
+#if defined(CONFIG_ARCH_COMCERTO)
+	if(SYNOComcerto2kIsBoardNeedPowerUpHDD(pAp->print_id)) {
 		SYNO_CTRL_HDD_POWERON(pAp->print_id, 1);
+		if (0 == SYNO_CHECK_HDD_PRESENT(pAp->print_id)) {
+			goto END;
+		}
 		SleepForLatency();
 		iIsDoLatency = 1;
 	}
@@ -6334,12 +6387,12 @@
 		SleepForHD(pAp->print_id);
 #else
 		    /* 710+, 411+ is also power on each HD ports every 7s, so we use old delay 10s */
-		if (0 == strncmp(gszSynoHWVersion, HW_DS710p, strlen(HW_DS710p)) ||
-			0 == strncmp(gszSynoHWVersion, HW_DS411p, strlen(HW_DS411p)) ||
-			0 == strncmp(gszSynoHWVersion, HW_DS411pII, strlen(HW_DS411pII)) ||
-			0 == strncmp(gszSynoHWVersion, HW_DS409, strlen(HW_DS409)) ||
-			0 == strncmp(gszSynoHWVersion, HW_DS410j, strlen(HW_DS410j)) ||
-			0 == strncmp(gszSynoHWVersion, HW_DS411j, strlen(HW_DS411j))) {
+		if (syno_is_hw_version(HW_DS710p) ||
+			syno_is_hw_version(HW_DS411p) ||
+			syno_is_hw_version(HW_DS411pII) ||
+			syno_is_hw_version(HW_DS409) ||
+			syno_is_hw_version(HW_DS410j) ||
+			syno_is_hw_version(HW_DS411j)) {
 			SleepForHD(pAp->print_id);
 		} else {
 			/* New model needn't dely 10s, so we speed it up useing new SleepForHW function */
@@ -7352,12 +7405,14 @@
 EXPORT_SYMBOL_GPL(ata_cable_ignore);
 EXPORT_SYMBOL_GPL(ata_cable_sata);
 
-#ifdef MY_ABC_HERE
 int (*funcSYNOSendDiskResetPwrEvent)(unsigned int, unsigned int) = NULL;
 EXPORT_SYMBOL(funcSYNOSendDiskResetPwrEvent);
 int (*funcSYNOSendDiskPortDisEvent)(unsigned int, unsigned int) = NULL;
 EXPORT_SYMBOL(funcSYNOSendDiskPortDisEvent);
-#endif /* MY_ABC_HERE */
+int (*funcSYNOSataErrorReport)(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int) = NULL;
+EXPORT_SYMBOL(funcSYNOSataErrorReport);
+int (*funcSYNODeepSleepEvent)(unsigned int, unsigned int) = NULL;
+EXPORT_SYMBOL(funcSYNODeepSleepEvent);
 
 #ifdef MY_DEF_HERE
 int (*funcSYNOSendEboxRefreshEvent)(int portIndex) = NULL;
@@ -7367,3 +7422,4 @@
 #ifdef MY_ABC_HERE
 EXPORT_SYMBOL_GPL(ata_dev_set_feature);
 #endif
+
diff -ur a/drivers/ata/libata-eh.c b/drivers/ata/libata-eh.c
--- a/drivers/ata/libata-eh.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/libata-eh.c	2014-02-17 11:57:44.000000000 +0100
@@ -100,11 +100,6 @@
 #endif
 
 
-#ifdef MY_ABC_HERE
-extern int (*funcSYNOSendDiskResetPwrEvent)(unsigned int, unsigned int);
-extern int (*funcSYNOSendDiskPortDisEvent)(unsigned int, unsigned int);
-#endif
-
 /* The following table determines how we sequence resets.  Each entry
  * represents timeout for that try.  The first try can be soft or
  * hardreset.  All others are hardreset if available.  In most cases
@@ -587,27 +582,6 @@
 	spin_unlock_irqrestore(ap->lock, flags);
 }
 
-#ifdef MY_ABC_HERE
-void SendPwrResetEvent(struct work_struct *work)
-{
-	if (funcSYNOSendDiskResetPwrEvent) {
-		funcSYNOSendDiskResetPwrEvent(0, 0);
-	}
-
-	return;
-}
-
-void SendPortDisEvent(struct work_struct *work)
-{
-	if (funcSYNOSendDiskPortDisEvent) {
-		funcSYNOSendDiskPortDisEvent(0, 0);
-	}
-
-	return;
-}
-
-#endif
-
 /**
  *	ata_scsi_error - SCSI layer error handler callback
  *	@host: SCSI host on which error occurred
@@ -753,30 +727,6 @@
 void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap)
 {
 	unsigned long flags;
-#ifdef MY_ABC_HERE
-	int iDetectTries = ATA_EH_PMP_TRIES;
-	int iForceDetect = 0;
-	unsigned int uiStatStart = 0x0;
-	unsigned int uiStatEnd = 0x0;
-#endif
-
-#ifdef MY_ABC_HERE
-	if (ap->iDetectStat) {
-		struct ata_link *link = NULL;
-		struct ata_device *dev = NULL;
-		int i = 0;
-		ata_for_each_link(link, ap, EDGE) {
-			ata_for_each_dev(dev, link, ALL) {
-				if(!(dev->ulSflags)) {
-					uiStatStart |= (ata_dev_enabled(dev)) << i;
-				}
-				++i;
-			}
-		}
-		DBGMESG("ata%u: detect stat 0x%x", ap->print_id, uiStatStart);
-	}
-#endif
-
 #ifdef SYNO_SATA_PM_DEVICE_GPIO
 	spin_lock_irqsave(ap->lock, flags);
 	while(ap->pflags & ATA_PFLAG_PMP_PMCTL) {
@@ -791,9 +741,6 @@
 	if (ap->ops->error_handler) {
 		struct ata_link *link;
 
-#ifdef MY_ABC_HERE
-acquire_repeat:
-#endif
 		/* acquire EH ownership */
 		ata_eh_acquire(ap);
  repeat:
@@ -880,129 +827,6 @@
 
 	scsi_eh_flush_done_q(&ap->eh_done_q);
 
-#ifdef MY_ABC_HERE
-	if (ap->iDetectStat) {
-		if (!(ap->pflags & ATA_PFLAG_FROZEN)) {
-			ap->iDetectStat = 0;
-			spin_lock_irqsave(ap->lock, flags);
-			if (ap->uiSflags & ATA_SYNO_FLAG_FORCE_RETRY) {
-				DBGMESG("ata%u: clear ATA_SYNO_FLAG_FORCE_RETRY\n", ap->print_id);
-				ap->uiSflags &= ~ATA_SYNO_FLAG_FORCE_RETRY;
-			}
-			spin_unlock_irqrestore(ap->lock, flags);
-		} else {
-			struct ata_link *link = NULL;
-			struct ata_device *dev = NULL;
-			int i = 0;
-
-			ata_for_each_link(link, ap, EDGE) {
-				ata_for_each_dev(dev, link, ALL) {
-					if(!(dev->ulSflags)) {
-						uiStatEnd |= (ata_dev_enabled(dev)) << i;
-					}
-					++i;
-				}
-			}
-			spin_lock_irqsave(ap->lock, flags);
-			if (uiStatStart == uiStatEnd) {
-				/* We received plugged/un-plugged events, but the status is still the same.
-				 * No device plugged/un-plugged but it frozen, we think it's a abnormal status */
-				ata_port_printk(ap, KERN_ERR, "detect abnormal stat 0x%x\n", uiStatEnd);
-				ap->uiSflags |= ATA_SYNO_FLAG_FORCE_RETRY;
-			} else {
-				ata_port_printk(ap, KERN_ERR, "didn't detect abnormal stat, but port frozen \n");
-				ap->iDetectStat = 0;
-				if (ap->uiSflags & ATA_SYNO_FLAG_FORCE_RETRY) {
-					ap->uiSflags &= ~ATA_SYNO_FLAG_FORCE_RETRY;
-				}
-			}
-			spin_unlock_irqrestore(ap->lock, flags);
-		}
-	}
-
-	spin_lock_irqsave(ap->lock, flags);
-	if (ap->uiSflags) {
-		iForceDetect = 1;
-		ap->eh_tries = 1; /* FIXME: set eh_tries to 1 to prevent it retry recursively */
-	}
-	spin_unlock_irqrestore(ap->lock, flags);
-
-	if (iForceDetect) {
-		iForceDetect = 0;
-		if (SYNO_ERROR_TILL_TO_FORCE == ap->iFakeError) {
-			DBGMESG("port %d unset Fake Error\n", ap->print_id);
-			ap->iFakeError = 0;
-		}
-		if (0 < iDetectTries) {
-			ata_port_printk(ap, KERN_ERR, "do detect tries %d\n", iDetectTries);
-			if (ap->ops->syno_force_intr) {
-				/* set force bit to force it occur fake sw plugged */
-				spin_lock_irqsave(ap->lock, flags);
-				ap->uiSflags |= ATA_SYNO_FLAG_FORCE_INTR;
-				spin_unlock_irqrestore(ap->lock, flags);
-				ap->ops->syno_force_intr(ap);
-			}
-			--iDetectTries;
-			goto acquire_repeat;
-		}
-	}
-	spin_lock_irqsave(ap->lock, flags);
-	if (!ap->uiSflags) {
-		/* FIXME: I can't find another properly place to clear them.
-		 * So I clear them here when EH complete and no error flags */
-		struct ata_link *link = NULL;
-		struct ata_device *dev = NULL;
-		/* if no our error flag, clear dev flags */
-		ata_for_each_link(link, ap, EDGE) {
-			ata_for_each_dev(dev, link, ALL) {
-				dev->ulSflags = 0;
-			}
-		}
-	} else {
-		struct ata_link *link = NULL;
-		struct ata_device *dev = NULL;
-
-		/* clear port error flags */
-		DBGMESG("ata%u: detect error flags 0x%x\n", ap->print_id, ap->uiSflags);
-		ap->uiSflags = 0;
-
-		/* if had on our action flag, we must take action now. Some action may cause deadlock (ex.detach),
-		 * so we must unlock now. */
-		spin_unlock_irqrestore(ap->lock, flags);
-		ata_for_each_link(link, ap, EDGE) {
-			link->uiSflags = 0;
-			ata_for_each_dev(dev, link, ALL) {
-#ifdef SYNO_SATA_PM_DEVICE_GPIO
-				if (dev->ulSflags & ATA_SYNO_DFLAG_PMP_DETACH) {
-					ata_dev_printk(dev, KERN_WARNING,
-							"force pmp detach\n");
-					sata_pmp_detach(dev);
-				}
-#endif
-				if (dev->ulSflags & ATA_SYNO_DFLAG_DETACH) {
-					ata_dev_printk(dev, KERN_WARNING,
-							"force dev detach\n");
-					ata_eh_detach_dev(dev);
-				}
-				if (dev->ulSflags & ATA_SYNO_DFLAG_DISABLE) {
-					ata_dev_printk(dev, KERN_WARNING,
-							"force dev disable\n");
-					ata_dev_disable(dev);
-				}
-				dev->ulSflags = 0;
-			}
-		}
-		spin_lock_irqsave(ap->lock, flags);
-	}
-	spin_unlock_irqrestore(ap->lock, flags);
-	if (ap->pflags & ATA_PFLAG_FROZEN) {
-		ata_port_printk(ap, KERN_ERR, "send port disabled event\n");
-		/* send event */
-		schedule_work(&(ap->SendPortDisEventTask));
-	}
-#endif /* MY_ABC_HERE */
-
-
 	/* clean up */
 	spin_lock_irqsave(ap->lock, flags);
 
@@ -1557,17 +1381,6 @@
 	if (!ata_dev_enabled(dev))
 		return;
 
-#ifdef MY_ABC_HERE
-	if ((dev->link->uiSflags || (dev->link->ap->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL))
-		&& ata_dev_enabled(dev)) {
-		ata_dev_printk(dev, KERN_WARNING,
-					   "still have recovery flags link 0x%x ap 0x%x, don't disabled it\n", dev->link->uiSflags, dev->link->ap->uiSflags);
-		dev->ulSflags |= ATA_SYNO_DFLAG_DISABLE;
-		return;
-	}
-	dev->ulSflags &= ~ATA_SYNO_DFLAG_DISABLE;
-#endif
-
 	if (ata_msg_drv(dev->link->ap))
 		ata_dev_warn(dev, "disabled\n");
 	ata_acpi_on_disable(dev);
@@ -1596,17 +1409,6 @@
 	struct ata_eh_context *ehc = &link->eh_context;
 	unsigned long flags;
 
-#ifdef MY_ABC_HERE
-	if ((dev->link->uiSflags || (dev->link->ap->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL))
-		&& ata_dev_enabled(dev)) {
-		ata_dev_printk(dev, KERN_WARNING,
-					   "still have recovery flags link 0x%x ap 0x%x, don't detach it\n", dev->link->uiSflags, dev->link->ap->uiSflags);
-		dev->ulSflags |= ATA_SYNO_DFLAG_DETACH;
-		return;
-	}
-	dev->ulSflags &= ~ATA_SYNO_DFLAG_DETACH;
-#endif
-
 	ata_dev_disable(dev);
 
 	spin_lock_irqsave(ap->lock, flags);
@@ -2784,6 +2586,10 @@
 		  ehc->i.serror & SERR_DEV_XCHG ? "DevExch " : "");
 #endif
 
+#ifdef SYNO_SATA_ERROR_REPORT
+	link->uiSError = ehc->i.serror;
+#endif
+
 	for (tag = 0; tag < ATA_MAX_QUEUE; tag++) {
 		struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag);
 		struct ata_taskfile *cmd = &qc->tf, *res = &qc->result_tf;
@@ -2871,7 +2677,16 @@
 			  res->feature & ATA_IDNF ? "IDNF " : "",
 			  res->feature & ATA_ABORTED ? "ABRT " : "");
 #endif
+#ifdef SYNO_SATA_ERROR_REPORT
+		if (cmd->command != ATA_CMD_PACKET &&
+		    (res->feature & (ATA_ICRC | ATA_UNC | ATA_IDNF | ATA_ABORTED))) {
+			link->uiError = res->feature & (ATA_ICRC | ATA_UNC | ATA_IDNF | ATA_ABORTED);
+		}
+#endif
 	}
+#ifdef SYNO_SATA_ERROR_REPORT
+	schedule_work(&(link->SendSataErrEventTask));
+#endif
 }
 
 /**
@@ -3260,19 +3075,6 @@
 	}
 #endif
 
-#ifdef MY_ABC_HERE
-	spin_lock_irqsave(ap->lock, flags);
-	if (!rc && link->uiSflags) {
-		/* GSCR is pmp fail flag, we shouldn't clear it here */
-		if (link->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL) {
-			link->uiSflags = ATA_SYNO_FLAG_GSCR_FAIL;
-		} else {
-			ata_link_printk(link, KERN_ERR, "link reset sucessfully clear error flags\n");
-			link->uiSflags = 0;
-		}
-	}
-	spin_unlock_irqrestore(ap->lock, flags);
-#endif
 	return rc;
 
  fail:
@@ -3528,12 +3330,6 @@
 		ehc->i.flags |= ATA_EHI_SETMODE;
 	}
 
-#ifdef MY_ABC_HERE
-	if (ap->uiSflags & ATA_SYNO_FLAG_REVALID_FAIL) {
-		DBGMESG("port %d revalid sucessfully , clear revalid fail flag\n", ap->print_id);
-		ap->uiSflags &= ~ATA_SYNO_FLAG_REVALID_FAIL;
-	}
-#endif
 	return 0;
 
  err:
@@ -4128,27 +3924,13 @@
 	ata_for_each_link(link, ap, EDGE) {
 		struct ata_eh_context *ehc = &link->eh_context;
 
-#ifdef MY_ABC_HERE
-		if (0 >= ap->iFakeError && !(ehc->i.action & ATA_EH_RESET))
-#else
 		if (!(ehc->i.action & ATA_EH_RESET))
-#endif
 			continue;
 
 		rc = ata_eh_reset(link, ata_link_nr_vacant(link),
 				  prereset, softreset, hardreset, postreset);
 		if (rc) {
 			ata_link_err(link, "reset failed, giving up\n");
-#ifdef MY_ABC_HERE
-			if (link->uiSflags) {
-				ata_for_each_dev(dev, link, ALL) {
-					if (ATA_DEV_ATA == dev->class) {
-						dev->ulSflags |= ATA_SYNO_DFLAG_DETACH;
-						ata_dev_printk(dev, KERN_ERR, "detect reset link fail, set detach flag\n");
-					}
-				}
-			}
-#endif
 			goto out;
 		}
 	}
@@ -4314,13 +4096,6 @@
 	if (rc && r_failed_link)
 		*r_failed_link = link;
 
-#ifdef MY_ABC_HERE
-	/* if not pmp, set link flags to ata port flags for ata port error handling.
-	 * pmp handler will handle pmp case by itself */
-	if (!ap->nr_pmp_links) {
-		ap->uiSflags = uiCheckPortLinksFlags(ap);
-	}
-#endif
 	DPRINTK("EXIT, rc=%d\n", rc);
 	return rc;
 }
diff -ur a/drivers/ata/libata.h b/drivers/ata/libata.h
--- a/drivers/ata/libata.h	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/libata.h	2014-02-17 11:57:45.000000000 +0100
@@ -169,11 +169,6 @@
 extern void ata_qc_schedule_eh(struct ata_queued_cmd *qc);
 extern void ata_dev_disable(struct ata_device *dev);
 extern void ata_eh_detach_dev(struct ata_device *dev);
-#ifdef MY_ABC_HERE
-extern void sata_pmp_detach(struct ata_device *dev);
-extern void SendPwrResetEvent(struct work_struct *work);
-extern void SendPortDisEvent(struct work_struct *work);
-#endif
 extern void ata_eh_about_to_do(struct ata_link *link, struct ata_device *dev,
 			       unsigned int action);
 extern void ata_eh_done(struct ata_link *link, struct ata_device *dev,
@@ -201,6 +196,7 @@
 extern int sata_pmp_set_lpm(struct ata_link *link, enum ata_lpm_policy policy,
 			    unsigned hints);
 extern int sata_pmp_attach(struct ata_device *dev);
+extern int syno_libata_pmp_deepsleep_indicator_set(struct ata_port *ap, const int blCLR);
 #else /* CONFIG_SATA_PMP */
 static inline int sata_pmp_scr_read(struct ata_link *link, int reg, u32 *val)
 {
diff -ur a/drivers/ata/libata-pmp.c b/drivers/ata/libata-pmp.c
--- a/drivers/ata/libata-pmp.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/libata-pmp.c	2014-02-17 11:57:45.000000000 +0100
@@ -16,9 +16,6 @@
 #include <linux/slab.h>
 #include "libata.h"
 #include "libata-transport.h"
-#ifdef MY_ABC_HERE
-#include "ahci.h"
-#endif
 
 #ifdef MY_DEF_HERE
 extern int (*funcSYNOSendEboxRefreshEvent)(int portIndex);
@@ -74,7 +71,6 @@
 	return 0;
 }
 
-
 /**
  *	sata_pmp_write - write PMP register
  *	@link: link to write PMP register for
@@ -742,7 +738,6 @@
 syno_pmp_ports_num(struct ata_port *ap)
 {
 	u32 ret = 1;
-	struct ahci_host_priv *hpriv = ap->host->private_data;
 
 	if (syno_is_synology_pm(ap)) {
 		ret = sata_pmp_gscr_ports(ap->link.device->gscr);
@@ -759,7 +754,7 @@
 
 #ifdef MY_ABC_HERE
 		/* Block sata 6Gbps host + sata 3Gbps expansion unit case*/
-		if (syno_pm_is_synology_3xxx(ap) && (hpriv->flags & AHCI_HFLAG_YES_MV9235_FIX)){
+		if (syno_pm_is_synology_3xxx(ap) && (ap->link.uiStsFlags & SYNO_STATUS_IS_MV9235)) {
 			ata_port_printk(ap, KERN_ERR, "This expansion unit is unsupported\n");
 			ret = 0;
 		}
@@ -805,6 +800,49 @@
 	syno_libata_pm_power_ctl(ap, 1, 1);
 }
 
+void
+syno_9705_workaround(struct ata_port *ap)
+{
+	struct Scsi_Host *pMaster_host = NULL;
+	struct ata_port *pAp_master = NULL;
+	int i = 0;
+
+	for (i = 1; i < ata_print_id; i++) {
+		if (NULL == (pMaster_host = scsi_host_lookup(i - 1))) {
+			continue;
+		}
+
+		if (NULL == (pAp_master = ata_shost_to_port(pMaster_host))) {
+			goto CONTINUE_FOR;
+		}
+
+		if (ap->host == pAp_master->host || ap->port_no == pAp_master->port_no) {
+			if (ap->PMSynoUnique != pAp_master->PMSynoUnique) {
+				if (syno_pm_is_synology_9705(pAp_master)) {
+					ata_port_printk(ap, KERN_ERR,
+							"replace unique %x with master unique %x\n",
+							ap->PMSynoUnique, pAp_master->PMSynoUnique);
+					ap->PMSynoUnique = pAp_master->PMSynoUnique;
+				} else {
+					ata_port_printk(ap, KERN_ERR,
+							"WARNING : master unique is not syno 9705, don't replace\n");
+				}
+
+				break;
+			}
+		}
+
+CONTINUE_FOR:
+		scsi_host_put(pMaster_host);
+		pMaster_host = NULL;
+		pAp_master = NULL;
+	}
+
+	if (NULL != pMaster_host) {
+		scsi_host_put(pMaster_host);
+	}
+}
+
 int
 syno_libata_pm_power_ctl(struct ata_port *ap, u8 blPowerOn, u8 blCustomInfo)
 {
@@ -850,6 +888,10 @@
 		} else if (syno_pm_is_9705(sata_pmp_gscr_vendor(ap->link.device->gscr),
 								   sata_pmp_gscr_devid(ap->link.device->gscr))) {
 			ap->PMSynoUnique = pm_pkg.var & 0x1f;
+
+			if (!syno_pm_is_synology_9705(ap)) {
+				syno_9705_workaround(ap);
+			}
 		}
 	}
 
@@ -868,10 +910,8 @@
 	for (iRetry = 0; blPowerOn ^ syno_pm_is_poweron(ap)
 					 && iRetry < SYNO_PMP_PWR_TRIES; ++iRetry) {
 
-		if (!blPowerOn) {
-			if (syno_sata_pmp_check_powerbtn(ap)) {
-				printk("check Eunit port %d power button fail\n", ap->print_id);
-			}
+		if (syno_sata_pmp_check_powerbtn(ap)) {
+			printk("check Eunit port %d power button fail\n", ap->print_id);
 		}
 
 		syno_pm_poweron_pkg_init(sata_pmp_gscr_vendor(ap->link.device->gscr),
@@ -1101,19 +1141,6 @@
 		if (err_mask) {
 			ata_dev_err(dev, "failed to read PMP GSCR[%d] (Emask=0x%x)\n",
 				    reg, err_mask);
-#ifdef MY_ABC_HERE
-			if ((AC_ERR_OTHER == err_mask || AC_ERR_SYSTEM == err_mask) &&
-				(dev->link->ap->pflags & ATA_PFLAG_FROZEN) &&
-				ata_phys_link_online(dev->link) && ata_is_host_link(dev->link)) {
-				ata_link_printk(dev->link, KERN_INFO,
-								"!!!!!!!!!!link still online, wait 7000ms and thaw for hw ready\n");
-				ata_eh_thaw_port(dev->link->ap);
-				schedule_timeout_uninterruptible(7UL*HZ);
-				/* set link and port flags to prevent pmp detach */
-				dev->link->uiSflags |= ATA_SYNO_FLAG_GSCR_FAIL;
-				dev->link->ap->uiSflags |= ATA_SYNO_FLAG_GSCR_FAIL;
-			}
-#endif
 			return -EIO;
 		}
 	}
@@ -1426,7 +1453,7 @@
 	syno_prepare_custom_info(ap);
 #ifdef MY_ABC_HERE
 	/*For DS1812+ with older version of DX510, the link should be limited to 1.5G*/
-	if (0 == strncmp(gszSynoHWVersion, HW_DS1812p, strlen(HW_DS1812p))) {
+	if (syno_is_hw_version(HW_DS1812p)) {
 		/* The old version should be b000 */
 		if(IS_SYNOLOGY_DX510(ap->PMSynoUnique) && (1 == ap->PMSynoCpldVer)) {
 			target = 1;
@@ -1441,13 +1468,13 @@
 		}
 	/*For DS412+, qoriq, 6282 with DX513, the link should be limited to 1.5G*/
 	} else if (IS_SYNOLOGY_DX513(ap->PMSynoUnique) &&
-			(0 == strncmp(gszSynoHWVersion, HW_DS412p, strlen(HW_DS412p)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS112 , strlen(HW_DS112)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS112pv10, strlen(HW_DS112pv10)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS213pv10, strlen(HW_DS213pv10)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS413, strlen(HW_DS413)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS212pv10, strlen(HW_DS212pv10)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS212pv20, strlen(HW_DS212pv20)))) {
+			(syno_is_hw_version(HW_DS412p)    ||
+			 syno_is_hw_version(HW_DS112)     ||
+			 syno_is_hw_version(HW_DS112pv10) ||
+			 syno_is_hw_version(HW_DS213pv10) ||
+			 syno_is_hw_version(HW_DS413)     ||
+			 syno_is_hw_version(HW_DS212pv10) ||
+			 syno_is_hw_version(HW_DS212pv20))) {
 		target = 1;
 		target_limit = (1 << target) - 1;
 
@@ -1459,11 +1486,11 @@
 		}
 	/*For DS412+, qoriq, 212p with and DX213, the link should be limited to 1.5G*/
 	} else if (IS_SYNOLOGY_DX213(ap->PMSynoUnique) &&
-			(0 == strncmp(gszSynoHWVersion, HW_DS412p, strlen(HW_DS412p)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS213pv10, strlen(HW_DS213pv10)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS413, strlen(HW_DS413)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS212pv10, strlen(HW_DS212pv10)) ||
-			 0 == strncmp(gszSynoHWVersion, HW_DS212pv20, strlen(HW_DS212pv20)))) {
+			(syno_is_hw_version(HW_DS412p)     ||
+			 syno_is_hw_version(HW_DS213pv10)  ||
+			 syno_is_hw_version(HW_DS413)      ||
+			 syno_is_hw_version(HW_DS212pv10)  ||
+			 syno_is_hw_version(HW_DS212pv20))) {
 		target = 1;
 		target_limit = (1 << target) - 1;
 
@@ -1533,11 +1560,7 @@
  *	LOCKING:
  *	Kernel thread context (may sleep).
  */
-#ifdef MY_ABC_HERE
-void sata_pmp_detach(struct ata_device *dev)
-#else
 static void sata_pmp_detach(struct ata_device *dev)
-#endif
 {
 	struct ata_link *link = dev->link;
 	struct ata_port *ap = link->ap;
@@ -1550,20 +1573,10 @@
 		link->pmp != SATA_PMP_CTRL_PORT);
 
 #ifdef MY_ABC_HERE
-	if ((dev->link->uiSflags || dev->link->ap->uiSflags) && ata_dev_enabled(dev)) {
-		ata_dev_printk(dev, KERN_WARNING,
-				"still have recovery flags link 0x%x ap 0x%x, don't detach this pmp dev\n", dev->link->uiSflags, dev->link->ap->uiSflags);
-		dev->ulSflags |= ATA_SYNO_DFLAG_PMP_DETACH;
-		/*FIXME: set detach flag, copy form ata_eh_detach_dev */
-		ata_for_each_link(tlink, ap, EDGE) {
-			tlink->device->ulSflags |= ATA_SYNO_DFLAG_DETACH;
-		}
-		return;
-	}
-	dev->ulSflags &= ~ATA_SYNO_DFLAG_PMP_DETACH;
 	ata_for_each_link(tlink, ap, EDGE) {
-		DBGMESG("ata%u: do pmp detach, clear all link uiSflags\n", dev->link->ap->print_id);
-		tlink->uiSflags = 0;
+		unsigned int *classes = tlink->eh_context.classes;
+		struct ata_device *tdev = tlink->device;
+		classes[tdev->devno] = ATA_DEV_UNKNOWN;
 	}
 #endif
 #ifdef SYNO_SATA_PM_DEVICE_GPIO
@@ -1797,9 +1810,6 @@
 	int tries = ATA_EH_PMP_TRIES;
 	int detach = 0, rc = 0;
 	int reval_failed = 0;
-#ifdef MY_ABC_HERE
-	unsigned int uiSflags = 0x0;
-#endif
 
 	DPRINTK("ENTER\n");
 
@@ -1870,31 +1880,9 @@
 	ehc->i.flags = 0;
 
 	DPRINTK("EXIT, rc=0\n");
-#ifdef MY_ABC_HERE
-	/* GSCR fail is not only attached to link, we must check pmp gscr fail here.
-	 * If pmp recover success, we must clear it here. */
-	if (ap->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL ||
-		link->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL) {
-		ata_port_printk(ap, KERN_ERR, "recovery success, clear gscr fail flag");
-		ap->uiSflags &= ~ATA_SYNO_FLAG_GSCR_FAIL;
-		link->uiSflags &= ~ATA_SYNO_FLAG_GSCR_FAIL;
-	}
-#endif
 	return 0;
 
  fail:
-#ifdef MY_ABC_HERE
-	/* set link error flags to ata port for ata port error handling.
-	 * GSCR may clear by link reset, but it may still have GSCR error,
-	 * so we must check port GSCR fail */
-	if ((uiSflags = uiCheckPortLinksFlags(ap))) {
-		if (ap->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL) {
-			ap->uiSflags = uiSflags | ATA_SYNO_FLAG_GSCR_FAIL;
-		} else {
-			ap->uiSflags = uiSflags;
-		}
-	}
-#endif
 	sata_pmp_detach(dev);
 	if (detach)
 		ata_eh_detach_dev(dev);
@@ -1996,9 +1984,6 @@
 	unsigned int err_mask;
 	u32 gscr_error, sntf;
 	int cnt, rc;
-#ifdef MY_ABC_HERE
-	unsigned int uiSflags = 0x0;
-#endif
 
 	pmp_tries = ATA_EH_PMP_TRIES;
 	ata_for_each_link(link, ap, EDGE)
@@ -2124,17 +2109,6 @@
 		goto retry;
 	}
 
-#ifdef MY_ABC_HERE
-	/* set link error flags to ata port for ata port error handling.
-	 * GSCR may clear by link reset, but it may still have GSCR error,
-	 * so we must check port GSCR fail */
-	uiSflags = uiCheckPortLinksFlags(ap);
-	if (ap->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL) {
-		ap->uiSflags = uiSflags | ATA_SYNO_FLAG_GSCR_FAIL;
-	} else {
-		ap->uiSflags = uiSflags;
-	}
-#endif
 	return 0;
 
  link_fail:
@@ -2161,18 +2135,6 @@
 
 	ata_port_err(ap, "failed to recover PMP after %d tries, giving up\n",
 		     ATA_EH_PMP_TRIES);
-#ifdef MY_ABC_HERE
-	/* set link error flags to ata port for ata port error handling.
-	 * GSCR may clear by link reset, but it may still have GSCR error,
-	 * so we must check port GSCR fail */
-	if ((uiSflags = uiCheckPortLinksFlags(ap))) {
-		if (ap->uiSflags & ATA_SYNO_FLAG_GSCR_FAIL) {
-			ap->uiSflags = uiSflags | ATA_SYNO_FLAG_GSCR_FAIL;
-		} else {
-			ap->uiSflags = uiSflags;
-		}
-	}
-#endif
 	sata_pmp_detach(pmp_dev);
 	ata_dev_disable(pmp_dev);
 
diff -ur a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
--- a/drivers/ata/libata-scsi.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/libata-scsi.c	2014-02-17 11:57:44.000000000 +0100
@@ -66,19 +66,16 @@
 #include <linux/list.h>
 extern unsigned int guiWakeupDisksNum;
 extern int giDenoOfTimeInterval;
-static unsigned long CurPendingListSleep = 0;
-static unsigned long CurPendingListWaking = 0;
 static int giGroupDisks = 0;
 static int giWakingDisks = 0;
 static unsigned long gulLastWake = 0;
 DEFINE_SPINLOCK(SYNOLastWakeLock);
 #endif
 
-#if defined(MY_ABC_HERE) && defined(MY_ABC_HERE)
-extern char gszSynoHWVersion[];
+#ifdef SYNO_SATA_ERROR_REPORT
+extern int (*funcSYNOSataErrorReport)(unsigned int, unsigned int, unsigned int, unsigned int, unsigned int);
 #endif
 
-
 #ifdef MY_DEF_HERE
 extern EUNIT_PWRON_TYPE (*funcSynoEunitPowerctlType)(void);
 #endif
@@ -211,151 +208,6 @@
 EXPORT_SYMBOL(look_up_scsi_dev_from_ap);
 #endif
 
-#ifdef MY_ABC_HERE
-static ssize_t
-syno_port_thaw_store(struct device *dev, struct device_attribute *attr, const char * buf, size_t count)
-{
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct ata_port *ap = ata_shost_to_port(shost);
-	ssize_t ret = -EIO;
-	int iThaw = 1;
-
-	if(!ap) {
-		goto END;
-	}
-
-
-	sscanf(buf, "%d", &iThaw);
-	if (iThaw) {
-		ata_eh_thaw_port(ap);
-		ata_port_schedule_eh(ap);
-	} else {
-		ata_port_printk(ap, KERN_ERR, "port freeze from sysfs control\n");
-		ata_eh_freeze_port(ap);
-		schedule_work(&(ap->SendPortDisEventTask));
-	}
-
-	ret = count;
-
-END:
-	return ret;
-}
-
-static ssize_t
-syno_port_thaw_show(struct device *dev, struct device_attribute *attr, char * buf)
-{
-	struct Scsi_Host *shost = class_to_shost(dev);
-	struct ata_port *ap = ata_shost_to_port(shost);
-	ssize_t len = -EIO;
-
-	if(!ap) {
-		goto END;
-	}
-
-
-	if (ap->pflags & ATA_PFLAG_FROZEN) {
-		len = sprintf(buf, "%d%s", 0, "\n");
-	} else {
-		len = sprintf(buf, "%d%s", 1, "\n");
-	}
-
-END:
-	return len;
-}
-DEVICE_ATTR(syno_port_thaw, S_IRUGO | S_IWUGO, syno_port_thaw_show, syno_port_thaw_store);
-EXPORT_SYMBOL_GPL(dev_attr_syno_port_thaw);
-
-/**
- * show this port remaining fake errors
- **/
-static ssize_t
-syno_fake_error_ctrl_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct scsi_device *sdev = to_scsi_device(dev);
-	struct ata_port *ap = ata_shost_to_port(sdev->host);
-	ssize_t len = -EIO;
-
-	if (!ap) {
-		goto END;
-	}
-
-	len = sprintf(buf, "%d%s", ap->iFakeError, "\n");
-
-END:
-	return len;
-}
-
-/**
- * set this port fake errors
- **/
-static ssize_t
-syno_fake_error_ctrl_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct scsi_device *sdev = to_scsi_device(dev);
-	struct ata_port *ap = ata_shost_to_port(sdev->host);
-	int iFakeError = 0;
-	ssize_t ret = -EIO;
-
-	if (!ap) {
-		goto END;
-	}
-
-	sscanf(buf, "%d", &iFakeError);
-	ap->iFakeError = iFakeError;
-
-	ret = count;
-
-END:
-	return ret;
-}
-DEVICE_ATTR(syno_fake_error_ctrl, S_IRUGO | S_IWUGO, syno_fake_error_ctrl_show, syno_fake_error_ctrl_store);
-EXPORT_SYMBOL_GPL(dev_attr_syno_fake_error_ctrl);
-
-/**
- * show this dev power reset count
- **/
-static ssize_t
-syno_pwr_reset_count_show(struct device *dev, struct device_attribute *attr, char *buf)
-{
-	struct scsi_device *sdev = to_scsi_device(dev);
-	ssize_t len = -EIO;
-
-	if (!sdev) {
-		goto END;
-	}
-
-	len = sprintf(buf, "%d%s", sdev->iResetPwrCount, "\n");
-
-END:
-	return len;
-}
-
-/**
- * set this dev power reset count
- **/
-static ssize_t
-syno_pwr_reset_count_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
-{
-	struct scsi_device *sdev = to_scsi_device(dev);
-	int iSet = 0;
-	ssize_t ret = -EIO;
-
-	if (!sdev) {
-		goto END;
-	}
-
-	sscanf(buf, "%d", &iSet);
-	sdev->iResetPwrCount = iSet;
-
-	ret = count;
-
-END:
-	return ret;
-}
-DEVICE_ATTR(syno_pwr_reset_count, S_IRUGO | S_IWUGO, syno_pwr_reset_count_show, syno_pwr_reset_count_store);
-EXPORT_SYMBOL_GPL(dev_attr_syno_pwr_reset_count);
-#endif /* MY_ABC_HERE */
-
 #ifdef SYNO_SATA_PM_DEVICE_GPIO
 /**
  * Eliminate CPU usage in scemd. while there is no disks in the
@@ -979,7 +831,13 @@
 	if (ap->nr_pmp_links &&
 		syno_is_synology_pm(ap)) {
 		char szTmp[BDEVNAME_SIZE];
-		char szTmp1[PAGE_SIZE];
+		char *szTmp1 = NULL;
+		szTmp1 = (char*) kmalloc(PAGE_SIZE, GFP_KERNEL);
+		if (NULL == szTmp1) {
+			printk(KERN_WARNING "%s kmalloc failed\n", __FUNCTION__);
+			len = 0;
+			goto END;
+		}
 
 		NumOfPMPorts = syno_support_disk_num(sata_pmp_gscr_vendor(ap->link.device->gscr),
 											 sata_pmp_gscr_devid(ap->link.device->gscr),
@@ -1155,10 +1013,12 @@
 
 		/* put it together */
 		len = snprintf(buf, PAGE_SIZE, "%s%s", buf, szTmp1);
+		kfree(szTmp1);
 	} else {
 		len = snprintf(buf, PAGE_SIZE, "%s%s%s", EBOX_INFO_DEV_LIST_KEY, "=\"\"", "\n");
 	}
 
+END:
 	return len;
 }
 
@@ -1577,10 +1437,6 @@
 	&dev_attr_syno_wcache,
 #endif
 #ifdef MY_ABC_HERE
-	&dev_attr_syno_fake_error_ctrl,
-	&dev_attr_syno_pwr_reset_count,
-#endif
-#ifdef MY_ABC_HERE
 	&dev_attr_syno_sata_disk_led_ctrl,
 #endif
 	NULL
@@ -3092,94 +2948,6 @@
 static int ata_scsi_translate(struct ata_device *dev, struct scsi_cmnd *cmd,
 						ata_xlat_func_t xlat_func);
 
-/**
- * completion function used for in-the-middle chk_power 
- * command to reissue pending command
- */
-void ata_qc_complete_chkpower(struct ata_queued_cmd *qc)
-{
-	u8 blSpinDown = 0;
-
-	if (qc->err_mask) {
-		DBGMESG("qc->err_mask != 0 print_id %u pmp %u\n", qc->ap->print_id, qc->dev->link->pmp);
-		goto END;
-	}
-
-	if (qc->flags & ATA_QCFLAG_FAILED) {
-		qc->dev->ulLastCmd = jiffies;
-		blSpinDown = 1;
-		DBGMESG("This qc is failed 0 print_id %u pmp %u schedule wale it up\n", qc->ap->print_id, qc->dev->link->pmp);
-		goto END;
-	}
-
-	/* 0 == qc->result_tf.nsect might not have a good asm code*/
-	if (!qc->result_tf.nsect) {
-		blSpinDown = 1;
-	}
-END:
-	if (blSpinDown) {
-		DBGMESG("disk %d is sleeping, need wakeup it\n", qc->ap->print_id);
-		set_bit(CHKPOWER_FIRST_CMD, &(qc->dev->ulSpinupState));
-		set_bit(qc->dev->link->ap->print_id, &CurPendingListSleep);
-	}
-	DBGMESG("ata%u: clear CHKPOWER_CHECKING\n", qc->ap->print_id);
-	clear_bit(CHKPOWER_CHECKING, &(qc->dev->ulSpinupState));
-	ata_qc_free(qc);
-}
-
-static int SynoInsertCheckPW(struct ata_device *dev)
-{
-	struct ata_queued_cmd *qc;
-	struct ata_port *ap = dev->link->ap;
-	int rc;
-
-	/* wake up sleeping disks if necessary */
-	if (test_and_set_bit(CHKPOWER_CHECKING, &(dev->ulSpinupState))) {
-		printk("%s: there is already cmnd processing print_id %d link->pmp %d\n",
-			   __FUNCTION__, ap->print_id, dev->link->pmp);
-		WARN_ON(1);
-		goto ERR_MEM;
-	}
-
-	/* issue a chk_power ata command to check disk power status */
-	qc = ata_qc_new_init(dev);
-	if (NULL == qc) {
-		DBGMESG("%s: NULL == qc print_id %d link->pmp %d\n",
-			   __FUNCTION__, ap->print_id, dev->link->pmp);
-		clear_bit(CHKPOWER_CHECKING, &(dev->ulSpinupState));
-		goto ERR_MEM;
-	}
-
-	qc->tf.command = ATA_CMD_CHK_POWER;
-	qc->tf.flags |= ATA_TFLAG_ISADDR | ATA_TFLAG_DEVICE;
-	qc->tf.protocol = ATA_PROT_NODATA;
-	qc->flags |= ATA_QCFLAG_RESULT_TF;
-
-	qc->complete_fn = ata_qc_complete_chkpower;
-
-	if (ap->ops->qc_defer) {
-		if ((rc = ap->ops->qc_defer(qc))){
-			clear_bit(CHKPOWER_CHECKING, &(dev->ulSpinupState));
-			DBGMESG("%s qc_defer, print_id %d pmp %d tag %d\n", __FUNCTION__, ap->print_id, dev->link->pmp, qc->tag);
-			goto DEFER;
-		}
-	}
-
-	dev->ulLastCmd = jiffies;
-	ata_qc_issue(qc);
-	return SCSI_MLQUEUE_HOST_BUSY;
-
-ERR_MEM:
-	dev->ulLastCmd = jiffies;
-	return SCSI_MLQUEUE_HOST_BUSY;
-DEFER:
-	ata_qc_free(qc);
-	if (rc == ATA_DEFER_LINK)
-		return SCSI_MLQUEUE_DEVICE_BUSY;
-	else
-		return SCSI_MLQUEUE_HOST_BUSY;
-}
-
 void ata_qc_complete_read(struct ata_queued_cmd *qc)
 {
 	if (qc->err_mask) {
@@ -3218,7 +2986,6 @@
 		WARN_ON(1);
 		goto ERR_MEM;
 	}
-	clear_bit(CHKPOWER_FIRST_CMD, &(dev->ulSpinupState));
 
 	/* issue a chk_power ata command to check disk power status */
 	qc = ata_qc_new_init(dev);
@@ -3258,10 +3025,6 @@
 	/* issue read and update gulLastWake */
 	spin_lock(&SYNOLastWakeLock);
 	gulLastWake = jiffies;
-	set_bit(ap->print_id, &CurPendingListWaking);
-	if (CurPendingListSleep == CurPendingListWaking) {
-		CurPendingListWaking = CurPendingListSleep = 0;
-	}
 	/* count waking disks */
 	++giWakingDisks;
 	/* if all disks in group were waking, reset group */
@@ -3310,11 +3073,8 @@
  			DBGMESG("port %d ATA_PFLAG_FROZEN or ATA_FLAG_DISABLED, clear all bits\n", ap->print_id);
 			ata_port_schedule_eh(ap);
  		}
-		clear_bit(CHKPOWER_CHECKING, &(dev->ulSpinupState));
 		clear_bit(CHKPOWER_FIRST_CMD, &(dev->ulSpinupState));
 		clear_bit(CHKPOWER_FIRST_WAIT, &(dev->ulSpinupState));
-		clear_bit(ap->print_id, &CurPendingListWaking);
-		clear_bit(ap->print_id, &CurPendingListSleep);
 		goto PASS;
 	}
 
@@ -3326,8 +3086,9 @@
 	/* The ATA_CMD_CHK_POWER command won't wake up disk. So we don't check whether
 	 * DS is sleeping now.
 	 */
-	if (!(scsicmd[0] == ATA_16 && scsicmd[14] == ATA_CMD_CHK_POWER)) {
-
+	if (scsicmd[0] == ATA_16 && scsicmd[14] == ATA_CMD_CHK_POWER) {
+		goto PASS_ONCE;
+	} else {
 		/* we need insert read as the first cmd to wakeup disk */
 		if (dev->iCheckPwr || test_bit(CHKPOWER_FIRST_CMD, &(dev->ulSpinupState))) {
 			/* check if this port need wait other disks wakeup */
@@ -3355,11 +3116,6 @@
 			spin_unlock(&SYNOLastWakeLock);
 
 			if (!iNeedWait) {
-				if (dev->iCheckPwr) {
-					set_bit(ap->print_id, &CurPendingListSleep);
-					clear_bit(ap->print_id, &CurPendingListWaking);
-					dev->ulSpinupState = 0;
-				}
 				goto ISSUE_READ;
 			} else {
 				/* These msg will appear very much, so we mark it.
@@ -3371,45 +3127,18 @@
 				goto WAIT;
 			}
 		}
-
-		if ((scsicmd[0] == ATA_16 && scsicmd[14] == ATA_CMD_STANDBYNOW1) ||
-			test_bit(ap->print_id, &CurPendingListWaking)) {
-			/* These msg will appear very much, so we mark it.
-			 * But it is useful for debug, I leave it here */
-			/*if (printk_ratelimit()) {
-				DBGMESG("skip this disk %d scsicmd[0] 0x%x scsicmd[14] 0x%x\n",
-						 ap->print_id, scsicmd[0], scsicmd[14]);
-			}*/
-			goto PASS;
-		}
-
-		/* The follwing case this port will goto CHKPOWER to let disk check hibernation and
-		 * spinup group by group
-		 * 1. This port is already received standby command
-		 * 2. some disks may go hibernation by itself, so if this disk is idle for a while we
-		 *    must check it
-		 **/
-		if (time_after(jiffies, dev->ulLastCmd + (ata_print_id * WAKEINTERVAL))) {
-			DBGMESG("disk %d go CHKPOWER,clear Waking/Sleep bit, scsicmd[0] 0x%x scsicmd[14] 0x%x\n",
-					ap->print_id, scsicmd[0], scsicmd[14]);
-			clear_bit(ap->print_id, &CurPendingListWaking);
-			clear_bit(ap->print_id, &CurPendingListSleep);
-			dev->ulSpinupState = 0;
-			goto CHKPOWER;
-		}
 	}
 
 PASS:
+	dev->iCheckPwr = 0;
+PASS_ONCE:
 	/* update time-bookkeeping of last command */
 	dev->ulLastCmd = jiffies;
-	dev->iCheckPwr = 0;
 	return ata_scsi_translate(dev, cmd, xlat_func);
 ISSUE_READ:
 	dev->iCheckPwr = 0;
+	dev->ulSpinupState = 0;
 	return SynoIssueRead(dev);
-CHKPOWER:
-	dev->iCheckPwr = 0;
-	return SynoInsertCheckPW(dev);
 WAIT:
 	return SCSI_MLQUEUE_HOST_BUSY;
 }
@@ -4933,12 +4662,6 @@
 			goto RETRY;
 		}
 #endif
-#ifdef MY_ABC_HERE
-		if (0 < dev->link->ap->iFakeError) {
-			ata_port_schedule_eh(dev->link->ap);
-			goto RETRY;
-		}
-#endif
 		/* 0 == g_internal_hd_num means this model no need spinup one by one,
 		 * guiWakeupDisksNum means how many disks in one group needed to be waking up.
 		 * So if 0 == g_internal_hd_num && 1 == guiWakeupDisksNum means we needn't
@@ -4956,14 +4679,6 @@
 				}
 				goto RETRY;
 			}
-			if (test_bit(CHKPOWER_CHECKING, &(dev->ulSpinupState))) {
-				if (time_after(jiffies, dev->ulLastCmd + WAKEINTERVAL)) {
-					DBGMESG("ata%u: checking timeout\n", dev->link->ap->print_id);
-					WARN_ON(1 != dev->link->ap->nr_active_links);
-					ata_port_schedule_eh(dev->link->ap);
-				}
-				goto RETRY;
-			}
 			rc = syno_ata_scsi_translate(dev, scmd, xlat_func);
 		}
 	}
@@ -5765,8 +5480,8 @@
 {
 	int index = -1;
 
-	if ( !strncmp(gszSynoHWVersion, HW_RS810p, strlen(HW_RS810p) ) ||
-		!strncmp(gszSynoHWVersion, HW_RS810rpp, strlen(HW_RS810rpp) ) ) {
+	if (syno_is_hw_version(HW_RS810p) ||
+		syno_is_hw_version(HW_RS810rpp) )  {
 		printk("This is RS810+/RS810rp+, reverse host!\n");
 		if ( host_no >= 0 && host_no <= 3 )
 			index = 3 - host_no;
@@ -5784,6 +5499,10 @@
 	char szMapStr[SYNO_DISK_INDEX_MAP_FIGURE + 1] = {0};
 	int cStrCp;
 
+	if (8 <= host->host_no) {
+		goto END;
+	}
+
 	cStrCp = snprintf(szMapStr, sizeof(szMapStr), "%s", &gszDiskIdxMap[SYNO_DISK_INDEX_MAP_FIGURE * host->host_no]);
 
 	if( SYNO_DISK_INDEX_MAP_FIGURE > cStrCp || SYNO_DISK_INDEX_MAP_FIGURE > strlen(szMapStr)) {
@@ -5890,6 +5609,7 @@
 
 int syno_libata_disk_map_table_gen(int *iDiskMapTable)
 {
+	int iAtaHostCount = 0;
 	int iScsiHostIdx;
 	int iAtaHostIdx;
 	int iDiskIdx;
@@ -5901,12 +5621,16 @@
 		goto END;
 	}
 
-	for(iScsiHostIdx = 0; iScsiHostIdx < (ata_print_id - 1); iScsiHostIdx++) {
-
+	for(iScsiHostIdx = 0; iAtaHostCount < (ata_print_id - 1); iScsiHostIdx++) {
 		if (NULL == (pScsiHost = scsi_host_lookup(iScsiHostIdx))) {
 			continue;
 		}
 
+		if (SYNO_PORT_TYPE_SAS == pScsiHost->hostt->syno_port_type) {
+			continue;
+		}
+		iAtaHostCount++;
+
 		pAp = ata_shost_to_port(pScsiHost);
 		if(!pAp) {
 			scsi_host_put(pScsiHost);
diff -ur a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
--- a/drivers/ata/libata-sff.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/libata-sff.c	2014-02-17 11:57:44.000000000 +0100
@@ -1472,9 +1472,6 @@
 unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc)
 {
 	struct ata_port *ap = qc->ap;
-#ifdef MY_ABC_HERE
-	u8 status;
-#endif
 	struct ata_link *link = qc->dev->link;
 
 	/* Use polling pio if the LLD doesn't handle
@@ -1495,31 +1492,7 @@
 		ata_tf_to_host(ap, &qc->tf);
 		ap->hsm_task_state = HSM_ST_LAST;
 
-#ifdef MY_ABC_HERE
-		/* copy from ata_pio_task() to send chkpwr cmd directly to prevent work queue timeout issue
-		 * Now we only find sata_mv have timeout issue, so we only on ATA_TFLAG_DIRECT in sata_mv */
-		if (ATA_TFLAG_DIRECT & qc->tf.flags) {
-			DBGMESG("ata%u: try to use directly issue cmd 0x%x\n", ap->print_id, qc->tf.command);
-			qc->tf.flags &= ~ATA_TFLAG_DIRECT;
-			status = ata_sff_busy_wait(ap, ATA_BUSY, 5);
-			if (status & ATA_BUSY) {
-				mdelay(2);
-				status = ata_sff_busy_wait(ap, ATA_BUSY, 10);
-				if (status & ATA_BUSY) {
-					/*if the status is still BUSY, we use original way ata_pio_queue_task() */
-					ata_sff_queue_pio_task(link, 0);
-					DBGMESG("ata%u: directly issue cmd 0x%x fail, using queue_task\n", ap->print_id, qc->tf.command);
-				} else {
-					ata_sff_hsm_move(ap, qc, status, 0);
-				}
-			} else {
-				ata_sff_hsm_move(ap, qc, status, 0);
-			}
-		}
-		else if (qc->tf.flags & ATA_TFLAG_POLLING)
-#else
 		if (qc->tf.flags & ATA_TFLAG_POLLING)
-#endif
 			ata_sff_queue_pio_task(link, 0);
 
 		break;
@@ -2149,23 +2122,8 @@
 	DPRINTK("about to softreset, devmask=%x\n", devmask);
 	rc = ata_bus_softreset(ap, devmask, deadline);
 	/* if link is occupied, -ENODEV too is an error */
-#ifdef MY_ABC_HERE
-	if (0 < ap->iFakeError) {
-		ata_link_printk(link, KERN_ERR, "generate fake SRST, Fake count %d\n", ap->iFakeError);
-		if (SYNO_ERROR_MAX > ap->iFakeError) {
-			--(ap->iFakeError);
-		}
-		rc = -EBUSY;
-	}
-#endif
 	if (rc && (rc != -ENODEV || sata_scr_valid(link))) {
 		ata_link_err(link, "SRST failed (errno=%d)\n", rc);
-#ifdef MY_ABC_HERE
-		if (-EBUSY == rc) {
-			ata_link_printk(link, KERN_ERR, "SRST fail, set srst fail flag\n");
-			link->uiSflags |= ATA_SYNO_FLAG_SRST_FAIL;
-		}
-#endif
 		return rc;
 	}
 
diff -ur a/drivers/ata/Makefile b/drivers/ata/Makefile
--- a/drivers/ata/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/ata/Makefile	2014-01-21 09:37:00.000000000 +0100
@@ -4,7 +4,12 @@
 # non-SFF interface
 obj-$(CONFIG_SATA_AHCI)		+= ahci.o libahci.o
 obj-$(CONFIG_SATA_ACARD_AHCI)	+= acard-ahci.o libahci.o
+
+# For mindspeed's platform , ahci_platform.o and libahci.o should be after sata_mv.o for disk ordering
+ifneq ($(CONFIG_SYNO_COMCERTO),y)
 obj-$(CONFIG_SATA_AHCI_PLATFORM) += ahci_platform.o libahci.o
+endif
+
 obj-$(CONFIG_SATA_FSL)		+= sata_fsl.o
 obj-$(CONFIG_SATA_INIC162X)	+= sata_inic162x.o
 obj-$(CONFIG_SCSI_MVSATA_BSP422)   += mvSata_4_2_2/
@@ -29,6 +34,12 @@
 obj-$(CONFIG_SATA_VIA)		+= sata_via.o
 obj-$(CONFIG_SATA_VITESSE)	+= sata_vsc.o
 
+# For mindspeed's platform , ahci_platform.o and libahci.o should be after sata_mv.o for disk ordering
+# So we move them here
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_SATA_AHCI_PLATFORM) += ahci_platform.o libahci.o
+endif
+
 # Synology modified, for disk ordering. sata_sil24.o sould be after sata_mv.o
 obj-$(CONFIG_SATA_SIL24)        += sata_sil24.o
 
diff -ur a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c
--- a/drivers/ata/sata_mv.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/sata_mv.c	2014-02-17 11:57:45.000000000 +0100
@@ -175,8 +175,12 @@
 
 	MV_GEN_I_FLAGS		= MV_COMMON_FLAGS | ATA_FLAG_NO_ATAPI,
 
+#ifdef CONFIG_SYNO_COMCERTO
+	MV_GEN_II_FLAGS     = MV_COMMON_FLAGS | ATA_FLAG_PMP | ATA_FLAG_ACPI_SATA,
+#else
 	MV_GEN_II_FLAGS		= MV_COMMON_FLAGS | ATA_FLAG_NCQ |
 				  ATA_FLAG_PMP | ATA_FLAG_ACPI_SATA,
+#endif
 
 	MV_GEN_IIE_FLAGS	= MV_GEN_II_FLAGS | ATA_FLAG_AN,
 
@@ -658,10 +662,6 @@
 static u8 mv_sff_check_status(struct ata_port *ap);
 
 #ifdef MY_ABC_HERE
-static void mv_err_intr(struct ata_port *ap);
-#endif
-
-#ifdef MY_ABC_HERE
 static ssize_t
 syno_mv_phy_ctl_store(struct device *dev, struct device_attribute *attr, const char * buf, size_t count);
 DEVICE_ATTR(syno_phy_ctl, S_IWUGO, NULL, syno_mv_phy_ctl_store);
@@ -676,9 +676,6 @@
 	&dev_attr_syno_phy_ctl,
 #endif
 #ifdef MY_ABC_HERE
-	&dev_attr_syno_port_thaw,
-#endif
-#ifdef MY_ABC_HERE
 	&dev_attr_syno_diskname_trans,
 #endif
 #ifdef MY_ABC_HERE
@@ -760,11 +757,6 @@
 	.bmdma_start		= mv_bmdma_start,
 	.bmdma_stop		= mv_bmdma_stop,
 	.bmdma_status		= mv_bmdma_status,
-
-#ifdef MY_ABC_HERE
-	.syno_force_intr	= mv_err_intr,
-#endif
-
 	.port_start		= mv_port_start,
 	.port_stop		= mv_port_stop,
 };
@@ -2468,11 +2460,6 @@
 		if (IS_GEN_II(hpriv))
 			return mv_qc_issue_fis(qc);
 	}
-#ifdef MY_ABC_HERE
-	if (NULL == qc->scsicmd && ATA_CMD_CHK_POWER == qc->tf.command) {
-		qc->tf.flags |= ATA_TFLAG_DIRECT;
-	}
-#endif
 	return ata_bmdma_qc_issue(qc);
 }
 
@@ -2770,19 +2757,7 @@
 		action |= ATA_EH_RESET;
 		ata_ehi_push_desc(ehi, "parity error");
 	}
-#ifdef MY_ABC_HERE
-	if ((edma_err_cause & (EDMA_ERR_DEV_DCON | EDMA_ERR_DEV_CON)) ||
-		(ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR)) {
-		if (ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR) {
-			ap->uiSflags &= ~ATA_SYNO_FLAG_FORCE_INTR;
-			DBGMESG("ata%u: clear ATA_SYNO_FLAG_FORCE_INTR\n", ap->print_id);
-		} else {
-			ap->iDetectStat = 1;
-			DBGMESG("ata%u: set detect stat check\n", ap->print_id);
-		}
-#else
 	if (edma_err_cause & (EDMA_ERR_DEV_DCON | EDMA_ERR_DEV_CON)) {
-#endif
 #ifdef MY_ABC_HERE
 		syno_ata_info_print(ap);
 #endif
@@ -3689,6 +3664,45 @@
 }
 
 #ifdef MY_ABC_HERE
+int syno_sata_mv_gpio_read(const unsigned short hostnum)
+{
+	struct Scsi_Host *shost = scsi_host_lookup(hostnum);
+	struct ata_port *ap = NULL;
+	void __iomem *host_mmio = NULL;
+	u32 gpio_value = 0;
+	int led_idx;
+	int ret = -1;
+
+	if (NULL == shost) {
+		goto END;
+	}
+
+	if (NULL == (ap = ata_shost_to_port(shost))) {
+		goto END;
+	}
+
+	if (NULL == (host_mmio = mv_host_base(ap->host))) {
+		goto END;
+	}
+
+	led_idx = ap->print_id - ap->host->ports[0]->print_id;
+
+	gpio_value = readl(host_mmio + GPIO_CTL_DATA);
+
+	if (gpio_value & (1 << led_idx)) {
+		ret = 1;
+	} else {
+		ret = 0;
+	}
+
+END:
+	if (NULL != shost) {
+		scsi_host_put(shost);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(syno_sata_mv_gpio_read);
+
 /*FIXME - Too brutal and directly, should separate into levels*/
 void syno_sata_mv_gpio_write(u8 blFaulty, const unsigned short hostnum)
 {
@@ -3703,12 +3717,10 @@
 	}
 
 	if(NULL == (ap = ata_shost_to_port(shost))) {
-		scsi_host_put(shost);
 		goto END;
 	}
 
 	if(NULL == (host_mmio = mv_host_base(ap->host))) {
-		scsi_host_put(shost);
 		goto END;
 	}
 	
@@ -3723,9 +3735,11 @@
 	}
 
 	writel(gpio_value, host_mmio + GPIO_CTL_DATA);
-	scsi_host_put(shost);
 
 END:
+	if (NULL != shost) {
+		scsi_host_put(shost);
+	}
 	return;
 }
 EXPORT_SYMBOL(syno_sata_mv_gpio_write);
diff -ur a/drivers/ata/sata_sil24.c b/drivers/ata/sata_sil24.c
--- a/drivers/ata/sata_sil24.c	2013-08-24 11:37:08.000000000 +0200
+++ b/drivers/ata/sata_sil24.c	2014-02-17 11:57:44.000000000 +0100
@@ -358,10 +358,6 @@
 static int sil24_port_resume(struct ata_port *ap);
 #endif
 
-#ifdef MY_ABC_HERE
-static inline void sil24_host_intr(struct ata_port *ap);
-#endif
-
 static const struct pci_device_id sil24_pci_tbl[] = {
 	{ PCI_VDEVICE(CMD, 0x3124), BID_SIL3124 },
 	{ PCI_VDEVICE(INTEL, 0x3124), BID_SIL3124 },
@@ -448,9 +444,6 @@
 	&dev_attr_syno_pm_gpio,
 	&dev_attr_syno_pm_info,
 #ifdef MY_ABC_HERE
-	&dev_attr_syno_port_thaw,
-#endif
-#ifdef MY_ABC_HERE
 	&dev_attr_syno_diskname_trans,
 #endif
 #ifdef MY_ABC_HERE
@@ -497,9 +490,6 @@
 #ifdef CONFIG_PM
 	.port_resume		= sil24_port_resume,
 #endif
-#ifdef MY_ABC_HERE
-	.syno_force_intr	= sil24_host_intr,
-#endif
 };
 
 static int sata_sil24_msi;    /* Disable MSI */
@@ -770,15 +760,7 @@
 	ata_tf_init(link->device, &tf);	/* doesn't really matter */
 	rc = sil24_exec_polled_cmd(ap, pmp, &tf, 0, PRB_CTRL_SRST,
 				   timeout_msec);
-#ifdef MY_ABC_HERE
-	if (0 < ap->iFakeError) {
-		ata_link_printk(link, KERN_ERR, "generate fake softreset error, Fake count %d\n", ap->iFakeError);
-		if (SYNO_ERROR_MAX > ap->iFakeError) {
-			--(ap->iFakeError);
-		}
-		rc = -EBUSY;
-	}
-#endif
+
 	if (rc == -EBUSY) {
 		reason = "timeout";
 		goto err;
@@ -809,10 +791,7 @@
 
  err:
 	ata_link_err(link, "softreset failed (%s)\n", reason);
-#ifdef MY_ABC_HERE
-	ata_link_printk(link, KERN_ERR, "softreset failed, set srst fail flag\n");
-	link->uiSflags |= ATA_SYNO_FLAG_SRST_FAIL;
-#endif
+
 	return -EIO;
 }
 
@@ -1161,19 +1140,7 @@
 		sata_async_notification(ap);
 	}
 
-#ifdef MY_ABC_HERE
-	if ((irq_stat & (PORT_IRQ_PHYRDY_CHG | PORT_IRQ_DEV_XCHG)) ||
-		(ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR)) {
-		if (ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR) {
-			ap->uiSflags &= ~ATA_SYNO_FLAG_FORCE_INTR;
-			DBGMESG("ata%u: clear ATA_SYNO_FLAG_FORCE_INTR\n", ap->print_id);
-		} else {
-			ap->iDetectStat = 1;
-			DBGMESG("ata%u: set detect stat check\n", ap->print_id);
-		}
-#else
 	if (irq_stat & (PORT_IRQ_PHYRDY_CHG | PORT_IRQ_DEV_XCHG)) {
-#endif
 #ifdef MY_ABC_HERE
 		syno_ata_info_print(ap);
 #endif
@@ -1298,11 +1265,7 @@
 
 	slot_stat = readl(port + PORT_SLOT_STAT);
 
-#ifdef MY_ABC_HERE
-	if (unlikely(slot_stat & HOST_SSTAT_ATTN) || (ap->uiSflags & ATA_SYNO_FLAG_FORCE_INTR)) {
-#else
 	if (unlikely(slot_stat & HOST_SSTAT_ATTN)) {
-#endif
 		sil24_error_intr(ap);
 		return;
 	}
@@ -1450,6 +1413,9 @@
 
 		/* configure port */
 		sil24_config_port(ap);
+#ifdef MY_ABC_HERE
+		mdelay(1000);
+#endif
 	}
 
 	/* Turn on interrupts */
diff -ur a/drivers/base/cpu.c b/drivers/base/cpu.c
--- a/drivers/base/cpu.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/base/cpu.c	2014-02-17 11:57:39.000000000 +0100
@@ -18,8 +18,6 @@
 
 #if defined(MY_ABC_HERE) && defined(MY_ABC_HERE)
 #include <linux/synobios.h>
-
-extern char gszSynoHWVersion[];
 #endif
 
 struct sysdev_class cpu_sysdev_class = {
@@ -46,7 +44,7 @@
 	ssize_t ret;
 
 #ifdef MY_ABC_HERE
-	if(!strncmp(gszSynoHWVersion, HW_DS712pv20, strlen(HW_DS712pv20))) {
+	if(syno_is_hw_version(HW_DS712pv20)) {
 		if( 1 == cpu->sysdev.id || 3 == cpu->sysdev.id ) {
 			printk(KERN_ERR "This model does not allow changing the specified cpu state.\n");
 			ret = count;
diff -ur a/drivers/base/power/main.c b/drivers/base/power/main.c
--- a/drivers/base/power/main.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/base/power/main.c	2014-02-17 11:57:39.000000000 +0100
@@ -1166,3 +1166,193 @@
 	return async_error;
 }
 EXPORT_SYMBOL_GPL(device_pm_wait_for_dev);
+
+#if defined(CONFIG_SYNO_COMCERTO)
+/*
+ * Code added to suppoprt  device SUSPEND(L1 and L2 ) and RESUME
+ * (L1 = clock gating L2 = clock gating + reset) .
+ * Depends upon config option CONFIG_PM_SYSFS_MANUAL
+ *
+ */
+
+#ifdef CONFIG_PM_SYSFS_MANUAL
+
+static DEFINE_MUTEX(dpm_lock);
+
+/**
+ *	dpm_manual_resume - resume the device .
+ *      uses device_resume and device_complete to acheive feature.
+ *      part of the code borrowed from dpm_resume and dpm_resume_complete.
+ *	@dev:   Device.
+ * 	@state: State to enter.
+*/
+
+void dpm_manual_resume(struct device *dev,pm_message_t state)
+{
+	int error;
+	struct list_head list;
+	ktime_t starttime = ktime_get();
+
+	might_sleep();
+
+	/* Device resume prepare starts here */
+	mutex_lock(&dpm_list_mtx);
+        pm_transition = state;
+	INIT_COMPLETION(dev->power.completion);
+	mutex_unlock(&dpm_list_mtx);
+
+	error = device_resume(dev, state, false);
+	if (error) {
+		suspend_stats.failed_resume++;
+		dpm_save_failed_step(SUSPEND_RESUME);
+		dpm_save_failed_dev(dev_name(dev));
+		pm_dev_err(dev, state, "", error);
+	}
+
+	mutex_lock(&dpm_list_mtx);
+	if (!list_empty(&dev->power.entry))
+		list_move_tail(&dev->power.entry, &dpm_prepared_list);
+	mutex_unlock(&dpm_list_mtx);
+	
+	/* DPM complete start */
+	INIT_LIST_HEAD(&list);
+	mutex_lock(&dpm_list_mtx);
+	dev->power.is_prepared = false;
+	list_move(&dev->power.entry, &list);
+	mutex_unlock(&dpm_list_mtx);
+
+	device_complete(dev, state);
+	dev->power.power_state=state;
+	dpm_show_time(starttime, state, NULL);
+}
+
+
+/**
+ *	dpm_manual_resume_start - Start the process for Power one device back to work.
+ *	@dev:   Device.
+ *	@state: State to enter.
+ *      Code inspired from dpm_resume_end().
+ *
+ *	Bring one device back to the on state by first powering it
+ *	on, then restoring state. We only operate on devices that aren't
+ *	already on.
+ */
+
+void dpm_manual_resume_start(struct device * dev,pm_message_t state)
+{
+	mutex_lock(&dpm_lock);
+	if (dev->power.power_state.event == state.event){
+		printk(KERN_ERR "PM: We are already in the resume state \n");
+		goto done;
+        }
+	/* Device resume starts from here */
+	dpm_manual_resume(dev,state);
+done:
+	mutex_unlock(&dpm_lock);
+
+}
+
+/**
+ *	dpm_manual_prepare - prepare the device for power transition.
+ *	Part of the code borrowed from dpm_prapare.
+ *	@dev:   Device.
+ *	@state: State to enter.
+ */
+static int dpm_manual_prepare(struct device * dev , pm_message_t state)
+{
+	/* This part of code is borrowed from dpm_prepare
+	 * make the deice for prepare.
+	*/
+	int error = 0;
+	might_sleep();
+	
+	/* Call the device prepare */
+	error = device_prepare(dev, state);
+
+	mutex_lock(&dpm_list_mtx);
+	if (error){
+		printk(KERN_INFO "PM: Device %s not prepared " "for power transition: code %d\n",
+			dev_name(dev), error);
+		goto done;
+	}
+	dev->power.is_prepared = true;
+	if (!list_empty(&dev->power.entry))
+		list_move_tail(&dev->power.entry, &dpm_prepared_list);
+
+done:
+	mutex_unlock(&dpm_list_mtx);	
+	return error;
+}
+
+/**
+ *	dpm_manual_suspend - Helper routing to call the device_suspend.
+ *	Part of the code borrowed from dpm_supend().
+ *	@dev:   Device.
+ *	@state: State to enter.
+ */
+static int dpm_manual_suspend(struct device * dev, pm_message_t state)
+{
+	ktime_t starttime;
+	int error=0;
+
+	might_sleep();
+
+	mutex_lock(&dpm_list_mtx);
+	pm_transition = state;
+	mutex_unlock(&dpm_list_mtx);
+	
+	error = device_suspend(dev);
+	
+	mutex_lock(&dpm_list_mtx);
+	if (error){
+		pm_dev_err(dev, state, "", error);
+                dpm_save_failed_dev(dev_name(dev));
+	}	
+	if (!list_empty(&dev->power.entry))
+		list_move(&dev->power.entry, &dpm_suspended_list);
+	mutex_unlock(&dpm_list_mtx);
+
+	dev->power.power_state=state;
+	dpm_show_time(starttime, state, NULL);
+	return error;
+}
+
+/**
+ *      dpm_manual_suspend_start - Put one device in Power of L1/L2 state.
+ *      Power off L1 - clock gating , Power off L2 - clock gating + device reset
+ *  	Part of the code borrowed from dpm_suspend_start.
+ *      @dev:   Device.
+ *      @state: State to enter.
+ */
+int dpm_manual_suspend_start(struct device * dev, pm_message_t state)
+{
+	int error=0;
+
+	/* Start the global mutex value*/
+	mutex_lock(&dpm_lock);
+
+	if (dev->power.power_state.event == state.event){
+		if ( state.event == PM_EVENT_SUSPEND )
+			printk(KERN_ERR "PM: We are already in the suspend (power off L1) state \n");
+#if 0
+		else if ( state.event == PM_EVENT_SUSPEND_L2)
+			printk(KERN_ERR "PM: We are already in the suspend (Power off L2) state \n");
+#endif
+		goto done;
+        }
+
+	/* Devce PM prepare starts from here */
+	error=dpm_manual_prepare(dev,state);
+	
+	if (error){
+		suspend_stats.failed_prepare++;
+		dpm_save_failed_step(SUSPEND_PREPARE);
+		goto done;
+	}else
+		error = dpm_manual_suspend(dev,state);	
+done:
+	mutex_unlock(&dpm_lock);
+	return error;
+}
+#endif
+#endif
diff -ur a/drivers/base/power/opp.c b/drivers/base/power/opp.c
--- a/drivers/base/power/opp.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/base/power/opp.c	2014-02-17 11:57:39.000000000 +0100
@@ -330,6 +330,9 @@
 {
 	struct device_opp *dev_opp;
 	struct opp *temp_opp, *opp = ERR_PTR(-ENODEV);
+#if defined(CONFIG_SYNO_COMCERTO)
+	int c = 0;
+#endif
 
 	if (!dev || !freq) {
 		dev_err(dev, "%s: Invalid argument freq=%p\n", __func__, freq);
@@ -341,10 +344,21 @@
 		return opp;
 
 	list_for_each_entry_rcu(temp_opp, &dev_opp->opp_list, node) {
+#if defined(CONFIG_SYNO_COMCERTO)
+		++c;
+#endif
 		if (temp_opp->available) {
 			/* go to the next node, before choosing prev */
 			if (temp_opp->rate > *freq)
+#if defined(CONFIG_SYNO_COMCERTO)	
+			{
+				if (c == 1)
+					opp = temp_opp;
 				break;
+			}
+#else
+				break;
+#endif
 			else
 				opp = temp_opp;
 		}
diff -ur a/drivers/base/power/power.h b/drivers/base/power/power.h
--- a/drivers/base/power/power.h	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/base/power/power.h	2014-02-17 11:57:39.000000000 +0100
@@ -81,3 +81,15 @@
 static inline void wakeup_sysfs_remove(struct device *dev) {}
 
 #endif
+
+/* Added for SYSFS support to handle from Device power management from 
+ * user space. Manual PM  configuration.
+ */
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_PM_SYSFS_MANUAL)
+
+extern int dpm_manual_suspend_start(struct device * , pm_message_t );
+extern void dpm_manual_resume_start(struct device * , pm_message_t);
+
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_PM_SYSFS_MANUAL */
+
diff -ur a/drivers/base/power/sysfs.c b/drivers/base/power/sysfs.c
--- a/drivers/base/power/sysfs.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/base/power/sysfs.c	2014-02-17 11:57:39.000000000 +0100
@@ -89,6 +89,14 @@
  *	value are used only if the driver calls pm_runtime_use_autosuspend().
  *
  *	wakeup_count - Report the number of wakeup events related to the device
+#if defined(CONFIG_SYNO_COMCERTO)
+ *      
+ *      MSPD: Added the support for manual PM operation for NON-CPU devices.
+ *      This is operated through power/state file . operates in two states
+ *      PM_EVENT_SUSPEND= Power off L1 state (device Clock gating )
+ *	PM_EVENT_SUSPEND_L2=Power off L2 state ( device Clock gating + reset). 
+ *      
+#endif
  */
 
 static const char enabled[] = "enabled";
@@ -97,6 +105,44 @@
 const char power_group_name[] = "power";
 EXPORT_SYMBOL_GPL(power_group_name);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_PM_SYSFS_MANUAL)
+static ssize_t state_show(struct device * dev, struct device_attribute *attr, char * buf)
+{
+	if (dev->power.power_state.event == PM_EVENT_SUSPEND) /* Power off L1 state */
+		return sprintf(buf, "2\n");
+#if 0
+	else if (dev->power.power_state.event == PM_EVENT_SUSPEND_L2) /* Power off L2 state */
+		return sprintf(buf, "3\n");
+#endif
+	else 
+		return sprintf(buf, "0\n");
+}
+
+static ssize_t state_store(struct device * dev, struct device_attribute *attr, const char * buf, size_t n)
+{
+	pm_message_t state;
+	int error = -EINVAL;
+	
+	if ((n == 2) && (buf[0] == '2')) {
+                state.event = PM_EVENT_SUSPEND; 
+                error = dpm_manual_suspend_start(dev, state); /* Power off L1 state */
+        }
+#if 0
+	if ((n == 2) && (buf[0] == '3')) {
+                state.event = PM_EVENT_SUSPEND_L2;
+                error = dpm_manual_suspend(dev, state);       /* Power off L2 state */
+        }	
+#endif
+	if ((n == 2) && (buf[0] == '0')) {
+                state.event = PM_EVENT_RESUME;
+                dpm_manual_resume_start(dev, state);
+		error = 0;
+        }
+	return error ? error : n;
+}
+static DEVICE_ATTR(state, 0644, state_show, state_store);
+#endif
+
 #ifdef CONFIG_PM_RUNTIME
 static const char ctrl_auto[] = "auto";
 static const char ctrl_on[] = "on";
@@ -437,6 +483,9 @@
 #endif /* CONFIG_PM_ADVANCED_DEBUG */
 
 static struct attribute *power_attrs[] = {
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_PM_SYSFS_MANUAL)
+	 &dev_attr_state.attr,
+#endif
 #ifdef CONFIG_PM_ADVANCED_DEBUG
 #ifdef CONFIG_PM_SLEEP
 	&dev_attr_async.attr,
diff -ur a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
--- a/drivers/block/virtio_blk.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/block/virtio_blk.c	2014-02-17 11:57:39.000000000 +0100
@@ -172,7 +172,7 @@
 		}
 	}
 
-	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
+	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr, GFP_ATOMIC)<0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
diff -ur a/drivers/char/hw_random/virtio-rng.c b/drivers/char/hw_random/virtio-rng.c
--- a/drivers/char/hw_random/virtio-rng.c	2013-08-24 11:37:15.000000000 +0200
+++ b/drivers/char/hw_random/virtio-rng.c	2014-02-17 11:57:54.000000000 +0100
@@ -47,7 +47,7 @@
 	sg_init_one(&sg, buf, size);
 
 	/* There should always be room for one buffer. */
-	if (virtqueue_add_buf(vq, &sg, 0, 1, buf) < 0)
+	if (virtqueue_add_buf(vq, &sg, 0, 1, buf, GFP_KERNEL) < 0)
 		BUG();
 
 	virtqueue_kick(vq);
diff -ur a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c
--- a/drivers/char/virtio_console.c	2013-08-24 11:37:15.000000000 +0200
+++ b/drivers/char/virtio_console.c	2014-02-17 11:57:54.000000000 +0100
@@ -392,7 +392,7 @@
 
 	sg_init_one(sg, buf->buf, buf->size);
 
-	ret = virtqueue_add_buf(vq, sg, 0, 1, buf);
+	ret = virtqueue_add_buf(vq, sg, 0, 1, buf, GFP_ATOMIC);
 	virtqueue_kick(vq);
 	return ret;
 }
@@ -457,7 +457,7 @@
 	vq = portdev->c_ovq;
 
 	sg_init_one(sg, &cpkt, sizeof(cpkt));
-	if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt) >= 0) {
+	if (virtqueue_add_buf(vq, sg, 1, 0, &cpkt, GFP_ATOMIC) >= 0) {
 		virtqueue_kick(vq);
 		while (!virtqueue_get_buf(vq, &len))
 			cpu_relax();
@@ -506,7 +506,7 @@
 	reclaim_consumed_buffers(port);
 
 	sg_init_one(sg, in_buf, in_count);
-	ret = virtqueue_add_buf(out_vq, sg, 1, 0, in_buf);
+	ret = virtqueue_add_buf(out_vq, sg, 1, 0, in_buf, GFP_ATOMIC);
 
 	/* Tell Host to go! */
 	virtqueue_kick(out_vq);
Nur in b/drivers/cpufreq: c2k-cpufreq.c.
diff -ur a/drivers/cpufreq/Kconfig b/drivers/cpufreq/Kconfig
--- a/drivers/cpufreq/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/cpufreq/Kconfig	2014-01-21 09:37:02.000000000 +0100
@@ -45,6 +45,7 @@
 choice
 	prompt "Default CPUFreq governor"
 	default CPU_FREQ_DEFAULT_GOV_USERSPACE if CPU_FREQ_SA1100 || CPU_FREQ_SA1110
+	default CPU_FREQ_DEFAULT_GOV_ONDEMAND if SYNO_COMCERTO
 	default CPU_FREQ_DEFAULT_GOV_PERFORMANCE
 	help
 	  This option sets which CPUFreq governor shall be loaded at
diff -ur a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm
--- a/drivers/cpufreq/Kconfig.arm	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/cpufreq/Kconfig.arm	2014-01-21 09:37:02.000000000 +0100
@@ -30,3 +30,8 @@
 	  SoC (S5PV310 or S5PC210).
 
 	  If in doubt, say N.
+
+config CPU_FREQ_C2K
+	bool "CPUfreq driver for Comcerto"
+	depends on ARCH_COMCERTO && CPU_FREQ && SYNO_COMCERTO
+	default y
diff -ur a/drivers/cpufreq/Makefile b/drivers/cpufreq/Makefile
--- a/drivers/cpufreq/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/cpufreq/Makefile	2014-01-21 09:37:02.000000000 +0100
@@ -47,3 +47,4 @@
 ##################################################################################
 # PowerPC platform drivers
 obj-$(CONFIG_CPU_FREQ_MAPLE)		+= maple-cpufreq.o
+obj-$(CONFIG_CPU_FREQ_C2K)             += c2k-cpufreq.o
Nur in b/drivers/devfreq: c2k-devfreq.c.
diff -ur a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig
--- a/drivers/devfreq/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/devfreq/Kconfig	2014-01-21 09:37:02.000000000 +0100
@@ -65,4 +65,11 @@
 
 comment "DEVFREQ Drivers"
 
+config COMCERTO_DEVFREQ_SUPPORT
+	bool "Enable Comcerto devfreq"
+	depends on SYNO_COMCERTO
+	select DEVFREQ_GOV_SIMPLE_ONDEMAND
+	help
+		Adds devfreq support in Comcerto.
+
 endif # PM_DEVFREQ
diff -ur a/drivers/devfreq/Makefile b/drivers/devfreq/Makefile
--- a/drivers/devfreq/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/devfreq/Makefile	2014-01-21 09:37:02.000000000 +0100
@@ -3,3 +3,4 @@
 obj-$(CONFIG_DEVFREQ_GOV_PERFORMANCE)	+= governor_performance.o
 obj-$(CONFIG_DEVFREQ_GOV_POWERSAVE)	+= governor_powersave.o
 obj-$(CONFIG_DEVFREQ_GOV_USERSPACE)	+= governor_userspace.o
+obj-$(CONFIG_COMCERTO_DEVFREQ_SUPPORT)	+= c2k-devfreq.o
Nur in b/drivers/dma: c2k_dma.c.
Nur in b/drivers/dma: comcerto_xor.c.
Nur in b/drivers/dma: comcerto_xor.h.
diff -ur a/drivers/dma/Kconfig b/drivers/dma/Kconfig
--- a/drivers/dma/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/dma/Kconfig	2014-01-21 09:37:02.000000000 +0100
@@ -122,6 +122,22 @@
 	---help---
 	  Enable support for the Marvell XOR engine.
 
+config COMCERTO_XOR
+	tristate "Mindspeed Comcerto 2000 XOR engine support"
+	depends on ARCH_M86XXX && SYNO_COMCERTO
+	select DMA_ENGINE
+	select ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	---help---
+	  Enable support for the Mindspeed XOR engine.
+
+config COMCERTO_DMA_BASIC
+	tristate "Mindspeed Comcerto 2000 MDMA engine basic support"
+	depends on ARCH_M86XXX && SYNO_COMCERTO
+	select DMA_ENGINE
+	select ASYNC_TX_ENABLE_CHANNEL_SWITCH
+	---help---
+	  Enable support for the Mindspeed mdma engine.
+
 config MX3_IPU
 	bool "MX3x Image Processing Unit support"
 	depends on SOC_IMX31 || SOC_IMX35
@@ -282,6 +298,15 @@
 
 	  If unsure, say N.
 
+config RAID_ZERO_COPY
+        bool "Optimized DMA/XOR offload: reduce raid5 memcpy which offloaded for dma"
+        depends on ASYNC_TX_DMA && SYNO_COMCERTO
+        help
+          This allows the async_tx api try to reduce raid5 memcpy operations for
+          dma. If you have dma device to support memcpy offloading, you can set
+          it as Y, else N.
+
+
 config DMATEST
 	tristate "DMA Test client"
 	depends on DMA_ENGINE
diff -ur a/drivers/dma/Makefile b/drivers/dma/Makefile
--- a/drivers/dma/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/dma/Makefile	2014-01-21 09:37:02.000000000 +0100
@@ -26,3 +26,5 @@
 obj-$(CONFIG_PCH_DMA) += pch_dma.o
 obj-$(CONFIG_AMBA_PL08X) += amba-pl08x.o
 obj-$(CONFIG_EP93XX_DMA) += ep93xx_dma.o
+obj-$(CONFIG_COMCERTO_XOR) += comcerto_xor.o
+obj-$(CONFIG_COMCERTO_DMA_BASIC) += c2k_dma.o
Nur in b/drivers/gpio: gpio-c2k.c.
diff -ur a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
--- a/drivers/gpio/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/gpio/Kconfig	2014-01-21 09:37:04.000000000 +0100
@@ -147,6 +147,13 @@
 	help
 	  Say yes here to support the Xilinx FPGA GPIO device
 
+config GPIO_C2K
+	def_bool y
+	depends on ARCH_M86XXX && SYNO_COMCERTO
+	bool "Comcerto 2000 GPIO support"
+	help
+	  Say yes here to support the Comcerto 2000 GPIO device
+
 config GPIO_VR41XX
 	tristate "NEC VR4100 series General-purpose I/O Uint support"
 	depends on CPU_VR41XX
diff -ur a/drivers/gpio/Makefile b/drivers/gpio/Makefile
--- a/drivers/gpio/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/gpio/Makefile	2014-01-21 09:37:04.000000000 +0100
@@ -69,3 +69,5 @@
 obj-$(CONFIG_GPIO_WM8350)	+= gpio-wm8350.o
 obj-$(CONFIG_GPIO_WM8994)	+= gpio-wm8994.o
 obj-$(CONFIG_GPIO_XILINX)	+= gpio-xilinx.o
+obj-$(CONFIG_GPIO_C2K)		+= gpio-c2k.o
+
diff -ur a/drivers/hwmon/adt7475.c b/drivers/hwmon/adt7475.c
--- a/drivers/hwmon/adt7475.c	2013-08-24 11:37:14.000000000 +0200
+++ b/drivers/hwmon/adt7475.c	2014-02-17 11:57:53.000000000 +0100
@@ -1282,12 +1282,17 @@
 			    char *buf)
 {
 	struct i2c_client *client = to_i2c_client(dev);
-	struct adt7475_data *data = adt7475_update_device(dev);
+	struct adt7475_data *data = i2c_get_clientdata(client);
 	struct sensor_device_attribute_2 *sattr = to_sensor_dev_attr_2(attr);
 
 	if (!SYNO_IS_ADT7490(client)) {
 		return -EINVAL;
 	}
+	mutex_lock(&data->lock);
+	/* Read Modify Write PWM values */
+	adt7475_read_pwm(client, sattr->index);
+	mutex_unlock(&data->lock);
+
 	return sprintf(buf, "%d\n", data->pwmsynoctl[sattr->index]);
 }
 
@@ -1321,13 +1326,9 @@
 	data->pwm[CONTROL][index] = adt7475_read(PWM_CONFIG_REG(index));
 
 	switch (inputVal) {
-		case 0:
-			valt = 1;
-			val = 0x04;	/* Run at 100% duty cycle */
-			break;
 		case 1:
 			valt = 0;
-			val = 0x03;	/* Run at full speed */
+			val = 0x03;	/* Run at maximun duty cycle (set by pwmMax) */
 			break;
 		case 2:
 			valt = 0;
@@ -1378,12 +1379,11 @@
 			val = 0x02;	/* Source from remote2 */
 			break;
 		default:
-			mutex_unlock(&data->lock);
-			return -EINVAL;
+			valt = 0;
+			val = 0x03;	/* Run at maximun duty cycle (set by pwmMax) */
+			break;
 	}
 
-	data->pwmsynoctl[index] = inputVal;
-
 	data->pwm[CONTROL][index] &= ~0xE8;
 	data->pwm[CONTROL][index] |= (valt & 1) << 3;
 	data->pwm[CONTROL][index] |= (val & 7) << 5;
@@ -2106,6 +2106,69 @@
 #endif
 }
 
+#ifdef CONFIG_SYNO_ADT7490_FEATURES
+static unsigned int adt7490_pwmctl_read(const unsigned int pwmReg)
+{
+	unsigned int valt = pwmReg & 0x8;
+	unsigned int pwmSource = (pwmReg >> 5) & 7;
+	unsigned int ret = 1;
+
+	if (valt) {
+		switch (pwmSource) {
+			case 0x0:
+				ret = 5;
+				break;
+			case 0x1:
+				ret = 6;
+				break;
+			case 0x2:
+				ret = 7;
+				break;
+			case 0x3:
+				ret = 8;
+				break;
+			case 0x5:
+				ret = 4;
+				break;
+			case 0x7:
+				ret = 3;
+				break;
+			default:
+				ret = 0;
+				break;
+		}
+	} else {
+		switch (pwmSource) {
+			case 0x0:
+				ret = 11;
+				break;
+			case 0x1:
+				ret = 12;
+				break;
+			case 0x2:
+				ret = 13;
+				break;
+			case 0x3:
+				ret = 1;
+				break;
+			case 0x5:
+				ret = 10;
+				break;
+			case 0x6:
+				ret = 9;
+				break;
+			case 0x7:
+				ret = 2;
+				break;
+			default:
+				ret = 0;
+				break;
+		}
+	}
+	return ret;
+}
+#endif
+
 static void adt7475_read_pwm(struct i2c_client *client, int index)
 {
 	struct adt7475_data *data = i2c_get_clientdata(client);
@@ -2117,7 +2180,9 @@
 	/* Figure out the internal value for pwmctrl and pwmchan
 	   based on the current settings */
 	v = (data->pwm[CONTROL][index] >> 5) & 7;
-
+#ifdef CONFIG_SYNO_ADT7490_FEATURES
+	data->pwmsynoctl[index] = adt7490_pwmctl_read(data->pwm[CONTROL][index]);
+#endif
 	if (v == 3)
 		data->pwmctl[index] = 0;
 	else if (v == 7)
diff -ur a/drivers/hwmon/syno_hddmon.c b/drivers/hwmon/syno_hddmon.c
--- a/drivers/hwmon/syno_hddmon.c	2013-08-24 11:37:14.000000000 +0200
+++ b/drivers/hwmon/syno_hddmon.c	2014-02-17 11:57:53.000000000 +0100
@@ -24,7 +24,7 @@
 
 #define GPIO_UNDEF				0xFF
 
-#if defined(CONFIG_ARCH_GEN3) || defined(CONFIG_SYNO_ARMADA)
+#if defined(CONFIG_ARCH_GEN3) || defined(CONFIG_SYNO_ARMADA) || defined(CONFIG_ARCH_COMCERTO)
 extern int SYNO_CHECK_HDD_PRESENT(int index);
 extern int SYNO_CTRL_HDD_POWERON(int index, int value);
 extern int SYNO_SUPPORT_HDD_DYNAMIC_ENABLE_POWER(void);
Nur in b/drivers/i2c/busses: i2c-comcerto.c.
diff -ur a/drivers/i2c/busses/i2c-i801.c b/drivers/i2c/busses/i2c-i801.c
--- a/drivers/i2c/busses/i2c-i801.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/i2c/busses/i2c-i801.c	2014-02-17 11:57:42.000000000 +0100
@@ -53,6 +53,7 @@
   Panther Point (PCH)   0x1e22     32     hard     yes     yes     yes
   Lynx Point (PCH)      0x8c22     32     hard     yes     yes     yes
   Lynx Point-LP (PCH)   0x9c22     32     hard     yes     yes     yes
+  Avoton (SOC)          0x1f3c     32     hard     yes     yes     yes
 
   Features supported by this driver:
   Software PEC                     no
@@ -145,6 +146,7 @@
 #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS_IDF1	0x1d71
 #define PCI_DEVICE_ID_INTEL_PATSBURG_SMBUS_IDF2	0x1d72
 #define PCI_DEVICE_ID_INTEL_PANTHERPOINT_SMBUS	0x1e22
+#define PCI_DEVICE_ID_INTEL_AVOTON_SMBUS	0x1f3c
 #define PCI_DEVICE_ID_INTEL_DH89XXCC_SMBUS	0x2330
 #define PCI_DEVICE_ID_INTEL_5_3400_SERIES_SMBUS	0x3b30
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_SMBUS	0x8c22
@@ -639,6 +641,7 @@
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_PANTHERPOINT_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LYNXPOINT_SMBUS) },
 	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_SMBUS) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_AVOTON_SMBUS) },
 	{ 0, }
 };
 
diff -ur a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig
--- a/drivers/i2c/busses/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/i2c/busses/Kconfig	2014-01-21 09:37:05.000000000 +0100
@@ -105,6 +105,7 @@
 	    Panther Point (PCH)
 	    Lynx Point (PCH)
 	    Lynx Point-LP (PCH)
+	    Avoton (SOC)
 
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-i801.
@@ -310,6 +311,16 @@
 	  This driver can also be built as a module.  If so, the module
 	  will be called i2c-au1550.
 
+config I2C_COMCERTO
+	tristate "Comcerto I2C interface"
+	depends on I2C && (ARCH_COMCERTO) && SYNO_COMCERTO
+	help
+		If you say yes to this option, support will be included for the
+		Comcerto I2C interface.
+
+		This driver can also be built as a module.  If so, the module
+		will be called i2c-comcerto.
+
 config I2C_BLACKFIN_TWI
 	tristate "Blackfin TWI I2C support"
 	depends on BLACKFIN
diff -ur a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile
--- a/drivers/i2c/busses/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/i2c/busses/Makefile	2014-01-21 09:37:05.000000000 +0100
@@ -31,6 +31,7 @@
 obj-$(CONFIG_I2C_AT91)		+= i2c-at91.o
 obj-$(CONFIG_I2C_AU1550)	+= i2c-au1550.o
 obj-$(CONFIG_I2C_BLACKFIN_TWI)	+= i2c-bfin-twi.o
+obj-$(CONFIG_I2C_COMCERTO)      += i2c-comcerto.o
 obj-$(CONFIG_I2C_CPM)		+= i2c-cpm.o
 obj-$(CONFIG_I2C_DAVINCI)	+= i2c-davinci.o
 obj-$(CONFIG_I2C_DESIGNWARE_CORE)	+= i2c-designware-core.o
Nur in b/drivers/i2c: chips.
diff -ur a/drivers/i2c/Kconfig b/drivers/i2c/Kconfig
--- a/drivers/i2c/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/i2c/Kconfig	2014-01-21 09:37:05.000000000 +0100
@@ -51,7 +51,7 @@
 	  programs use the I2C bus.  Information on how to do this is
 	  contained in the file <file:Documentation/i2c/dev-interface>.
 
-	  This support is also available as a module.  If so, the module 
+	  This support is also available as a module.  If so, the module
 	  will be called i2c-dev.
 
 config I2C_MUX
@@ -93,6 +93,7 @@
 
 source drivers/i2c/algos/Kconfig
 source drivers/i2c/busses/Kconfig
+source drivers/i2c/chips/Kconfig
 
 config I2C_DEBUG_CORE
 	bool "I2C Core debugging messages"
diff -ur a/drivers/i2c/Makefile b/drivers/i2c/Makefile
--- a/drivers/i2c/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/i2c/Makefile	2014-01-21 09:37:05.000000000 +0100
@@ -7,7 +7,11 @@
 obj-$(CONFIG_I2C_SMBUS)		+= i2c-smbus.o
 obj-$(CONFIG_I2C_CHARDEV)	+= i2c-dev.o
 obj-$(CONFIG_I2C_MUX)		+= i2c-mux.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-y				+= algos/ busses/ chips/ muxes/
+else
 obj-y				+= algos/ busses/ muxes/
+endif
 
 ccflags-$(CONFIG_I2C_DEBUG_CORE) := -DDEBUG
 CFLAGS_i2c-core.o := -Wno-deprecated-declarations
Nur in b/drivers/iommu: amd_iommu_v2.c.
diff -ur a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
--- a/drivers/iommu/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/iommu/Kconfig	2014-01-21 09:37:05.000000000 +0100
@@ -58,6 +58,14 @@
 	  information to userspace via debugfs.
 	  If unsure, say N.
 
+config AMD_IOMMU_V2
+	tristate "AMD IOMMU Version 2 driver (EXPERIMENTAL)"
+	depends on AMD_IOMMU && EXPERIMENTAL
+	---help---
+	  This option enables support for the AMD IOMMUv2 features of the IOMMU
+	  hardware. Select this option if you want to use devices that support
+	  the the PCI PRI and PASID interface.
+
 # Intel IOMMU support
 config DMAR_TABLE
 	bool
diff -ur a/drivers/iommu/Makefile b/drivers/iommu/Makefile
--- a/drivers/iommu/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/iommu/Makefile	2014-01-21 09:37:05.000000000 +0100
@@ -1,6 +1,7 @@
 obj-$(CONFIG_IOMMU_API) += iommu.o
 obj-$(CONFIG_MSM_IOMMU) += msm_iommu.o msm_iommu_dev.o
 obj-$(CONFIG_AMD_IOMMU) += amd_iommu.o amd_iommu_init.o
+obj-$(CONFIG_AMD_IOMMU_V2) += amd_iommu_v2.o
 obj-$(CONFIG_DMAR_TABLE) += dmar.o
 obj-$(CONFIG_INTEL_IOMMU) += iova.o intel-iommu.o
 obj-$(CONFIG_IRQ_REMAP) += intr_remapping.o
diff -ur a/drivers/Kconfig b/drivers/Kconfig
--- a/drivers/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/Kconfig	2014-01-21 09:36:57.000000000 +0100
@@ -52,6 +52,8 @@
 
 source "drivers/spi/Kconfig"
 
+source "drivers/spi2/Kconfig"
+
 source "drivers/pps/Kconfig"
 
 source "drivers/ptp/Kconfig"
diff -ur a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
--- a/drivers/lguest/lguest_device.c	2013-08-24 11:37:15.000000000 +0200
+++ b/drivers/lguest/lguest_device.c	2014-02-17 11:57:54.000000000 +0100
@@ -292,10 +292,12 @@
 
 	/*
 	 * OK, tell virtio_ring.c to set up a virtqueue now we know its size
-	 * and we've got a pointer to its pages.
+	 * and we've got a pointer to its pages.  Note that we set weak_barriers
+	 * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
+	 * barriers.
 	 */
-	vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN,
-				 vdev, lvq->pages, lg_notify, callback, name);
+	vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev,
+				 true, lvq->pages, lg_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto unmap;
diff -ur a/drivers/Makefile b/drivers/Makefile
--- a/drivers/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/Makefile	2014-01-21 09:36:57.000000000 +0100
@@ -53,6 +53,7 @@
 obj-$(CONFIG_TARGET_CORE)	+= target/
 obj-$(CONFIG_MTD)		+= mtd/
 obj-$(CONFIG_SPI)		+= spi/
+obj-$(CONFIG_SPI2)		+= spi2/
 obj-y				+= net/
 obj-$(CONFIG_ATM)		+= atm/
 obj-$(CONFIG_FUSION)		+= message/
diff -ur a/drivers/md/dm-io.c b/drivers/md/dm-io.c
--- a/drivers/md/dm-io.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/md/dm-io.c	2014-02-17 11:57:41.000000000 +0100
@@ -40,6 +40,9 @@
 	void *context;
 	void *vma_invalidate_address;
 	unsigned long vma_invalidate_size;
+#ifdef MY_ABC_HERE
+	int is_return_err;
+#endif
 } __attribute__((aligned(DM_IO_MAX_REGIONS)));
 
 static struct kmem_cache *_dm_io_cache;
@@ -318,6 +321,11 @@
 		bio->bi_bdev = where->bdev;
 		bio->bi_end_io = endio;
 		bio->bi_destructor = dm_bio_destructor;
+#ifdef MY_ABC_HERE
+		if (1 == io->is_return_err) {
+			set_bit(BIO_MD_RETURN_ERROR, &bio->bi_flags);
+		}
+#endif
 		store_io_and_region_in_bio(bio, io, region);
 
 		if (rw & REQ_DISCARD) {
@@ -372,9 +380,15 @@
 	dec_count(io, 0, 0);
 }
 
+#ifdef MY_ABC_HERE
+static int sync_io(struct dm_io_client *client, unsigned int num_regions,
+		   struct dm_io_region *where, int rw, struct dpages *dp,
+		   unsigned long *error_bits, int is_return_err)
+#else
 static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 		   struct dm_io_region *where, int rw, struct dpages *dp,
 		   unsigned long *error_bits)
+#endif
 {
 	/*
 	 * gcc <= 4.3 can't do the alignment for stack variables, so we must
@@ -397,6 +411,9 @@
 
 	io->vma_invalidate_address = dp->vma_invalidate_address;
 	io->vma_invalidate_size = dp->vma_invalidate_size;
+#ifdef MY_ABC_HERE
+	io->is_return_err = is_return_err;
+#endif
 
 	dispatch_io(rw, num_regions, where, dp, io, 1);
 
@@ -416,9 +433,15 @@
 	return io->error_bits ? -EIO : 0;
 }
 
+#ifdef MY_ABC_HERE
+static int async_io(struct dm_io_client *client, unsigned int num_regions,
+		    struct dm_io_region *where, int rw, struct dpages *dp,
+		    io_notify_fn fn, void *context, int is_return_err)
+#else
 static int async_io(struct dm_io_client *client, unsigned int num_regions,
 		    struct dm_io_region *where, int rw, struct dpages *dp,
 		    io_notify_fn fn, void *context)
+#endif
 {
 	struct io *io;
 
@@ -435,6 +458,9 @@
 	io->client = client;
 	io->callback = fn;
 	io->context = context;
+#ifdef MY_ABC_HERE
+	io->is_return_err = is_return_err;
+#endif
 
 	io->vma_invalidate_address = dp->vma_invalidate_address;
 	io->vma_invalidate_size = dp->vma_invalidate_size;
@@ -479,6 +505,27 @@
 
 	return 0;
 }
+#ifdef MY_ABC_HERE
+int syno_dm_io(struct dm_io_request *io_req, unsigned num_regions,
+	  struct dm_io_region *where, unsigned long *sync_error_bits)
+{
+	int r;
+	struct dpages dp;
+	int is_return_err = 1;
+
+	r = dp_init(io_req, &dp, (unsigned long)where->count << SECTOR_SHIFT);
+	if (r)
+		return r;
+
+	/* XXX: set is_return_err = 1 */
+	if (!io_req->notify.fn)
+		return sync_io(io_req->client, num_regions, where,
+			       io_req->bi_rw, &dp, sync_error_bits, is_return_err);
+	return async_io(io_req->client, num_regions, where, io_req->bi_rw,
+			&dp, io_req->notify.fn, io_req->notify.context, is_return_err);
+}
+EXPORT_SYMBOL(syno_dm_io);
+#endif
 
 /*
  * New collapsed (a)synchronous interface.
@@ -498,12 +545,21 @@
 	if (r)
 		return r;
 
+#ifdef MY_ABC_HERE
+	if (!io_req->notify.fn)
+		return sync_io(io_req->client, num_regions, where,
+			       io_req->bi_rw, &dp, sync_error_bits, 0);
+	return async_io(io_req->client, num_regions, where, io_req->bi_rw,
+			&dp, io_req->notify.fn, io_req->notify.context, 0);
+#else
 	if (!io_req->notify.fn)
 		return sync_io(io_req->client, num_regions, where,
 			       io_req->bi_rw, &dp, sync_error_bits);
 
 	return async_io(io_req->client, num_regions, where, io_req->bi_rw,
 			&dp, io_req->notify.fn, io_req->notify.context);
+#endif
+
 }
 EXPORT_SYMBOL(dm_io);
 
diff -ur a/drivers/md/md.c b/drivers/md/md.c
--- a/drivers/md/md.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/md/md.c	2014-02-17 11:57:41.000000000 +0100
@@ -367,7 +367,6 @@
 #ifdef MY_ABC_HERE
 	unsigned char blActive = 1;
 #endif
-
 	if (mddev == NULL || mddev->pers == NULL
 	    || !mddev->ready) {
 		bio_io_error(bio);
@@ -6288,69 +6287,6 @@
 }
 EXPORT_SYMBOL(md_set_array_sectors);
 
-#ifdef MY_ABC_HERE
-/*
- * Duplicate from mdadm/mdadm-3.1.4/super1.c
- * choose an appropriate space for bitmap.
- */
-static unsigned long choose_bm_space(unsigned long long devsize)
-{
-	/* if the device is bigger than 8Gig, save 64k for bitmap usage,
-	 * if bigger than 200Gig, save 128k
-	 * NOTE: result must be multiple of 4K else bad things happen
-	 * on 4K-sector devices.
-	 */
-	if (devsize < 64*2) return 0;
-	if (devsize - 64*2 >= 200*1024*1024*2)
-		return 128*2;
-	if (devsize - 4*2 > 8*1024*1024*2)
-		return 64*2;
-	return 4*2;
-}
-
-/*
- * Examine the max device size that can used for data and aligned with
- * 64KB chunk size.
- */
-static sector_t max_avail_data_size(struct md_rdev *rdev, int minor_version)
-{
-	unsigned long long avail = 0;
-	unsigned long long devsize = i_size_read(rdev->bdev->bd_inode) / 512;
-
-	if (devsize < 24)
-		return 0;
-
-	devsize -= choose_bm_space(devsize);
-	if (minor_version > 1) {
-		if (devsize > 1024*1024*2)
-			devsize -= 1024*2;
-	}
-
-	switch(minor_version) {
-		case 0:
-			/* at end */
-			avail = ((devsize - 8*2 ) & ~(4*2-1));
-			break;
-		case 1:
-			/* at start, 4K for superblock and possible bitmap */
-			avail = devsize - 4*2;
-			break;
-		case 2:
-			/* 4k from start, 4K for superblock and possible bitmap */
-			avail = devsize - (4+4)*2;
-			break;
-		default:
-			return 0;
-	}
-
-	//aligned with 64 KB chunk
-	if (avail) {
-		avail &= ~(unsigned long long)(0x0000007fULL);
-	}
-	return (sector_t)avail;
-}
-#endif
-
 static int update_size(struct mddev *mddev, sector_t num_sectors)
 {
 	struct md_rdev *rdev;
@@ -6378,11 +6314,6 @@
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		sector_t avail = rdev->sectors;
 
-#ifdef MY_ABC_HERE
-		if (fit) {
-			avail = max_avail_data_size(rdev, mddev->minor_version);
-		}
-#endif
 		if (fit && (num_sectors == 0 || num_sectors > avail))
 			num_sectors = avail;
 		if (avail < num_sectors)
@@ -8776,6 +8707,7 @@
 
 	if ((ret = register_blkdev(0, "mdp")) < 0)
 		goto err_mdp;
+
 	mdp_major = ret;
 
 	blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -8859,7 +8791,10 @@
 		goto END;
 	}
 	// this disk are not going to be used as system disk
-#ifdef MY_ABC_HERE
+#ifdef CONFIG_SYNO_KVMX64
+	/* CONFIG_SYNO_KVMX64 */
+	/* XXX TODO FIXME ugly hack needed to get rid for production */
+#else
 	if (!(pBDev->bd_disk->systemDisk)) {
 		goto END;
 	}
diff -ur a/drivers/md/raid0.c b/drivers/md/raid0.c
--- a/drivers/md/raid0.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/md/raid0.c	2014-02-17 11:57:41.000000000 +0100
@@ -569,6 +569,19 @@
 	}
 }
 
+#ifdef MY_ABC_HERE
+static void syno_flashcache_return_error(struct bio *bio)
+{
+	/* defined in blk_types.h */
+	if (bio_flagged(bio, BIO_MD_RETURN_ERROR)) {
+		printk(KERN_DEBUG "Get flashcache read error, return error code\n");
+		bio_endio(bio, 1);
+	} else {
+		bio_endio(bio, 0);
+	}
+}
+#endif
+
 static void raid0_make_request(struct mddev *mddev, struct bio *bio)
 {
 	unsigned int chunk_sects;
@@ -594,7 +607,11 @@
 #else
 	if (mddev->degraded) {
 #endif
+#ifdef  MY_ABC_HERE
+		syno_flashcache_return_error(bio);
+#else
 		bio_endio(bio, 0);
+#endif
 		return;
 	}
 #endif
diff -ur a/drivers/md/raid10.c b/drivers/md/raid10.c
--- a/drivers/md/raid10.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/md/raid10.c	2014-02-17 11:57:41.000000000 +0100
@@ -2459,7 +2459,10 @@
 	rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
 
 	bio = r10_bio->devs[slot].bio;
+#ifdef MY_ABC_HERE
+#else
 	bdevname(bio->bi_bdev, b);
+#endif
 	r10_bio->devs[slot].bio =
 		mddev->ro ? IO_BLOCKED : NULL;
 read_more:
@@ -2473,11 +2476,19 @@
 					(unsigned long long)r10_bio->sector);
 		}else
 #endif
+#ifdef MY_ABC_HERE
+		printk(KERN_ALERT "md/raid10:%s: unrecoverable I/O"
+		       " read error for block %llu\n",
+		       mdname(mddev),
+		       (unsigned long long)r10_bio->sector);
+		raid_end_bio_io(r10_bio);
+#else
 		printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
 		       " read error for block %llu\n",
 		       mdname(mddev), b,
 		       (unsigned long long)r10_bio->sector);
 		raid_end_bio_io(r10_bio);
+#endif
 		bio_put(bio);
 		return;
 	}
diff -ur a/drivers/md/raid1.c b/drivers/md/raid1.c
--- a/drivers/md/raid1.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/md/raid1.c	2014-02-17 11:57:41.000000000 +0100
@@ -920,6 +920,19 @@
 	pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 }
 
+#ifdef MY_ABC_HERE
+static void syno_flashcache_return_error(struct bio *bio)
+{
+	/* defined in blk_types.h */
+	if (bio_flagged(bio, BIO_MD_RETURN_ERROR)) {
+		printk(KERN_DEBUG "Get flashcache read error, return error code\n");
+		bio_endio(bio, 1);
+	} else {
+		bio_endio(bio, 0);
+	}
+}
+#endif
+
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r1conf *conf = mddev->private;
@@ -947,7 +960,12 @@
 	if (0 == conf->raid_disks - mddev->degraded) {
 #endif
 		/* when there are no any disk, just pass it */
+
+#ifdef  MY_ABC_HERE
+		syno_flashcache_return_error(bio);
+#else
 		bio_endio(bio, 0);
+#endif
 		return;
 	}
 #endif /* MY_ABC_HERE */
@@ -2313,7 +2331,10 @@
 		md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
 
 	bio = r1_bio->bios[r1_bio->read_disk];
+#ifdef MY_ABC_HERE
+#else
 	bdevname(bio->bi_bdev, b);
+#endif
 read_more:
 	disk = read_balance(conf, r1_bio, &max_sectors);
 	if (disk == -1) {
@@ -2325,10 +2346,17 @@
 					(unsigned long long)r1_bio->sector);
 		}else
 #endif
+#ifdef MY_ABC_HERE
+		printk(KERN_ALERT "md/raid1:%s: unrecoverable I/O"
+		       " read error for block %llu\n",
+		       mdname(mddev), (unsigned long long)r1_bio->sector);
+		raid_end_bio_io(r1_bio);
+#else
 		printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
 		       " read error for block %llu\n",
 		       mdname(mddev), b, (unsigned long long)r1_bio->sector);
 		raid_end_bio_io(r1_bio);
+#endif
 	} else {
 		const unsigned long do_sync
 			= r1_bio->master_bio->bi_rw & REQ_SYNC;
diff -ur a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
--- a/drivers/mfd/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/mfd/Kconfig	2014-01-21 09:37:09.000000000 +0100
@@ -666,6 +666,15 @@
 	  LPC bridge function of the Intel SCH provides support for
 	  System Management Bus and General Purpose I/O.
 
+config LPC_ICH
+	tristate "Intel ICH LPC"
+	depends on PCI
+	select MFD_CORE
+	help
+	  The LPC bridge function of the Intel ICH provides support for
+	  many functional units. This driver provides needed support for
+	  other drivers to control these functions, currently GPIO.
+
 config MFD_RDC321X
 	tristate "Support for RDC-R321x southbridge"
 	select MFD_CORE
Nur in b/drivers/mfd: lpc_ich.c.
diff -ur a/drivers/mfd/Makefile b/drivers/mfd/Makefile
--- a/drivers/mfd/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/mfd/Makefile	2014-01-21 09:37:09.000000000 +0100
@@ -91,6 +91,7 @@
 obj-$(CONFIG_MFD_TIMBERDALE)    += timberdale.o
 obj-$(CONFIG_PMIC_ADP5520)	+= adp5520.o
 obj-$(CONFIG_LPC_SCH)		+= lpc_sch.o
+obj-$(CONFIG_LPC_ICH)		+= lpc_ich.o
 obj-$(CONFIG_MFD_RDC321X)	+= rdc321x-southbridge.o
 obj-$(CONFIG_MFD_JANZ_CMODIO)	+= janz-cmodio.o
 obj-$(CONFIG_MFD_JZ4740_ADC)	+= jz4740-adc.o
diff -ur a/drivers/mtd/devices/doc2000.c b/drivers/mtd/devices/doc2000.c
--- a/drivers/mtd/devices/doc2000.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/devices/doc2000.c	2014-02-17 11:57:40.000000000 +0100
@@ -564,7 +564,7 @@
 	mtd->flags = MTD_CAP_NANDFLASH;
 	mtd->size = 0;
 	mtd->erasesize = 0;
-	mtd->writesize = 512;
+	mtd->writebufsize = mtd->writesize = 512;
 	mtd->oobsize = 16;
 	mtd->owner = THIS_MODULE;
 	mtd->erase = doc_erase;
diff -ur a/drivers/mtd/devices/doc2001.c b/drivers/mtd/devices/doc2001.c
--- a/drivers/mtd/devices/doc2001.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/devices/doc2001.c	2014-02-17 11:57:40.000000000 +0100
@@ -348,7 +348,7 @@
 	/* FIXME: erase size is not always 8KiB */
 	mtd->erasesize = 0x2000;
 
-	mtd->writesize = 512;
+	mtd->writebufsize = mtd->writesize = 512;
 	mtd->oobsize = 16;
 	mtd->owner = THIS_MODULE;
 	mtd->erase = doc_erase;
diff -ur a/drivers/mtd/devices/doc2001plus.c b/drivers/mtd/devices/doc2001plus.c
--- a/drivers/mtd/devices/doc2001plus.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/devices/doc2001plus.c	2014-02-17 11:57:40.000000000 +0100
@@ -470,7 +470,7 @@
 	mtd->size = 0;
 
 	mtd->erasesize = 0;
-	mtd->writesize = 512;
+	mtd->writebufsize = mtd->writesize = 512;
 	mtd->oobsize = 16;
 	mtd->owner = THIS_MODULE;
 	mtd->erase = doc_erase;
diff -ur a/drivers/mtd/devices/docg3.c b/drivers/mtd/devices/docg3.c
--- a/drivers/mtd/devices/docg3.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/devices/docg3.c	2014-02-17 11:57:40.000000000 +0100
@@ -962,7 +962,7 @@
 	mtd->flags = MTD_CAP_ROM;
 	mtd->size = (docg3->max_block + 1) * DOC_LAYOUT_BLOCK_SIZE;
 	mtd->erasesize = DOC_LAYOUT_BLOCK_SIZE * DOC_LAYOUT_NBPLANES;
-	mtd->writesize = DOC_LAYOUT_PAGE_SIZE;
+	mtd->writebufsize = mtd->writesize = DOC_LAYOUT_PAGE_SIZE;
 	mtd->oobsize = DOC_LAYOUT_OOB_SIZE;
 	mtd->owner = THIS_MODULE;
 	mtd->erase = NULL;
Nur in b/drivers/mtd/devices: m25p80_mindspeed.c.
diff -ur a/drivers/mtd/devices/Makefile b/drivers/mtd/devices/Makefile
--- a/drivers/mtd/devices/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/mtd/devices/Makefile	2014-01-21 09:37:10.000000000 +0100
@@ -16,7 +16,11 @@
 obj-$(CONFIG_MTD_LART)		+= lart.o
 obj-$(CONFIG_MTD_BLOCK2MTD)	+= block2mtd.o
 obj-$(CONFIG_MTD_DATAFLASH)	+= mtd_dataflash.o
+ifeq ($(CONFIG_SYNO_C2K_SPI_PARTITION),y)
+obj-$(CONFIG_MTD_M25P80)	+= m25p80_mindspeed.o
+else
 obj-$(CONFIG_MTD_M25P80)	+= m25p80.o
+endif
 obj-$(CONFIG_MTD_SST25L)	+= sst25l.o
 
-CFLAGS_docg3.o			+= -I$(src)
\ Kein Zeilenumbruch am Dateiende.
+CFLAGS_docg3.o			+= -I$(src)
diff -ur a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig
--- a/drivers/mtd/Kconfig	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/mtd/Kconfig	2014-01-21 09:37:10.000000000 +0100
@@ -23,6 +23,16 @@
 	  WARNING: some of the tests will ERASE entire MTD device which they
 	  test. Do not use these tests unless you really know what you do.
 
+config MTD_ROOTFS_ROOT_DEV
+	bool "Automatically set 'rootfs' partition to be root filesystem"
+	depends on SYNO_COMCERTO
+	default y
+
+config MTD_ROOTFS_SPLIT
+	bool "Automatically split 'rootfs' partition for squashfs"
+	depends on SYNO_COMCERTO
+	default y
+
 config MTD_REDBOOT_PARTS
 	tristate "RedBoot partition table parsing"
 	---help---
Nur in b/drivers/mtd/maps: comcerto-nor.c.
diff -ur a/drivers/mtd/maps/Makefile b/drivers/mtd/maps/Makefile
--- a/drivers/mtd/maps/Makefile	2013-08-03 09:59:50.000000000 +0200
+++ b/drivers/mtd/maps/Makefile	2014-01-21 09:37:10.000000000 +0100
@@ -66,3 +66,4 @@
 	obj-$(CONFIG_MV_INCLUDE_SPI) += ../../../$(MACHINE)/flashmap.o
 endif
 endif
+obj-$(CONFIG_MTD_COMCERTO_NOR)  += comcerto-nor.o
diff -ur a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c
--- a/drivers/mtd/mtdchar.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/mtd/mtdchar.c	2014-02-17 11:57:40.000000000 +0100
@@ -1187,6 +1187,14 @@
 		break;
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	case MTDREFRESH:
+	{
+		ret = mtd_device_refresh(mtd);
+		break;
+	}
+#endif
+
 	default:
 		ret = -ENOTTY;
 	}
diff -ur a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c
--- a/drivers/mtd/mtdpart.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/mtd/mtdpart.c	2014-02-17 11:57:40.000000000 +0100
@@ -29,6 +29,10 @@
 #include <linux/kmod.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/root_dev.h>
+#include <linux/magic.h>
+#endif
 #include <linux/err.h>
 #ifdef MY_ABC_HERE
 #include <linux/rtnetlink.h>
@@ -50,6 +54,10 @@
 
 #include "mtdcore.h"
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#define MTD_ERASE_PARTIAL	0x8000 /* partition only covers parts of an erase block */
+#endif
+
 /* Our partition linked list */
 static LIST_HEAD(mtd_partitions);
 static DEFINE_MUTEX(mtd_partitions_mutex);
@@ -67,7 +75,9 @@
  * the pointer to that structure with this macro.
  */
 #define PART(x)  ((struct mtd_part *)(x))
-
+#if defined(CONFIG_SYNO_COMCERTO)
+#define IS_PART(mtd) (mtd->read == part_read)
+#endif
 
 /*
  * MTD methods which simply translate the effective address and pass through
@@ -273,12 +283,62 @@
 		return -EROFS;
 	if (instr->addr >= mtd->size)
 		return -EINVAL;
+
+#if defined(CONFIG_SYNO_COMCERTO)
+	instr->partial_start = false;
+	if (mtd->flags & MTD_ERASE_PARTIAL) {
+		size_t readlen = 0;
+		u64 mtd_ofs;
+
+		instr->erase_buf = kmalloc(part->master->erasesize, GFP_ATOMIC);
+		if (!instr->erase_buf)
+			return -ENOMEM;
+
+		mtd_ofs = part->offset + instr->addr;
+		instr->erase_buf_ofs = do_div(mtd_ofs, part->master->erasesize);
+
+		if (instr->erase_buf_ofs > 0) {
+			instr->addr -= instr->erase_buf_ofs;
+			ret = part->master->read(part->master,
+				instr->addr + part->offset,
+				part->master->erasesize,
+				&readlen, instr->erase_buf);
+
+			instr->partial_start = true;
+		} else {
+			mtd_ofs = part->offset + part->mtd.size;
+			instr->erase_buf_ofs = part->master->erasesize -
+				do_div(mtd_ofs, part->master->erasesize);
+
+			if (instr->erase_buf_ofs > 0) {
+				instr->len += instr->erase_buf_ofs;
+				ret = part->master->read(part->master,
+					part->offset + instr->addr +
+					instr->len - part->master->erasesize,
+					part->master->erasesize, &readlen,
+					instr->erase_buf);
+			} else {
+				ret = 0;
+			}
+		}
+		if (ret < 0) {
+			kfree(instr->erase_buf);
+			return ret;
+		}
+
+	}
+#endif
+
 	instr->addr += part->offset;
 	ret = part->master->erase(part->master, instr);
 	if (ret) {
 		if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 			instr->fail_addr -= part->offset;
 		instr->addr -= part->offset;
+#if defined(CONFIG_SYNO_COMCERTO)
+		if (mtd->flags & MTD_ERASE_PARTIAL)
+			kfree(instr->erase_buf);
+#endif
 	}
 	return ret;
 }
@@ -287,7 +347,29 @@
 {
 	if (instr->mtd->erase == part_erase) {
 		struct mtd_part *part = PART(instr->mtd);
+#if defined(CONFIG_SYNO_COMCERTO)
+		size_t wrlen = 0;
+#endif
 
+#if defined(CONFIG_SYNO_COMCERTO)
+		if (instr->mtd->flags & MTD_ERASE_PARTIAL) {
+			if (instr->partial_start) {
+				part->master->write(part->master,
+					instr->addr, instr->erase_buf_ofs,
+					&wrlen, instr->erase_buf);
+				instr->addr += instr->erase_buf_ofs;
+			} else {
+				instr->len -= instr->erase_buf_ofs;
+				part->master->write(part->master,
+					instr->addr + instr->len,
+					instr->erase_buf_ofs, &wrlen,
+					instr->erase_buf +
+					part->master->erasesize -
+					instr->erase_buf_ofs);
+			}
+			kfree(instr->erase_buf);
+		}
+#endif
 		if (instr->fail_addr != MTD_FAIL_ADDR_UNKNOWN)
 			instr->fail_addr -= part->offset;
 		instr->addr -= part->offset;
@@ -558,18 +640,41 @@
 	if ((slave->mtd.flags & MTD_WRITEABLE) &&
 	    mtd_mod_by_eb(slave->offset, &slave->mtd)) {
 		/* Doesn't start on a boundary of major erase size */
+#if defined(CONFIG_SYNO_COMCERTO)
+		slave->mtd.flags |= MTD_ERASE_PARTIAL;
+		if (((u32) slave->mtd.size) > master->erasesize)
+			slave->mtd.flags &= ~MTD_WRITEABLE;
+		else
+			slave->mtd.erasesize = slave->mtd.size;
+#else
 		/* FIXME: Let it be writable if it is on a boundary of
 		 * _minor_ erase size though */
 		slave->mtd.flags &= ~MTD_WRITEABLE;
 		printk(KERN_WARNING"mtd: partition \"%s\" doesn't start on an erase block boundary -- force read-only\n",
 			part->name);
+#endif
 	}
 	if ((slave->mtd.flags & MTD_WRITEABLE) &&
+#if defined(CONFIG_SYNO_COMCERTO)
+	    mtd_mod_by_eb(slave->offset + slave->mtd.size, &slave->mtd)) {
+		slave->mtd.flags |= MTD_ERASE_PARTIAL;
+
+		if ((u32) slave->mtd.size > master->erasesize)
+			slave->mtd.flags &= ~MTD_WRITEABLE;
+		else
+			slave->mtd.erasesize = slave->mtd.size;
+#else
 	    mtd_mod_by_eb(slave->mtd.size, &slave->mtd)) {
 		slave->mtd.flags &= ~MTD_WRITEABLE;
 		printk(KERN_WARNING"mtd: partition \"%s\" doesn't end on an erase block -- force read-only\n",
 			part->name);
+#endif
 	}
+#if defined(CONFIG_SYNO_COMCERTO)
+	if ((slave->mtd.flags & (MTD_ERASE_PARTIAL|MTD_WRITEABLE)) == MTD_ERASE_PARTIAL)
+		printk(KERN_WARNING"mtd: partition \"%s\" must either start or end on erase block boundary or be smaller than an erase block -- forcing read-only\n",
+				part->name);
+#endif
 
 	slave->mtd.ecclayout = master->ecclayout;
 	if (master->block_isbad) {
@@ -805,6 +910,155 @@
 }
 EXPORT_SYMBOL_GPL(mtd_del_partition);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_MTD_ROOTFS_SPLIT)
+#define ROOTFS_SPLIT_NAME "rootfs_data"
+#define ROOTFS_REMOVED_NAME "<removed>"
+
+struct squashfs_super_block {
+	__le32 s_magic;
+	__le32 pad0[9];
+	__le64 bytes_used;
+};
+
+
+static int split_squashfs(struct mtd_info *master, int offset, int *split_offset)
+{
+	struct squashfs_super_block sb;
+	int len, ret;
+
+	ret = master->read(master, offset, sizeof(sb), &len, (void *) &sb);
+	if (ret || (len != sizeof(sb))) {
+		printk(KERN_ALERT "split_squashfs: error occured while reading "
+			"from \"%s\"\n", master->name);
+		return -EINVAL;
+	}
+
+	if (SQUASHFS_MAGIC != le32_to_cpu(sb.s_magic) ) {
+		printk(KERN_ALERT "split_squashfs: no squashfs found in \"%s\"\n",
+			master->name);
+		*split_offset = 0;
+		return 0;
+	}
+
+	if (le64_to_cpu((sb.bytes_used)) <= 0) {
+		printk(KERN_ALERT "split_squashfs: squashfs is empty in \"%s\"\n",
+			master->name);
+		*split_offset = 0;
+		return 0;
+	}
+
+	len = (u32) le64_to_cpu(sb.bytes_used);
+	len += (offset & 0x000fffff);
+	len +=  (master->erasesize - 1);
+	len &= ~(master->erasesize - 1);
+	len -= (offset & 0x000fffff);
+	*split_offset = offset + len;
+
+	return 0;
+}
+
+static int split_rootfs_data(struct mtd_info *master, struct mtd_info *rpart, const struct mtd_partition *part)
+{
+	struct mtd_partition *dpart;
+	struct mtd_part *slave = NULL;
+	struct mtd_part *spart;
+	int ret, split_offset = 0;
+
+	spart = PART(rpart);
+	ret = split_squashfs(master, spart->offset, &split_offset);
+	if (ret)
+		return ret;
+
+	if (split_offset <= 0)
+		return 0;
+
+	dpart = kmalloc(sizeof(*part)+sizeof(ROOTFS_SPLIT_NAME)+1, GFP_KERNEL);
+	if (dpart == NULL) {
+		printk(KERN_INFO "split_squashfs: no memory for partition \"%s\"\n",
+			ROOTFS_SPLIT_NAME);
+		return -ENOMEM;
+	}
+
+	memcpy(dpart, part, sizeof(*part));
+	dpart->name = (unsigned char *)&dpart[1];
+	strcpy(dpart->name, ROOTFS_SPLIT_NAME);
+
+	dpart->size = rpart->size - (split_offset - spart->offset);
+	dpart->offset = split_offset;
+
+	if (dpart == NULL)
+		return 1;
+
+	printk(KERN_INFO "mtd: partition \"%s\" created automatically, ofs=%llX, len=%llX \n",
+		ROOTFS_SPLIT_NAME, dpart->offset, dpart->size);
+
+	slave = allocate_partition(master, dpart, 0, split_offset);
+	if (IS_ERR(slave))
+		return PTR_ERR(slave);
+	mutex_lock(&mtd_partitions_mutex);
+	list_add(&slave->list, &mtd_partitions);
+	mutex_unlock(&mtd_partitions_mutex);
+
+	add_mtd_device(&slave->mtd);
+
+	rpart->split = &slave->mtd;
+
+	return 0;
+}
+
+static int refresh_rootfs_split(struct mtd_info *mtd)
+{
+	struct mtd_partition tpart;
+	struct mtd_part *part;
+	char *name;
+	//int index = 0;
+	int offset, size;
+	int ret;
+
+	part = PART(mtd);
+
+	/* check for the new squashfs offset first */
+	ret = split_squashfs(part->master, part->offset, &offset);
+	if (ret)
+		return ret;
+
+	if ((offset > 0) && !mtd->split) {
+		printk(KERN_INFO "%s: creating new split partition for \"%s\"\n", __func__, mtd->name);
+		/* if we don't have a rootfs split partition, create a new one */
+		tpart.name = (char *) mtd->name;
+		tpart.size = mtd->size;
+		tpart.offset = part->offset;
+
+		return split_rootfs_data(part->master, &part->mtd, &tpart);
+	} else if ((offset > 0) && mtd->split) {
+		/* update the offsets of the existing partition */
+		size = mtd->size + part->offset - offset;
+
+		part = PART(mtd->split);
+		part->offset = offset;
+		part->mtd.size = size;
+		printk(KERN_INFO "%s: %s partition \"" ROOTFS_SPLIT_NAME "\", offset: 0x%06x (0x%06x)\n",
+			__func__, (!strcmp(part->mtd.name, ROOTFS_SPLIT_NAME) ? "updating" : "creating"),
+			(u32) part->offset, (u32) part->mtd.size);
+		name = kmalloc(sizeof(ROOTFS_SPLIT_NAME) + 1, GFP_KERNEL);
+		strcpy(name, ROOTFS_SPLIT_NAME);
+		part->mtd.name = name;
+	} else if ((offset <= 0) && mtd->split) {
+		printk(KERN_INFO "%s: removing partition \"%s\"\n", __func__, mtd->split->name);
+
+		/* mark existing partition as removed */
+		part = PART(mtd->split);
+		name = kmalloc(sizeof(ROOTFS_SPLIT_NAME) + 1, GFP_KERNEL);
+		strcpy(name, ROOTFS_REMOVED_NAME);
+		part->mtd.name = name;
+		part->offset = 0;
+		part->mtd.size = 0;
+	}
+
+	return 0;
+}
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_MTD_ROOTFS_SPLIT */
+
 /*
  * This function, given a master MTD object and a partition table, creates
  * and registers slave MTD objects which are bound to the master according to
@@ -821,6 +1075,9 @@
 	struct mtd_part *slave;
 	uint64_t cur_offset = 0;
 	int i;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_MTD_ROOTFS_SPLIT)
+	int ret;
+#endif
 
 	printk(KERN_NOTICE "Creating %d MTD partitions on \"%s\":\n", nbparts, master->name);
 
@@ -835,12 +1092,56 @@
 
 		add_mtd_device(&slave->mtd);
 
+#if defined(CONFIG_SYNO_COMCERTO) 
+		if (!strcmp(parts[i].name, "rootfs")) {
+#ifdef CONFIG_MTD_ROOTFS_ROOT_DEV
+			if (ROOT_DEV == 0) {
+				printk(KERN_NOTICE "mtd: partition \"rootfs\" "
+					"set to be root filesystem\n");
+				ROOT_DEV = MKDEV(MTD_BLOCK_MAJOR, slave->mtd.index);
+			}
+#endif
+#ifdef CONFIG_MTD_ROOTFS_SPLIT
+			ret = split_rootfs_data(master, &slave->mtd, &parts[i]);
+			/* if (ret == 0)
+			 * 	j++; */
+#endif
+		}
+#endif
 		cur_offset = slave->offset + slave->mtd.size;
 	}
 
 	return 0;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO)
+int mtd_device_refresh(struct mtd_info *mtd)
+{
+	int ret = 0;
+
+	if (IS_PART(mtd)) {
+		struct mtd_part *part;
+		struct mtd_info *master;
+
+		part = PART(mtd);
+		master = part->master;
+		if (master->refresh_device)
+			ret = master->refresh_device(master);
+	}
+
+	if (!ret && mtd->refresh_device)
+		ret = mtd->refresh_device(mtd);
+
+#ifdef CONFIG_MTD_ROOTFS_SPLIT
+	if (!ret && IS_PART(mtd) && !strcmp(mtd->name, "rootfs"))
+		refresh_rootfs_split(mtd);
+#endif
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mtd_device_refresh);
+#endif
+
 static DEFINE_SPINLOCK(part_parser_lock);
 static LIST_HEAD(part_parsers);
 
@@ -939,7 +1240,7 @@
 	return ret;
 }
 
-int mtd_is_partition(struct mtd_info *mtd)
+int mtd_is_partition(const struct mtd_info *mtd)
 {
 	struct mtd_part *part;
 	int ispart = 0;
@@ -956,6 +1257,16 @@
 }
 EXPORT_SYMBOL_GPL(mtd_is_partition);
 
+/* Returns the size of the entire flash chip */
+uint64_t mtd_get_device_size(const struct mtd_info *mtd)
+{
+	if (!mtd_is_partition(mtd))
+		return mtd->size;
+
+	return PART(mtd)->master->size;
+}
+EXPORT_SYMBOL_GPL(mtd_get_device_size);
+
 #ifdef MY_ABC_HERE
 int SYNOMTDModifyPartInfo(struct mtd_info *mtd, unsigned long offset, unsigned long length)
 {
Nur in b/drivers/mtd/nand: comcerto_nand.c.
diff -ur a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
--- a/drivers/mtd/nand/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/mtd/nand/Kconfig	2014-01-21 09:37:10.000000000 +0100
@@ -46,6 +46,18 @@
 	  ECC codes. They are used with NAND devices requiring more than 1 bit
 	  of error correction.
 
+config NAND_COMCERTO_ECC_8_HW_BCH
+	bool
+	depends on MTD_NAND_COMCERTO && SYNO_COMCERTO
+	default n
+	prompt "Comcerto ECC-8 syndrome calculation using BCH"
+
+config NAND_COMCERTO_ECC_24_HW_BCH
+	bool
+	depends on MTD_NAND_COMCERTO && SYNO_COMCERTO
+	default y
+	prompt "Comcerto ECC-24 syndrome calculation using BCH"
+
 config MTD_SM_COMMON
 	tristate
 	default n
@@ -543,4 +555,10 @@
 	  Enables support for NAND Flash chips on the ST Microelectronics
 	  Flexible Static Memory Controller (FSMC)
 
+config MTD_NAND_COMCERTO
+        tristate "Comcerto NAND driver"
+	depends on MTD_NAND && (ARCH_COMCERTO) && SYNO_COMCERTO
+	help
+		Comcerto NAND Driver.
+
 endif # MTD_NAND
diff -ur a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
--- a/drivers/mtd/nand/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/mtd/nand/Makefile	2014-01-21 09:37:10.000000000 +0100
@@ -49,6 +49,7 @@
 obj-$(CONFIG_MTD_NAND_RICOH)		+= r852.o
 obj-$(CONFIG_MTD_NAND_JZ4740)		+= jz4740_nand.o
 obj-$(CONFIG_MTD_NAND_GPMI_NAND)	+= gpmi-nand/
+obj-$(CONFIG_MTD_NAND_COMCERTO)		+= comcerto_nand.o
 
 
 nand-objs := nand_base.o nand_bbt.o
diff -ur a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
--- a/drivers/mtd/nand/nand_base.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/mtd/nand/nand_base.c	2014-02-17 11:57:40.000000000 +0100
@@ -3314,6 +3314,11 @@
 		case 128:
 			chip->ecc.layout = &nand_oob_128;
 			break;
+#if defined(CONFIG_SYNO_COMCERTO)
+		case 224:
+			chip->ecc.layout = &nand_oob_128;
+			break;
+#endif
 		default:
 			pr_warn("No oob scheme defined for oobsize %d\n",
 				   mtd->oobsize);
diff -ur a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c
--- a/drivers/mtd/nand/nandsim.c	2013-08-24 11:37:06.000000000 +0200
+++ b/drivers/mtd/nand/nandsim.c	2014-02-17 11:57:40.000000000 +0100
@@ -1402,10 +1402,7 @@
 	unsigned int page_no = ns->regs.row;
 
 	if (read_error(page_no)) {
-		int i;
-		memset(ns->buf.byte, 0xFF, num);
-		for (i = 0; i < num; ++i)
-			ns->buf.byte[i] = random32();
+		prandom_bytes(ns->buf.byte, num);
 		NS_WARN("simulating read error in page %u\n", page_no);
 		return 1;
 	}
Nur in b/drivers/mtd/ubi: attach.c.
diff -ur a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c
--- a/drivers/mtd/ubi/build.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/build.c	2014-02-17 11:57:40.000000000 +0100
@@ -27,10 +27,6 @@
  * module load parameters or the kernel boot parameters. If MTD devices were
  * specified, UBI does not attach any MTD device, but it is possible to do
  * later using the "UBI control device".
- *
- * At the moment we only attach UBI devices by scanning, which will become a
- * bottleneck when flashes reach certain large size. Then one may improve UBI
- * and add other methods, although it does not seem to be easy to do.
  */
 
 #include <linux/err.h>
@@ -40,6 +36,7 @@
 #include <linux/namei.h>
 #include <linux/stat.h>
 #include <linux/miscdevice.h>
+#include <linux/mtd/partitions.h>
 #include <linux/log2.h>
 #include <linux/kthread.h>
 #include <linux/kernel.h>
@@ -49,6 +46,12 @@
 /* Maximum length of the 'mtd=' parameter */
 #define MTD_PARAM_LEN_MAX 64
 
+/* Maximum number of comma-separated items in the 'mtd=' parameter */
+#define MTD_PARAM_MAX_COUNT 4
+
+/* Maximum value for the number of bad PEBs per 1024 PEBs */
+#define MAX_MTD_UBI_BEB_LIMIT 768
+
 #ifdef CONFIG_MTD_UBI_MODULE
 #define ubi_is_module() 1
 #else
@@ -60,10 +63,13 @@
  * @name: MTD character device node path, MTD device name, or MTD device number
  *        string
  * @vid_hdr_offs: VID header offset
+ * @max_beb_per1024: maximum expected number of bad PEBs per 1024 PEBs
  */
 struct mtd_dev_param {
 	char name[MTD_PARAM_LEN_MAX];
+	int ubi_num;
 	int vid_hdr_offs;
+	int max_beb_per1024;
 };
 
 /* Numbers of elements set in the @mtd_dev_param array */
@@ -71,7 +77,10 @@
 
 /* MTD devices specification parameters */
 static struct mtd_dev_param __initdata mtd_dev_param[UBI_MAX_DEVICES];
-
+#ifdef CONFIG_MTD_UBI_FASTMAP
+/* UBI module parameter to enable fastmap automatically on non-fastmap images */
+static bool fm_autoconvert;
+#endif
 /* Root UBI "class" object (corresponds to '/<sysfs>/class/ubi/') */
 struct class *ubi_class;
 
@@ -148,6 +157,19 @@
 
 	ubi_do_get_device_info(ubi, &nt.di);
 	ubi_do_get_volume_info(ubi, vol, &nt.vi);
+
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	switch (ntype) {
+	case UBI_VOLUME_ADDED:
+	case UBI_VOLUME_REMOVED:
+	case UBI_VOLUME_RESIZED:
+	case UBI_VOLUME_RENAMED:
+		if (ubi_update_fastmap(ubi)) {
+			ubi_err("Unable to update fastmap!");
+			ubi_ro_mode(ubi);
+		}
+	}
+#endif
 	return blocking_notifier_call_chain(&ubi_notifiers, ntype, &nt);
 }
 
@@ -554,10 +576,10 @@
 }
 
 /**
- * free_internal_volumes - free internal volumes.
+ * ubi_free_internal_volumes - free internal volumes.
  * @ubi: UBI device description object
  */
-static void free_internal_volumes(struct ubi_device *ubi)
+void ubi_free_internal_volumes(struct ubi_device *ubi)
 {
 	int i;
 
@@ -568,62 +590,38 @@
 	}
 }
 
-/**
- * attach_by_scanning - attach an MTD device using scanning method.
- * @ubi: UBI device descriptor
- *
- * This function returns zero in case of success and a negative error code in
- * case of failure.
- *
- * Note, currently this is the only method to attach UBI devices. Hopefully in
- * the future we'll have more scalable attaching methods and avoid full media
- * scanning. But even in this case scanning will be needed as a fall-back
- * attaching method if there are some on-flash table corruptions.
- */
-static int attach_by_scanning(struct ubi_device *ubi)
+static int get_bad_peb_limit(const struct ubi_device *ubi, int max_beb_per1024)
 {
-	int err;
-	struct ubi_scan_info *si;
-
-	si = ubi_scan(ubi);
-	if (IS_ERR(si))
-		return PTR_ERR(si);
-
-	ubi->bad_peb_count = si->bad_peb_count;
-	ubi->good_peb_count = ubi->peb_count - ubi->bad_peb_count;
-	ubi->corr_peb_count = si->corr_peb_count;
-	ubi->max_ec = si->max_ec;
-	ubi->mean_ec = si->mean_ec;
-	ubi_msg("max. sequence number:       %llu", si->max_sqnum);
+	int limit, device_pebs;
+	uint64_t device_size;
 
-	err = ubi_read_volume_table(ubi, si);
-	if (err)
-		goto out_si;
-
-	err = ubi_wl_init_scan(ubi, si);
-	if (err)
-		goto out_vtbl;
-
-	err = ubi_eba_init_scan(ubi, si);
-	if (err)
-		goto out_wl;
+	if (!max_beb_per1024)
+		return 0;
 
-	ubi_scan_destroy_si(si);
-	return 0;
+	/*
+	 * Here we are using size of the entire flash chip and
+	 * not just the MTD partition size because the maximum
+	 * number of bad eraseblocks is a percentage of the
+	 * whole device and bad eraseblocks are not fairly
+	 * distributed over the flash chip. So the worst case
+	 * is that all the bad eraseblocks of the chip are in
+	 * the MTD partition we are attaching (ubi->mtd).
+	 */
+	device_size = mtd_get_device_size(ubi->mtd);
+	device_pebs = mtd_div_by_eb(device_size, ubi->mtd);
+	limit = mult_frac(device_pebs, max_beb_per1024, 1024);
+
+	/* Round it up */
+	if (mult_frac(limit, 1024, max_beb_per1024) < device_pebs)
+		limit += 1;
 
-out_wl:
-	ubi_wl_close(ubi);
-out_vtbl:
-	free_internal_volumes(ubi);
-	vfree(ubi->vtbl);
-out_si:
-	ubi_scan_destroy_si(si);
-	return err;
+	return limit;
 }
 
 /**
  * io_init - initialize I/O sub-system for a given UBI device.
  * @ubi: UBI device description object
+ * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs
  *
  * If @ubi->vid_hdr_offset or @ubi->leb_start is zero, default offsets are
  * assumed:
@@ -636,8 +634,11 @@
  * This function returns zero in case of success and a negative error code in
  * case of failure.
  */
-static int io_init(struct ubi_device *ubi)
+static int io_init(struct ubi_device *ubi, int max_beb_per1024)
 {
+	dbg_gen("sizeof(struct ubi_ainf_peb) %zu", sizeof(struct ubi_ainf_peb));
+	dbg_gen("sizeof(struct ubi_wl_entry) %zu", sizeof(struct ubi_wl_entry));
+
 	if (ubi->mtd->numeraseregions != 0) {
 		/*
 		 * Some flashes have several erase regions. Different regions
@@ -664,8 +665,10 @@
 	ubi->peb_count  = mtd_div_by_eb(ubi->mtd->size, ubi->mtd);
 	ubi->flash_size = ubi->mtd->size;
 
-	if (ubi->mtd->block_isbad && ubi->mtd->block_markbad)
+	if (ubi->mtd->block_isbad && ubi->mtd->block_markbad) {
 		ubi->bad_allowed = 1;
+		ubi->bad_peb_limit = get_bad_peb_limit(ubi, max_beb_per1024);
+	}
 
 	if (ubi->mtd->type == MTD_NORFLASH) {
 		ubi_assert(ubi->mtd->writesize == 1);
@@ -707,11 +710,11 @@
 	ubi->ec_hdr_alsize = ALIGN(UBI_EC_HDR_SIZE, ubi->hdrs_min_io_size);
 	ubi->vid_hdr_alsize = ALIGN(UBI_VID_HDR_SIZE, ubi->hdrs_min_io_size);
 
-	dbg_msg("min_io_size      %d", ubi->min_io_size);
-	dbg_msg("max_write_size   %d", ubi->max_write_size);
-	dbg_msg("hdrs_min_io_size %d", ubi->hdrs_min_io_size);
-	dbg_msg("ec_hdr_alsize    %d", ubi->ec_hdr_alsize);
-	dbg_msg("vid_hdr_alsize   %d", ubi->vid_hdr_alsize);
+	dbg_gen("min_io_size      %d", ubi->min_io_size);
+	dbg_gen("max_write_size   %d", ubi->max_write_size);
+	dbg_gen("hdrs_min_io_size %d", ubi->hdrs_min_io_size);
+	dbg_gen("ec_hdr_alsize    %d", ubi->ec_hdr_alsize);
+	dbg_gen("vid_hdr_alsize   %d", ubi->vid_hdr_alsize);
 
 	if (ubi->vid_hdr_offset == 0)
 		/* Default offset */
@@ -728,10 +731,10 @@
 	ubi->leb_start = ubi->vid_hdr_offset + UBI_VID_HDR_SIZE;
 	ubi->leb_start = ALIGN(ubi->leb_start, ubi->min_io_size);
 
-	dbg_msg("vid_hdr_offset   %d", ubi->vid_hdr_offset);
-	dbg_msg("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset);
-	dbg_msg("vid_hdr_shift    %d", ubi->vid_hdr_shift);
-	dbg_msg("leb_start        %d", ubi->leb_start);
+	dbg_gen("vid_hdr_offset   %d", ubi->vid_hdr_offset);
+	dbg_gen("vid_hdr_aloffset %d", ubi->vid_hdr_aloffset);
+	dbg_gen("vid_hdr_shift    %d", ubi->vid_hdr_shift);
+	dbg_gen("leb_start        %d", ubi->leb_start);
 
 	/* The shift must be aligned to 32-bit boundary */
 	if (ubi->vid_hdr_shift % 4) {
@@ -757,7 +760,7 @@
 	ubi->max_erroneous = ubi->peb_count / 10;
 	if (ubi->max_erroneous < 16)
 		ubi->max_erroneous = 16;
-	dbg_msg("max_erroneous    %d", ubi->max_erroneous);
+	dbg_gen("max_erroneous    %d", ubi->max_erroneous);
 
 	/*
 	 * It may happen that EC and VID headers are situated in one minimal
@@ -765,36 +768,24 @@
 	 * read-only mode.
 	 */
 	if (ubi->vid_hdr_offset + UBI_VID_HDR_SIZE <= ubi->hdrs_min_io_size) {
-		ubi_warn("EC and VID headers are in the same minimal I/O unit, "
-			 "switch to read-only mode");
+		ubi_warn("EC and VID headers are in the same minimal I/O unit, switch to read-only mode");
 		ubi->ro_mode = 1;
 	}
 
 	ubi->leb_size = ubi->peb_size - ubi->leb_start;
 
 	if (!(ubi->mtd->flags & MTD_WRITEABLE)) {
-		ubi_msg("MTD device %d is write-protected, attach in "
-			"read-only mode", ubi->mtd->index);
+		ubi_msg("MTD device %d is write-protected, attach in read-only mode",
+			ubi->mtd->index);
 		ubi->ro_mode = 1;
 	}
 
-	ubi_msg("physical eraseblock size:   %d bytes (%d KiB)",
-		ubi->peb_size, ubi->peb_size >> 10);
-	ubi_msg("logical eraseblock size:    %d bytes", ubi->leb_size);
-	ubi_msg("smallest flash I/O unit:    %d", ubi->min_io_size);
-	if (ubi->hdrs_min_io_size != ubi->min_io_size)
-		ubi_msg("sub-page size:              %d",
-			ubi->hdrs_min_io_size);
-	ubi_msg("VID header offset:          %d (aligned %d)",
-		ubi->vid_hdr_offset, ubi->vid_hdr_aloffset);
-	ubi_msg("data offset:                %d", ubi->leb_start);
-
 	/*
-	 * Note, ideally, we have to initialize ubi->bad_peb_count here. But
+	 * Note, ideally, we have to initialize @ubi->bad_peb_count here. But
 	 * unfortunately, MTD does not provide this information. We should loop
 	 * over all physical eraseblocks and invoke mtd->block_is_bad() for
-	 * each physical eraseblock. So, we skip ubi->bad_peb_count
-	 * uninitialized and initialize it after scanning.
+	 * each physical eraseblock. So, we leave @ubi->bad_peb_count
+	 * uninitialized so far.
 	 */
 
 	return 0;
@@ -805,7 +796,7 @@
  * @ubi: UBI device description object
  * @vol_id: ID of the volume to re-size
  *
- * This function re-sizes the volume marked by the @UBI_VTBL_AUTORESIZE_FLG in
+ * This function re-sizes the volume marked by the %UBI_VTBL_AUTORESIZE_FLG in
  * the volume table to the largest possible size. See comments in ubi-header.h
  * for more description of the flag. Returns zero in case of success and a
  * negative error code in case of failure.
@@ -835,8 +826,7 @@
 		 * No available PEBs to re-size the volume, clear the flag on
 		 * flash and exit.
 		 */
-		memcpy(&vtbl_rec, &ubi->vtbl[vol_id],
-		       sizeof(struct ubi_vtbl_record));
+		vtbl_rec = ubi->vtbl[vol_id];
 		err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
 		if (err)
 			ubi_err("cannot clean auto-resize flag for volume %d",
@@ -862,6 +852,7 @@
  * @mtd: MTD device description object
  * @ubi_num: number to assign to the new UBI device
  * @vid_hdr_offset: VID header offset
+ * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs
  *
  * This function attaches MTD device @mtd_dev to UBI and assign @ubi_num number
  * to the newly created UBI device, unless @ubi_num is %UBI_DEV_NUM_AUTO, in
@@ -872,11 +863,18 @@
  * Note, the invocations of this function has to be serialized by the
  * @ubi_devices_mutex.
  */
-int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset)
+int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
+		       int vid_hdr_offset, int max_beb_per1024)
 {
 	struct ubi_device *ubi;
 	int i, err, ref = 0;
 
+	if (max_beb_per1024 < 0 || max_beb_per1024 > MAX_MTD_UBI_BEB_LIMIT)
+		return -EINVAL;
+
+	if (!max_beb_per1024)
+		max_beb_per1024 = CONFIG_MTD_UBI_BEB_LIMIT;
+
 	/*
 	 * Check if we already have the same MTD device attached.
 	 *
@@ -886,7 +884,7 @@
 	for (i = 0; i < UBI_MAX_DEVICES; i++) {
 		ubi = ubi_devices[i];
 		if (ubi && mtd->index == ubi->mtd->index) {
-			dbg_err("mtd%d is already attached to ubi%d",
+			ubi_err("mtd%d is already attached to ubi%d",
 				mtd->index, i);
 			return -EEXIST;
 		}
@@ -901,8 +899,8 @@
 	 * no sense to attach emulated MTD devices, so we prohibit this.
 	 */
 	if (mtd->type == MTD_UBIVOLUME) {
-		ubi_err("refuse attaching mtd%d - it is already emulated on "
-			"top of UBI", mtd->index);
+		ubi_err("refuse attaching mtd%d - it is already emulated on top of UBI",
+			mtd->index);
 		return -EINVAL;
 	}
 
@@ -912,7 +910,7 @@
 			if (!ubi_devices[ubi_num])
 				break;
 		if (ubi_num == UBI_MAX_DEVICES) {
-			dbg_err("only %d UBI devices may be created",
+			ubi_err("only %d UBI devices may be created",
 				UBI_MAX_DEVICES);
 			return -ENFILE;
 		}
@@ -922,7 +920,7 @@
 
 		/* Make sure ubi_num is not busy */
 		if (ubi_devices[ubi_num]) {
-			dbg_err("ubi%d already exists", ubi_num);
+			ubi_err("ubi%d already exists", ubi_num);
 			return -EEXIST;
 		}
 	}
@@ -936,36 +934,62 @@
 	ubi->vid_hdr_offset = vid_hdr_offset;
 	ubi->autoresize_vol_id = -1;
 
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	ubi->fm_pool.used = ubi->fm_pool.size = 0;
+	ubi->fm_wl_pool.used = ubi->fm_wl_pool.size = 0;
+
+	/*
+	 * fm_pool.max_size is 5% of the total number of PEBs but it's also
+	 * between UBI_FM_MAX_POOL_SIZE and UBI_FM_MIN_POOL_SIZE.
+	 */
+	ubi->fm_pool.max_size = min(((int)mtd_div_by_eb(ubi->mtd->size,
+		ubi->mtd) / 100) * 5, UBI_FM_MAX_POOL_SIZE);
+	if (ubi->fm_pool.max_size < UBI_FM_MIN_POOL_SIZE)
+		ubi->fm_pool.max_size = UBI_FM_MIN_POOL_SIZE;
+
+	ubi->fm_wl_pool.max_size = UBI_FM_WL_POOL_SIZE;
+	ubi->fm_disabled = !fm_autoconvert;
+
+	if (!ubi->fm_disabled && (int)mtd_div_by_eb(ubi->mtd->size, ubi->mtd)
+	    <= UBI_FM_MAX_START) {
+		ubi_err("More than %i PEBs are needed for fastmap, sorry.",
+			UBI_FM_MAX_START);
+		ubi->fm_disabled = 1;
+	}
+
+	ubi_msg("default fastmap pool size: %d", ubi->fm_pool.max_size);
+	ubi_msg("default fastmap WL pool size: %d", ubi->fm_wl_pool.max_size);
+#else
+	ubi->fm_disabled = 1;
+#endif
 	mutex_init(&ubi->buf_mutex);
 	mutex_init(&ubi->ckvol_mutex);
 	mutex_init(&ubi->device_mutex);
 	spin_lock_init(&ubi->volumes_lock);
+	mutex_init(&ubi->fm_mutex);
+	init_rwsem(&ubi->fm_sem);
 
 	ubi_msg("attaching mtd%d to ubi%d", mtd->index, ubi_num);
-	dbg_msg("sizeof(struct ubi_scan_leb) %zu", sizeof(struct ubi_scan_leb));
-	dbg_msg("sizeof(struct ubi_wl_entry) %zu", sizeof(struct ubi_wl_entry));
 
-	err = io_init(ubi);
+	err = io_init(ubi, max_beb_per1024);
 	if (err)
 		goto out_free;
 
 	err = -ENOMEM;
-	ubi->peb_buf1 = vmalloc(ubi->peb_size);
-	if (!ubi->peb_buf1)
+	ubi->peb_buf = vmalloc(ubi->peb_size);
+	if (!ubi->peb_buf)
 		goto out_free;
 
-	ubi->peb_buf2 = vmalloc(ubi->peb_size);
-	if (!ubi->peb_buf2)
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	ubi->fm_size = ubi_calc_fm_size(ubi);
+	ubi->fm_buf = vzalloc(ubi->fm_size);
+	if (!ubi->fm_buf)
 		goto out_free;
-
-	err = ubi_debugging_init_dev(ubi);
-	if (err)
-		goto out_free;
-
-	err = attach_by_scanning(ubi);
+#endif
+	err = ubi_attach(ubi, 0);
 	if (err) {
-		dbg_err("failed to attach by scanning, error %d", err);
-		goto out_debugging;
+		ubi_err("failed to attach mtd%d, error %d", mtd->index, err);
+		goto out_free;
 	}
 
 	if (ubi->autoresize_vol_id != -1) {
@@ -990,23 +1014,24 @@
 		goto out_debugfs;
 	}
 
-	ubi_msg("attached mtd%d to ubi%d", mtd->index, ubi_num);
-	ubi_msg("MTD device name:            \"%s\"", mtd->name);
-	ubi_msg("MTD device size:            %llu MiB", ubi->flash_size >> 20);
-	ubi_msg("number of good PEBs:        %d", ubi->good_peb_count);
-	ubi_msg("number of bad PEBs:         %d", ubi->bad_peb_count);
-	ubi_msg("number of corrupted PEBs:   %d", ubi->corr_peb_count);
-	ubi_msg("max. allowed volumes:       %d", ubi->vtbl_slots);
-	ubi_msg("wear-leveling threshold:    %d", CONFIG_MTD_UBI_WL_THRESHOLD);
-	ubi_msg("number of internal volumes: %d", UBI_INT_VOL_COUNT);
-	ubi_msg("number of user volumes:     %d",
-		ubi->vol_count - UBI_INT_VOL_COUNT);
-	ubi_msg("available PEBs:             %d", ubi->avail_pebs);
-	ubi_msg("total number of reserved PEBs: %d", ubi->rsvd_pebs);
-	ubi_msg("number of PEBs reserved for bad PEB handling: %d",
-		ubi->beb_rsvd_pebs);
-	ubi_msg("max/mean erase counter: %d/%d", ubi->max_ec, ubi->mean_ec);
-	ubi_msg("image sequence number:  %d", ubi->image_seq);
+	ubi_msg("attached mtd%d (name \"%s\", size %llu MiB) to ubi%d",
+		mtd->index, mtd->name, ubi->flash_size >> 20, ubi_num);
+	ubi_msg("PEB size: %d bytes (%d KiB), LEB size: %d bytes",
+		ubi->peb_size, ubi->peb_size >> 10, ubi->leb_size);
+	ubi_msg("min./max. I/O unit sizes: %d/%d, sub-page size %d",
+		ubi->min_io_size, ubi->max_write_size, ubi->hdrs_min_io_size);
+	ubi_msg("VID header offset: %d (aligned %d), data offset: %d",
+		ubi->vid_hdr_offset, ubi->vid_hdr_aloffset, ubi->leb_start);
+	ubi_msg("good PEBs: %d, bad PEBs: %d, corrupted PEBs: %d",
+		ubi->good_peb_count, ubi->bad_peb_count, ubi->corr_peb_count);
+	ubi_msg("user volume: %d, internal volumes: %d, max. volumes count: %d",
+		ubi->vol_count - UBI_INT_VOL_COUNT, UBI_INT_VOL_COUNT,
+		ubi->vtbl_slots);
+	ubi_msg("max/mean erase counter: %d/%d, WL threshold: %d, image sequence number: %u",
+		ubi->max_ec, ubi->mean_ec, CONFIG_MTD_UBI_WL_THRESHOLD,
+		ubi->image_seq);
+	ubi_msg("available PEBs: %d, total reserved PEBs: %d, PEBs reserved for bad PEB handling: %d",
+		ubi->avail_pebs, ubi->rsvd_pebs, ubi->beb_rsvd_pebs);
 
 	/*
 	 * The below lock makes sure we do not race with 'ubi_thread()' which
@@ -1029,13 +1054,11 @@
 	uif_close(ubi);
 out_detach:
 	ubi_wl_close(ubi);
-	free_internal_volumes(ubi);
+	ubi_free_internal_volumes(ubi);
 	vfree(ubi->vtbl);
-out_debugging:
-	ubi_debugging_exit_dev(ubi);
 out_free:
-	vfree(ubi->peb_buf1);
-	vfree(ubi->peb_buf2);
+	vfree(ubi->peb_buf);
+	vfree(ubi->fm_buf);
 	if (ref)
 		put_device(&ubi->dev);
 	else
@@ -1084,8 +1107,12 @@
 
 	ubi_assert(ubi_num == ubi->ubi_num);
 	ubi_notify_all(ubi, UBI_VOLUME_REMOVED, NULL);
-	dbg_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num);
-
+	ubi_msg("detaching mtd%d from ubi%d", ubi->mtd->index, ubi_num);
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	/* If we don't write a new fastmap at detach time we lose all
+	 * EC updates that have been made since the last written fastmap. */
+	ubi_update_fastmap(ubi);
+#endif
 	/*
 	 * Before freeing anything, we have to stop the background thread to
 	 * prevent it from doing anything on this device while we are freeing.
@@ -1101,13 +1128,13 @@
 
 	ubi_debugfs_exit_dev(ubi);
 	uif_close(ubi);
+
 	ubi_wl_close(ubi);
-	free_internal_volumes(ubi);
+	ubi_free_internal_volumes(ubi);
 	vfree(ubi->vtbl);
 	put_mtd_device(ubi->mtd);
-	ubi_debugging_exit_dev(ubi);
-	vfree(ubi->peb_buf1);
-	vfree(ubi->peb_buf2);
+	vfree(ubi->peb_buf);
+	vfree(ubi->fm_buf);
 	ubi_msg("mtd%d is detached from ubi%d", ubi->mtd->index, ubi->ubi_num);
 	put_device(&ubi->dev);
 	return 0;
@@ -1235,12 +1262,16 @@
 		mtd = open_mtd_device(p->name);
 		if (IS_ERR(mtd)) {
 			err = PTR_ERR(mtd);
-			goto out_detach;
+			ubi_err("cannot open mtd %s, error %d", p->name, err);
+			/* See comment below re-ubi_is_module(). */
+			if (ubi_is_module())
+				goto out_detach;
+			continue;
 		}
 
 		mutex_lock(&ubi_devices_mutex);
-		err = ubi_attach_mtd_dev(mtd, UBI_DEV_NUM_AUTO,
-					 p->vid_hdr_offs);
+		err = ubi_attach_mtd_dev(mtd, p->ubi_num,
+					 p->vid_hdr_offs, p->max_beb_per1024);
 		mutex_unlock(&ubi_devices_mutex);
 		if (err < 0) {
 			ubi_err("cannot attach mtd%d", mtd->index);
@@ -1283,10 +1314,10 @@
 out_class:
 	class_destroy(ubi_class);
 out:
-	ubi_err("UBI error: cannot initialize UBI, error %d", err);
+	ubi_err("cannot initialize UBI, error %d", err);
 	return err;
 }
-module_init(ubi_init);
+late_initcall(ubi_init);
 
 static void __exit ubi_exit(void)
 {
@@ -1320,8 +1351,7 @@
 
 	result = simple_strtoul(str, &endp, 0);
 	if (str == endp || result >= INT_MAX) {
-		printk(KERN_ERR "UBI error: incorrect bytes count: \"%s\"\n",
-		       str);
+		ubi_err("incorrect bytes count: \"%s\"\n", str);
 		return -EINVAL;
 	}
 
@@ -1337,8 +1367,7 @@
 	case '\0':
 		break;
 	default:
-		printk(KERN_ERR "UBI error: incorrect bytes count: \"%s\"\n",
-		       str);
+		ubi_err("incorrect bytes count: \"%s\"\n", str);
 		return -EINVAL;
 	}
 
@@ -1359,27 +1388,26 @@
 	struct mtd_dev_param *p;
 	char buf[MTD_PARAM_LEN_MAX];
 	char *pbuf = &buf[0];
-	char *tokens[2] = {NULL, NULL};
+	char *tokens[MTD_PARAM_MAX_COUNT], *token;
 
 	if (!val)
 		return -EINVAL;
 
 	if (mtd_devs == UBI_MAX_DEVICES) {
-		printk(KERN_ERR "UBI error: too many parameters, max. is %d\n",
-		       UBI_MAX_DEVICES);
+		ubi_err("too many parameters, max. is %d\n",
+			UBI_MAX_DEVICES);
 		return -EINVAL;
 	}
 
 	len = strnlen(val, MTD_PARAM_LEN_MAX);
 	if (len == MTD_PARAM_LEN_MAX) {
-		printk(KERN_ERR "UBI error: parameter \"%s\" is too long, "
-		       "max. is %d\n", val, MTD_PARAM_LEN_MAX);
+		ubi_err("parameter \"%s\" is too long, max. is %d\n",
+			val, MTD_PARAM_LEN_MAX);
 		return -EINVAL;
 	}
 
 	if (len == 0) {
-		printk(KERN_WARNING "UBI warning: empty 'mtd=' parameter - "
-		       "ignored\n");
+		pr_warn("UBI warning: empty 'mtd=' parameter - ignored\n");
 		return 0;
 	}
 
@@ -1389,42 +1417,69 @@
 	if (buf[len - 1] == '\n')
 		buf[len - 1] = '\0';
 
-	for (i = 0; i < 2; i++)
+	for (i = 0; i < MTD_PARAM_MAX_COUNT; i++)
 		tokens[i] = strsep(&pbuf, ",");
 
 	if (pbuf) {
-		printk(KERN_ERR "UBI error: too many arguments at \"%s\"\n",
-		       val);
+		ubi_err("too many arguments at \"%s\"\n", val);
 		return -EINVAL;
 	}
 
 	p = &mtd_dev_param[mtd_devs];
 	strcpy(&p->name[0], tokens[0]);
 
-	if (tokens[1])
-		p->vid_hdr_offs = bytes_str_to_int(tokens[1]);
+	token = tokens[1];
+	if (token) {
+		p->vid_hdr_offs = bytes_str_to_int(token);
 
-	if (p->vid_hdr_offs < 0)
-		return p->vid_hdr_offs;
+		if (p->vid_hdr_offs < 0)
+			return p->vid_hdr_offs;
+	}
+
+	token = tokens[2];
+	if (token) {
+		int err = kstrtoint(token, 10, &p->max_beb_per1024);
+
+		if (err) {
+			ubi_err("bad value for max_beb_per1024 parameter: %s",
+				token);
+			return -EINVAL;
+		}
+	}
+
+	token = tokens[3];
+	if (token) {
+		int err = kstrtoint(token, 10, &p->ubi_num);
+
+		if (err) {
+			ubi_err("bad value for ubi_num parameter: %s", token);
+			return -EINVAL;
+		}
+	} else
+		p->ubi_num = UBI_DEV_NUM_AUTO;
 
 	mtd_devs += 1;
 	return 0;
 }
 
 module_param_call(mtd, ubi_mtd_param_parse, NULL, NULL, 000);
-MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: "
-		      "mtd=<name|num|path>[,<vid_hdr_offs>].\n"
+MODULE_PARM_DESC(mtd, "MTD devices to attach. Parameter format: mtd=<name|num|path>[,<vid_hdr_offs>[,max_beb_per1024[,ubi_num]]].\n"
 		      "Multiple \"mtd\" parameters may be specified.\n"
-		      "MTD devices may be specified by their number, name, or "
-		      "path to the MTD character device node.\n"
-		      "Optional \"vid_hdr_offs\" parameter specifies UBI VID "
-		      "header position to be used by UBI.\n"
-		      "Example 1: mtd=/dev/mtd0 - attach MTD device "
-		      "/dev/mtd0.\n"
-		      "Example 2: mtd=content,1984 mtd=4 - attach MTD device "
-		      "with name \"content\" using VID header offset 1984, and "
-		      "MTD device number 4 with default VID header offset.");
-
+		      "MTD devices may be specified by their number, name, or path to the MTD character device node.\n"
+		      "Optional \"vid_hdr_offs\" parameter specifies UBI VID header position to be used by UBI. (default value if 0)\n"
+		      "Optional \"max_beb_per1024\" parameter specifies the maximum expected bad eraseblock per 1024 eraseblocks. (default value ("
+		      __stringify(CONFIG_MTD_UBI_BEB_LIMIT) ") if 0)\n"
+		      "Optional \"ubi_num\" parameter specifies UBI device number which have to be assigned to the newly created UBI device (assigned automatically by default)\n"
+		      "\n"
+		      "Example 1: mtd=/dev/mtd0 - attach MTD device /dev/mtd0.\n"
+		      "Example 2: mtd=content,1984 mtd=4 - attach MTD device with name \"content\" using VID header offset 1984, and MTD device number 4 with default VID header offset.\n"
+		      "Example 3: mtd=/dev/mtd1,0,25 - attach MTD device /dev/mtd1 using default VID header offset and reserve 25*nand_size_in_blocks/1024 erase blocks for bad block handling.\n"
+		      "Example 4: mtd=/dev/mtd1,0,0,5 - attach MTD device /dev/mtd1 to UBI 5 and using default values for the other fields.\n"
+		      "\t(e.g. if the NAND *chipset* has 4096 PEB, 100 will be reserved for this UBI device).");
+#ifdef CONFIG_MTD_UBI_FASTMAP
+module_param(fm_autoconvert, bool, 0644);
+MODULE_PARM_DESC(fm_autoconvert, "Set this parameter to enable fastmap automatically on images without a fastmap.");
+#endif
 MODULE_VERSION(__stringify(UBI_VERSION));
 MODULE_DESCRIPTION("UBI - Unsorted Block Images");
 MODULE_AUTHOR("Artem Bityutskiy");
diff -ur a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c
--- a/drivers/mtd/ubi/cdev.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/cdev.c	2014-02-17 11:57:40.000000000 +0100
@@ -63,7 +63,7 @@
 	users = vol->readers + vol->writers + vol->exclusive;
 	ubi_assert(users > 0);
 	if (users > 1) {
-		dbg_err("%d users for volume %d", users, vol->vol_id);
+		ubi_err("%d users for volume %d", users, vol->vol_id);
 		err = -EBUSY;
 	} else {
 		vol->readers = vol->writers = 0;
@@ -140,9 +140,9 @@
 		vol->updating = 0;
 		vfree(vol->upd_buf);
 	} else if (vol->changing_leb) {
-		dbg_gen("only %lld of %lld bytes received for atomic LEB change"
-			" for volume %d:%d, cancel", vol->upd_received,
-			vol->upd_bytes, vol->ubi->ubi_num, vol->vol_id);
+		dbg_gen("only %lld of %lld bytes received for atomic LEB change for volume %d:%d, cancel",
+			vol->upd_received, vol->upd_bytes, vol->ubi->ubi_num,
+			vol->vol_id);
 		vol->changing_leb = 0;
 		vfree(vol->upd_buf);
 	}
@@ -159,7 +159,7 @@
 
 	if (vol->updating) {
 		/* Update is in progress, seeking is prohibited */
-		dbg_err("updating");
+		ubi_err("updating");
 		return -EBUSY;
 	}
 
@@ -178,7 +178,7 @@
 	}
 
 	if (new_offset < 0 || new_offset > vol->used_bytes) {
-		dbg_err("bad seek %lld", new_offset);
+		ubi_err("bad seek %lld", new_offset);
 		return -EINVAL;
 	}
 
@@ -189,7 +189,8 @@
 	return new_offset;
 }
 
-static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end,
+			  int datasync)
 {
 	struct ubi_volume_desc *desc = file->private_data;
 	struct ubi_device *ubi = desc->vol->ubi;
@@ -216,11 +217,11 @@
 		count, *offp, vol->vol_id);
 
 	if (vol->updating) {
-		dbg_err("updating");
+		ubi_err("updating");
 		return -EBUSY;
 	}
 	if (vol->upd_marker) {
-		dbg_err("damaged volume, update marker is set");
+		ubi_err("damaged volume, update marker is set");
 		return -EBADF;
 	}
 	if (*offp == vol->used_bytes || count == 0)
@@ -300,7 +301,7 @@
 
 	lnum = div_u64_rem(*offp, vol->usable_leb_size, &off);
 	if (off & (ubi->min_io_size - 1)) {
-		dbg_err("unaligned position");
+		ubi_err("unaligned position");
 		return -EINVAL;
 	}
 
@@ -309,7 +310,7 @@
 
 	/* We can write only in fractions of the minimum I/O unit */
 	if (count & (ubi->min_io_size - 1)) {
-		dbg_err("unaligned write length");
+		ubi_err("unaligned write length");
 		return -EINVAL;
 	}
 
@@ -334,8 +335,7 @@
 			break;
 		}
 
-		err = ubi_eba_write_leb(ubi, vol, lnum, tbuf, off, len,
-					UBI_UNKNOWN);
+		err = ubi_eba_write_leb(ubi, vol, lnum, tbuf, off, len);
 		if (err)
 			break;
 
@@ -477,9 +477,6 @@
 		if (req.lnum < 0 || req.lnum >= vol->reserved_pebs ||
 		    req.bytes < 0 || req.lnum >= vol->usable_leb_size)
 			break;
-		if (req.dtype != UBI_LONGTERM && req.dtype != UBI_SHORTTERM &&
-		    req.dtype != UBI_UNKNOWN)
-			break;
 
 		err = get_exclusive(desc);
 		if (err < 0)
@@ -518,7 +515,7 @@
 		if (err)
 			break;
 
-		err = ubi_wl_flush(ubi);
+		err = ubi_wl_flush(ubi, UBI_ALL, UBI_ALL);
 		break;
 	}
 
@@ -532,7 +529,7 @@
 			err = -EFAULT;
 			break;
 		}
-		err = ubi_leb_map(desc, req.lnum, req.dtype);
+		err = ubi_leb_map(desc, req.lnum);
 		break;
 	}
 
@@ -647,8 +644,8 @@
 	return 0;
 
 bad:
-	dbg_err("bad volume creation request");
-	ubi_dbg_dump_mkvol_req(req);
+	ubi_err("bad volume creation request");
+	ubi_dump_mkvol_req(req);
 	return err;
 }
 
@@ -713,12 +710,12 @@
 	for (i = 0; i < req->count - 1; i++) {
 		for (n = i + 1; n < req->count; n++) {
 			if (req->ents[i].vol_id == req->ents[n].vol_id) {
-				dbg_err("duplicated volume id %d",
+				ubi_err("duplicated volume id %d",
 					req->ents[i].vol_id);
 				return -EINVAL;
 			}
 			if (!strcmp(req->ents[i].name, req->ents[n].name)) {
-				dbg_err("duplicated volume name \"%s\"",
+				ubi_err("duplicated volume name \"%s\"",
 					req->ents[i].name);
 				return -EINVAL;
 			}
@@ -741,7 +738,7 @@
 		re->desc = ubi_open_volume(ubi->ubi_num, vol_id, UBI_EXCLUSIVE);
 		if (IS_ERR(re->desc)) {
 			err = PTR_ERR(re->desc);
-			dbg_err("cannot open volume %d, error %d", vol_id, err);
+			ubi_err("cannot open volume %d, error %d", vol_id, err);
 			kfree(re);
 			goto out_free;
 		}
@@ -757,7 +754,7 @@
 		re->new_name_len = name_len;
 		memcpy(re->new_name, name, name_len);
 		list_add_tail(&re->list, &rename_list);
-		dbg_msg("will rename volume %d from \"%s\" to \"%s\"",
+		dbg_gen("will rename volume %d from \"%s\" to \"%s\"",
 			vol_id, re->desc->vol->name, name);
 	}
 
@@ -800,7 +797,7 @@
 				continue;
 
 			/* The volume exists but busy, or an error occurred */
-			dbg_err("cannot open volume \"%s\", error %d",
+			ubi_err("cannot open volume \"%s\", error %d",
 				re->new_name, err);
 			goto out_free;
 		}
@@ -815,7 +812,7 @@
 		re1->remove = 1;
 		re1->desc = desc;
 		list_add(&re1->list, &rename_list);
-		dbg_msg("will remove volume %d, name \"%s\"",
+		dbg_gen("will remove volume %d, name \"%s\"",
 			re1->desc->vol->vol_id, re1->desc->vol->name);
 	}
 
@@ -946,7 +943,7 @@
 	{
 		struct ubi_rnvol_req *req;
 
-		dbg_msg("re-name volumes");
+		dbg_gen("re-name volumes");
 		req = kmalloc(sizeof(struct ubi_rnvol_req), GFP_KERNEL);
 		if (!req) {
 			err = -ENOMEM;
@@ -1014,7 +1011,8 @@
 		 * 'ubi_attach_mtd_dev()'.
 		 */
 		mutex_lock(&ubi_devices_mutex);
-		err = ubi_attach_mtd_dev(mtd, req.ubi_num, req.vid_hdr_offset);
+		err = ubi_attach_mtd_dev(mtd, req.ubi_num, req.vid_hdr_offset,
+					 req.max_beb_per1024);
 		mutex_unlock(&ubi_devices_mutex);
 		if (err < 0)
 			put_mtd_device(mtd);
@@ -1030,7 +1028,7 @@
 	{
 		int ubi_num;
 
-		dbg_gen("dettach MTD device");
+		dbg_gen("detach MTD device");
 		err = get_user(ubi_num, (__user int32_t *)argp);
 		if (err) {
 			err = -EFAULT;
diff -ur a/drivers/mtd/ubi/debug.c b/drivers/mtd/ubi/debug.c
--- a/drivers/mtd/ubi/debug.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/debug.c	2014-02-17 11:57:40.000000000 +0100
@@ -18,243 +18,203 @@
  * Author: Artem Bityutskiy (Битюцкий Артём)
  */
 
-/*
- * Here we keep all the UBI debugging stuff which should normally be disabled
- * and compiled-out, but it is extremely helpful when hunting bugs or doing big
- * changes.
- */
-
-#ifdef CONFIG_MTD_UBI_DEBUG
-
 #include "ubi.h"
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 
+
+/**
+ * ubi_dump_flash - dump a region of flash.
+ * @ubi: UBI device description object
+ * @pnum: the physical eraseblock number to dump
+ * @offset: the starting offset within the physical eraseblock to dump
+ * @len: the length of the region to dump
+ */
+void ubi_dump_flash(struct ubi_device *ubi, int pnum, int offset, int len)
+{
+	int err;
+	size_t read;
+	void *buf;
+	loff_t addr = (loff_t)pnum * ubi->peb_size + offset;
+
+	buf = vmalloc(len);
+	if (!buf)
+		return;
+	err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
+	if (err && err != -EUCLEAN) {
+		ubi_err("error %d while reading %d bytes from PEB %d:%d, read %zd bytes",
+			err, len, pnum, offset, read);
+		goto out;
+	}
+
+	ubi_msg("dumping %d bytes of data from PEB %d, offset %d",
+		len, pnum, offset);
+	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, buf, len, 1);
+out:
+	vfree(buf);
+	return;
+}
+
 /**
- * ubi_dbg_dump_ec_hdr - dump an erase counter header.
+ * ubi_dump_ec_hdr - dump an erase counter header.
  * @ec_hdr: the erase counter header to dump
  */
-void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr)
+void ubi_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr)
 {
-	printk(KERN_DEBUG "Erase counter header dump:\n");
-	printk(KERN_DEBUG "\tmagic          %#08x\n",
-	       be32_to_cpu(ec_hdr->magic));
-	printk(KERN_DEBUG "\tversion        %d\n", (int)ec_hdr->version);
-	printk(KERN_DEBUG "\tec             %llu\n",
-	       (long long)be64_to_cpu(ec_hdr->ec));
-	printk(KERN_DEBUG "\tvid_hdr_offset %d\n",
-	       be32_to_cpu(ec_hdr->vid_hdr_offset));
-	printk(KERN_DEBUG "\tdata_offset    %d\n",
-	       be32_to_cpu(ec_hdr->data_offset));
-	printk(KERN_DEBUG "\timage_seq      %d\n",
-	       be32_to_cpu(ec_hdr->image_seq));
-	printk(KERN_DEBUG "\thdr_crc        %#08x\n",
-	       be32_to_cpu(ec_hdr->hdr_crc));
-	printk(KERN_DEBUG "erase counter header hexdump:\n");
+	pr_err("Erase counter header dump:\n");
+	pr_err("\tmagic          %#08x\n", be32_to_cpu(ec_hdr->magic));
+	pr_err("\tversion        %d\n", (int)ec_hdr->version);
+	pr_err("\tec             %llu\n", (long long)be64_to_cpu(ec_hdr->ec));
+	pr_err("\tvid_hdr_offset %d\n", be32_to_cpu(ec_hdr->vid_hdr_offset));
+	pr_err("\tdata_offset    %d\n", be32_to_cpu(ec_hdr->data_offset));
+	pr_err("\timage_seq      %d\n", be32_to_cpu(ec_hdr->image_seq));
+	pr_err("\thdr_crc        %#08x\n", be32_to_cpu(ec_hdr->hdr_crc));
+	pr_err("erase counter header hexdump:\n");
 	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
 		       ec_hdr, UBI_EC_HDR_SIZE, 1);
 }
 
 /**
- * ubi_dbg_dump_vid_hdr - dump a volume identifier header.
+ * ubi_dump_vid_hdr - dump a volume identifier header.
  * @vid_hdr: the volume identifier header to dump
  */
-void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr)
+void ubi_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr)
 {
-	printk(KERN_DEBUG "Volume identifier header dump:\n");
-	printk(KERN_DEBUG "\tmagic     %08x\n", be32_to_cpu(vid_hdr->magic));
-	printk(KERN_DEBUG "\tversion   %d\n",  (int)vid_hdr->version);
-	printk(KERN_DEBUG "\tvol_type  %d\n",  (int)vid_hdr->vol_type);
-	printk(KERN_DEBUG "\tcopy_flag %d\n",  (int)vid_hdr->copy_flag);
-	printk(KERN_DEBUG "\tcompat    %d\n",  (int)vid_hdr->compat);
-	printk(KERN_DEBUG "\tvol_id    %d\n",  be32_to_cpu(vid_hdr->vol_id));
-	printk(KERN_DEBUG "\tlnum      %d\n",  be32_to_cpu(vid_hdr->lnum));
-	printk(KERN_DEBUG "\tdata_size %d\n",  be32_to_cpu(vid_hdr->data_size));
-	printk(KERN_DEBUG "\tused_ebs  %d\n",  be32_to_cpu(vid_hdr->used_ebs));
-	printk(KERN_DEBUG "\tdata_pad  %d\n",  be32_to_cpu(vid_hdr->data_pad));
-	printk(KERN_DEBUG "\tsqnum     %llu\n",
+	pr_err("Volume identifier header dump:\n");
+	pr_err("\tmagic     %08x\n", be32_to_cpu(vid_hdr->magic));
+	pr_err("\tversion   %d\n",  (int)vid_hdr->version);
+	pr_err("\tvol_type  %d\n",  (int)vid_hdr->vol_type);
+	pr_err("\tcopy_flag %d\n",  (int)vid_hdr->copy_flag);
+	pr_err("\tcompat    %d\n",  (int)vid_hdr->compat);
+	pr_err("\tvol_id    %d\n",  be32_to_cpu(vid_hdr->vol_id));
+	pr_err("\tlnum      %d\n",  be32_to_cpu(vid_hdr->lnum));
+	pr_err("\tdata_size %d\n",  be32_to_cpu(vid_hdr->data_size));
+	pr_err("\tused_ebs  %d\n",  be32_to_cpu(vid_hdr->used_ebs));
+	pr_err("\tdata_pad  %d\n",  be32_to_cpu(vid_hdr->data_pad));
+	pr_err("\tsqnum     %llu\n",
 		(unsigned long long)be64_to_cpu(vid_hdr->sqnum));
-	printk(KERN_DEBUG "\thdr_crc   %08x\n", be32_to_cpu(vid_hdr->hdr_crc));
-	printk(KERN_DEBUG "Volume identifier header hexdump:\n");
+	pr_err("\thdr_crc   %08x\n", be32_to_cpu(vid_hdr->hdr_crc));
+	pr_err("Volume identifier header hexdump:\n");
 	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
 		       vid_hdr, UBI_VID_HDR_SIZE, 1);
 }
 
 /**
- * ubi_dbg_dump_vol_info- dump volume information.
+ * ubi_dump_vol_info - dump volume information.
  * @vol: UBI volume description object
  */
-void ubi_dbg_dump_vol_info(const struct ubi_volume *vol)
+void ubi_dump_vol_info(const struct ubi_volume *vol)
 {
-	printk(KERN_DEBUG "Volume information dump:\n");
-	printk(KERN_DEBUG "\tvol_id          %d\n", vol->vol_id);
-	printk(KERN_DEBUG "\treserved_pebs   %d\n", vol->reserved_pebs);
-	printk(KERN_DEBUG "\talignment       %d\n", vol->alignment);
-	printk(KERN_DEBUG "\tdata_pad        %d\n", vol->data_pad);
-	printk(KERN_DEBUG "\tvol_type        %d\n", vol->vol_type);
-	printk(KERN_DEBUG "\tname_len        %d\n", vol->name_len);
-	printk(KERN_DEBUG "\tusable_leb_size %d\n", vol->usable_leb_size);
-	printk(KERN_DEBUG "\tused_ebs        %d\n", vol->used_ebs);
-	printk(KERN_DEBUG "\tused_bytes      %lld\n", vol->used_bytes);
-	printk(KERN_DEBUG "\tlast_eb_bytes   %d\n", vol->last_eb_bytes);
-	printk(KERN_DEBUG "\tcorrupted       %d\n", vol->corrupted);
-	printk(KERN_DEBUG "\tupd_marker      %d\n", vol->upd_marker);
+	pr_err("Volume information dump:\n");
+	pr_err("\tvol_id          %d\n", vol->vol_id);
+	pr_err("\treserved_pebs   %d\n", vol->reserved_pebs);
+	pr_err("\talignment       %d\n", vol->alignment);
+	pr_err("\tdata_pad        %d\n", vol->data_pad);
+	pr_err("\tvol_type        %d\n", vol->vol_type);
+	pr_err("\tname_len        %d\n", vol->name_len);
+	pr_err("\tusable_leb_size %d\n", vol->usable_leb_size);
+	pr_err("\tused_ebs        %d\n", vol->used_ebs);
+	pr_err("\tused_bytes      %lld\n", vol->used_bytes);
+	pr_err("\tlast_eb_bytes   %d\n", vol->last_eb_bytes);
+	pr_err("\tcorrupted       %d\n", vol->corrupted);
+	pr_err("\tupd_marker      %d\n", vol->upd_marker);
 
 	if (vol->name_len <= UBI_VOL_NAME_MAX &&
 	    strnlen(vol->name, vol->name_len + 1) == vol->name_len) {
-		printk(KERN_DEBUG "\tname            %s\n", vol->name);
+		pr_err("\tname            %s\n", vol->name);
 	} else {
-		printk(KERN_DEBUG "\t1st 5 characters of name: %c%c%c%c%c\n",
+		pr_err("\t1st 5 characters of name: %c%c%c%c%c\n",
 		       vol->name[0], vol->name[1], vol->name[2],
 		       vol->name[3], vol->name[4]);
 	}
 }
 
 /**
- * ubi_dbg_dump_vtbl_record - dump a &struct ubi_vtbl_record object.
+ * ubi_dump_vtbl_record - dump a &struct ubi_vtbl_record object.
  * @r: the object to dump
  * @idx: volume table index
  */
-void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx)
+void ubi_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx)
 {
 	int name_len = be16_to_cpu(r->name_len);
 
-	printk(KERN_DEBUG "Volume table record %d dump:\n", idx);
-	printk(KERN_DEBUG "\treserved_pebs   %d\n",
-	       be32_to_cpu(r->reserved_pebs));
-	printk(KERN_DEBUG "\talignment       %d\n", be32_to_cpu(r->alignment));
-	printk(KERN_DEBUG "\tdata_pad        %d\n", be32_to_cpu(r->data_pad));
-	printk(KERN_DEBUG "\tvol_type        %d\n", (int)r->vol_type);
-	printk(KERN_DEBUG "\tupd_marker      %d\n", (int)r->upd_marker);
-	printk(KERN_DEBUG "\tname_len        %d\n", name_len);
+	pr_err("Volume table record %d dump:\n", idx);
+	pr_err("\treserved_pebs   %d\n", be32_to_cpu(r->reserved_pebs));
+	pr_err("\talignment       %d\n", be32_to_cpu(r->alignment));
+	pr_err("\tdata_pad        %d\n", be32_to_cpu(r->data_pad));
+	pr_err("\tvol_type        %d\n", (int)r->vol_type);
+	pr_err("\tupd_marker      %d\n", (int)r->upd_marker);
+	pr_err("\tname_len        %d\n", name_len);
 
 	if (r->name[0] == '\0') {
-		printk(KERN_DEBUG "\tname            NULL\n");
+		pr_err("\tname            NULL\n");
 		return;
 	}
 
 	if (name_len <= UBI_VOL_NAME_MAX &&
 	    strnlen(&r->name[0], name_len + 1) == name_len) {
-		printk(KERN_DEBUG "\tname            %s\n", &r->name[0]);
+		pr_err("\tname            %s\n", &r->name[0]);
 	} else {
-		printk(KERN_DEBUG "\t1st 5 characters of name: %c%c%c%c%c\n",
+		pr_err("\t1st 5 characters of name: %c%c%c%c%c\n",
 			r->name[0], r->name[1], r->name[2], r->name[3],
 			r->name[4]);
 	}
-	printk(KERN_DEBUG "\tcrc             %#08x\n", be32_to_cpu(r->crc));
+	pr_err("\tcrc             %#08x\n", be32_to_cpu(r->crc));
 }
 
 /**
- * ubi_dbg_dump_sv - dump a &struct ubi_scan_volume object.
- * @sv: the object to dump
+ * ubi_dump_av - dump a &struct ubi_ainf_volume object.
+ * @av: the object to dump
  */
-void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv)
+void ubi_dump_av(const struct ubi_ainf_volume *av)
 {
-	printk(KERN_DEBUG "Volume scanning information dump:\n");
-	printk(KERN_DEBUG "\tvol_id         %d\n", sv->vol_id);
-	printk(KERN_DEBUG "\thighest_lnum   %d\n", sv->highest_lnum);
-	printk(KERN_DEBUG "\tleb_count      %d\n", sv->leb_count);
-	printk(KERN_DEBUG "\tcompat         %d\n", sv->compat);
-	printk(KERN_DEBUG "\tvol_type       %d\n", sv->vol_type);
-	printk(KERN_DEBUG "\tused_ebs       %d\n", sv->used_ebs);
-	printk(KERN_DEBUG "\tlast_data_size %d\n", sv->last_data_size);
-	printk(KERN_DEBUG "\tdata_pad       %d\n", sv->data_pad);
+	pr_err("Volume attaching information dump:\n");
+	pr_err("\tvol_id         %d\n", av->vol_id);
+	pr_err("\thighest_lnum   %d\n", av->highest_lnum);
+	pr_err("\tleb_count      %d\n", av->leb_count);
+	pr_err("\tcompat         %d\n", av->compat);
+	pr_err("\tvol_type       %d\n", av->vol_type);
+	pr_err("\tused_ebs       %d\n", av->used_ebs);
+	pr_err("\tlast_data_size %d\n", av->last_data_size);
+	pr_err("\tdata_pad       %d\n", av->data_pad);
 }
 
 /**
- * ubi_dbg_dump_seb - dump a &struct ubi_scan_leb object.
- * @seb: the object to dump
+ * ubi_dump_aeb - dump a &struct ubi_ainf_peb object.
+ * @aeb: the object to dump
  * @type: object type: 0 - not corrupted, 1 - corrupted
  */
-void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type)
+void ubi_dump_aeb(const struct ubi_ainf_peb *aeb, int type)
 {
-	printk(KERN_DEBUG "eraseblock scanning information dump:\n");
-	printk(KERN_DEBUG "\tec       %d\n", seb->ec);
-	printk(KERN_DEBUG "\tpnum     %d\n", seb->pnum);
+	pr_err("eraseblock attaching information dump:\n");
+	pr_err("\tec       %d\n", aeb->ec);
+	pr_err("\tpnum     %d\n", aeb->pnum);
 	if (type == 0) {
-		printk(KERN_DEBUG "\tlnum     %d\n", seb->lnum);
-		printk(KERN_DEBUG "\tscrub    %d\n", seb->scrub);
-		printk(KERN_DEBUG "\tsqnum    %llu\n", seb->sqnum);
+		pr_err("\tlnum     %d\n", aeb->lnum);
+		pr_err("\tscrub    %d\n", aeb->scrub);
+		pr_err("\tsqnum    %llu\n", aeb->sqnum);
 	}
 }
 
 /**
- * ubi_dbg_dump_mkvol_req - dump a &struct ubi_mkvol_req object.
+ * ubi_dump_mkvol_req - dump a &struct ubi_mkvol_req object.
  * @req: the object to dump
  */
-void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req)
+void ubi_dump_mkvol_req(const struct ubi_mkvol_req *req)
 {
 	char nm[17];
 
-	printk(KERN_DEBUG "Volume creation request dump:\n");
-	printk(KERN_DEBUG "\tvol_id    %d\n",   req->vol_id);
-	printk(KERN_DEBUG "\talignment %d\n",   req->alignment);
-	printk(KERN_DEBUG "\tbytes     %lld\n", (long long)req->bytes);
-	printk(KERN_DEBUG "\tvol_type  %d\n",   req->vol_type);
-	printk(KERN_DEBUG "\tname_len  %d\n",   req->name_len);
+	pr_err("Volume creation request dump:\n");
+	pr_err("\tvol_id    %d\n",   req->vol_id);
+	pr_err("\talignment %d\n",   req->alignment);
+	pr_err("\tbytes     %lld\n", (long long)req->bytes);
+	pr_err("\tvol_type  %d\n",   req->vol_type);
+	pr_err("\tname_len  %d\n",   req->name_len);
 
 	memcpy(nm, req->name, 16);
 	nm[16] = 0;
-	printk(KERN_DEBUG "\t1st 16 characters of name: %s\n", nm);
-}
-
-/**
- * ubi_dbg_dump_flash - dump a region of flash.
- * @ubi: UBI device description object
- * @pnum: the physical eraseblock number to dump
- * @offset: the starting offset within the physical eraseblock to dump
- * @len: the length of the region to dump
- */
-void ubi_dbg_dump_flash(struct ubi_device *ubi, int pnum, int offset, int len)
-{
-	int err;
-	size_t read;
-	void *buf;
-	loff_t addr = (loff_t)pnum * ubi->peb_size + offset;
-
-	buf = vmalloc(len);
-	if (!buf)
-		return;
-	err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
-	if (err && err != -EUCLEAN) {
-		ubi_err("error %d while reading %d bytes from PEB %d:%d, "
-			"read %zd bytes", err, len, pnum, offset, read);
-		goto out;
-	}
-
-	dbg_msg("dumping %d bytes of data from PEB %d, offset %d",
-		len, pnum, offset);
-	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, buf, len, 1);
-out:
-	vfree(buf);
-	return;
-}
-
-/**
- * ubi_debugging_init_dev - initialize debugging for an UBI device.
- * @ubi: UBI device description object
- *
- * This function initializes debugging-related data for UBI device @ubi.
- * Returns zero in case of success and a negative error code in case of
- * failure.
- */
-int ubi_debugging_init_dev(struct ubi_device *ubi)
-{
-	ubi->dbg = kzalloc(sizeof(struct ubi_debug_info), GFP_KERNEL);
-	if (!ubi->dbg)
-		return -ENOMEM;
-
-	return 0;
-}
-
-/**
- * ubi_debugging_exit_dev - free debugging data for an UBI device.
- * @ubi: UBI device description object
- */
-void ubi_debugging_exit_dev(struct ubi_device *ubi)
-{
-	kfree(ubi->dbg);
+	pr_err("\t1st 16 characters of name: %s\n", nm);
 }
 
 /*
@@ -271,6 +231,9 @@
  */
 int ubi_debugfs_init(void)
 {
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
 	dfs_rootdir = debugfs_create_dir("ubi", NULL);
 	if (IS_ERR_OR_NULL(dfs_rootdir)) {
 		int err = dfs_rootdir ? -ENODEV : PTR_ERR(dfs_rootdir);
@@ -288,7 +251,8 @@
  */
 void ubi_debugfs_exit(void)
 {
-	debugfs_remove(dfs_rootdir);
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_remove(dfs_rootdir);
 }
 
 /* Read an UBI debugfs file */
@@ -305,7 +269,7 @@
 	ubi = ubi_get_device(ubi_num);
 	if (!ubi)
 		return -ENODEV;
-	d = ubi->dbg;
+	d = &ubi->dbg;
 
 	if (dent == d->dfs_chk_gen)
 		val = d->chk_gen;
@@ -351,7 +315,7 @@
 	ubi = ubi_get_device(ubi_num);
 	if (!ubi)
 		return -ENODEV;
-	d = ubi->dbg;
+	d = &ubi->dbg;
 
 	buf_size = min_t(size_t, count, (sizeof(buf) - 1));
 	if (copy_from_user(buf, user_buf, buf_size)) {
@@ -416,7 +380,10 @@
 	unsigned long ubi_num = ubi->ubi_num;
 	const char *fname;
 	struct dentry *dent;
-	struct ubi_debug_info *d = ubi->dbg;
+	struct ubi_debug_info *d = &ubi->dbg;
+
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
 
 	n = snprintf(d->dfs_dir_name, UBI_DFS_DIR_LEN + 1, UBI_DFS_DIR_NAME,
 		     ubi->ubi_num);
@@ -485,7 +452,6 @@
  */
 void ubi_debugfs_exit_dev(struct ubi_device *ubi)
 {
-	debugfs_remove_recursive(ubi->dbg->dfs_dir);
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_remove_recursive(ubi->dbg.dfs_dir);
 }
-
-#endif /* CONFIG_MTD_UBI_DEBUG */
diff -ur a/drivers/mtd/ubi/debug.h b/drivers/mtd/ubi/debug.h
--- a/drivers/mtd/ubi/debug.h	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/debug.h	2014-02-17 11:57:40.000000000 +0100
@@ -21,31 +21,26 @@
 #ifndef __UBI_DEBUG_H__
 #define __UBI_DEBUG_H__
 
-#ifdef CONFIG_MTD_UBI_DEBUG
+void ubi_dump_flash(struct ubi_device *ubi, int pnum, int offset, int len);
+void ubi_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr);
+void ubi_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr);
+
 #include <linux/random.h>
 
 #define ubi_assert(expr)  do {                                               \
 	if (unlikely(!(expr))) {                                             \
-		printk(KERN_CRIT "UBI assert failed in %s at %u (pid %d)\n", \
+		pr_crit("UBI assert failed in %s at %u (pid %d)\n",          \
 		       __func__, __LINE__, current->pid);                    \
-		ubi_dbg_dump_stack();                                        \
+		dump_stack();                                                \
 	}                                                                    \
 } while (0)
 
-#define dbg_err(fmt, ...) ubi_err(fmt, ##__VA_ARGS__)
-
-#define ubi_dbg_dump_stack() dump_stack()
-
-#define ubi_dbg_print_hex_dump(l, ps, pt, r, g, b, len, a)  \
+#define ubi_dbg_print_hex_dump(l, ps, pt, r, g, b, len, a)                   \
 		print_hex_dump(l, ps, pt, r, g, b, len, a)
 
 #define ubi_dbg_msg(type, fmt, ...) \
-	pr_debug("UBI DBG " type ": " fmt "\n", ##__VA_ARGS__)
-
-/* Just a debugging messages not related to any specific UBI subsystem */
-#define dbg_msg(fmt, ...)                                    \
-	printk(KERN_DEBUG "UBI DBG (pid %d): %s: " fmt "\n", \
-	       current->pid, __func__, ##__VA_ARGS__)
+	pr_debug("UBI DBG " type " (pid %d): " fmt "\n", current->pid,       \
+		 ##__VA_ARGS__)
 
 /* General debugging messages */
 #define dbg_gen(fmt, ...) ubi_dbg_msg("gen", fmt, ##__VA_ARGS__)
@@ -58,62 +53,18 @@
 /* Initialization and build messages */
 #define dbg_bld(fmt, ...) ubi_dbg_msg("bld", fmt, ##__VA_ARGS__)
 
-void ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr);
-void ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr);
-void ubi_dbg_dump_vol_info(const struct ubi_volume *vol);
-void ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx);
-void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv);
-void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb, int type);
-void ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req);
-void ubi_dbg_dump_flash(struct ubi_device *ubi, int pnum, int offset, int len);
-int ubi_dbg_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len);
-int ubi_dbg_check_write(struct ubi_device *ubi, const void *buf, int pnum,
-			int offset, int len);
-int ubi_debugging_init_dev(struct ubi_device *ubi);
-void ubi_debugging_exit_dev(struct ubi_device *ubi);
+void ubi_dump_vol_info(const struct ubi_volume *vol);
+void ubi_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx);
+void ubi_dump_av(const struct ubi_ainf_volume *av);
+void ubi_dump_aeb(const struct ubi_ainf_peb *aeb, int type);
+void ubi_dump_mkvol_req(const struct ubi_mkvol_req *req);
+int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset,
+			  int len);
 int ubi_debugfs_init(void);
 void ubi_debugfs_exit(void);
 int ubi_debugfs_init_dev(struct ubi_device *ubi);
 void ubi_debugfs_exit_dev(struct ubi_device *ubi);
 
-/*
- * The UBI debugfs directory name pattern and maximum name length (3 for "ubi"
- * + 2 for the number plus 1 for the trailing zero byte.
- */
-#define UBI_DFS_DIR_NAME "ubi%d"
-#define UBI_DFS_DIR_LEN  (3 + 2 + 1)
-
-/**
- * struct ubi_debug_info - debugging information for an UBI device.
- *
- * @chk_gen: if UBI general extra checks are enabled
- * @chk_io: if UBI I/O extra checks are enabled
- * @disable_bgt: disable the background task for testing purposes
- * @emulate_bitflips: emulate bit-flips for testing purposes
- * @emulate_io_failures: emulate write/erase failures for testing purposes
- * @dfs_dir_name: name of debugfs directory containing files of this UBI device
- * @dfs_dir: direntry object of the UBI device debugfs directory
- * @dfs_chk_gen: debugfs knob to enable UBI general extra checks
- * @dfs_chk_io: debugfs knob to enable UBI I/O extra checks
- * @dfs_disable_bgt: debugfs knob to disable the background task
- * @dfs_emulate_bitflips: debugfs knob to emulate bit-flips
- * @dfs_emulate_io_failures: debugfs knob to emulate write/erase failures
- */
-struct ubi_debug_info {
-	unsigned int chk_gen:1;
-	unsigned int chk_io:1;
-	unsigned int disable_bgt:1;
-	unsigned int emulate_bitflips:1;
-	unsigned int emulate_io_failures:1;
-	char dfs_dir_name[UBI_DFS_DIR_LEN + 1];
-	struct dentry *dfs_dir;
-	struct dentry *dfs_chk_gen;
-	struct dentry *dfs_chk_io;
-	struct dentry *dfs_disable_bgt;
-	struct dentry *dfs_emulate_bitflips;
-	struct dentry *dfs_emulate_io_failures;
-};
-
 /**
  * ubi_dbg_is_bgt_disabled - if the background thread is disabled.
  * @ubi: UBI device description object
@@ -123,7 +74,7 @@
  */
 static inline int ubi_dbg_is_bgt_disabled(const struct ubi_device *ubi)
 {
-	return ubi->dbg->disable_bgt;
+	return ubi->dbg.disable_bgt;
 }
 
 /**
@@ -134,7 +85,7 @@
  */
 static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi)
 {
-	if (ubi->dbg->emulate_bitflips)
+	if (ubi->dbg.emulate_bitflips)
 		return !(random32() % 200);
 	return 0;
 }
@@ -148,7 +99,7 @@
  */
 static inline int ubi_dbg_is_write_failure(const struct ubi_device *ubi)
 {
-	if (ubi->dbg->emulate_io_failures)
+	if (ubi->dbg.emulate_io_failures)
 		return !(random32() % 500);
 	return 0;
 }
@@ -162,78 +113,18 @@
  */
 static inline int ubi_dbg_is_erase_failure(const struct ubi_device *ubi)
 {
-	if (ubi->dbg->emulate_io_failures)
+	if (ubi->dbg.emulate_io_failures)
 		return !(random32() % 400);
 	return 0;
 }
 
-#else
-
-/* Use "if (0)" to make compiler check arguments even if debugging is off */
-#define ubi_assert(expr)  do {                                               \
-	if (0) {                                                             \
-		printk(KERN_CRIT "UBI assert failed in %s at %u (pid %d)\n", \
-		       __func__, __LINE__, current->pid);                    \
-	}                                                                    \
-} while (0)
-
-#define dbg_err(fmt, ...) do {                                               \
-	if (0)                                                               \
-		ubi_err(fmt, ##__VA_ARGS__);                                 \
-} while (0)
-
-#define ubi_dbg_msg(fmt, ...) do {                                           \
-	if (0)                                                               \
-		printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__);                  \
-} while (0)
-
-#define dbg_msg(fmt, ...)  ubi_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gen(fmt, ...)  ubi_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_eba(fmt, ...)  ubi_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_wl(fmt, ...)   ubi_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)   ubi_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_bld(fmt, ...)  ubi_dbg_msg(fmt, ##__VA_ARGS__)
-
-static inline void ubi_dbg_dump_stack(void)                          { return; }
-static inline void
-ubi_dbg_dump_ec_hdr(const struct ubi_ec_hdr *ec_hdr)                 { return; }
-static inline void
-ubi_dbg_dump_vid_hdr(const struct ubi_vid_hdr *vid_hdr)              { return; }
-static inline void
-ubi_dbg_dump_vol_info(const struct ubi_volume *vol)                  { return; }
-static inline void
-ubi_dbg_dump_vtbl_record(const struct ubi_vtbl_record *r, int idx)   { return; }
-static inline void ubi_dbg_dump_sv(const struct ubi_scan_volume *sv) { return; }
-static inline void ubi_dbg_dump_seb(const struct ubi_scan_leb *seb,
-				    int type)                        { return; }
-static inline void
-ubi_dbg_dump_mkvol_req(const struct ubi_mkvol_req *req)              { return; }
-static inline void ubi_dbg_dump_flash(struct ubi_device *ubi,
-				      int pnum, int offset, int len) { return; }
-static inline void
-ubi_dbg_print_hex_dump(const char *l, const char *ps, int pt, int r,
-		       int g, const void *b, size_t len, bool a)     { return; }
-static inline int ubi_dbg_check_all_ff(struct ubi_device *ubi,
-				       int pnum, int offset,
-				       int len)                    { return 0; }
-static inline int ubi_dbg_check_write(struct ubi_device *ubi,
-				      const void *buf, int pnum,
-				      int offset, int len)         { return 0; }
-
-static inline int ubi_debugging_init_dev(struct ubi_device *ubi)   { return 0; }
-static inline void ubi_debugging_exit_dev(struct ubi_device *ubi)  { return; }
-static inline int ubi_debugfs_init(void)                           { return 0; }
-static inline void ubi_debugfs_exit(void)                          { return; }
-static inline int ubi_debugfs_init_dev(struct ubi_device *ubi)     { return 0; }
-static inline void ubi_debugfs_exit_dev(struct ubi_device *ubi)    { return; }
-
-static inline int
-ubi_dbg_is_bgt_disabled(const struct ubi_device *ubi)              { return 0; }
-static inline int ubi_dbg_is_bitflip(const struct ubi_device *ubi) { return 0; }
-static inline int
-ubi_dbg_is_write_failure(const struct ubi_device *ubi)             { return 0; }
-static inline int
-ubi_dbg_is_erase_failure(const struct ubi_device *ubi)             { return 0; }
+static inline int ubi_dbg_chk_io(const struct ubi_device *ubi)
+{
+	return ubi->dbg.chk_io;
+}
 
-#endif /* !CONFIG_MTD_UBI_DEBUG */
+static inline int ubi_dbg_chk_gen(const struct ubi_device *ubi)
+{
+	return ubi->dbg.chk_gen;
+}
 #endif /* !__UBI_DEBUG_H__ */
diff -ur a/drivers/mtd/ubi/eba.c b/drivers/mtd/ubi/eba.c
--- a/drivers/mtd/ubi/eba.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/eba.c	2014-02-17 11:57:40.000000000 +0100
@@ -57,7 +57,7 @@
  * global sequence counter value. It also increases the global sequence
  * counter.
  */
-static unsigned long long next_sqnum(struct ubi_device *ubi)
+unsigned long long ubi_next_sqnum(struct ubi_device *ubi)
 {
 	unsigned long long sqnum;
 
@@ -340,8 +340,10 @@
 
 	dbg_eba("erase LEB %d:%d, PEB %d", vol_id, lnum, pnum);
 
+	down_read(&ubi->fm_sem);
 	vol->eba_tbl[lnum] = UBI_LEB_UNMAPPED;
-	err = ubi_wl_put_peb(ubi, pnum, 0);
+	up_read(&ubi->fm_sem);
+	err = ubi_wl_put_peb(ubi, vol_id, lnum, pnum, 0);
 
 out_unlock:
 	leb_write_unlock(ubi, vol_id, lnum);
@@ -420,9 +422,8 @@
 				 */
 				if (err == UBI_IO_BAD_HDR_EBADMSG ||
 				    err == UBI_IO_BAD_HDR) {
-					ubi_warn("corrupted VID header at PEB "
-						 "%d, LEB %d:%d", pnum, vol_id,
-						 lnum);
+					ubi_warn("corrupted VID header at PEB %d, LEB %d:%d",
+						 pnum, vol_id, lnum);
 					err = -EBADMSG;
 				} else
 					ubi_ro_mode(ubi);
@@ -507,7 +508,7 @@
 		return -ENOMEM;
 
 retry:
-	new_pnum = ubi_wl_get_peb(ubi, UBI_UNKNOWN);
+	new_pnum = ubi_wl_get_peb(ubi);
 	if (new_pnum < 0) {
 		ubi_free_vid_hdr(ubi, vid_hdr);
 		return new_pnum;
@@ -522,25 +523,25 @@
 		goto out_put;
 	}
 
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 	err = ubi_io_write_vid_hdr(ubi, new_pnum, vid_hdr);
 	if (err)
 		goto write_error;
 
 	data_size = offset + len;
 	mutex_lock(&ubi->buf_mutex);
-	memset(ubi->peb_buf1 + offset, 0xFF, len);
+	memset(ubi->peb_buf + offset, 0xFF, len);
 
 	/* Read everything before the area where the write failure happened */
 	if (offset > 0) {
-		err = ubi_io_read_data(ubi, ubi->peb_buf1, pnum, 0, offset);
+		err = ubi_io_read_data(ubi, ubi->peb_buf, pnum, 0, offset);
 		if (err && err != UBI_IO_BITFLIPS)
 			goto out_unlock;
 	}
 
-	memcpy(ubi->peb_buf1 + offset, buf, len);
+	memcpy(ubi->peb_buf + offset, buf, len);
 
-	err = ubi_io_write_data(ubi, ubi->peb_buf1, new_pnum, 0, data_size);
+	err = ubi_io_write_data(ubi, ubi->peb_buf, new_pnum, 0, data_size);
 	if (err) {
 		mutex_unlock(&ubi->buf_mutex);
 		goto write_error;
@@ -549,8 +550,10 @@
 	mutex_unlock(&ubi->buf_mutex);
 	ubi_free_vid_hdr(ubi, vid_hdr);
 
+	down_read(&ubi->fm_sem);
 	vol->eba_tbl[lnum] = new_pnum;
-	ubi_wl_put_peb(ubi, pnum, 1);
+	up_read(&ubi->fm_sem);
+	ubi_wl_put_peb(ubi, vol_id, lnum, pnum, 1);
 
 	ubi_msg("data was successfully recovered");
 	return 0;
@@ -558,7 +561,7 @@
 out_unlock:
 	mutex_unlock(&ubi->buf_mutex);
 out_put:
-	ubi_wl_put_peb(ubi, new_pnum, 1);
+	ubi_wl_put_peb(ubi, vol_id, lnum, new_pnum, 1);
 	ubi_free_vid_hdr(ubi, vid_hdr);
 	return err;
 
@@ -568,7 +571,7 @@
 	 * get another one.
 	 */
 	ubi_warn("failed to write to PEB %d", new_pnum);
-	ubi_wl_put_peb(ubi, new_pnum, 1);
+	ubi_wl_put_peb(ubi, vol_id, lnum, new_pnum, 1);
 	if (++tries > UBI_IO_RETRIES) {
 		ubi_free_vid_hdr(ubi, vid_hdr);
 		return err;
@@ -585,7 +588,6 @@
  * @buf: the data to write
  * @offset: offset within the logical eraseblock where to write
  * @len: how many bytes to write
- * @dtype: data type
  *
  * This function writes data to logical eraseblock @lnum of a dynamic volume
  * @vol. Returns zero in case of success and a negative error code in case
@@ -593,7 +595,7 @@
  * written to the flash media, but may be some garbage.
  */
 int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
-		      const void *buf, int offset, int len, int dtype)
+		      const void *buf, int offset, int len)
 {
 	int err, pnum, tries = 0, vol_id = vol->vol_id;
 	struct ubi_vid_hdr *vid_hdr;
@@ -634,14 +636,14 @@
 	}
 
 	vid_hdr->vol_type = UBI_VID_DYNAMIC;
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 	vid_hdr->vol_id = cpu_to_be32(vol_id);
 	vid_hdr->lnum = cpu_to_be32(lnum);
 	vid_hdr->compat = ubi_get_compat(ubi, vol_id);
 	vid_hdr->data_pad = cpu_to_be32(vol->data_pad);
 
 retry:
-	pnum = ubi_wl_get_peb(ubi, dtype);
+	pnum = ubi_wl_get_peb(ubi);
 	if (pnum < 0) {
 		ubi_free_vid_hdr(ubi, vid_hdr);
 		leb_write_unlock(ubi, vol_id, lnum);
@@ -661,14 +663,15 @@
 	if (len) {
 		err = ubi_io_write_data(ubi, buf, pnum, offset, len);
 		if (err) {
-			ubi_warn("failed to write %d bytes at offset %d of "
-				 "LEB %d:%d, PEB %d", len, offset, vol_id,
-				 lnum, pnum);
+			ubi_warn("failed to write %d bytes at offset %d of LEB %d:%d, PEB %d",
+				 len, offset, vol_id, lnum, pnum);
 			goto write_error;
 		}
 	}
 
+	down_read(&ubi->fm_sem);
 	vol->eba_tbl[lnum] = pnum;
+	up_read(&ubi->fm_sem);
 
 	leb_write_unlock(ubi, vol_id, lnum);
 	ubi_free_vid_hdr(ubi, vid_hdr);
@@ -687,7 +690,7 @@
 	 * eraseblock, so just put it and request a new one. We assume that if
 	 * this physical eraseblock went bad, the erase code will handle that.
 	 */
-	err = ubi_wl_put_peb(ubi, pnum, 1);
+	err = ubi_wl_put_peb(ubi, vol_id, lnum, pnum, 1);
 	if (err || ++tries > UBI_IO_RETRIES) {
 		ubi_ro_mode(ubi);
 		leb_write_unlock(ubi, vol_id, lnum);
@@ -695,7 +698,7 @@
 		return err;
 	}
 
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 	ubi_msg("try another PEB");
 	goto retry;
 }
@@ -707,7 +710,6 @@
  * @lnum: logical eraseblock number
  * @buf: data to write
  * @len: how many bytes to write
- * @dtype: data type
  * @used_ebs: how many logical eraseblocks will this volume contain
  *
  * This function writes data to logical eraseblock @lnum of static volume
@@ -724,8 +726,7 @@
  * code in case of failure.
  */
 int ubi_eba_write_leb_st(struct ubi_device *ubi, struct ubi_volume *vol,
-			 int lnum, const void *buf, int len, int dtype,
-			 int used_ebs)
+			 int lnum, const void *buf, int len, int used_ebs)
 {
 	int err, pnum, tries = 0, data_size = len, vol_id = vol->vol_id;
 	struct ubi_vid_hdr *vid_hdr;
@@ -750,7 +751,7 @@
 		return err;
 	}
 
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 	vid_hdr->vol_id = cpu_to_be32(vol_id);
 	vid_hdr->lnum = cpu_to_be32(lnum);
 	vid_hdr->compat = ubi_get_compat(ubi, vol_id);
@@ -763,7 +764,7 @@
 	vid_hdr->data_crc = cpu_to_be32(crc);
 
 retry:
-	pnum = ubi_wl_get_peb(ubi, dtype);
+	pnum = ubi_wl_get_peb(ubi);
 	if (pnum < 0) {
 		ubi_free_vid_hdr(ubi, vid_hdr);
 		leb_write_unlock(ubi, vol_id, lnum);
@@ -788,7 +789,9 @@
 	}
 
 	ubi_assert(vol->eba_tbl[lnum] < 0);
+	down_read(&ubi->fm_sem);
 	vol->eba_tbl[lnum] = pnum;
+	up_read(&ubi->fm_sem);
 
 	leb_write_unlock(ubi, vol_id, lnum);
 	ubi_free_vid_hdr(ubi, vid_hdr);
@@ -807,7 +810,7 @@
 		return err;
 	}
 
-	err = ubi_wl_put_peb(ubi, pnum, 1);
+	err = ubi_wl_put_peb(ubi, vol_id, lnum, pnum, 1);
 	if (err || ++tries > UBI_IO_RETRIES) {
 		ubi_ro_mode(ubi);
 		leb_write_unlock(ubi, vol_id, lnum);
@@ -815,7 +818,7 @@
 		return err;
 	}
 
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 	ubi_msg("try another PEB");
 	goto retry;
 }
@@ -827,7 +830,6 @@
  * @lnum: logical eraseblock number
  * @buf: data to write
  * @len: how many bytes to write
- * @dtype: data type
  *
  * This function changes the contents of a logical eraseblock atomically. @buf
  * has to contain new logical eraseblock data, and @len - the length of the
@@ -839,7 +841,7 @@
  * LEB change may be done at a time. This is ensured by @ubi->alc_mutex.
  */
 int ubi_eba_atomic_leb_change(struct ubi_device *ubi, struct ubi_volume *vol,
-			      int lnum, const void *buf, int len, int dtype)
+			      int lnum, const void *buf, int len)
 {
 	int err, pnum, tries = 0, vol_id = vol->vol_id;
 	struct ubi_vid_hdr *vid_hdr;
@@ -856,7 +858,7 @@
 		err = ubi_eba_unmap_leb(ubi, vol, lnum);
 		if (err)
 			return err;
-		return ubi_eba_write_leb(ubi, vol, lnum, NULL, 0, 0, dtype);
+		return ubi_eba_write_leb(ubi, vol, lnum, NULL, 0, 0);
 	}
 
 	vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS);
@@ -868,7 +870,7 @@
 	if (err)
 		goto out_mutex;
 
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 	vid_hdr->vol_id = cpu_to_be32(vol_id);
 	vid_hdr->lnum = cpu_to_be32(lnum);
 	vid_hdr->compat = ubi_get_compat(ubi, vol_id);
@@ -881,7 +883,7 @@
 	vid_hdr->data_crc = cpu_to_be32(crc);
 
 retry:
-	pnum = ubi_wl_get_peb(ubi, dtype);
+	pnum = ubi_wl_get_peb(ubi);
 	if (pnum < 0) {
 		err = pnum;
 		goto out_leb_unlock;
@@ -905,12 +907,14 @@
 	}
 
 	if (vol->eba_tbl[lnum] >= 0) {
-		err = ubi_wl_put_peb(ubi, vol->eba_tbl[lnum], 0);
+		err = ubi_wl_put_peb(ubi, vol_id, lnum, vol->eba_tbl[lnum], 0);
 		if (err)
 			goto out_leb_unlock;
 	}
 
+	down_read(&ubi->fm_sem);
 	vol->eba_tbl[lnum] = pnum;
+	up_read(&ubi->fm_sem);
 
 out_leb_unlock:
 	leb_write_unlock(ubi, vol_id, lnum);
@@ -930,13 +934,13 @@
 		goto out_leb_unlock;
 	}
 
-	err = ubi_wl_put_peb(ubi, pnum, 1);
+	err = ubi_wl_put_peb(ubi, vol_id, lnum, pnum, 1);
 	if (err || ++tries > UBI_IO_RETRIES) {
 		ubi_ro_mode(ubi);
 		goto out_leb_unlock;
 	}
 
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 	ubi_msg("try another PEB");
 	goto retry;
 }
@@ -979,7 +983,7 @@
  * physical eraseblock @to. The @vid_hdr buffer may be changed by this
  * function. Returns:
  *   o %0 in case of success;
- *   o %MOVE_CANCEL_RACE, %MOVE_TARGET_WR_ERR, %MOVE_CANCEL_BITFLIPS, etc;
+ *   o %MOVE_CANCEL_RACE, %MOVE_TARGET_WR_ERR, %MOVE_TARGET_BITFLIPS, etc;
  *   o a negative error code in case of failure.
  */
 int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
@@ -1044,22 +1048,21 @@
 	 * cancel it.
 	 */
 	if (vol->eba_tbl[lnum] != from) {
-		dbg_wl("LEB %d:%d is no longer mapped to PEB %d, mapped to "
-		       "PEB %d, cancel", vol_id, lnum, from,
-		       vol->eba_tbl[lnum]);
+		dbg_wl("LEB %d:%d is no longer mapped to PEB %d, mapped to PEB %d, cancel",
+		       vol_id, lnum, from, vol->eba_tbl[lnum]);
 		err = MOVE_CANCEL_RACE;
 		goto out_unlock_leb;
 	}
 
 	/*
 	 * OK, now the LEB is locked and we can safely start moving it. Since
-	 * this function utilizes the @ubi->peb_buf1 buffer which is shared
+	 * this function utilizes the @ubi->peb_buf buffer which is shared
 	 * with some other functions - we lock the buffer by taking the
 	 * @ubi->buf_mutex.
 	 */
 	mutex_lock(&ubi->buf_mutex);
 	dbg_wl("read %d bytes of data", aldata_size);
-	err = ubi_io_read_data(ubi, ubi->peb_buf1, from, 0, aldata_size);
+	err = ubi_io_read_data(ubi, ubi->peb_buf, from, 0, aldata_size);
 	if (err && err != UBI_IO_BITFLIPS) {
 		ubi_warn("error %d while reading data from PEB %d",
 			 err, from);
@@ -1079,10 +1082,10 @@
 	 */
 	if (vid_hdr->vol_type == UBI_VID_DYNAMIC)
 		aldata_size = data_size =
-			ubi_calc_data_len(ubi, ubi->peb_buf1, data_size);
+			ubi_calc_data_len(ubi, ubi->peb_buf, data_size);
 
 	cond_resched();
-	crc = crc32(UBI_CRC32_INIT, ubi->peb_buf1, data_size);
+	crc = crc32(UBI_CRC32_INIT, ubi->peb_buf, data_size);
 	cond_resched();
 
 	/*
@@ -1096,7 +1099,7 @@
 		vid_hdr->data_size = cpu_to_be32(data_size);
 		vid_hdr->data_crc = cpu_to_be32(crc);
 	}
-	vid_hdr->sqnum = cpu_to_be64(next_sqnum(ubi));
+	vid_hdr->sqnum = cpu_to_be64(ubi_next_sqnum(ubi));
 
 	err = ubi_io_write_vid_hdr(ubi, to, vid_hdr);
 	if (err) {
@@ -1111,17 +1114,17 @@
 	err = ubi_io_read_vid_hdr(ubi, to, vid_hdr, 1);
 	if (err) {
 		if (err != UBI_IO_BITFLIPS) {
-			ubi_warn("error %d while reading VID header back from "
-				  "PEB %d", err, to);
+			ubi_warn("error %d while reading VID header back from PEB %d",
+				 err, to);
 			if (is_error_sane(err))
 				err = MOVE_TARGET_RD_ERR;
 		} else
-			err = MOVE_CANCEL_BITFLIPS;
+			err = MOVE_TARGET_BITFLIPS;
 		goto out_unlock_buf;
 	}
 
 	if (data_size > 0) {
-		err = ubi_io_write_data(ubi, ubi->peb_buf1, to, 0, aldata_size);
+		err = ubi_io_write_data(ubi, ubi->peb_buf, to, 0, aldata_size);
 		if (err) {
 			if (err == -EIO)
 				err = MOVE_TARGET_WR_ERR;
@@ -1134,31 +1137,33 @@
 		 * We've written the data and are going to read it back to make
 		 * sure it was written correctly.
 		 */
-
-		err = ubi_io_read_data(ubi, ubi->peb_buf2, to, 0, aldata_size);
+		memset(ubi->peb_buf, 0xFF, aldata_size);
+		err = ubi_io_read_data(ubi, ubi->peb_buf, to, 0, aldata_size);
 		if (err) {
 			if (err != UBI_IO_BITFLIPS) {
-				ubi_warn("error %d while reading data back "
-					 "from PEB %d", err, to);
+				ubi_warn("error %d while reading data back from PEB %d",
+					 err, to);
 				if (is_error_sane(err))
 					err = MOVE_TARGET_RD_ERR;
 			} else
-				err = MOVE_CANCEL_BITFLIPS;
+				err = MOVE_TARGET_BITFLIPS;
 			goto out_unlock_buf;
 		}
 
 		cond_resched();
 
-		if (memcmp(ubi->peb_buf1, ubi->peb_buf2, aldata_size)) {
-			ubi_warn("read data back from PEB %d and it is "
-				 "different", to);
+		if (crc != crc32(UBI_CRC32_INIT, ubi->peb_buf, data_size)) {
+			ubi_warn("read data back from PEB %d and it is different",
+				 to);
 			err = -EINVAL;
 			goto out_unlock_buf;
 		}
 	}
 
 	ubi_assert(vol->eba_tbl[lnum] == from);
+	down_read(&ubi->fm_sem);
 	vol->eba_tbl[lnum] = to;
+	up_read(&ubi->fm_sem);
 
 out_unlock_buf:
 	mutex_unlock(&ubi->buf_mutex);
@@ -1171,7 +1176,7 @@
  * print_rsvd_warning - warn about not having enough reserved PEBs.
  * @ubi: UBI device description object
  *
- * This is a helper function for 'ubi_eba_init_scan()' which is called when UBI
+ * This is a helper function for 'ubi_eba_init()' which is called when UBI
  * cannot reserve enough PEBs for bad block handling. This function makes a
  * decision whether we have to print a warning or not. The algorithm is as
  * follows:
@@ -1186,13 +1191,13 @@
  * reported by real users.
  */
 static void print_rsvd_warning(struct ubi_device *ubi,
-			       struct ubi_scan_info *si)
+			       struct ubi_attach_info *ai)
 {
 	/*
 	 * The 1 << 18 (256KiB) number is picked randomly, just a reasonably
 	 * large number to distinguish between newly flashed and used images.
 	 */
-	if (si->max_sqnum > (1 << 18)) {
+	if (ai->max_sqnum > (1 << 18)) {
 		int min = ubi->beb_rsvd_level / 10;
 
 		if (!min)
@@ -1201,27 +1206,123 @@
 			return;
 	}
 
-	ubi_warn("cannot reserve enough PEBs for bad PEB handling, reserved %d,"
-		 " need %d", ubi->beb_rsvd_pebs, ubi->beb_rsvd_level);
+	ubi_warn("cannot reserve enough PEBs for bad PEB handling, reserved %d, need %d",
+		 ubi->beb_rsvd_pebs, ubi->beb_rsvd_level);
 	if (ubi->corr_peb_count)
 		ubi_warn("%d PEBs are corrupted and not used",
-			ubi->corr_peb_count);
+			 ubi->corr_peb_count);
+}
+
+/**
+ * self_check_eba - run a self check on the EBA table constructed by fastmap.
+ * @ubi: UBI device description object
+ * @ai_fastmap: UBI attach info object created by fastmap
+ * @ai_scan: UBI attach info object created by scanning
+ *
+ * Returns < 0 in case of an internal error, 0 otherwise.
+ * If a bad EBA table entry was found it will be printed out and
+ * ubi_assert() triggers.
+ */
+int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap,
+		   struct ubi_attach_info *ai_scan)
+{
+	int i, j, num_volumes, ret = 0;
+	int **scan_eba, **fm_eba;
+	struct ubi_ainf_volume *av;
+	struct ubi_volume *vol;
+	struct ubi_ainf_peb *aeb;
+	struct rb_node *rb;
+
+	num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT;
+
+	scan_eba = kmalloc(sizeof(*scan_eba) * num_volumes, GFP_KERNEL);
+	if (!scan_eba)
+		return -ENOMEM;
+
+	fm_eba = kmalloc(sizeof(*fm_eba) * num_volumes, GFP_KERNEL);
+	if (!fm_eba) {
+		kfree(scan_eba);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < num_volumes; i++) {
+		vol = ubi->volumes[i];
+		if (!vol)
+			continue;
+
+		scan_eba[i] = kmalloc(vol->reserved_pebs * sizeof(**scan_eba),
+				      GFP_KERNEL);
+		if (!scan_eba[i]) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+
+		fm_eba[i] = kmalloc(vol->reserved_pebs * sizeof(**fm_eba),
+				    GFP_KERNEL);
+		if (!fm_eba[i]) {
+			ret = -ENOMEM;
+			goto out_free;
+		}
+
+		for (j = 0; j < vol->reserved_pebs; j++)
+			scan_eba[i][j] = fm_eba[i][j] = UBI_LEB_UNMAPPED;
+
+		av = ubi_find_av(ai_scan, idx2vol_id(ubi, i));
+		if (!av)
+			continue;
+
+		ubi_rb_for_each_entry(rb, aeb, &av->root, u.rb)
+			scan_eba[i][aeb->lnum] = aeb->pnum;
+
+		av = ubi_find_av(ai_fastmap, idx2vol_id(ubi, i));
+		if (!av)
+			continue;
+
+		ubi_rb_for_each_entry(rb, aeb, &av->root, u.rb)
+			fm_eba[i][aeb->lnum] = aeb->pnum;
+
+		for (j = 0; j < vol->reserved_pebs; j++) {
+			if (scan_eba[i][j] != fm_eba[i][j]) {
+				if (scan_eba[i][j] == UBI_LEB_UNMAPPED ||
+					fm_eba[i][j] == UBI_LEB_UNMAPPED)
+					continue;
+
+				ubi_err("LEB:%i:%i is PEB:%i instead of %i!",
+					vol->vol_id, i, fm_eba[i][j],
+					scan_eba[i][j]);
+				ubi_assert(0);
+			}
+		}
+	}
+
+out_free:
+	for (i = 0; i < num_volumes; i++) {
+		if (!ubi->volumes[i])
+			continue;
+
+		kfree(scan_eba[i]);
+		kfree(fm_eba[i]);
+	}
+
+	kfree(scan_eba);
+	kfree(fm_eba);
+	return ret;
 }
 
 /**
- * ubi_eba_init_scan - initialize the EBA sub-system using scanning information.
+ * ubi_eba_init - initialize the EBA sub-system using attaching information.
  * @ubi: UBI device description object
- * @si: scanning information
+ * @ai: attaching information
  *
  * This function returns zero in case of success and a negative error code in
  * case of failure.
  */
-int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
+int ubi_eba_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
 {
 	int i, j, err, num_volumes;
-	struct ubi_scan_volume *sv;
+	struct ubi_ainf_volume *av;
 	struct ubi_volume *vol;
-	struct ubi_scan_leb *seb;
+	struct ubi_ainf_peb *aeb;
 	struct rb_node *rb;
 
 	dbg_eba("initialize EBA sub-system");
@@ -1230,7 +1331,7 @@
 	mutex_init(&ubi->alc_mutex);
 	ubi->ltree = RB_ROOT;
 
-	ubi->global_sqnum = si->max_sqnum + 1;
+	ubi->global_sqnum = ai->max_sqnum + 1;
 	num_volumes = ubi->vtbl_slots + UBI_INT_VOL_COUNT;
 
 	for (i = 0; i < num_volumes; i++) {
@@ -1250,18 +1351,18 @@
 		for (j = 0; j < vol->reserved_pebs; j++)
 			vol->eba_tbl[j] = UBI_LEB_UNMAPPED;
 
-		sv = ubi_scan_find_sv(si, idx2vol_id(ubi, i));
-		if (!sv)
+		av = ubi_find_av(ai, idx2vol_id(ubi, i));
+		if (!av)
 			continue;
 
-		ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) {
-			if (seb->lnum >= vol->reserved_pebs)
+		ubi_rb_for_each_entry(rb, aeb, &av->root, u.rb) {
+			if (aeb->lnum >= vol->reserved_pebs)
 				/*
 				 * This may happen in case of an unclean reboot
 				 * during re-size.
 				 */
-				ubi_scan_move_to_list(sv, seb, &si->erase);
-			vol->eba_tbl[seb->lnum] = seb->pnum;
+				ubi_move_aeb_to_list(av, aeb, &ai->erase);
+			vol->eba_tbl[aeb->lnum] = aeb->pnum;
 		}
 	}
 
@@ -1283,7 +1384,7 @@
 		if (ubi->avail_pebs < ubi->beb_rsvd_level) {
 			/* No enough free physical eraseblocks */
 			ubi->beb_rsvd_pebs = ubi->avail_pebs;
-			print_rsvd_warning(ubi, si);
+			print_rsvd_warning(ubi, ai);
 		} else
 			ubi->beb_rsvd_pebs = ubi->beb_rsvd_level;
 
Nur in b/drivers/mtd/ubi: fastmap.c.
diff -ur a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c
--- a/drivers/mtd/ubi/gluebi.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/gluebi.c	2014-02-17 11:57:40.000000000 +0100
@@ -41,7 +41,7 @@
 #include "ubi-media.h"
 
 #define err_msg(fmt, ...)                                   \
-	printk(KERN_DEBUG "gluebi (pid %d): %s: " fmt "\n", \
+	pr_err("gluebi (pid %d): %s: " fmt "\n",            \
 	       current->pid, __func__, ##__VA_ARGS__)
 
 /**
@@ -171,7 +171,7 @@
 static int gluebi_read(struct mtd_info *mtd, loff_t from, size_t len,
 		       size_t *retlen, unsigned char *buf)
 {
-	int err = 0, lnum, offs, total_read;
+	int err = 0, lnum, offs, bytes_left;
 	struct gluebi_device *gluebi;
 
 	if (len < 0 || from < 0 || from + len > mtd->size)
@@ -180,12 +180,12 @@
 	gluebi = container_of(mtd, struct gluebi_device, mtd);
 
 	lnum = div_u64_rem(from, mtd->erasesize, &offs);
-	total_read = len;
-	while (total_read) {
+	bytes_left = len;
+	while (bytes_left) {
 		size_t to_read = mtd->erasesize - offs;
 
-		if (to_read > total_read)
-			to_read = total_read;
+		if (to_read > bytes_left)
+			to_read = bytes_left;
 
 		err = ubi_read(gluebi->desc, lnum, buf, offs, to_read);
 		if (err)
@@ -193,11 +193,11 @@
 
 		lnum += 1;
 		offs = 0;
-		total_read -= to_read;
+		bytes_left -= to_read;
 		buf += to_read;
 	}
 
-	*retlen = len - total_read;
+	*retlen = len - bytes_left;
 	return err;
 }
 
@@ -215,7 +215,7 @@
 static int gluebi_write(struct mtd_info *mtd, loff_t to, size_t len,
 			size_t *retlen, const u_char *buf)
 {
-	int err = 0, lnum, offs, total_written;
+	int err = 0, lnum, offs, bytes_left;
 	struct gluebi_device *gluebi;
 
 	if (len < 0 || to < 0 || len + to > mtd->size)
@@ -231,24 +231,24 @@
 	if (len % mtd->writesize || offs % mtd->writesize)
 		return -EINVAL;
 
-	total_written = len;
-	while (total_written) {
+	bytes_left = len;
+	while (bytes_left) {
 		size_t to_write = mtd->erasesize - offs;
 
-		if (to_write > total_written)
-			to_write = total_written;
+		if (to_write > bytes_left)
+			to_write = bytes_left;
 
-		err = ubi_write(gluebi->desc, lnum, buf, offs, to_write);
+		err = ubi_leb_write(gluebi->desc, lnum, buf, offs, to_write);
 		if (err)
 			break;
 
 		lnum += 1;
 		offs = 0;
-		total_written -= to_write;
+		bytes_left -= to_write;
 		buf += to_write;
 	}
 
-	*retlen = len - total_written;
+	*retlen = len - bytes_left;
 	return err;
 }
 
@@ -360,9 +360,8 @@
 	mutex_lock(&devices_mutex);
 	g = find_gluebi_nolock(vi->ubi_num, vi->vol_id);
 	if (g)
-		err_msg("gluebi MTD device %d form UBI device %d volume %d "
-			"already exists", g->mtd.index, vi->ubi_num,
-			vi->vol_id);
+		err_msg("gluebi MTD device %d form UBI device %d volume %d already exists",
+			g->mtd.index, vi->ubi_num, vi->vol_id);
 	mutex_unlock(&devices_mutex);
 
 	if (mtd_device_register(mtd, NULL, 0)) {
@@ -395,8 +394,8 @@
 	mutex_lock(&devices_mutex);
 	gluebi = find_gluebi_nolock(vi->ubi_num, vi->vol_id);
 	if (!gluebi) {
-		err_msg("got remove notification for unknown UBI device %d "
-			"volume %d", vi->ubi_num, vi->vol_id);
+		err_msg("got remove notification for unknown UBI device %d volume %d",
+			vi->ubi_num, vi->vol_id);
 		err = -ENOENT;
 	} else if (gluebi->refcnt)
 		err = -EBUSY;
@@ -409,9 +408,8 @@
 	mtd = &gluebi->mtd;
 	err = mtd_device_unregister(mtd);
 	if (err) {
-		err_msg("cannot remove fake MTD device %d, UBI device %d, "
-			"volume %d, error %d", mtd->index, gluebi->ubi_num,
-			gluebi->vol_id, err);
+		err_msg("cannot remove fake MTD device %d, UBI device %d, volume %d, error %d",
+			mtd->index, gluebi->ubi_num, gluebi->vol_id, err);
 		mutex_lock(&devices_mutex);
 		list_add_tail(&gluebi->list, &gluebi_devices);
 		mutex_unlock(&devices_mutex);
@@ -441,8 +439,8 @@
 	gluebi = find_gluebi_nolock(vi->ubi_num, vi->vol_id);
 	if (!gluebi) {
 		mutex_unlock(&devices_mutex);
-		err_msg("got update notification for unknown UBI device %d "
-			"volume %d", vi->ubi_num, vi->vol_id);
+		err_msg("got update notification for unknown UBI device %d volume %d",
+			vi->ubi_num, vi->vol_id);
 		return -ENOENT;
 	}
 
@@ -468,8 +466,8 @@
 	gluebi = find_gluebi_nolock(vi->ubi_num, vi->vol_id);
 	if (!gluebi) {
 		mutex_unlock(&devices_mutex);
-		err_msg("got update notification for unknown UBI device %d "
-			"volume %d", vi->ubi_num, vi->vol_id);
+		err_msg("got update notification for unknown UBI device %d volume %d",
+			vi->ubi_num, vi->vol_id);
 		return -ENOENT;
 	}
 	gluebi->mtd.size = vi->used_bytes;
@@ -526,9 +524,9 @@
 
 		err = mtd_device_unregister(mtd);
 		if (err)
-			err_msg("error %d while removing gluebi MTD device %d, "
-				"UBI device %d, volume %d - ignoring", err,
-				mtd->index, gluebi->ubi_num, gluebi->vol_id);
+			err_msg("error %d while removing gluebi MTD device %d, UBI device %d, volume %d - ignoring",
+				err, mtd->index, gluebi->ubi_num,
+				gluebi->vol_id);
 		kfree(mtd->name);
 		kfree(gluebi);
 	}
diff -ur a/drivers/mtd/ubi/io.c b/drivers/mtd/ubi/io.c
--- a/drivers/mtd/ubi/io.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/io.c	2014-02-17 11:57:40.000000000 +0100
@@ -91,21 +91,15 @@
 #include <linux/slab.h>
 #include "ubi.h"
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum);
-static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum);
-static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum,
-				 const struct ubi_ec_hdr *ec_hdr);
-static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum);
-static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum,
-				  const struct ubi_vid_hdr *vid_hdr);
-#else
-#define paranoid_check_not_bad(ubi, pnum) 0
-#define paranoid_check_peb_ec_hdr(ubi, pnum)  0
-#define paranoid_check_ec_hdr(ubi, pnum, ec_hdr)  0
-#define paranoid_check_peb_vid_hdr(ubi, pnum) 0
-#define paranoid_check_vid_hdr(ubi, pnum, vid_hdr) 0
-#endif
+static int self_check_not_bad(const struct ubi_device *ubi, int pnum);
+static int self_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum);
+static int self_check_ec_hdr(const struct ubi_device *ubi, int pnum,
+			     const struct ubi_ec_hdr *ec_hdr);
+static int self_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum);
+static int self_check_vid_hdr(const struct ubi_device *ubi, int pnum,
+			      const struct ubi_vid_hdr *vid_hdr);
+static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum,
+			    int offset, int len);
 
 /**
  * ubi_io_read - read data from a physical eraseblock.
@@ -142,7 +136,7 @@
 	ubi_assert(offset >= 0 && offset + len <= ubi->peb_size);
 	ubi_assert(len > 0);
 
-	err = paranoid_check_not_bad(ubi, pnum);
+	err = self_check_not_bad(ubi, pnum);
 	if (err)
 		return err;
 
@@ -183,22 +177,21 @@
 			 * enabled. A corresponding message will be printed
 			 * later, when it is has been scrubbed.
 			 */
-			dbg_msg("fixable bit-flip detected at PEB %d", pnum);
+			ubi_msg("fixable bit-flip detected at PEB %d", pnum);
 			ubi_assert(len == read);
 			return UBI_IO_BITFLIPS;
 		}
 
 		if (retries++ < UBI_IO_RETRIES) {
-			dbg_io("error %d%s while reading %d bytes from PEB "
-			       "%d:%d, read only %zd bytes, retry",
-			       err, errstr, len, pnum, offset, read);
+			ubi_warn("error %d%s while reading %d bytes from PEB %d:%d, read only %zd bytes, retry",
+				 err, errstr, len, pnum, offset, read);
 			yield();
 			goto retry;
 		}
 
-		ubi_err("error %d%s while reading %d bytes from PEB %d:%d, "
-			"read %zd bytes", err, errstr, len, pnum, offset, read);
-		ubi_dbg_dump_stack();
+		ubi_err("error %d%s while reading %d bytes from PEB %d:%d, read %zd bytes",
+			err, errstr, len, pnum, offset, read);
+		dump_stack();
 
 		/*
 		 * The driver should never return -EBADMSG if it failed to read
@@ -257,14 +250,12 @@
 		return -EROFS;
 	}
 
-	/* The below has to be compiled out if paranoid checks are disabled */
-
-	err = paranoid_check_not_bad(ubi, pnum);
+	err = self_check_not_bad(ubi, pnum);
 	if (err)
 		return err;
 
 	/* The area we are writing to has to contain all 0xFF bytes */
-	err = ubi_dbg_check_all_ff(ubi, pnum, offset, len);
+	err = ubi_self_check_all_ff(ubi, pnum, offset, len);
 	if (err)
 		return err;
 
@@ -273,33 +264,33 @@
 		 * We write to the data area of the physical eraseblock. Make
 		 * sure it has valid EC and VID headers.
 		 */
-		err = paranoid_check_peb_ec_hdr(ubi, pnum);
+		err = self_check_peb_ec_hdr(ubi, pnum);
 		if (err)
 			return err;
-		err = paranoid_check_peb_vid_hdr(ubi, pnum);
+		err = self_check_peb_vid_hdr(ubi, pnum);
 		if (err)
 			return err;
 	}
 
 	if (ubi_dbg_is_write_failure(ubi)) {
-		dbg_err("cannot write %d bytes to PEB %d:%d "
-			"(emulated)", len, pnum, offset);
-		ubi_dbg_dump_stack();
+		ubi_err("cannot write %d bytes to PEB %d:%d (emulated)",
+			len, pnum, offset);
+		dump_stack();
 		return -EIO;
 	}
 
 	addr = (loff_t)pnum * ubi->peb_size + offset;
 	err = ubi->mtd->write(ubi->mtd, addr, len, &written, buf);
 	if (err) {
-		ubi_err("error %d while writing %d bytes to PEB %d:%d, written "
-			"%zd bytes", err, len, pnum, offset, written);
-		ubi_dbg_dump_stack();
-		ubi_dbg_dump_flash(ubi, pnum, offset, len);
+		ubi_err("error %d while writing %d bytes to PEB %d:%d, written %zd bytes",
+			err, len, pnum, offset, written);
+		dump_stack();
+		ubi_dump_flash(ubi, pnum, offset, len);
 	} else
 		ubi_assert(written == len);
 
 	if (!err) {
-		err = ubi_dbg_check_write(ubi, buf, pnum, offset, len);
+		err = self_check_write(ubi, buf, pnum, offset, len);
 		if (err)
 			return err;
 
@@ -310,7 +301,7 @@
 		offset += len;
 		len = ubi->peb_size - offset;
 		if (len)
-			err = ubi_dbg_check_all_ff(ubi, pnum, offset, len);
+			err = ubi_self_check_all_ff(ubi, pnum, offset, len);
 	}
 
 	return err;
@@ -364,13 +355,13 @@
 	err = ubi->mtd->erase(ubi->mtd, &ei);
 	if (err) {
 		if (retries++ < UBI_IO_RETRIES) {
-			dbg_io("error %d while erasing PEB %d, retry",
-			       err, pnum);
+			ubi_warn("error %d while erasing PEB %d, retry",
+				 err, pnum);
 			yield();
 			goto retry;
 		}
 		ubi_err("cannot erase PEB %d, error %d", pnum, err);
-		ubi_dbg_dump_stack();
+		dump_stack();
 		return err;
 	}
 
@@ -383,21 +374,21 @@
 
 	if (ei.state == MTD_ERASE_FAILED) {
 		if (retries++ < UBI_IO_RETRIES) {
-			dbg_io("error while erasing PEB %d, retry", pnum);
+			ubi_warn("error while erasing PEB %d, retry", pnum);
 			yield();
 			goto retry;
 		}
 		ubi_err("cannot erase PEB %d", pnum);
-		ubi_dbg_dump_stack();
+		dump_stack();
 		return -EIO;
 	}
 
-	err = ubi_dbg_check_all_ff(ubi, pnum, 0, ubi->peb_size);
+	err = ubi_self_check_all_ff(ubi, pnum, 0, ubi->peb_size);
 	if (err)
 		return err;
 
 	if (ubi_dbg_is_erase_failure(ubi)) {
-		dbg_err("cannot erase PEB %d (emulated)", pnum);
+		ubi_err("cannot erase PEB %d (emulated)", pnum);
 		return -EIO;
 	}
 
@@ -431,11 +422,11 @@
 			goto out;
 
 		/* Make sure the PEB contains only 0xFF bytes */
-		err = ubi_io_read(ubi, ubi->peb_buf1, pnum, 0, ubi->peb_size);
+		err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size);
 		if (err)
 			goto out;
 
-		err = ubi_check_pattern(ubi->peb_buf1, 0xFF, ubi->peb_size);
+		err = ubi_check_pattern(ubi->peb_buf, 0xFF, ubi->peb_size);
 		if (err == 0) {
 			ubi_err("erased PEB %d, but a non-0xFF byte found",
 				pnum);
@@ -444,17 +435,17 @@
 		}
 
 		/* Write a pattern and check it */
-		memset(ubi->peb_buf1, patterns[i], ubi->peb_size);
-		err = ubi_io_write(ubi, ubi->peb_buf1, pnum, 0, ubi->peb_size);
+		memset(ubi->peb_buf, patterns[i], ubi->peb_size);
+		err = ubi_io_write(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size);
 		if (err)
 			goto out;
 
-		memset(ubi->peb_buf1, ~patterns[i], ubi->peb_size);
-		err = ubi_io_read(ubi, ubi->peb_buf1, pnum, 0, ubi->peb_size);
+		memset(ubi->peb_buf, ~patterns[i], ubi->peb_size);
+		err = ubi_io_read(ubi, ubi->peb_buf, pnum, 0, ubi->peb_size);
 		if (err)
 			goto out;
 
-		err = ubi_check_pattern(ubi->peb_buf1, patterns[i],
+		err = ubi_check_pattern(ubi->peb_buf, patterns[i],
 					ubi->peb_size);
 		if (err == 0) {
 			ubi_err("pattern %x checking failed for PEB %d",
@@ -521,8 +512,7 @@
 	 * It is important to first invalidate the EC header, and then the VID
 	 * header. Otherwise a power cut may lead to valid EC header and
 	 * invalid VID header, in which case UBI will treat this PEB as
-	 * corrupted and will try to preserve it, and print scary warnings (see
-	 * the header comment in scan.c for more information).
+	 * corrupted and will try to preserve it, and print scary warnings.
 	 */
 	addr = (loff_t)pnum * ubi->peb_size;
 	err = ubi->mtd->write(ubi->mtd, addr, 4, &written, (void *)&data);
@@ -564,7 +554,7 @@
 	 */
 	ubi_err("cannot invalidate PEB %d, write returned %d read returned %d",
 		pnum, err, err1);
-	ubi_dbg_dump_flash(ubi, pnum, 0, ubi->peb_size);
+	ubi_dump_flash(ubi, pnum, 0, ubi->peb_size);
 	return -EIO;
 }
 
@@ -590,7 +580,7 @@
 
 	ubi_assert(pnum >= 0 && pnum < ubi->peb_count);
 
-	err = paranoid_check_not_bad(ubi, pnum);
+	err = self_check_not_bad(ubi, pnum);
 	if (err != 0)
 		return err;
 
@@ -695,8 +685,7 @@
 	leb_start = be32_to_cpu(ec_hdr->data_offset);
 
 	if (ec_hdr->version != UBI_VERSION) {
-		ubi_err("node with incompatible UBI version found: "
-			"this UBI version is %d, image version is %d",
+		ubi_err("node with incompatible UBI version found: this UBI version is %d, image version is %d",
 			UBI_VERSION, (int)ec_hdr->version);
 		goto bad;
 	}
@@ -722,8 +711,8 @@
 
 bad:
 	ubi_err("bad EC header");
-	ubi_dbg_dump_ec_hdr(ec_hdr);
-	ubi_dbg_dump_stack();
+	ubi_dump_ec_hdr(ec_hdr);
+	dump_stack();
 	return 1;
 }
 
@@ -787,10 +776,10 @@
 		if (ubi_check_pattern(ec_hdr, 0xFF, UBI_EC_HDR_SIZE)) {
 			/* The physical eraseblock is supposedly empty */
 			if (verbose)
-				ubi_warn("no EC header found at PEB %d, "
-					 "only 0xFF bytes", pnum);
-			dbg_bld("no EC header found at PEB %d, "
-				"only 0xFF bytes", pnum);
+				ubi_warn("no EC header found at PEB %d, only 0xFF bytes",
+					 pnum);
+			dbg_bld("no EC header found at PEB %d, only 0xFF bytes",
+				pnum);
 			if (!read_err)
 				return UBI_IO_FF;
 			else
@@ -802,12 +791,12 @@
 		 * 0xFF bytes. Report that the header is corrupted.
 		 */
 		if (verbose) {
-			ubi_warn("bad magic number at PEB %d: %08x instead of "
-				 "%08x", pnum, magic, UBI_EC_HDR_MAGIC);
-			ubi_dbg_dump_ec_hdr(ec_hdr);
+			ubi_warn("bad magic number at PEB %d: %08x instead of %08x",
+				 pnum, magic, UBI_EC_HDR_MAGIC);
+			ubi_dump_ec_hdr(ec_hdr);
 		}
-		dbg_bld("bad magic number at PEB %d: %08x instead of "
-			"%08x", pnum, magic, UBI_EC_HDR_MAGIC);
+		dbg_bld("bad magic number at PEB %d: %08x instead of %08x",
+			pnum, magic, UBI_EC_HDR_MAGIC);
 		return UBI_IO_BAD_HDR;
 	}
 
@@ -816,12 +805,12 @@
 
 	if (hdr_crc != crc) {
 		if (verbose) {
-			ubi_warn("bad EC header CRC at PEB %d, calculated "
-				 "%#08x, read %#08x", pnum, crc, hdr_crc);
-			ubi_dbg_dump_ec_hdr(ec_hdr);
+			ubi_warn("bad EC header CRC at PEB %d, calculated %#08x, read %#08x",
+				 pnum, crc, hdr_crc);
+			ubi_dump_ec_hdr(ec_hdr);
 		}
-		dbg_bld("bad EC header CRC at PEB %d, calculated "
-			"%#08x, read %#08x", pnum, crc, hdr_crc);
+		dbg_bld("bad EC header CRC at PEB %d, calculated %#08x, read %#08x",
+			pnum, crc, hdr_crc);
 
 		if (!read_err)
 			return UBI_IO_BAD_HDR;
@@ -875,7 +864,7 @@
 	crc = crc32(UBI_CRC32_INIT, ec_hdr, UBI_EC_HDR_SIZE_CRC);
 	ec_hdr->hdr_crc = cpu_to_be32(crc);
 
-	err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr);
+	err = self_check_ec_hdr(ubi, pnum, ec_hdr);
 	if (err)
 		return err;
 
@@ -906,40 +895,40 @@
 	int usable_leb_size = ubi->leb_size - data_pad;
 
 	if (copy_flag != 0 && copy_flag != 1) {
-		dbg_err("bad copy_flag");
+		ubi_err("bad copy_flag");
 		goto bad;
 	}
 
 	if (vol_id < 0 || lnum < 0 || data_size < 0 || used_ebs < 0 ||
 	    data_pad < 0) {
-		dbg_err("negative values");
+		ubi_err("negative values");
 		goto bad;
 	}
 
 	if (vol_id >= UBI_MAX_VOLUMES && vol_id < UBI_INTERNAL_VOL_START) {
-		dbg_err("bad vol_id");
+		ubi_err("bad vol_id");
 		goto bad;
 	}
 
 	if (vol_id < UBI_INTERNAL_VOL_START && compat != 0) {
-		dbg_err("bad compat");
+		ubi_err("bad compat");
 		goto bad;
 	}
 
 	if (vol_id >= UBI_INTERNAL_VOL_START && compat != UBI_COMPAT_DELETE &&
 	    compat != UBI_COMPAT_RO && compat != UBI_COMPAT_PRESERVE &&
 	    compat != UBI_COMPAT_REJECT) {
-		dbg_err("bad compat");
+		ubi_err("bad compat");
 		goto bad;
 	}
 
 	if (vol_type != UBI_VID_DYNAMIC && vol_type != UBI_VID_STATIC) {
-		dbg_err("bad vol_type");
+		ubi_err("bad vol_type");
 		goto bad;
 	}
 
 	if (data_pad >= ubi->leb_size / 2) {
-		dbg_err("bad data_pad");
+		ubi_err("bad data_pad");
 		goto bad;
 	}
 
@@ -951,45 +940,45 @@
 		 * mapped logical eraseblocks.
 		 */
 		if (used_ebs == 0) {
-			dbg_err("zero used_ebs");
+			ubi_err("zero used_ebs");
 			goto bad;
 		}
 		if (data_size == 0) {
-			dbg_err("zero data_size");
+			ubi_err("zero data_size");
 			goto bad;
 		}
 		if (lnum < used_ebs - 1) {
 			if (data_size != usable_leb_size) {
-				dbg_err("bad data_size");
+				ubi_err("bad data_size");
 				goto bad;
 			}
 		} else if (lnum == used_ebs - 1) {
 			if (data_size == 0) {
-				dbg_err("bad data_size at last LEB");
+				ubi_err("bad data_size at last LEB");
 				goto bad;
 			}
 		} else {
-			dbg_err("too high lnum");
+			ubi_err("too high lnum");
 			goto bad;
 		}
 	} else {
 		if (copy_flag == 0) {
 			if (data_crc != 0) {
-				dbg_err("non-zero data CRC");
+				ubi_err("non-zero data CRC");
 				goto bad;
 			}
 			if (data_size != 0) {
-				dbg_err("non-zero data_size");
+				ubi_err("non-zero data_size");
 				goto bad;
 			}
 		} else {
 			if (data_size == 0) {
-				dbg_err("zero data_size of copy");
+				ubi_err("zero data_size of copy");
 				goto bad;
 			}
 		}
 		if (used_ebs != 0) {
-			dbg_err("bad used_ebs");
+			ubi_err("bad used_ebs");
 			goto bad;
 		}
 	}
@@ -998,8 +987,8 @@
 
 bad:
 	ubi_err("bad VID header");
-	ubi_dbg_dump_vid_hdr(vid_hdr);
-	ubi_dbg_dump_stack();
+	ubi_dump_vid_hdr(vid_hdr);
+	dump_stack();
 	return 1;
 }
 
@@ -1042,10 +1031,10 @@
 
 		if (ubi_check_pattern(vid_hdr, 0xFF, UBI_VID_HDR_SIZE)) {
 			if (verbose)
-				ubi_warn("no VID header found at PEB %d, "
-					 "only 0xFF bytes", pnum);
-			dbg_bld("no VID header found at PEB %d, "
-				"only 0xFF bytes", pnum);
+				ubi_warn("no VID header found at PEB %d, only 0xFF bytes",
+					 pnum);
+			dbg_bld("no VID header found at PEB %d, only 0xFF bytes",
+				pnum);
 			if (!read_err)
 				return UBI_IO_FF;
 			else
@@ -1053,12 +1042,12 @@
 		}
 
 		if (verbose) {
-			ubi_warn("bad magic number at PEB %d: %08x instead of "
-				 "%08x", pnum, magic, UBI_VID_HDR_MAGIC);
-			ubi_dbg_dump_vid_hdr(vid_hdr);
+			ubi_warn("bad magic number at PEB %d: %08x instead of %08x",
+				 pnum, magic, UBI_VID_HDR_MAGIC);
+			ubi_dump_vid_hdr(vid_hdr);
 		}
-		dbg_bld("bad magic number at PEB %d: %08x instead of "
-			"%08x", pnum, magic, UBI_VID_HDR_MAGIC);
+		dbg_bld("bad magic number at PEB %d: %08x instead of %08x",
+			pnum, magic, UBI_VID_HDR_MAGIC);
 		return UBI_IO_BAD_HDR;
 	}
 
@@ -1067,12 +1056,12 @@
 
 	if (hdr_crc != crc) {
 		if (verbose) {
-			ubi_warn("bad CRC at PEB %d, calculated %#08x, "
-				 "read %#08x", pnum, crc, hdr_crc);
-			ubi_dbg_dump_vid_hdr(vid_hdr);
+			ubi_warn("bad CRC at PEB %d, calculated %#08x, read %#08x",
+				 pnum, crc, hdr_crc);
+			ubi_dump_vid_hdr(vid_hdr);
 		}
-		dbg_bld("bad CRC at PEB %d, calculated %#08x, "
-			"read %#08x", pnum, crc, hdr_crc);
+		dbg_bld("bad CRC at PEB %d, calculated %#08x, read %#08x",
+			pnum, crc, hdr_crc);
 		if (!read_err)
 			return UBI_IO_BAD_HDR;
 		else
@@ -1113,7 +1102,7 @@
 	dbg_io("write VID header to PEB %d", pnum);
 	ubi_assert(pnum >= 0 &&  pnum < ubi->peb_count);
 
-	err = paranoid_check_peb_ec_hdr(ubi, pnum);
+	err = self_check_peb_ec_hdr(ubi, pnum);
 	if (err)
 		return err;
 
@@ -1122,7 +1111,7 @@
 	crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_VID_HDR_SIZE_CRC);
 	vid_hdr->hdr_crc = cpu_to_be32(crc);
 
-	err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr);
+	err = self_check_vid_hdr(ubi, pnum, vid_hdr);
 	if (err)
 		return err;
 
@@ -1132,34 +1121,32 @@
 	return err;
 }
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-
 /**
- * paranoid_check_not_bad - ensure that a physical eraseblock is not bad.
+ * self_check_not_bad - ensure that a physical eraseblock is not bad.
  * @ubi: UBI device description object
  * @pnum: physical eraseblock number to check
  *
  * This function returns zero if the physical eraseblock is good, %-EINVAL if
  * it is bad and a negative error code if an error occurred.
  */
-static int paranoid_check_not_bad(const struct ubi_device *ubi, int pnum)
+static int self_check_not_bad(const struct ubi_device *ubi, int pnum)
 {
 	int err;
 
-	if (!ubi->dbg->chk_io)
+	if (!ubi_dbg_chk_io(ubi))
 		return 0;
 
 	err = ubi_io_is_bad(ubi, pnum);
 	if (!err)
 		return err;
 
-	ubi_err("paranoid check failed for PEB %d", pnum);
-	ubi_dbg_dump_stack();
+	ubi_err("self-check failed for PEB %d", pnum);
+	dump_stack();
 	return err > 0 ? -EINVAL : err;
 }
 
 /**
- * paranoid_check_ec_hdr - check if an erase counter header is all right.
+ * self_check_ec_hdr - check if an erase counter header is all right.
  * @ubi: UBI device description object
  * @pnum: physical eraseblock number the erase counter header belongs to
  * @ec_hdr: the erase counter header to check
@@ -1167,13 +1154,13 @@
  * This function returns zero if the erase counter header contains valid
  * values, and %-EINVAL if not.
  */
-static int paranoid_check_ec_hdr(const struct ubi_device *ubi, int pnum,
-				 const struct ubi_ec_hdr *ec_hdr)
+static int self_check_ec_hdr(const struct ubi_device *ubi, int pnum,
+			     const struct ubi_ec_hdr *ec_hdr)
 {
 	int err;
 	uint32_t magic;
 
-	if (!ubi->dbg->chk_io)
+	if (!ubi_dbg_chk_io(ubi))
 		return 0;
 
 	magic = be32_to_cpu(ec_hdr->magic);
@@ -1185,33 +1172,33 @@
 
 	err = validate_ec_hdr(ubi, ec_hdr);
 	if (err) {
-		ubi_err("paranoid check failed for PEB %d", pnum);
+		ubi_err("self-check failed for PEB %d", pnum);
 		goto fail;
 	}
 
 	return 0;
 
 fail:
-	ubi_dbg_dump_ec_hdr(ec_hdr);
-	ubi_dbg_dump_stack();
+	ubi_dump_ec_hdr(ec_hdr);
+	dump_stack();
 	return -EINVAL;
 }
 
 /**
- * paranoid_check_peb_ec_hdr - check erase counter header.
+ * self_check_peb_ec_hdr - check erase counter header.
  * @ubi: UBI device description object
  * @pnum: the physical eraseblock number to check
  *
  * This function returns zero if the erase counter header is all right and and
  * a negative error code if not or if an error occurred.
  */
-static int paranoid_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum)
+static int self_check_peb_ec_hdr(const struct ubi_device *ubi, int pnum)
 {
 	int err;
 	uint32_t crc, hdr_crc;
 	struct ubi_ec_hdr *ec_hdr;
 
-	if (!ubi->dbg->chk_io)
+	if (!ubi_dbg_chk_io(ubi))
 		return 0;
 
 	ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS);
@@ -1226,14 +1213,14 @@
 	hdr_crc = be32_to_cpu(ec_hdr->hdr_crc);
 	if (hdr_crc != crc) {
 		ubi_err("bad CRC, calculated %#08x, read %#08x", crc, hdr_crc);
-		ubi_err("paranoid check failed for PEB %d", pnum);
-		ubi_dbg_dump_ec_hdr(ec_hdr);
-		ubi_dbg_dump_stack();
+		ubi_err("self-check failed for PEB %d", pnum);
+		ubi_dump_ec_hdr(ec_hdr);
+		dump_stack();
 		err = -EINVAL;
 		goto exit;
 	}
 
-	err = paranoid_check_ec_hdr(ubi, pnum, ec_hdr);
+	err = self_check_ec_hdr(ubi, pnum, ec_hdr);
 
 exit:
 	kfree(ec_hdr);
@@ -1241,7 +1228,7 @@
 }
 
 /**
- * paranoid_check_vid_hdr - check that a volume identifier header is all right.
+ * self_check_vid_hdr - check that a volume identifier header is all right.
  * @ubi: UBI device description object
  * @pnum: physical eraseblock number the volume identifier header belongs to
  * @vid_hdr: the volume identifier header to check
@@ -1249,13 +1236,13 @@
  * This function returns zero if the volume identifier header is all right, and
  * %-EINVAL if not.
  */
-static int paranoid_check_vid_hdr(const struct ubi_device *ubi, int pnum,
-				  const struct ubi_vid_hdr *vid_hdr)
+static int self_check_vid_hdr(const struct ubi_device *ubi, int pnum,
+			      const struct ubi_vid_hdr *vid_hdr)
 {
 	int err;
 	uint32_t magic;
 
-	if (!ubi->dbg->chk_io)
+	if (!ubi_dbg_chk_io(ubi))
 		return 0;
 
 	magic = be32_to_cpu(vid_hdr->magic);
@@ -1267,36 +1254,36 @@
 
 	err = validate_vid_hdr(ubi, vid_hdr);
 	if (err) {
-		ubi_err("paranoid check failed for PEB %d", pnum);
+		ubi_err("self-check failed for PEB %d", pnum);
 		goto fail;
 	}
 
 	return err;
 
 fail:
-	ubi_err("paranoid check failed for PEB %d", pnum);
-	ubi_dbg_dump_vid_hdr(vid_hdr);
-	ubi_dbg_dump_stack();
+	ubi_err("self-check failed for PEB %d", pnum);
+	ubi_dump_vid_hdr(vid_hdr);
+	dump_stack();
 	return -EINVAL;
 
 }
 
 /**
- * paranoid_check_peb_vid_hdr - check volume identifier header.
+ * self_check_peb_vid_hdr - check volume identifier header.
  * @ubi: UBI device description object
  * @pnum: the physical eraseblock number to check
  *
  * This function returns zero if the volume identifier header is all right,
  * and a negative error code if not or if an error occurred.
  */
-static int paranoid_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum)
+static int self_check_peb_vid_hdr(const struct ubi_device *ubi, int pnum)
 {
 	int err;
 	uint32_t crc, hdr_crc;
 	struct ubi_vid_hdr *vid_hdr;
 	void *p;
 
-	if (!ubi->dbg->chk_io)
+	if (!ubi_dbg_chk_io(ubi))
 		return 0;
 
 	vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_NOFS);
@@ -1312,16 +1299,16 @@
 	crc = crc32(UBI_CRC32_INIT, vid_hdr, UBI_EC_HDR_SIZE_CRC);
 	hdr_crc = be32_to_cpu(vid_hdr->hdr_crc);
 	if (hdr_crc != crc) {
-		ubi_err("bad VID header CRC at PEB %d, calculated %#08x, "
-			"read %#08x", pnum, crc, hdr_crc);
-		ubi_err("paranoid check failed for PEB %d", pnum);
-		ubi_dbg_dump_vid_hdr(vid_hdr);
-		ubi_dbg_dump_stack();
+		ubi_err("bad VID header CRC at PEB %d, calculated %#08x, read %#08x",
+			pnum, crc, hdr_crc);
+		ubi_err("self-check failed for PEB %d", pnum);
+		ubi_dump_vid_hdr(vid_hdr);
+		dump_stack();
 		err = -EINVAL;
 		goto exit;
 	}
 
-	err = paranoid_check_vid_hdr(ubi, pnum, vid_hdr);
+	err = self_check_vid_hdr(ubi, pnum, vid_hdr);
 
 exit:
 	ubi_free_vid_hdr(ubi, vid_hdr);
@@ -1329,7 +1316,7 @@
 }
 
 /**
- * ubi_dbg_check_write - make sure write succeeded.
+ * self_check_write - make sure write succeeded.
  * @ubi: UBI device description object
  * @buf: buffer with data which were written
  * @pnum: physical eraseblock number the data were written to
@@ -1340,15 +1327,15 @@
  * the original data buffer - the data have to match. Returns zero if the data
  * match and a negative error code if not or in case of failure.
  */
-int ubi_dbg_check_write(struct ubi_device *ubi, const void *buf, int pnum,
-			int offset, int len)
+static int self_check_write(struct ubi_device *ubi, const void *buf, int pnum,
+			    int offset, int len)
 {
 	int err, i;
 	size_t read;
 	void *buf1;
 	loff_t addr = (loff_t)pnum * ubi->peb_size + offset;
 
-	if (!ubi->dbg->chk_io)
+	if (!ubi_dbg_chk_io(ubi))
 		return 0;
 
 	buf1 = __vmalloc(len, GFP_NOFS, PAGE_KERNEL);
@@ -1369,7 +1356,7 @@
 		if (c == c1)
 			continue;
 
-		ubi_err("paranoid check failed for PEB %d:%d, len %d",
+		ubi_err("self-check failed for PEB %d:%d, len %d",
 			pnum, offset, len);
 		ubi_msg("data differ at position %d", i);
 		dump_len = max_t(int, 128, len - i);
@@ -1381,7 +1368,7 @@
 			i, i + dump_len);
 		print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
 			       buf1 + i, dump_len, 1);
-		ubi_dbg_dump_stack();
+		dump_stack();
 		err = -EINVAL;
 		goto out_free;
 	}
@@ -1395,7 +1382,7 @@
 }
 
 /**
- * ubi_dbg_check_all_ff - check that a region of flash is empty.
+ * ubi_self_check_all_ff - check that a region of flash is empty.
  * @ubi: UBI device description object
  * @pnum: the physical eraseblock number to check
  * @offset: the starting offset within the physical eraseblock to check
@@ -1405,14 +1392,14 @@
  * @offset of the physical eraseblock @pnum, and a negative error code if not
  * or if an error occurred.
  */
-int ubi_dbg_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len)
+int ubi_self_check_all_ff(struct ubi_device *ubi, int pnum, int offset, int len)
 {
 	size_t read;
 	int err;
 	void *buf;
 	loff_t addr = (loff_t)pnum * ubi->peb_size + offset;
 
-	if (!ubi->dbg->chk_io)
+	if (!ubi_dbg_chk_io(ubi))
 		return 0;
 
 	buf = __vmalloc(len, GFP_NOFS, PAGE_KERNEL);
@@ -1423,15 +1410,15 @@
 
 	err = ubi->mtd->read(ubi->mtd, addr, len, &read, buf);
 	if (err && !mtd_is_bitflip(err)) {
-		ubi_err("error %d while reading %d bytes from PEB %d:%d, "
-			"read %zd bytes", err, len, pnum, offset, read);
+		ubi_err("error %d while reading %d bytes from PEB %d:%d, read %zd bytes",
+			err, len, pnum, offset, read);
 		goto error;
 	}
 
 	err = ubi_check_pattern(buf, 0xFF, len);
 	if (err == 0) {
-		ubi_err("flash region at PEB %d:%d, length %d does not "
-			"contain all 0xFF bytes", pnum, offset, len);
+		ubi_err("flash region at PEB %d:%d, length %d does not contain all 0xFF bytes",
+			pnum, offset, len);
 		goto fail;
 	}
 
@@ -1439,14 +1426,12 @@
 	return 0;
 
 fail:
-	ubi_err("paranoid check failed for PEB %d", pnum);
+	ubi_err("self-check failed for PEB %d", pnum);
 	ubi_msg("hex dump of the %d-%d region", offset, offset + len);
 	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1, buf, len, 1);
 	err = -EINVAL;
 error:
-	ubi_dbg_dump_stack();
+	dump_stack();
 	vfree(buf);
 	return err;
 }
-
-#endif /* CONFIG_MTD_UBI_DEBUG */
diff -ur a/drivers/mtd/ubi/kapi.c b/drivers/mtd/ubi/kapi.c
--- a/drivers/mtd/ubi/kapi.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/kapi.c	2014-02-17 11:57:40.000000000 +0100
@@ -221,7 +221,7 @@
 	kfree(desc);
 out_put_ubi:
 	ubi_put_device(ubi);
-	dbg_err("cannot open device %d, volume %d, error %d",
+	ubi_err("cannot open device %d, volume %d, error %d",
 		ubi_num, vol_id, err);
 	return ERR_PTR(err);
 }
@@ -426,11 +426,9 @@
  * @buf: data to write
  * @offset: offset within the logical eraseblock where to write
  * @len: how many bytes to write
- * @dtype: expected data type
  *
  * This function writes @len bytes of data from @buf to offset @offset of
- * logical eraseblock @lnum. The @dtype argument describes expected lifetime of
- * the data.
+ * logical eraseblock @lnum.
  *
  * This function takes care of physical eraseblock write failures. If write to
  * the physical eraseblock write operation fails, the logical eraseblock is
@@ -447,7 +445,7 @@
  * returns immediately with %-EBADF code.
  */
 int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
-		  int offset, int len, int dtype)
+		  int offset, int len)
 {
 	struct ubi_volume *vol = desc->vol;
 	struct ubi_device *ubi = vol->ubi;
@@ -466,17 +464,13 @@
 	    offset & (ubi->min_io_size - 1) || len & (ubi->min_io_size - 1))
 		return -EINVAL;
 
-	if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM &&
-	    dtype != UBI_UNKNOWN)
-		return -EINVAL;
-
 	if (vol->upd_marker)
 		return -EBADF;
 
 	if (len == 0)
 		return 0;
 
-	return ubi_eba_write_leb(ubi, vol, lnum, buf, offset, len, dtype);
+	return ubi_eba_write_leb(ubi, vol, lnum, buf, offset, len);
 }
 EXPORT_SYMBOL_GPL(ubi_leb_write);
 
@@ -486,7 +480,6 @@
  * @lnum: logical eraseblock number to change
  * @buf: data to write
  * @len: how many bytes to write
- * @dtype: expected data type
  *
  * This function changes the contents of a logical eraseblock atomically. @buf
  * has to contain new logical eraseblock data, and @len - the length of the
@@ -497,7 +490,7 @@
  * code in case of failure.
  */
 int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
-		   int len, int dtype)
+		   int len)
 {
 	struct ubi_volume *vol = desc->vol;
 	struct ubi_device *ubi = vol->ubi;
@@ -515,17 +508,13 @@
 	    len > vol->usable_leb_size || len & (ubi->min_io_size - 1))
 		return -EINVAL;
 
-	if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM &&
-	    dtype != UBI_UNKNOWN)
-		return -EINVAL;
-
 	if (vol->upd_marker)
 		return -EBADF;
 
 	if (len == 0)
 		return 0;
 
-	return ubi_eba_atomic_leb_change(ubi, vol, lnum, buf, len, dtype);
+	return ubi_eba_atomic_leb_change(ubi, vol, lnum, buf, len);
 }
 EXPORT_SYMBOL_GPL(ubi_leb_change);
 
@@ -562,7 +551,7 @@
 	if (err)
 		return err;
 
-	return ubi_wl_flush(ubi);
+	return ubi_wl_flush(ubi, vol->vol_id, lnum);
 }
 EXPORT_SYMBOL_GPL(ubi_leb_erase);
 
@@ -626,7 +615,6 @@
  * ubi_leb_map - map logical eraseblock to a physical eraseblock.
  * @desc: volume descriptor
  * @lnum: logical eraseblock number
- * @dtype: expected data type
  *
  * This function maps an un-mapped logical eraseblock @lnum to a physical
  * eraseblock. This means, that after a successful invocation of this
@@ -639,7 +627,7 @@
  * eraseblock is already mapped, and other negative error codes in case of
  * other failures.
  */
-int ubi_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
+int ubi_leb_map(struct ubi_volume_desc *desc, int lnum)
 {
 	struct ubi_volume *vol = desc->vol;
 	struct ubi_device *ubi = vol->ubi;
@@ -652,17 +640,13 @@
 	if (lnum < 0 || lnum >= vol->reserved_pebs)
 		return -EINVAL;
 
-	if (dtype != UBI_LONGTERM && dtype != UBI_SHORTTERM &&
-	    dtype != UBI_UNKNOWN)
-		return -EINVAL;
-
 	if (vol->upd_marker)
 		return -EBADF;
 
 	if (vol->eba_tbl[lnum] >= 0)
 		return -EBADMSG;
 
-	return ubi_eba_write_leb(ubi, vol, lnum, NULL, 0, 0, dtype);
+	return ubi_eba_write_leb(ubi, vol, lnum, NULL, 0, 0);
 }
 EXPORT_SYMBOL_GPL(ubi_leb_map);
 
@@ -722,6 +706,33 @@
 }
 EXPORT_SYMBOL_GPL(ubi_sync);
 
+/**
+ * ubi_flush - flush UBI work queue.
+ * @ubi_num: UBI device to flush work queue
+ * @vol_id: volume id to flush for
+ * @lnum: logical eraseblock number to flush for
+ *
+ * This function executes all pending works for a particular volume id / logical
+ * eraseblock number pair. If either value is set to %UBI_ALL, then it acts as
+ * a wildcard for all of the corresponding volume numbers or logical
+ * eraseblock numbers. It returns zero in case of success and a negative error
+ * code in case of failure.
+ */
+int ubi_flush(int ubi_num, int vol_id, int lnum)
+{
+	struct ubi_device *ubi;
+	int err = 0;
+
+	ubi = ubi_get_device(ubi_num);
+	if (!ubi)
+		return -ENODEV;
+
+	err = ubi_wl_flush(ubi, vol_id, lnum);
+	ubi_put_device(ubi);
+	return err;
+}
+EXPORT_SYMBOL_GPL(ubi_flush);
+
 BLOCKING_NOTIFIER_HEAD(ubi_notifiers);
 
 /**
diff -ur a/drivers/mtd/ubi/Kconfig b/drivers/mtd/ubi/Kconfig
--- a/drivers/mtd/ubi/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/mtd/ubi/Kconfig	2014-01-21 09:37:11.000000000 +0100
@@ -27,20 +27,55 @@
 	  life-cycle less than 10000, the threshold should be lessened (e.g.,
 	  to 128 or 256, although it does not have to be power of 2).
 
-config MTD_UBI_BEB_RESERVE
-	int "Percentage of reserved eraseblocks for bad eraseblocks handling"
-	default 1
-	range 0 25
+config MTD_UBI_BEB_LIMIT
+	int "Maximum expected bad eraseblock count per 1024 eraseblocks"
+	default 20
+	range 0 768
 	help
-	  If the MTD device admits of bad eraseblocks (e.g. NAND flash), UBI
-	  reserves some amount of physical eraseblocks to handle new bad
-	  eraseblocks. For example, if a flash physical eraseblock becomes bad,
-	  UBI uses these reserved physical eraseblocks to relocate the bad one.
-	  This option specifies how many physical eraseblocks will be reserved
-	  for bad eraseblock handling (percents of total number of good flash
-	  eraseblocks). If the underlying flash does not admit of bad
-	  eraseblocks (e.g. NOR flash), this value is ignored and nothing is
-	  reserved. Leave the default value if unsure.
+	  This option specifies the maximum bad physical eraseblocks UBI
+	  expects on the MTD device (per 1024 eraseblocks). If the underlying
+	  flash does not admit of bad eraseblocks (e.g. NOR flash), this value
+	  is ignored.
+
+	  NAND datasheets often specify the minimum and maximum NVM (Number of
+	  Valid Blocks) for the flashes' endurance lifetime. The maximum
+	  expected bad eraseblocks per 1024 eraseblocks then can be calculated
+	  as "1024 * (1 - MinNVB / MaxNVB)", which gives 20 for most NANDs
+	  (MaxNVB is basically the total count of eraseblocks on the chip).
+
+	  To put it differently, if this value is 20, UBI will try to reserve
+	  about 1.9% of physical eraseblocks for bad blocks handling. And that
+	  will be 1.9% of eraseblocks on the entire NAND chip, not just the MTD
+	  partition UBI attaches. This means that if you have, say, a NAND
+	  flash chip admits maximum 40 bad eraseblocks, and it is split on two
+	  MTD partitions of the same size, UBI will reserve 40 eraseblocks when
+	  attaching a partition.
+
+	  This option can be overridden by the "mtd=" UBI module parameter or
+	  by the "attach" ioctl.
+
+	  Leave the default value if unsure.
+
+config MTD_UBI_FASTMAP
+	bool "UBI Fastmap (Experimental feature)"
+	default n
+	help
+	   Important: this feature is experimental so far and the on-flash
+	   format for fastmap may change in the next kernel versions
+
+	   Fastmap is a mechanism which allows attaching an UBI device
+	   in nearly constant time. Instead of scanning the whole MTD device it
+	   only has to locate a checkpoint (called fastmap) on the device.
+	   The on-flash fastmap contains all information needed to attach
+	   the device. Using fastmap makes only sense on large devices where
+	   attaching by scanning takes long. UBI will not automatically install
+	   a fastmap on old images, but you can set the UBI module parameter
+	   fm_autoconvert to 1 if you want so. Please note that fastmap-enabled
+	   images are still usable with UBI implementations without
+	   fastmap support. On typical flash devices the whole fastmap fits
+	   into one PEB. UBI will reserve PEBs to hold two fastmaps.
+
+	   If in doubt, say "N".
 
 config MTD_UBI_GLUEBI
 	tristate "MTD devices emulation driver (gluebi)"
@@ -52,12 +87,4 @@
 	   work on top of UBI. Do not enable this unless you use legacy
 	   software.
 
-config MTD_UBI_DEBUG
-	bool "UBI debugging"
-	depends on SYSFS
-	select DEBUG_FS
-	select KALLSYMS
-	help
-	  This option enables UBI debugging.
-
 endif # MTD_UBI
diff -ur a/drivers/mtd/ubi/Makefile b/drivers/mtd/ubi/Makefile
--- a/drivers/mtd/ubi/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/mtd/ubi/Makefile	2014-01-21 09:37:11.000000000 +0100
@@ -1,7 +1,7 @@
 obj-$(CONFIG_MTD_UBI) += ubi.o
 
-ubi-y += vtbl.o vmt.o upd.o build.o cdev.o kapi.o eba.o io.o wl.o scan.o
-ubi-y += misc.o
+ubi-y += vtbl.o vmt.o upd.o build.o cdev.o kapi.o eba.o io.o wl.o attach.o
+ubi-y += misc.o debug.o
+ubi-$(CONFIG_MTD_UBI_FASTMAP) += fastmap.o
 
-ubi-$(CONFIG_MTD_UBI_DEBUG) += debug.o
 obj-$(CONFIG_MTD_UBI_GLUEBI) += gluebi.o
diff -ur a/drivers/mtd/ubi/misc.c b/drivers/mtd/ubi/misc.c
--- a/drivers/mtd/ubi/misc.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/misc.c	2014-02-17 11:57:40.000000000 +0100
@@ -92,16 +92,45 @@
 }
 
 /**
- * ubi_calculate_rsvd_pool - calculate how many PEBs must be reserved for bad
+ * ubi_update_reserved - update bad eraseblock handling accounting data.
+ * @ubi: UBI device description object
+ *
+ * This function calculates the gap between current number of PEBs reserved for
+ * bad eraseblock handling and the required level of PEBs that must be
+ * reserved, and if necessary, reserves more PEBs to fill that gap, according
+ * to availability. Should be called with ubi->volumes_lock held.
+ */
+void ubi_update_reserved(struct ubi_device *ubi)
+{
+	int need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs;
+
+	if (need <= 0 || ubi->avail_pebs == 0)
+		return;
+
+	need = min_t(int, need, ubi->avail_pebs);
+	ubi->avail_pebs -= need;
+	ubi->rsvd_pebs += need;
+	ubi->beb_rsvd_pebs += need;
+	ubi_msg("reserved more %d PEBs for bad PEB handling", need);
+}
+
+/**
+ * ubi_calculate_reserved - calculate how many PEBs must be reserved for bad
  * eraseblock handling.
  * @ubi: UBI device description object
  */
 void ubi_calculate_reserved(struct ubi_device *ubi)
 {
-	ubi->beb_rsvd_level = ubi->good_peb_count/100;
-	ubi->beb_rsvd_level *= CONFIG_MTD_UBI_BEB_RESERVE;
-	if (ubi->beb_rsvd_level < MIN_RESEVED_PEBS)
-		ubi->beb_rsvd_level = MIN_RESEVED_PEBS;
+	/*
+	 * Calculate the actual number of PEBs currently needed to be reserved
+	 * for future bad eraseblock handling.
+	 */
+	ubi->beb_rsvd_level = ubi->bad_peb_limit - ubi->bad_peb_count;
+	if (ubi->beb_rsvd_level < 0) {
+		ubi->beb_rsvd_level = 0;
+		ubi_warn("number of bad PEBs (%d) is above the expected limit (%d), not reserving any PEBs for bad PEB handling, will use available PEBs (if any)",
+			 ubi->bad_peb_count, ubi->bad_peb_limit);
+	}
 }
 
 /**
Nur in a/drivers/mtd/ubi: scan.c.
Nur in a/drivers/mtd/ubi: scan.h.
diff -ur a/drivers/mtd/ubi/ubi.h b/drivers/mtd/ubi/ubi.h
--- a/drivers/mtd/ubi/ubi.h	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/ubi.h	2014-02-17 11:57:40.000000000 +0100
@@ -43,7 +43,6 @@
 #include <asm/pgtable.h>
 
 #include "ubi-media.h"
-#include "scan.h"
 
 /* Maximum number of supported UBI devices */
 #define UBI_MAX_DEVICES 32
@@ -52,21 +51,21 @@
 #define UBI_NAME_STR "ubi"
 
 /* Normal UBI messages */
-#define ubi_msg(fmt, ...) printk(KERN_NOTICE "UBI: " fmt "\n", ##__VA_ARGS__)
+#define ubi_msg(fmt, ...) pr_notice("UBI: " fmt "\n", ##__VA_ARGS__)
 /* UBI warning messages */
-#define ubi_warn(fmt, ...) printk(KERN_WARNING "UBI warning: %s: " fmt "\n", \
-				  __func__, ##__VA_ARGS__)
+#define ubi_warn(fmt, ...) pr_warn("UBI warning: %s: " fmt "\n",  \
+				   __func__, ##__VA_ARGS__)
 /* UBI error messages */
-#define ubi_err(fmt, ...) printk(KERN_ERR "UBI error: %s: " fmt "\n", \
+#define ubi_err(fmt, ...) pr_err("UBI error: %s: " fmt "\n",      \
 				 __func__, ##__VA_ARGS__)
 
-/* Lowest number PEBs reserved for bad PEB handling */
-#define MIN_RESEVED_PEBS 2
-
 /* Background thread name pattern */
 #define UBI_BGT_NAME_PATTERN "ubi_bgt%dd"
 
-/* This marker in the EBA table means that the LEB is um-mapped */
+/*
+ * This marker in the EBA table means that the LEB is um-mapped.
+ * NOTE! It has to have the same value as %UBI_ALL.
+ */
 #define UBI_LEB_UNMAPPED -1
 
 /*
@@ -82,6 +81,16 @@
  */
 #define UBI_PROT_QUEUE_LEN 10
 
+/* The volume ID/LEB number/erase counter is unknown */
+#define UBI_UNKNOWN -1
+
+/*
+ * The UBI debugfs directory name pattern and maximum name length (3 for "ubi"
+ * + 2 for the number plus 1 for the trailing zero byte.
+ */
+#define UBI_DFS_DIR_NAME "ubi%d"
+#define UBI_DFS_DIR_LEN  (3 + 2 + 1)
+
 /*
  * Error codes returned by the I/O sub-system.
  *
@@ -118,7 +127,7 @@
  *                     PEB
  * MOVE_TARGET_WR_ERR: canceled because there was a write error to the target
  *                     PEB
- * MOVE_CANCEL_BITFLIPS: canceled because a bit-flip was detected in the
+ * MOVE_TARGET_BITFLIPS: canceled because a bit-flip was detected in the
  *                       target PEB
  * MOVE_RETRY: retry scrubbing the PEB
  */
@@ -127,10 +136,21 @@
 	MOVE_SOURCE_RD_ERR,
 	MOVE_TARGET_RD_ERR,
 	MOVE_TARGET_WR_ERR,
-	MOVE_CANCEL_BITFLIPS,
+	MOVE_TARGET_BITFLIPS,
 	MOVE_RETRY,
 };
 
+/*
+ * Return codes of the fastmap sub-system
+ *
+ * UBI_NO_FASTMAP: No fastmap super block was found
+ * UBI_BAD_FASTMAP: A fastmap was found but it's unusable
+ */
+enum {
+	UBI_NO_FASTMAP = 1,
+	UBI_BAD_FASTMAP,
+};
+
 /**
  * struct ubi_wl_entry - wear-leveling entry.
  * @u.rb: link in the corresponding (free/used) RB-tree
@@ -197,6 +217,41 @@
 struct ubi_volume_desc;
 
 /**
+ * struct ubi_fastmap_layout - in-memory fastmap data structure.
+ * @e: PEBs used by the current fastmap
+ * @to_be_tortured: if non-zero tortured this PEB
+ * @used_blocks: number of used PEBs
+ * @max_pool_size: maximal size of the user pool
+ * @max_wl_pool_size: maximal size of the pool used by the WL sub-system
+ */
+struct ubi_fastmap_layout {
+	struct ubi_wl_entry *e[UBI_FM_MAX_BLOCKS];
+	int to_be_tortured[UBI_FM_MAX_BLOCKS];
+	int used_blocks;
+	int max_pool_size;
+	int max_wl_pool_size;
+};
+
+/**
+ * struct ubi_fm_pool - in-memory fastmap pool
+ * @pebs: PEBs in this pool
+ * @used: number of used PEBs
+ * @size: total number of PEBs in this pool
+ * @max_size: maximal size of the pool
+ *
+ * A pool gets filled with up to max_size.
+ * If all PEBs within the pool are used a new fastmap will be written
+ * to the flash and the pool gets refilled with empty PEBs.
+ *
+ */
+struct ubi_fm_pool {
+	int pebs[UBI_FM_MAX_POOL_SIZE];
+	int used;
+	int size;
+	int max_size;
+};
+
+/**
  * struct ubi_volume - UBI volume description data structure.
  * @dev: device object to make use of the the Linux device model
  * @cdev: character device object to create character device
@@ -222,8 +277,6 @@
  * @upd_ebs: how many eraseblocks are expected to be updated
  * @ch_lnum: LEB number which is being changing by the atomic LEB change
  *           operation
- * @ch_dtype: data persistency type which is being changing by the atomic LEB
- *            change operation
  * @upd_bytes: how many bytes are expected to be received for volume update or
  *             atomic LEB change
  * @upd_received: how many bytes were already received for volume update or
@@ -270,7 +323,6 @@
 
 	int upd_ebs;
 	int ch_lnum;
-	int ch_dtype;
 	long long upd_bytes;
 	long long upd_received;
 	void *upd_buf;
@@ -297,6 +349,37 @@
 struct ubi_wl_entry;
 
 /**
+ * struct ubi_debug_info - debugging information for an UBI device.
+ *
+ * @chk_gen: if UBI general extra checks are enabled
+ * @chk_io: if UBI I/O extra checks are enabled
+ * @disable_bgt: disable the background task for testing purposes
+ * @emulate_bitflips: emulate bit-flips for testing purposes
+ * @emulate_io_failures: emulate write/erase failures for testing purposes
+ * @dfs_dir_name: name of debugfs directory containing files of this UBI device
+ * @dfs_dir: direntry object of the UBI device debugfs directory
+ * @dfs_chk_gen: debugfs knob to enable UBI general extra checks
+ * @dfs_chk_io: debugfs knob to enable UBI I/O extra checks
+ * @dfs_disable_bgt: debugfs knob to disable the background task
+ * @dfs_emulate_bitflips: debugfs knob to emulate bit-flips
+ * @dfs_emulate_io_failures: debugfs knob to emulate write/erase failures
+ */
+struct ubi_debug_info {
+	unsigned int chk_gen:1;
+	unsigned int chk_io:1;
+	unsigned int disable_bgt:1;
+	unsigned int emulate_bitflips:1;
+	unsigned int emulate_io_failures:1;
+	char dfs_dir_name[UBI_DFS_DIR_LEN + 1];
+	struct dentry *dfs_dir;
+	struct dentry *dfs_chk_gen;
+	struct dentry *dfs_chk_io;
+	struct dentry *dfs_disable_bgt;
+	struct dentry *dfs_emulate_bitflips;
+	struct dentry *dfs_emulate_io_failures;
+};
+
+/**
  * struct ubi_device - UBI device description structure
  * @dev: UBI device object to use the the Linux device model
  * @cdev: character device object to create character device
@@ -334,9 +417,21 @@
  * @ltree: the lock tree
  * @alc_mutex: serializes "atomic LEB change" operations
  *
+ * @fm_disabled: non-zero if fastmap is disabled (default)
+ * @fm: in-memory data structure of the currently used fastmap
+ * @fm_pool: in-memory data structure of the fastmap pool
+ * @fm_wl_pool: in-memory data structure of the fastmap pool used by the WL
+ *		sub-system
+ * @fm_mutex: serializes ubi_update_fastmap() and protects @fm_buf
+ * @fm_buf: vmalloc()'d buffer which holds the raw fastmap
+ * @fm_size: fastmap size in bytes
+ * @fm_sem: allows ubi_update_fastmap() to block EBA table changes
+ * @fm_work: fastmap work queue
+ *
  * @used: RB-tree of used physical eraseblocks
  * @erroneous: RB-tree of erroneous used physical eraseblocks
  * @free: RB-tree of free physical eraseblocks
+ * @free_count: Contains the number of elements in @free
  * @scrub: RB-tree of physical eraseblocks which need scrubbing
  * @pq: protection queue (contain physical eraseblocks which are temporarily
  *      protected from the wear-leveling worker)
@@ -361,6 +456,7 @@
  * @flash_size: underlying MTD device size (in bytes)
  * @peb_count: count of physical eraseblocks on the MTD device
  * @peb_size: physical eraseblock size
+ * @bad_peb_limit: top limit of expected bad physical eraseblocks
  * @bad_peb_count: count of bad physical eraseblocks
  * @good_peb_count: count of good physical eraseblocks
  * @corr_peb_count: count of corrupted physical eraseblocks (preserved and not
@@ -387,9 +483,8 @@
  *                  time (MTD write buffer size)
  * @mtd: MTD device descriptor
  *
- * @peb_buf1: a buffer of PEB size used for different purposes
- * @peb_buf2: another buffer of PEB size used for different purposes
- * @buf_mutex: protects @peb_buf1 and @peb_buf2
+ * @peb_buf: a buffer of PEB size used for different purposes
+ * @buf_mutex: protects @peb_buf
  * @ckvol_mutex: serializes static volume checking when opening
  *
  * @dbg: debugging information for this UBI device
@@ -409,6 +504,7 @@
 	int avail_pebs;
 	int beb_rsvd_pebs;
 	int beb_rsvd_level;
+	int bad_peb_limit;
 
 	int autoresize_vol_id;
 	int vtbl_slots;
@@ -426,10 +522,22 @@
 	struct rb_root ltree;
 	struct mutex alc_mutex;
 
+	/* Fastmap stuff */
+	int fm_disabled;
+	struct ubi_fastmap_layout *fm;
+	struct ubi_fm_pool fm_pool;
+	struct ubi_fm_pool fm_wl_pool;
+	struct rw_semaphore fm_sem;
+	struct mutex fm_mutex;
+	void *fm_buf;
+	size_t fm_size;
+	struct work_struct fm_work;
+
 	/* Wear-leveling sub-system's stuff */
 	struct rb_root used;
 	struct rb_root erroneous;
 	struct rb_root free;
+	int free_count;
 	struct rb_root scrub;
 	struct list_head pq[UBI_PROT_QUEUE_LEN];
 	int pq_head;
@@ -471,12 +579,155 @@
 	int max_write_size;
 	struct mtd_info *mtd;
 
-	void *peb_buf1;
-	void *peb_buf2;
+	void *peb_buf;
 	struct mutex buf_mutex;
 	struct mutex ckvol_mutex;
 
-	struct ubi_debug_info *dbg;
+	struct ubi_debug_info dbg;
+};
+
+/**
+ * struct ubi_ainf_peb - attach information about a physical eraseblock.
+ * @ec: erase counter (%UBI_UNKNOWN if it is unknown)
+ * @pnum: physical eraseblock number
+ * @vol_id: ID of the volume this LEB belongs to
+ * @lnum: logical eraseblock number
+ * @scrub: if this physical eraseblock needs scrubbing
+ * @copy_flag: this LEB is a copy (@copy_flag is set in VID header of this LEB)
+ * @sqnum: sequence number
+ * @u: unions RB-tree or @list links
+ * @u.rb: link in the per-volume RB-tree of &struct ubi_ainf_peb objects
+ * @u.list: link in one of the eraseblock lists
+ *
+ * One object of this type is allocated for each physical eraseblock when
+ * attaching an MTD device. Note, if this PEB does not belong to any LEB /
+ * volume, the @vol_id and @lnum fields are initialized to %UBI_UNKNOWN.
+ */
+struct ubi_ainf_peb {
+	int ec;
+	int pnum;
+	int vol_id;
+	int lnum;
+	unsigned int scrub:1;
+	unsigned int copy_flag:1;
+	unsigned long long sqnum;
+	union {
+		struct rb_node rb;
+		struct list_head list;
+	} u;
+};
+
+/**
+ * struct ubi_ainf_volume - attaching information about a volume.
+ * @vol_id: volume ID
+ * @highest_lnum: highest logical eraseblock number in this volume
+ * @leb_count: number of logical eraseblocks in this volume
+ * @vol_type: volume type
+ * @used_ebs: number of used logical eraseblocks in this volume (only for
+ *            static volumes)
+ * @last_data_size: amount of data in the last logical eraseblock of this
+ *                  volume (always equivalent to the usable logical eraseblock
+ *                  size in case of dynamic volumes)
+ * @data_pad: how many bytes at the end of logical eraseblocks of this volume
+ *            are not used (due to volume alignment)
+ * @compat: compatibility flags of this volume
+ * @rb: link in the volume RB-tree
+ * @root: root of the RB-tree containing all the eraseblock belonging to this
+ *        volume (&struct ubi_ainf_peb objects)
+ *
+ * One object of this type is allocated for each volume when attaching an MTD
+ * device.
+ */
+struct ubi_ainf_volume {
+	int vol_id;
+	int highest_lnum;
+	int leb_count;
+	int vol_type;
+	int used_ebs;
+	int last_data_size;
+	int data_pad;
+	int compat;
+	struct rb_node rb;
+	struct rb_root root;
+};
+
+/**
+ * struct ubi_attach_info - MTD device attaching information.
+ * @volumes: root of the volume RB-tree
+ * @corr: list of corrupted physical eraseblocks
+ * @free: list of free physical eraseblocks
+ * @erase: list of physical eraseblocks which have to be erased
+ * @alien: list of physical eraseblocks which should not be used by UBI (e.g.,
+ *         those belonging to "preserve"-compatible internal volumes)
+ * @corr_peb_count: count of PEBs in the @corr list
+ * @empty_peb_count: count of PEBs which are presumably empty (contain only
+ *                   0xFF bytes)
+ * @alien_peb_count: count of PEBs in the @alien list
+ * @bad_peb_count: count of bad physical eraseblocks
+ * @maybe_bad_peb_count: count of bad physical eraseblocks which are not marked
+ *                       as bad yet, but which look like bad
+ * @vols_found: number of volumes found
+ * @highest_vol_id: highest volume ID
+ * @is_empty: flag indicating whether the MTD device is empty or not
+ * @min_ec: lowest erase counter value
+ * @max_ec: highest erase counter value
+ * @max_sqnum: highest sequence number value
+ * @mean_ec: mean erase counter value
+ * @ec_sum: a temporary variable used when calculating @mean_ec
+ * @ec_count: a temporary variable used when calculating @mean_ec
+ * @aeb_slab_cache: slab cache for &struct ubi_ainf_peb objects
+ *
+ * This data structure contains the result of attaching an MTD device and may
+ * be used by other UBI sub-systems to build final UBI data structures, further
+ * error-recovery and so on.
+ */
+struct ubi_attach_info {
+	struct rb_root volumes;
+	struct list_head corr;
+	struct list_head free;
+	struct list_head erase;
+	struct list_head alien;
+	int corr_peb_count;
+	int empty_peb_count;
+	int alien_peb_count;
+	int bad_peb_count;
+	int maybe_bad_peb_count;
+	int vols_found;
+	int highest_vol_id;
+	int is_empty;
+	int min_ec;
+	int max_ec;
+	unsigned long long max_sqnum;
+	int mean_ec;
+	uint64_t ec_sum;
+	int ec_count;
+	struct kmem_cache *aeb_slab_cache;
+};
+
+/**
+ * struct ubi_work - UBI work description data structure.
+ * @list: a link in the list of pending works
+ * @func: worker function
+ * @e: physical eraseblock to erase
+ * @vol_id: the volume ID on which this erasure is being performed
+ * @lnum: the logical eraseblock number
+ * @torture: if the physical eraseblock has to be tortured
+ * @anchor: produce a anchor PEB to by used by fastmap
+ *
+ * The @func pointer points to the worker function. If the @cancel argument is
+ * not zero, the worker has to free the resources and exit immediately. The
+ * worker has to return zero in case of success and a negative error code in
+ * case of failure.
+ */
+struct ubi_work {
+	struct list_head list;
+	int (*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel);
+	/* The below fields are only relevant to erasure works */
+	struct ubi_wl_entry *e;
+	int vol_id;
+	int lnum;
+	int torture;
+	int anchor;
 };
 
 #include "debug.h"
@@ -489,12 +740,23 @@
 extern struct mutex ubi_devices_mutex;
 extern struct blocking_notifier_head ubi_notifiers;
 
+/* attach.c */
+int ubi_add_to_av(struct ubi_device *ubi, struct ubi_attach_info *ai, int pnum,
+		  int ec, const struct ubi_vid_hdr *vid_hdr, int bitflips);
+struct ubi_ainf_volume *ubi_find_av(const struct ubi_attach_info *ai,
+				    int vol_id);
+void ubi_remove_av(struct ubi_attach_info *ai, struct ubi_ainf_volume *av);
+struct ubi_ainf_peb *ubi_early_get_peb(struct ubi_device *ubi,
+				       struct ubi_attach_info *ai);
+int ubi_attach(struct ubi_device *ubi, int force_scan);
+void ubi_destroy_ai(struct ubi_attach_info *ai);
+
 /* vtbl.c */
 int ubi_change_vtbl_record(struct ubi_device *ubi, int idx,
 			   struct ubi_vtbl_record *vtbl_rec);
 int ubi_vtbl_rename_volumes(struct ubi_device *ubi,
 			    struct list_head *rename_list);
-int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si);
+int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_attach_info *ai);
 
 /* vmt.c */
 int ubi_create_volume(struct ubi_device *ubi, struct ubi_mkvol_req *req);
@@ -518,6 +780,7 @@
 int ubi_calc_data_len(const struct ubi_device *ubi, const void *buf,
 		      int length);
 int ubi_check_volume(struct ubi_device *ubi, int vol_id);
+void ubi_update_reserved(struct ubi_device *ubi);
 void ubi_calculate_reserved(struct ubi_device *ubi);
 int ubi_check_pattern(const void *buf, uint8_t patt, int size);
 
@@ -527,24 +790,33 @@
 int ubi_eba_read_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
 		     void *buf, int offset, int len, int check);
 int ubi_eba_write_leb(struct ubi_device *ubi, struct ubi_volume *vol, int lnum,
-		      const void *buf, int offset, int len, int dtype);
+		      const void *buf, int offset, int len);
 int ubi_eba_write_leb_st(struct ubi_device *ubi, struct ubi_volume *vol,
-			 int lnum, const void *buf, int len, int dtype,
-			 int used_ebs);
+			 int lnum, const void *buf, int len, int used_ebs);
 int ubi_eba_atomic_leb_change(struct ubi_device *ubi, struct ubi_volume *vol,
-			      int lnum, const void *buf, int len, int dtype);
+			      int lnum, const void *buf, int len);
 int ubi_eba_copy_leb(struct ubi_device *ubi, int from, int to,
 		     struct ubi_vid_hdr *vid_hdr);
-int ubi_eba_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si);
+int ubi_eba_init(struct ubi_device *ubi, struct ubi_attach_info *ai);
+unsigned long long ubi_next_sqnum(struct ubi_device *ubi);
+int self_check_eba(struct ubi_device *ubi, struct ubi_attach_info *ai_fastmap,
+		   struct ubi_attach_info *ai_scan);
 
 /* wl.c */
-int ubi_wl_get_peb(struct ubi_device *ubi, int dtype);
-int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture);
-int ubi_wl_flush(struct ubi_device *ubi);
+int ubi_wl_get_peb(struct ubi_device *ubi);
+int ubi_wl_put_peb(struct ubi_device *ubi, int vol_id, int lnum,
+		   int pnum, int torture);
+int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum);
 int ubi_wl_scrub_peb(struct ubi_device *ubi, int pnum);
-int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si);
+int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai);
 void ubi_wl_close(struct ubi_device *ubi);
 int ubi_thread(void *u);
+struct ubi_wl_entry *ubi_wl_get_fm_peb(struct ubi_device *ubi, int anchor);
+int ubi_wl_put_fm_peb(struct ubi_device *ubi, struct ubi_wl_entry *used_e,
+		      int lnum, int torture);
+int ubi_is_erase_work(struct ubi_work *wrk);
+void ubi_refill_pools(struct ubi_device *ubi);
+int ubi_ensure_anchor_pebs(struct ubi_device *ubi);
 
 /* io.c */
 int ubi_io_read(const struct ubi_device *ubi, void *buf, int pnum, int offset,
@@ -564,7 +836,8 @@
 			 struct ubi_vid_hdr *vid_hdr);
 
 /* build.c */
-int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num, int vid_hdr_offset);
+int ubi_attach_mtd_dev(struct mtd_info *mtd, int ubi_num,
+		       int vid_hdr_offset, int max_beb_per1024);
 int ubi_detach_mtd_dev(int ubi_num, int anyway);
 struct ubi_device *ubi_get_device(int ubi_num);
 void ubi_put_device(struct ubi_device *ubi);
@@ -575,11 +848,21 @@
 int ubi_notify_all(struct ubi_device *ubi, int ntype,
 		   struct notifier_block *nb);
 int ubi_enumerate_volumes(struct notifier_block *nb);
+void ubi_free_internal_volumes(struct ubi_device *ubi);
 
 /* kapi.c */
 void ubi_do_get_device_info(struct ubi_device *ubi, struct ubi_device_info *di);
 void ubi_do_get_volume_info(struct ubi_device *ubi, struct ubi_volume *vol,
 			    struct ubi_volume_info *vi);
+/* scan.c */
+int ubi_compare_lebs(struct ubi_device *ubi, const struct ubi_ainf_peb *aeb,
+		      int pnum, const struct ubi_vid_hdr *vid_hdr);
+
+/* fastmap.c */
+size_t ubi_calc_fm_size(struct ubi_device *ubi);
+int ubi_update_fastmap(struct ubi_device *ubi);
+int ubi_scan_fastmap(struct ubi_device *ubi, struct ubi_attach_info *ai,
+		     int fm_anchor);
 
 /*
  * ubi_rb_for_each_entry - walk an RB-tree.
@@ -595,6 +878,21 @@
 	     rb = rb_next(rb),                                               \
 	     pos = (rb ? container_of(rb, typeof(*pos), member) : NULL))
 
+/*
+ * ubi_move_aeb_to_list - move a PEB from the volume tree to a list.
+ *
+ * @av: volume attaching information
+ * @aeb: attaching eraseblock information
+ * @list: the list to move to
+ */
+static inline void ubi_move_aeb_to_list(struct ubi_ainf_volume *av,
+					 struct ubi_ainf_peb *aeb,
+					 struct list_head *list)
+{
+		rb_erase(&aeb->u.rb, &av->root);
+		list_add_tail(&aeb->u.list, list);
+}
+
 /**
  * ubi_zalloc_vid_hdr - allocate a volume identifier header object.
  * @ubi: UBI device description object
@@ -669,7 +967,7 @@
 	if (!ubi->ro_mode) {
 		ubi->ro_mode = 1;
 		ubi_warn("switch to read-only mode");
-		ubi_dbg_dump_stack();
+		dump_stack();
 	}
 }
 
diff -ur a/drivers/mtd/ubi/ubi-media.h b/drivers/mtd/ubi/ubi-media.h
--- a/drivers/mtd/ubi/ubi-media.h	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/ubi-media.h	2014-02-17 11:57:40.000000000 +0100
@@ -149,10 +149,10 @@
  * The @image_seq field is used to validate a UBI image that has been prepared
  * for a UBI device. The @image_seq value can be any value, but it must be the
  * same on all eraseblocks. UBI will ensure that all new erase counter headers
- * also contain this value, and will check the value when scanning at start-up.
+ * also contain this value, and will check the value when attaching the flash.
  * One way to make use of @image_seq is to increase its value by one every time
  * an image is flashed over an existing image, then, if the flashing does not
- * complete, UBI will detect the error when scanning.
+ * complete, UBI will detect the error when attaching the media.
  */
 struct ubi_ec_hdr {
 	__be32  magic;
@@ -298,8 +298,8 @@
 #define UBI_INT_VOL_COUNT 1
 
 /*
- * Starting ID of internal volumes. There is reserved room for 4096 internal
- * volumes.
+ * Starting ID of internal volumes: 0x7fffefff.
+ * There is reserved room for 4096 internal volumes.
  */
 #define UBI_INTERNAL_VOL_START (0x7FFFFFFF - 4096)
 
@@ -375,4 +375,141 @@
 	__be32  crc;
 } __packed;
 
+/* UBI fastmap on-flash data structures */
+
+#define UBI_FM_SB_VOLUME_ID	(UBI_LAYOUT_VOLUME_ID + 1)
+#define UBI_FM_DATA_VOLUME_ID	(UBI_LAYOUT_VOLUME_ID + 2)
+
+/* fastmap on-flash data structure format version */
+#define UBI_FM_FMT_VERSION	1
+
+#define UBI_FM_SB_MAGIC		0x7B11D69F
+#define UBI_FM_HDR_MAGIC	0xD4B82EF7
+#define UBI_FM_VHDR_MAGIC	0xFA370ED1
+#define UBI_FM_POOL_MAGIC	0x67AF4D08
+#define UBI_FM_EBA_MAGIC	0xf0c040a8
+
+/* A fastmap supber block can be located between PEB 0 and
+ * UBI_FM_MAX_START */
+#define UBI_FM_MAX_START	64
+
+/* A fastmap can use up to UBI_FM_MAX_BLOCKS PEBs */
+#define UBI_FM_MAX_BLOCKS	32
+
+/* 5% of the total number of PEBs have to be scanned while attaching
+ * from a fastmap.
+ * But the size of this pool is limited to be between UBI_FM_MIN_POOL_SIZE and
+ * UBI_FM_MAX_POOL_SIZE */
+#define UBI_FM_MIN_POOL_SIZE	8
+#define UBI_FM_MAX_POOL_SIZE	256
+
+#define UBI_FM_WL_POOL_SIZE	25
+
+/**
+ * struct ubi_fm_sb - UBI fastmap super block
+ * @magic: fastmap super block magic number (%UBI_FM_SB_MAGIC)
+ * @version: format version of this fastmap
+ * @data_crc: CRC over the fastmap data
+ * @used_blocks: number of PEBs used by this fastmap
+ * @block_loc: an array containing the location of all PEBs of the fastmap
+ * @block_ec: the erase counter of each used PEB
+ * @sqnum: highest sequence number value at the time while taking the fastmap
+ *
+ */
+struct ubi_fm_sb {
+	__be32 magic;
+	__u8 version;
+	__u8 padding1[3];
+	__be32 data_crc;
+	__be32 used_blocks;
+	__be32 block_loc[UBI_FM_MAX_BLOCKS];
+	__be32 block_ec[UBI_FM_MAX_BLOCKS];
+	__be64 sqnum;
+	__u8 padding2[32];
+} __packed;
+
+/**
+ * struct ubi_fm_hdr - header of the fastmap data set
+ * @magic: fastmap header magic number (%UBI_FM_HDR_MAGIC)
+ * @free_peb_count: number of free PEBs known by this fastmap
+ * @used_peb_count: number of used PEBs known by this fastmap
+ * @scrub_peb_count: number of to be scrubbed PEBs known by this fastmap
+ * @bad_peb_count: number of bad PEBs known by this fastmap
+ * @erase_peb_count: number of bad PEBs which have to be erased
+ * @vol_count: number of UBI volumes known by this fastmap
+ */
+struct ubi_fm_hdr {
+	__be32 magic;
+	__be32 free_peb_count;
+	__be32 used_peb_count;
+	__be32 scrub_peb_count;
+	__be32 bad_peb_count;
+	__be32 erase_peb_count;
+	__be32 vol_count;
+	__u8 padding[4];
+} __packed;
+
+/* struct ubi_fm_hdr is followed by two struct ubi_fm_scan_pool */
+
+/**
+ * struct ubi_fm_scan_pool - Fastmap pool PEBs to be scanned while attaching
+ * @magic: pool magic numer (%UBI_FM_POOL_MAGIC)
+ * @size: current pool size
+ * @max_size: maximal pool size
+ * @pebs: an array containing the location of all PEBs in this pool
+ */
+struct ubi_fm_scan_pool {
+	__be32 magic;
+	__be16 size;
+	__be16 max_size;
+	__be32 pebs[UBI_FM_MAX_POOL_SIZE];
+	__be32 padding[4];
+} __packed;
+
+/* ubi_fm_scan_pool is followed by nfree+nused struct ubi_fm_ec records */
+
+/**
+ * struct ubi_fm_ec - stores the erase counter of a PEB
+ * @pnum: PEB number
+ * @ec: ec of this PEB
+ */
+struct ubi_fm_ec {
+	__be32 pnum;
+	__be32 ec;
+} __packed;
+
+/**
+ * struct ubi_fm_volhdr - Fastmap volume header
+ * it identifies the start of an eba table
+ * @magic: Fastmap volume header magic number (%UBI_FM_VHDR_MAGIC)
+ * @vol_id: volume id of the fastmapped volume
+ * @vol_type: type of the fastmapped volume
+ * @data_pad: data_pad value of the fastmapped volume
+ * @used_ebs: number of used LEBs within this volume
+ * @last_eb_bytes: number of bytes used in the last LEB
+ */
+struct ubi_fm_volhdr {
+	__be32 magic;
+	__be32 vol_id;
+	__u8 vol_type;
+	__u8 padding1[3];
+	__be32 data_pad;
+	__be32 used_ebs;
+	__be32 last_eb_bytes;
+	__u8 padding2[8];
+} __packed;
+
+/* struct ubi_fm_volhdr is followed by one struct ubi_fm_eba records */
+
+/**
+ * struct ubi_fm_eba - denotes an association beween a PEB and LEB
+ * @magic: EBA table magic number
+ * @reserved_pebs: number of table entries
+ * @pnum: PEB number of LEB (LEB is the index)
+ */
+struct ubi_fm_eba {
+	__be32 magic;
+	__be32 reserved_pebs;
+	__be32 pnum[0];
+} __packed;
 #endif /* !__UBI_MEDIA_H__ */
diff -ur a/drivers/mtd/ubi/upd.c b/drivers/mtd/ubi/upd.c
--- a/drivers/mtd/ubi/upd.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/upd.c	2014-02-17 11:57:40.000000000 +0100
@@ -64,8 +64,7 @@
 		return 0;
 	}
 
-	memcpy(&vtbl_rec, &ubi->vtbl[vol->vol_id],
-	       sizeof(struct ubi_vtbl_record));
+	vtbl_rec = ubi->vtbl[vol->vol_id];
 	vtbl_rec.upd_marker = 1;
 
 	mutex_lock(&ubi->device_mutex);
@@ -93,8 +92,7 @@
 
 	dbg_gen("clear update marker for volume %d", vol->vol_id);
 
-	memcpy(&vtbl_rec, &ubi->vtbl[vol->vol_id],
-	       sizeof(struct ubi_vtbl_record));
+	vtbl_rec = ubi->vtbl[vol->vol_id];
 	ubi_assert(vol->upd_marker && vtbl_rec.upd_marker);
 	vtbl_rec.upd_marker = 0;
 
@@ -147,7 +145,7 @@
 	}
 
 	if (bytes == 0) {
-		err = ubi_wl_flush(ubi);
+		err = ubi_wl_flush(ubi, UBI_ALL, UBI_ALL);
 		if (err)
 			return err;
 
@@ -186,14 +184,12 @@
 	dbg_gen("start changing LEB %d:%d, %u bytes",
 		vol->vol_id, req->lnum, req->bytes);
 	if (req->bytes == 0)
-		return ubi_eba_atomic_leb_change(ubi, vol, req->lnum, NULL, 0,
-						 req->dtype);
+		return ubi_eba_atomic_leb_change(ubi, vol, req->lnum, NULL, 0);
 
 	vol->upd_bytes = req->bytes;
 	vol->upd_received = 0;
 	vol->changing_leb = 1;
 	vol->ch_lnum = req->lnum;
-	vol->ch_dtype = req->dtype;
 
 	vol->upd_buf = vmalloc(req->bytes);
 	if (!vol->upd_buf)
@@ -246,8 +242,7 @@
 			return 0;
 		}
 
-		err = ubi_eba_write_leb(ubi, vol, lnum, buf, 0, len,
-					UBI_UNKNOWN);
+		err = ubi_eba_write_leb(ubi, vol, lnum, buf, 0, len);
 	} else {
 		/*
 		 * When writing static volume, and this is the last logical
@@ -259,8 +254,7 @@
 		 * contain zeros, not random trash.
 		 */
 		memset(buf + len, 0, vol->usable_leb_size - len);
-		err = ubi_eba_write_leb_st(ubi, vol, lnum, buf, len,
-					   UBI_UNKNOWN, used_ebs);
+		err = ubi_eba_write_leb_st(ubi, vol, lnum, buf, len, used_ebs);
 	}
 
 	return err;
@@ -365,7 +359,7 @@
 
 	ubi_assert(vol->upd_received <= vol->upd_bytes);
 	if (vol->upd_received == vol->upd_bytes) {
-		err = ubi_wl_flush(ubi);
+		err = ubi_wl_flush(ubi, UBI_ALL, UBI_ALL);
 		if (err)
 			return err;
 		/* The update is finished, clear the update marker */
@@ -421,7 +415,7 @@
 		       len - vol->upd_bytes);
 		len = ubi_calc_data_len(ubi, vol->upd_buf, len);
 		err = ubi_eba_atomic_leb_change(ubi, vol, vol->ch_lnum,
-						vol->upd_buf, len, UBI_UNKNOWN);
+						vol->upd_buf, len);
 		if (err)
 			return err;
 	}
diff -ur a/drivers/mtd/ubi/vmt.c b/drivers/mtd/ubi/vmt.c
--- a/drivers/mtd/ubi/vmt.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/vmt.c	2014-02-17 11:57:40.000000000 +0100
@@ -29,11 +29,7 @@
 #include <linux/export.h>
 #include "ubi.h"
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-static int paranoid_check_volumes(struct ubi_device *ubi);
-#else
-#define paranoid_check_volumes(ubi) 0
-#endif
+static int self_check_volumes(struct ubi_device *ubi);
 
 static ssize_t vol_attribute_show(struct device *dev,
 				  struct device_attribute *attr, char *buf);
@@ -227,7 +223,7 @@
 			}
 
 		if (vol_id == UBI_VOL_NUM_AUTO) {
-			dbg_err("out of volume IDs");
+			ubi_err("out of volume IDs");
 			err = -ENFILE;
 			goto out_unlock;
 		}
@@ -241,7 +237,7 @@
 	/* Ensure that this volume does not exist */
 	err = -EEXIST;
 	if (ubi->volumes[vol_id]) {
-		dbg_err("volume %d already exists", vol_id);
+		ubi_err("volume %d already exists", vol_id);
 		goto out_unlock;
 	}
 
@@ -250,7 +246,7 @@
 		if (ubi->volumes[i] &&
 		    ubi->volumes[i]->name_len == req->name_len &&
 		    !strcmp(ubi->volumes[i]->name, req->name)) {
-			dbg_err("volume \"%s\" exists (ID %d)", req->name, i);
+			ubi_err("volume \"%s\" exists (ID %d)", req->name, i);
 			goto out_unlock;
 		}
 
@@ -261,9 +257,9 @@
 
 	/* Reserve physical eraseblocks */
 	if (vol->reserved_pebs > ubi->avail_pebs) {
-		dbg_err("not enough PEBs, only %d available", ubi->avail_pebs);
+		ubi_err("not enough PEBs, only %d available", ubi->avail_pebs);
 		if (ubi->corr_peb_count)
-			dbg_err("%d PEBs are corrupted and not used",
+			ubi_err("%d PEBs are corrupted and not used",
 				ubi->corr_peb_count);
 		err = -ENOSPC;
 		goto out_unlock;
@@ -284,7 +280,7 @@
 	 * Finish all pending erases because there may be some LEBs belonging
 	 * to the same volume ID.
 	 */
-	err = ubi_wl_flush(ubi);
+	err = ubi_wl_flush(ubi, vol_id, UBI_ALL);
 	if (err)
 		goto out_acc;
 
@@ -360,8 +356,7 @@
 	spin_unlock(&ubi->volumes_lock);
 
 	ubi_volume_notify(ubi, vol, UBI_VOLUME_ADDED);
-	if (paranoid_check_volumes(ubi))
-		dbg_err("check failed while creating volume %d", vol_id);
+	self_check_volumes(ubi);
 	return err;
 
 out_sysfs:
@@ -448,21 +443,13 @@
 	spin_lock(&ubi->volumes_lock);
 	ubi->rsvd_pebs -= reserved_pebs;
 	ubi->avail_pebs += reserved_pebs;
-	i = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs;
-	if (i > 0) {
-		i = ubi->avail_pebs >= i ? i : ubi->avail_pebs;
-		ubi->avail_pebs -= i;
-		ubi->rsvd_pebs += i;
-		ubi->beb_rsvd_pebs += i;
-		if (i > 0)
-			ubi_msg("reserve more %d PEBs", i);
-	}
+	ubi_update_reserved(ubi);
 	ubi->vol_count -= 1;
 	spin_unlock(&ubi->volumes_lock);
 
 	ubi_volume_notify(ubi, vol, UBI_VOLUME_REMOVED);
-	if (!no_vtbl && paranoid_check_volumes(ubi))
-		dbg_err("check failed while removing volume %d", vol_id);
+	if (!no_vtbl)
+		self_check_volumes(ubi);
 
 	return err;
 
@@ -500,7 +487,7 @@
 
 	if (vol->vol_type == UBI_STATIC_VOLUME &&
 	    reserved_pebs < vol->used_ebs) {
-		dbg_err("too small size %d, %d LEBs contain data",
+		ubi_err("too small size %d, %d LEBs contain data",
 			reserved_pebs, vol->used_ebs);
 		return -EINVAL;
 	}
@@ -529,10 +516,10 @@
 	if (pebs > 0) {
 		spin_lock(&ubi->volumes_lock);
 		if (pebs > ubi->avail_pebs) {
-			dbg_err("not enough PEBs: requested %d, available %d",
+			ubi_err("not enough PEBs: requested %d, available %d",
 				pebs, ubi->avail_pebs);
 			if (ubi->corr_peb_count)
-				dbg_err("%d PEBs are corrupted and not used",
+				ubi_err("%d PEBs are corrupted and not used",
 					ubi->corr_peb_count);
 			spin_unlock(&ubi->volumes_lock);
 			err = -ENOSPC;
@@ -548,7 +535,7 @@
 	}
 
 	/* Change volume table record */
-	memcpy(&vtbl_rec, &ubi->vtbl[vol_id], sizeof(struct ubi_vtbl_record));
+	vtbl_rec = ubi->vtbl[vol_id];
 	vtbl_rec.reserved_pebs = cpu_to_be32(reserved_pebs);
 	err = ubi_change_vtbl_record(ubi, vol_id, &vtbl_rec);
 	if (err)
@@ -563,15 +550,7 @@
 		spin_lock(&ubi->volumes_lock);
 		ubi->rsvd_pebs += pebs;
 		ubi->avail_pebs -= pebs;
-		pebs = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs;
-		if (pebs > 0) {
-			pebs = ubi->avail_pebs >= pebs ? pebs : ubi->avail_pebs;
-			ubi->avail_pebs -= pebs;
-			ubi->rsvd_pebs += pebs;
-			ubi->beb_rsvd_pebs += pebs;
-			if (pebs > 0)
-				ubi_msg("reserve more %d PEBs", pebs);
-		}
+		ubi_update_reserved(ubi);
 		for (i = 0; i < reserved_pebs; i++)
 			new_mapping[i] = vol->eba_tbl[i];
 		kfree(vol->eba_tbl);
@@ -588,8 +567,7 @@
 	}
 
 	ubi_volume_notify(ubi, vol, UBI_VOLUME_RESIZED);
-	if (paranoid_check_volumes(ubi))
-		dbg_err("check failed while re-sizing volume %d", vol_id);
+	self_check_volumes(ubi);
 	return err;
 
 out_acc:
@@ -638,8 +616,8 @@
 		}
 	}
 
-	if (!err && paranoid_check_volumes(ubi))
-		;
+	if (!err)
+		self_check_volumes(ubi);
 	return err;
 }
 
@@ -686,8 +664,7 @@
 		return err;
 	}
 
-	if (paranoid_check_volumes(ubi))
-		dbg_err("check failed while adding volume %d", vol_id);
+	self_check_volumes(ubi);
 	return err;
 
 out_cdev:
@@ -712,16 +689,14 @@
 	volume_sysfs_close(vol);
 }
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-
 /**
- * paranoid_check_volume - check volume information.
+ * self_check_volume - check volume information.
  * @ubi: UBI device description object
  * @vol_id: volume ID
  *
  * Returns zero if volume is all right and a a negative error code if not.
  */
-static int paranoid_check_volume(struct ubi_device *ubi, int vol_id)
+static int self_check_volume(struct ubi_device *ubi, int vol_id)
 {
 	int idx = vol_id2idx(ubi, vol_id);
 	int reserved_pebs, alignment, data_pad, vol_type, name_len, upd_marker;
@@ -771,7 +746,7 @@
 	}
 
 	if (vol->upd_marker && vol->corrupted) {
-		dbg_err("update marker and corrupted simultaneously");
+		ubi_err("update marker and corrupted simultaneously");
 		goto fail;
 	}
 
@@ -853,34 +828,33 @@
 	return 0;
 
 fail:
-	ubi_err("paranoid check failed for volume %d", vol_id);
+	ubi_err("self-check failed for volume %d", vol_id);
 	if (vol)
-		ubi_dbg_dump_vol_info(vol);
-	ubi_dbg_dump_vtbl_record(&ubi->vtbl[vol_id], vol_id);
+		ubi_dump_vol_info(vol);
+	ubi_dump_vtbl_record(&ubi->vtbl[vol_id], vol_id);
 	dump_stack();
 	spin_unlock(&ubi->volumes_lock);
 	return -EINVAL;
 }
 
 /**
- * paranoid_check_volumes - check information about all volumes.
+ * self_check_volumes - check information about all volumes.
  * @ubi: UBI device description object
  *
  * Returns zero if volumes are all right and a a negative error code if not.
  */
-static int paranoid_check_volumes(struct ubi_device *ubi)
+static int self_check_volumes(struct ubi_device *ubi)
 {
 	int i, err = 0;
 
-	if (!ubi->dbg->chk_gen)
+	if (!ubi_dbg_chk_gen(ubi))
 		return 0;
 
 	for (i = 0; i < ubi->vtbl_slots; i++) {
-		err = paranoid_check_volume(ubi, i);
+		err = self_check_volume(ubi, i);
 		if (err)
 			break;
 	}
 
 	return err;
 }
-#endif
diff -ur a/drivers/mtd/ubi/vtbl.c b/drivers/mtd/ubi/vtbl.c
--- a/drivers/mtd/ubi/vtbl.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/vtbl.c	2014-02-17 11:57:40.000000000 +0100
@@ -37,16 +37,15 @@
  * LEB 1. This scheme guarantees recoverability from unclean reboots.
  *
  * In this UBI implementation the on-flash volume table does not contain any
- * information about how many data static volumes contain. This information may
- * be found from the scanning data.
+ * information about how much data static volumes contain.
  *
  * But it would still be beneficial to store this information in the volume
  * table. For example, suppose we have a static volume X, and all its physical
  * eraseblocks became bad for some reasons. Suppose we are attaching the
- * corresponding MTD device, the scanning has found no logical eraseblocks
+ * corresponding MTD device, for some reason we find no logical eraseblocks
  * corresponding to the volume X. According to the volume table volume X does
  * exist. So we don't know whether it is just empty or all its physical
- * eraseblocks went bad. So we cannot alarm the user about this corruption.
+ * eraseblocks went bad. So we cannot alarm the user properly.
  *
  * The volume table also stores so-called "update marker", which is used for
  * volume updates. Before updating the volume, the update marker is set, and
@@ -62,11 +61,7 @@
 #include <asm/div64.h>
 #include "ubi.h"
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-static void paranoid_vtbl_check(const struct ubi_device *ubi);
-#else
-#define paranoid_vtbl_check(ubi)
-#endif
+static void self_vtbl_check(const struct ubi_device *ubi);
 
 /* Empty volume table record */
 static struct ubi_vtbl_record empty_vtbl_record;
@@ -106,12 +101,12 @@
 			return err;
 
 		err = ubi_eba_write_leb(ubi, layout_vol, i, ubi->vtbl, 0,
-					ubi->vtbl_size, UBI_LONGTERM);
+					ubi->vtbl_size);
 		if (err)
 			return err;
 	}
 
-	paranoid_vtbl_check(ubi);
+	self_vtbl_check(ubi);
 	return 0;
 }
 
@@ -158,7 +153,7 @@
 			return err;
 
 		err = ubi_eba_write_leb(ubi, layout_vol, i, ubi->vtbl, 0,
-					ubi->vtbl_size, UBI_LONGTERM);
+					ubi->vtbl_size);
 		if (err)
 			return err;
 	}
@@ -197,7 +192,7 @@
 		if (be32_to_cpu(vtbl[i].crc) != crc) {
 			ubi_err("bad CRC at record %u: %#08x, not %#08x",
 				 i, crc, be32_to_cpu(vtbl[i].crc));
-			ubi_dbg_dump_vtbl_record(&vtbl[i], i);
+			ubi_dump_vtbl_record(&vtbl[i], i);
 			return 1;
 		}
 
@@ -229,7 +224,7 @@
 
 		n = ubi->leb_size % alignment;
 		if (data_pad != n) {
-			dbg_err("bad data_pad, has to be %d", n);
+			ubi_err("bad data_pad, has to be %d", n);
 			err = 6;
 			goto bad;
 		}
@@ -245,7 +240,7 @@
 		}
 
 		if (reserved_pebs > ubi->good_peb_count) {
-			dbg_err("too large reserved_pebs %d, good PEBs %d",
+			ubi_err("too large reserved_pebs %d, good PEBs %d",
 				reserved_pebs, ubi->good_peb_count);
 			err = 9;
 			goto bad;
@@ -275,10 +270,10 @@
 
 			if (len1 > 0 && len1 == len2 &&
 			    !strncmp(vtbl[i].name, vtbl[n].name, len1)) {
-				ubi_err("volumes %d and %d have the same name"
-					" \"%s\"", i, n, vtbl[i].name);
-				ubi_dbg_dump_vtbl_record(&vtbl[i], i);
-				ubi_dbg_dump_vtbl_record(&vtbl[n], n);
+				ubi_err("volumes %d and %d have the same name \"%s\"",
+					i, n, vtbl[i].name);
+				ubi_dump_vtbl_record(&vtbl[i], i);
+				ubi_dump_vtbl_record(&vtbl[n], n);
 				return -EINVAL;
 			}
 		}
@@ -288,65 +283,64 @@
 
 bad:
 	ubi_err("volume table check failed: record %d, error %d", i, err);
-	ubi_dbg_dump_vtbl_record(&vtbl[i], i);
+	ubi_dump_vtbl_record(&vtbl[i], i);
 	return -EINVAL;
 }
 
 /**
  * create_vtbl - create a copy of volume table.
  * @ubi: UBI device description object
- * @si: scanning information
+ * @ai: attaching information
  * @copy: number of the volume table copy
  * @vtbl: contents of the volume table
  *
  * This function returns zero in case of success and a negative error code in
  * case of failure.
  */
-static int create_vtbl(struct ubi_device *ubi, struct ubi_scan_info *si,
+static int create_vtbl(struct ubi_device *ubi, struct ubi_attach_info *ai,
 		       int copy, void *vtbl)
 {
 	int err, tries = 0;
 	struct ubi_vid_hdr *vid_hdr;
-	struct ubi_scan_leb *new_seb;
+	struct ubi_ainf_peb *new_aeb;
 
-	ubi_msg("create volume table (copy #%d)", copy + 1);
+	dbg_gen("create volume table (copy #%d)", copy + 1);
 
 	vid_hdr = ubi_zalloc_vid_hdr(ubi, GFP_KERNEL);
 	if (!vid_hdr)
 		return -ENOMEM;
 
 retry:
-	new_seb = ubi_scan_get_free_peb(ubi, si);
-	if (IS_ERR(new_seb)) {
-		err = PTR_ERR(new_seb);
+	new_aeb = ubi_early_get_peb(ubi, ai);
+	if (IS_ERR(new_aeb)) {
+		err = PTR_ERR(new_aeb);
 		goto out_free;
 	}
 
-	vid_hdr->vol_type = UBI_VID_DYNAMIC;
+	vid_hdr->vol_type = UBI_LAYOUT_VOLUME_TYPE;
 	vid_hdr->vol_id = cpu_to_be32(UBI_LAYOUT_VOLUME_ID);
 	vid_hdr->compat = UBI_LAYOUT_VOLUME_COMPAT;
 	vid_hdr->data_size = vid_hdr->used_ebs =
 			     vid_hdr->data_pad = cpu_to_be32(0);
 	vid_hdr->lnum = cpu_to_be32(copy);
-	vid_hdr->sqnum = cpu_to_be64(++si->max_sqnum);
+	vid_hdr->sqnum = cpu_to_be64(++ai->max_sqnum);
 
 	/* The EC header is already there, write the VID header */
-	err = ubi_io_write_vid_hdr(ubi, new_seb->pnum, vid_hdr);
+	err = ubi_io_write_vid_hdr(ubi, new_aeb->pnum, vid_hdr);
 	if (err)
 		goto write_error;
 
 	/* Write the layout volume contents */
-	err = ubi_io_write_data(ubi, vtbl, new_seb->pnum, 0, ubi->vtbl_size);
+	err = ubi_io_write_data(ubi, vtbl, new_aeb->pnum, 0, ubi->vtbl_size);
 	if (err)
 		goto write_error;
 
 	/*
-	 * And add it to the scanning information. Don't delete the old version
-	 * of this LEB as it will be deleted and freed in 'ubi_scan_add_used()'.
+	 * And add it to the attaching information. Don't delete the old version
+	 * of this LEB as it will be deleted and freed in 'ubi_add_to_av()'.
 	 */
-	err = ubi_scan_add_used(ubi, si, new_seb->pnum, new_seb->ec,
-				vid_hdr, 0);
-	kmem_cache_free(si->scan_leb_slab, new_seb);
+	err = ubi_add_to_av(ubi, ai, new_aeb->pnum, new_aeb->ec, vid_hdr, 0);
+	kmem_cache_free(ai->aeb_slab_cache, new_aeb);
 	ubi_free_vid_hdr(ubi, vid_hdr);
 	return err;
 
@@ -356,10 +350,10 @@
 		 * Probably this physical eraseblock went bad, try to pick
 		 * another one.
 		 */
-		list_add(&new_seb->u.list, &si->erase);
+		list_add(&new_aeb->u.list, &ai->erase);
 		goto retry;
 	}
-	kmem_cache_free(si->scan_leb_slab, new_seb);
+	kmem_cache_free(ai->aeb_slab_cache, new_aeb);
 out_free:
 	ubi_free_vid_hdr(ubi, vid_hdr);
 	return err;
@@ -369,20 +363,20 @@
 /**
  * process_lvol - process the layout volume.
  * @ubi: UBI device description object
- * @si: scanning information
- * @sv: layout volume scanning information
+ * @ai: attaching information
+ * @av: layout volume attaching information
  *
  * This function is responsible for reading the layout volume, ensuring it is
  * not corrupted, and recovering from corruptions if needed. Returns volume
  * table in case of success and a negative error code in case of failure.
  */
 static struct ubi_vtbl_record *process_lvol(struct ubi_device *ubi,
-					    struct ubi_scan_info *si,
-					    struct ubi_scan_volume *sv)
+					    struct ubi_attach_info *ai,
+					    struct ubi_ainf_volume *av)
 {
 	int err;
 	struct rb_node *rb;
-	struct ubi_scan_leb *seb;
+	struct ubi_ainf_peb *aeb;
 	struct ubi_vtbl_record *leb[UBI_LAYOUT_VOLUME_EBS] = { NULL, NULL };
 	int leb_corrupted[UBI_LAYOUT_VOLUME_EBS] = {1, 1};
 
@@ -414,14 +408,14 @@
 	dbg_gen("check layout volume");
 
 	/* Read both LEB 0 and LEB 1 into memory */
-	ubi_rb_for_each_entry(rb, seb, &sv->root, u.rb) {
-		leb[seb->lnum] = vzalloc(ubi->vtbl_size);
-		if (!leb[seb->lnum]) {
+	ubi_rb_for_each_entry(rb, aeb, &av->root, u.rb) {
+		leb[aeb->lnum] = vzalloc(ubi->vtbl_size);
+		if (!leb[aeb->lnum]) {
 			err = -ENOMEM;
 			goto out_free;
 		}
 
-		err = ubi_io_read_data(ubi, leb[seb->lnum], seb->pnum, 0,
+		err = ubi_io_read_data(ubi, leb[aeb->lnum], aeb->pnum, 0,
 				       ubi->vtbl_size);
 		if (err == UBI_IO_BITFLIPS || mtd_is_eccerr(err))
 			/*
@@ -429,12 +423,12 @@
 			 * uncorrectable ECC error, but we have our own CRC and
 			 * the data will be checked later. If the data is OK,
 			 * the PEB will be scrubbed (because we set
-			 * seb->scrub). If the data is not OK, the contents of
+			 * aeb->scrub). If the data is not OK, the contents of
 			 * the PEB will be recovered from the second copy, and
-			 * seb->scrub will be cleared in
-			 * 'ubi_scan_add_used()'.
+			 * aeb->scrub will be cleared in
+			 * 'ubi_add_to_av()'.
 			 */
-			seb->scrub = 1;
+			aeb->scrub = 1;
 		else if (err)
 			goto out_free;
 	}
@@ -453,7 +447,7 @@
 						  ubi->vtbl_size);
 		if (leb_corrupted[1]) {
 			ubi_warn("volume table copy #2 is corrupted");
-			err = create_vtbl(ubi, si, 1, leb[0]);
+			err = create_vtbl(ubi, ai, 1, leb[0]);
 			if (err)
 				goto out_free;
 			ubi_msg("volume table was restored");
@@ -476,7 +470,7 @@
 		}
 
 		ubi_warn("volume table copy #1 is corrupted");
-		err = create_vtbl(ubi, si, 0, leb[1]);
+		err = create_vtbl(ubi, ai, 0, leb[1]);
 		if (err)
 			goto out_free;
 		ubi_msg("volume table was restored");
@@ -494,13 +488,13 @@
 /**
  * create_empty_lvol - create empty layout volume.
  * @ubi: UBI device description object
- * @si: scanning information
+ * @ai: attaching information
  *
  * This function returns volume table contents in case of success and a
  * negative error code in case of failure.
  */
 static struct ubi_vtbl_record *create_empty_lvol(struct ubi_device *ubi,
-						 struct ubi_scan_info *si)
+						 struct ubi_attach_info *ai)
 {
 	int i;
 	struct ubi_vtbl_record *vtbl;
@@ -515,7 +509,7 @@
 	for (i = 0; i < UBI_LAYOUT_VOLUME_EBS; i++) {
 		int err;
 
-		err = create_vtbl(ubi, si, i, vtbl);
+		err = create_vtbl(ubi, ai, i, vtbl);
 		if (err) {
 			vfree(vtbl);
 			return ERR_PTR(err);
@@ -528,18 +522,19 @@
 /**
  * init_volumes - initialize volume information for existing volumes.
  * @ubi: UBI device description object
- * @si: scanning information
+ * @ai: scanning information
  * @vtbl: volume table
  *
  * This function allocates volume description objects for existing volumes.
  * Returns zero in case of success and a negative error code in case of
  * failure.
  */
-static int init_volumes(struct ubi_device *ubi, const struct ubi_scan_info *si,
+static int init_volumes(struct ubi_device *ubi,
+			const struct ubi_attach_info *ai,
 			const struct ubi_vtbl_record *vtbl)
 {
 	int i, reserved_pebs = 0;
-	struct ubi_scan_volume *sv;
+	struct ubi_ainf_volume *av;
 	struct ubi_volume *vol;
 
 	for (i = 0; i < ubi->vtbl_slots; i++) {
@@ -567,8 +562,8 @@
 		if (vtbl[i].flags & UBI_VTBL_AUTORESIZE_FLG) {
 			/* Auto re-size flag may be set only for one volume */
 			if (ubi->autoresize_vol_id != -1) {
-				ubi_err("more than one auto-resize volume (%d "
-					"and %d)", ubi->autoresize_vol_id, i);
+				ubi_err("more than one auto-resize volume (%d and %d)",
+					ubi->autoresize_vol_id, i);
 				kfree(vol);
 				return -EINVAL;
 			}
@@ -595,8 +590,8 @@
 		}
 
 		/* Static volumes only */
-		sv = ubi_scan_find_sv(si, i);
-		if (!sv) {
+		av = ubi_find_av(ai, i);
+		if (!av) {
 			/*
 			 * No eraseblocks belonging to this volume found. We
 			 * don't actually know whether this static volume is
@@ -608,22 +603,22 @@
 			continue;
 		}
 
-		if (sv->leb_count != sv->used_ebs) {
+		if (av->leb_count != av->used_ebs) {
 			/*
 			 * We found a static volume which misses several
 			 * eraseblocks. Treat it as corrupted.
 			 */
 			ubi_warn("static volume %d misses %d LEBs - corrupted",
-				 sv->vol_id, sv->used_ebs - sv->leb_count);
+				 av->vol_id, av->used_ebs - av->leb_count);
 			vol->corrupted = 1;
 			continue;
 		}
 
-		vol->used_ebs = sv->used_ebs;
+		vol->used_ebs = av->used_ebs;
 		vol->used_bytes =
 			(long long)(vol->used_ebs - 1) * vol->usable_leb_size;
-		vol->used_bytes += sv->last_data_size;
-		vol->last_eb_bytes = sv->last_data_size;
+		vol->used_bytes += av->last_data_size;
+		vol->last_eb_bytes = av->last_data_size;
 	}
 
 	/* And add the layout volume */
@@ -632,7 +627,7 @@
 		return -ENOMEM;
 
 	vol->reserved_pebs = UBI_LAYOUT_VOLUME_EBS;
-	vol->alignment = 1;
+	vol->alignment = UBI_LAYOUT_VOLUME_ALIGN;
 	vol->vol_type = UBI_DYNAMIC_VOLUME;
 	vol->name_len = sizeof(UBI_LAYOUT_VOLUME_NAME) - 1;
 	memcpy(vol->name, UBI_LAYOUT_VOLUME_NAME, vol->name_len + 1);
@@ -664,105 +659,104 @@
 }
 
 /**
- * check_sv - check volume scanning information.
+ * check_av - check volume attaching information.
  * @vol: UBI volume description object
- * @sv: volume scanning information
+ * @av: volume attaching information
  *
- * This function returns zero if the volume scanning information is consistent
+ * This function returns zero if the volume attaching information is consistent
  * to the data read from the volume tabla, and %-EINVAL if not.
  */
-static int check_sv(const struct ubi_volume *vol,
-		    const struct ubi_scan_volume *sv)
+static int check_av(const struct ubi_volume *vol,
+		    const struct ubi_ainf_volume *av)
 {
 	int err;
 
-	if (sv->highest_lnum >= vol->reserved_pebs) {
+	if (av->highest_lnum >= vol->reserved_pebs) {
 		err = 1;
 		goto bad;
 	}
-	if (sv->leb_count > vol->reserved_pebs) {
+	if (av->leb_count > vol->reserved_pebs) {
 		err = 2;
 		goto bad;
 	}
-	if (sv->vol_type != vol->vol_type) {
+	if (av->vol_type != vol->vol_type) {
 		err = 3;
 		goto bad;
 	}
-	if (sv->used_ebs > vol->reserved_pebs) {
+	if (av->used_ebs > vol->reserved_pebs) {
 		err = 4;
 		goto bad;
 	}
-	if (sv->data_pad != vol->data_pad) {
+	if (av->data_pad != vol->data_pad) {
 		err = 5;
 		goto bad;
 	}
 	return 0;
 
 bad:
-	ubi_err("bad scanning information, error %d", err);
-	ubi_dbg_dump_sv(sv);
-	ubi_dbg_dump_vol_info(vol);
+	ubi_err("bad attaching information, error %d", err);
+	ubi_dump_av(av);
+	ubi_dump_vol_info(vol);
 	return -EINVAL;
 }
 
 /**
- * check_scanning_info - check that scanning information.
+ * check_attaching_info - check that attaching information.
  * @ubi: UBI device description object
- * @si: scanning information
+ * @ai: attaching information
  *
  * Even though we protect on-flash data by CRC checksums, we still don't trust
- * the media. This function ensures that scanning information is consistent to
- * the information read from the volume table. Returns zero if the scanning
+ * the media. This function ensures that attaching information is consistent to
+ * the information read from the volume table. Returns zero if the attaching
  * information is OK and %-EINVAL if it is not.
  */
-static int check_scanning_info(const struct ubi_device *ubi,
-			       struct ubi_scan_info *si)
+static int check_attaching_info(const struct ubi_device *ubi,
+			       struct ubi_attach_info *ai)
 {
 	int err, i;
-	struct ubi_scan_volume *sv;
+	struct ubi_ainf_volume *av;
 	struct ubi_volume *vol;
 
-	if (si->vols_found > UBI_INT_VOL_COUNT + ubi->vtbl_slots) {
-		ubi_err("scanning found %d volumes, maximum is %d + %d",
-			si->vols_found, UBI_INT_VOL_COUNT, ubi->vtbl_slots);
+	if (ai->vols_found > UBI_INT_VOL_COUNT + ubi->vtbl_slots) {
+		ubi_err("found %d volumes while attaching, maximum is %d + %d",
+			ai->vols_found, UBI_INT_VOL_COUNT, ubi->vtbl_slots);
 		return -EINVAL;
 	}
 
-	if (si->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT &&
-	    si->highest_vol_id < UBI_INTERNAL_VOL_START) {
-		ubi_err("too large volume ID %d found by scanning",
-			si->highest_vol_id);
+	if (ai->highest_vol_id >= ubi->vtbl_slots + UBI_INT_VOL_COUNT &&
+	    ai->highest_vol_id < UBI_INTERNAL_VOL_START) {
+		ubi_err("too large volume ID %d found", ai->highest_vol_id);
 		return -EINVAL;
 	}
 
 	for (i = 0; i < ubi->vtbl_slots + UBI_INT_VOL_COUNT; i++) {
 		cond_resched();
 
-		sv = ubi_scan_find_sv(si, i);
+		av = ubi_find_av(ai, i);
 		vol = ubi->volumes[i];
 		if (!vol) {
-			if (sv)
-				ubi_scan_rm_volume(si, sv);
+			if (av)
+				ubi_remove_av(ai, av);
 			continue;
 		}
 
 		if (vol->reserved_pebs == 0) {
 			ubi_assert(i < ubi->vtbl_slots);
 
-			if (!sv)
+			if (!av)
 				continue;
 
 			/*
-			 * During scanning we found a volume which does not
+			 * During attaching we found a volume which does not
 			 * exist according to the information in the volume
 			 * table. This must have happened due to an unclean
 			 * reboot while the volume was being removed. Discard
 			 * these eraseblocks.
 			 */
-			ubi_msg("finish volume %d removal", sv->vol_id);
-			ubi_scan_rm_volume(si, sv);
-		} else if (sv) {
-			err = check_sv(vol, sv);
+			ubi_msg("finish volume %d removal", av->vol_id);
+			ubi_remove_av(ai, av);
+		} else if (av) {
+			err = check_av(vol, av);
 			if (err)
 				return err;
 		}
@@ -774,16 +768,16 @@
 /**
  * ubi_read_volume_table - read the volume table.
  * @ubi: UBI device description object
- * @si: scanning information
+ * @ai: attaching information
  *
  * This function reads volume table, checks it, recover from errors if needed,
  * or creates it if needed. Returns zero in case of success and a negative
  * error code in case of failure.
  */
-int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_scan_info *si)
+int ubi_read_volume_table(struct ubi_device *ubi, struct ubi_attach_info *ai)
 {
 	int i, err;
-	struct ubi_scan_volume *sv;
+	struct ubi_ainf_volume *av;
 
 	empty_vtbl_record.crc = cpu_to_be32(0xf116c36b);
 
@@ -798,8 +792,8 @@
 	ubi->vtbl_size = ubi->vtbl_slots * UBI_VTBL_RECORD_SIZE;
 	ubi->vtbl_size = ALIGN(ubi->vtbl_size, ubi->min_io_size);
 
-	sv = ubi_scan_find_sv(si, UBI_LAYOUT_VOLUME_ID);
-	if (!sv) {
+	av = ubi_find_av(ai, UBI_LAYOUT_VOLUME_ID);
+	if (!av) {
 		/*
 		 * No logical eraseblocks belonging to the layout volume were
 		 * found. This could mean that the flash is just empty. In
@@ -808,8 +802,8 @@
 		 * But if flash is not empty this must be a corruption or the
 		 * MTD device just contains garbage.
 		 */
-		if (si->is_empty) {
-			ubi->vtbl = create_empty_lvol(ubi, si);
+		if (ai->is_empty) {
+			ubi->vtbl = create_empty_lvol(ubi, ai);
 			if (IS_ERR(ubi->vtbl))
 				return PTR_ERR(ubi->vtbl);
 		} else {
@@ -817,14 +811,14 @@
 			return -EINVAL;
 		}
 	} else {
-		if (sv->leb_count > UBI_LAYOUT_VOLUME_EBS) {
+		if (av->leb_count > UBI_LAYOUT_VOLUME_EBS) {
 			/* This must not happen with proper UBI images */
-			dbg_err("too many LEBs (%d) in layout volume",
-				sv->leb_count);
+			ubi_err("too many LEBs (%d) in layout volume",
+				av->leb_count);
 			return -EINVAL;
 		}
 
-		ubi->vtbl = process_lvol(ubi, si, sv);
+		ubi->vtbl = process_lvol(ubi, ai, av);
 		if (IS_ERR(ubi->vtbl))
 			return PTR_ERR(ubi->vtbl);
 	}
@@ -835,15 +829,15 @@
 	 * The layout volume is OK, initialize the corresponding in-RAM data
 	 * structures.
 	 */
-	err = init_volumes(ubi, si, ubi->vtbl);
+	err = init_volumes(ubi, ai, ubi->vtbl);
 	if (err)
 		goto out_free;
 
 	/*
-	 * Make sure that the scanning information is consistent to the
+	 * Make sure that the attaching information is consistent to the
 	 * information stored in the volume table.
 	 */
-	err = check_scanning_info(ubi, si);
+	err = check_attaching_info(ubi, ai);
 	if (err)
 		goto out_free;
 
@@ -858,21 +852,17 @@
 	return err;
 }
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-
 /**
- * paranoid_vtbl_check - check volume table.
+ * self_vtbl_check - check volume table.
  * @ubi: UBI device description object
  */
-static void paranoid_vtbl_check(const struct ubi_device *ubi)
+static void self_vtbl_check(const struct ubi_device *ubi)
 {
-	if (!ubi->dbg->chk_gen)
+	if (!ubi_dbg_chk_gen(ubi))
 		return;
 
 	if (vtbl_check(ubi, ubi->vtbl)) {
-		ubi_err("paranoid check failed");
+		ubi_err("self-check failed");
 		BUG();
 	}
 }
-
-#endif /* CONFIG_MTD_UBI_DEBUG */
diff -ur a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
--- a/drivers/mtd/ubi/wl.c	2013-08-24 11:37:05.000000000 +0200
+++ b/drivers/mtd/ubi/wl.c	2014-02-17 11:57:40.000000000 +0100
@@ -1,5 +1,4 @@
 /*
- * @ubi: UBI device description object
  * Copyright (c) International Business Machines Corp., 2006
  *
  * This program is free software; you can redistribute it and/or modify
@@ -41,12 +40,6 @@
  * physical eraseblocks with low erase counter to free physical eraseblocks
  * with high erase counter.
  *
- * The 'ubi_wl_get_peb()' function accepts data type hints which help to pick
- * an "optimal" physical eraseblock. For example, when it is known that the
- * physical eraseblock will be "put" soon because it contains short-term data,
- * the WL sub-system may pick a free physical eraseblock with low erase
- * counter, and so forth.
- *
  * If the WL sub-system fails to erase a physical eraseblock, it marks it as
  * bad.
  *
@@ -70,8 +63,7 @@
  *    to the user; instead, we first want to let users fill them up with data;
  *
  *  o there is a chance that the user will put the physical eraseblock very
- *    soon, so it makes sense not to move it for some time, but wait; this is
- *    especially important in case of "short term" physical eraseblocks.
+ *    soon, so it makes sense not to move it for some time, but wait.
  *
  * Physical eraseblocks stay protected only for limited time. But the "time" is
  * measured in erase cycles in this case. This is implemented with help of the
@@ -142,37 +134,46 @@
  */
 #define WL_MAX_FAILURES 32
 
+static int self_check_ec(struct ubi_device *ubi, int pnum, int ec);
+static int self_check_in_wl_tree(const struct ubi_device *ubi,
+				 struct ubi_wl_entry *e, struct rb_root *root);
+static int self_check_in_pq(const struct ubi_device *ubi,
+			    struct ubi_wl_entry *e);
+
+#ifdef CONFIG_MTD_UBI_FASTMAP
 /**
- * struct ubi_work - UBI work description data structure.
- * @list: a link in the list of pending works
- * @func: worker function
- * @e: physical eraseblock to erase
- * @torture: if the physical eraseblock has to be tortured
- *
- * The @func pointer points to the worker function. If the @cancel argument is
- * not zero, the worker has to free the resources and exit immediately. The
- * worker has to return zero in case of success and a negative error code in
- * case of failure.
+ * update_fastmap_work_fn - calls ubi_update_fastmap from a work queue
+ * @wrk: the work description object
  */
-struct ubi_work {
-	struct list_head list;
-	int (*func)(struct ubi_device *ubi, struct ubi_work *wrk, int cancel);
-	/* The below fields are only relevant to erasure works */
-	struct ubi_wl_entry *e;
-	int torture;
-};
+static void update_fastmap_work_fn(struct work_struct *wrk)
+{
+	struct ubi_device *ubi = container_of(wrk, struct ubi_device, fm_work);
+	ubi_update_fastmap(ubi);
+}
+
+/**
+ *  ubi_ubi_is_fm_block - returns 1 if a PEB is currently used in a fastmap.
+ *  @ubi: UBI device description object
+ *  @pnum: the to be checked PEB
+ */
+static int ubi_is_fm_block(struct ubi_device *ubi, int pnum)
+{
+	int i;
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-static int paranoid_check_ec(struct ubi_device *ubi, int pnum, int ec);
-static int paranoid_check_in_wl_tree(const struct ubi_device *ubi,
-				     struct ubi_wl_entry *e,
-				     struct rb_root *root);
-static int paranoid_check_in_pq(const struct ubi_device *ubi,
-				struct ubi_wl_entry *e);
+	if (!ubi->fm)
+		return 0;
+
+	for (i = 0; i < ubi->fm->used_blocks; i++)
+		if (ubi->fm->e[i]->pnum == pnum)
+			return 1;
+
+	return 0;
+}
 #else
-#define paranoid_check_ec(ubi, pnum, ec) 0
-#define paranoid_check_in_wl_tree(ubi, e, root)
-#define paranoid_check_in_pq(ubi, e) 0
+static int ubi_is_fm_block(struct ubi_device *ubi, int pnum)
+{
+	return 0;
+}
 #endif
 
 /**
@@ -271,18 +272,16 @@
 {
 	int err;
 
-	spin_lock(&ubi->wl_lock);
 	while (!ubi->free.rb_node) {
 		spin_unlock(&ubi->wl_lock);
 
 		dbg_wl("do one work synchronously");
 		err = do_work(ubi);
-		if (err)
-			return err;
 
 		spin_lock(&ubi->wl_lock);
+		if (err)
+			return err;
 	}
-	spin_unlock(&ubi->wl_lock);
 
 	return 0;
 }
@@ -349,19 +348,22 @@
 
 /**
  * find_wl_entry - find wear-leveling entry closest to certain erase counter.
+ * @ubi: UBI device description object
  * @root: the RB-tree where to look for
- * @max: highest possible erase counter
+ * @diff: maximum possible difference from the smallest erase counter
  *
  * This function looks for a wear leveling entry with erase counter closest to
- * @max and less than @max.
+ * min + @diff, where min is the smallest erase counter.
  */
-static struct ubi_wl_entry *find_wl_entry(struct rb_root *root, int max)
+static struct ubi_wl_entry *find_wl_entry(struct ubi_device *ubi,
+					  struct rb_root *root, int diff)
 {
 	struct rb_node *p;
-	struct ubi_wl_entry *e;
+	struct ubi_wl_entry *e, *prev_e = NULL;
+	int max;
 
 	e = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb);
-	max += e->ec;
+	max = e->ec + diff;
 
 	p = root->rb_node;
 	while (p) {
@@ -372,39 +374,143 @@
 			p = p->rb_left;
 		else {
 			p = p->rb_right;
+			prev_e = e;
 			e = e1;
 		}
 	}
 
+	/* If no fastmap has been written and this WL entry can be used
+	 * as anchor PEB, hold it back and return the second best WL entry
+	 * such that fastmap can use the anchor PEB later. */
+	if (prev_e && !ubi->fm_disabled &&
+	    !ubi->fm && e->pnum < UBI_FM_MAX_START)
+		return prev_e;
+
 	return e;
 }
 
 /**
- * ubi_wl_get_peb - get a physical eraseblock.
+ * find_mean_wl_entry - find wear-leveling entry with medium erase counter.
  * @ubi: UBI device description object
- * @dtype: type of data which will be stored in this physical eraseblock
+ * @root: the RB-tree where to look for
  *
- * This function returns a physical eraseblock in case of success and a
- * negative error code in case of failure. Might sleep.
+ * This function looks for a wear leveling entry with medium erase counter,
+ * but not greater or equivalent than the lowest erase counter plus
+ * %WL_FREE_MAX_DIFF/2.
  */
-int ubi_wl_get_peb(struct ubi_device *ubi, int dtype)
+static struct ubi_wl_entry *find_mean_wl_entry(struct ubi_device *ubi,
+					       struct rb_root *root)
 {
-	int err;
 	struct ubi_wl_entry *e, *first, *last;
 
-	ubi_assert(dtype == UBI_LONGTERM || dtype == UBI_SHORTTERM ||
-		   dtype == UBI_UNKNOWN);
+	first = rb_entry(rb_first(root), struct ubi_wl_entry, u.rb);
+	last = rb_entry(rb_last(root), struct ubi_wl_entry, u.rb);
+
+	if (last->ec - first->ec < WL_FREE_MAX_DIFF) {
+		e = rb_entry(root->rb_node, struct ubi_wl_entry, u.rb);
+
+#ifdef CONFIG_MTD_UBI_FASTMAP
+		/* If no fastmap has been written and this WL entry can be used
+		 * as anchor PEB, hold it back and return the second best
+		 * WL entry such that fastmap can use the anchor PEB later. */
+		if (e && !ubi->fm_disabled && !ubi->fm &&
+		    e->pnum < UBI_FM_MAX_START)
+			e = rb_entry(rb_next(root->rb_node),
+				     struct ubi_wl_entry, u.rb);
+#endif
+	} else
+		e = find_wl_entry(ubi, root, WL_FREE_MAX_DIFF/2);
+
+	return e;
+}
+
+#ifdef CONFIG_MTD_UBI_FASTMAP
+/**
+ * find_anchor_wl_entry - find wear-leveling entry to used as anchor PEB.
+ * @root: the RB-tree where to look for
+ */
+static struct ubi_wl_entry *find_anchor_wl_entry(struct rb_root *root)
+{
+	struct rb_node *p;
+	struct ubi_wl_entry *e, *victim = NULL;
+	int max_ec = UBI_MAX_ERASECOUNTER;
+
+	ubi_rb_for_each_entry(p, e, root, u.rb) {
+		if (e->pnum < UBI_FM_MAX_START && e->ec < max_ec) {
+			victim = e;
+			max_ec = e->ec;
+		}
+	}
+
+	return victim;
+}
+
+static int anchor_pebs_avalible(struct rb_root *root)
+{
+	struct rb_node *p;
+	struct ubi_wl_entry *e;
+
+	ubi_rb_for_each_entry(p, e, root, u.rb)
+		if (e->pnum < UBI_FM_MAX_START)
+			return 1;
+
+	return 0;
+}
+
+/**
+ * ubi_wl_get_fm_peb - find a physical erase block with a given maximal number.
+ * @ubi: UBI device description object
+ * @anchor: This PEB will be used as anchor PEB by fastmap
+ *
+ * The function returns a physical erase block with a given maximal number
+ * and removes it from the wl subsystem.
+ * Must be called with wl_lock held!
+ */
+struct ubi_wl_entry *ubi_wl_get_fm_peb(struct ubi_device *ubi, int anchor)
+{
+	struct ubi_wl_entry *e = NULL;
+
+	if (!ubi->free.rb_node || (ubi->free_count - ubi->beb_rsvd_pebs < 1))
+		goto out;
+
+	if (anchor)
+		e = find_anchor_wl_entry(&ubi->free);
+	else
+		e = find_mean_wl_entry(ubi, &ubi->free);
+
+	if (!e)
+		goto out;
+
+	self_check_in_wl_tree(ubi, e, &ubi->free);
+
+	/* remove it from the free list,
+	 * the wl subsystem does no longer know this erase block */
+	rb_erase(&e->u.rb, &ubi->free);
+	ubi->free_count--;
+out:
+	return e;
+}
+#endif
+
+/**
+ * __wl_get_peb - get a physical eraseblock.
+ * @ubi: UBI device description object
+ *
+ * This function returns a physical eraseblock in case of success and a
+ * negative error code in case of failure.
+ */
+static int __wl_get_peb(struct ubi_device *ubi)
+{
+	int err;
+	struct ubi_wl_entry *e;
 
 retry:
-	spin_lock(&ubi->wl_lock);
 	if (!ubi->free.rb_node) {
 		if (ubi->works_count == 0) {
-			ubi_assert(list_empty(&ubi->works));
 			ubi_err("no free eraseblocks");
-			spin_unlock(&ubi->wl_lock);
+			ubi_assert(list_empty(&ubi->works));
 			return -ENOSPC;
 		}
-		spin_unlock(&ubi->wl_lock);
 
 		err = produce_free_peb(ubi);
 		if (err < 0)
@@ -412,64 +518,186 @@
 		goto retry;
 	}
 
-	switch (dtype) {
-	case UBI_LONGTERM:
-		/*
-		 * For long term data we pick a physical eraseblock with high
-		 * erase counter. But the highest erase counter we can pick is
-		 * bounded by the the lowest erase counter plus
-		 * %WL_FREE_MAX_DIFF.
-		 */
-		e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
-		break;
-	case UBI_UNKNOWN:
-		/*
-		 * For unknown data we pick a physical eraseblock with medium
-		 * erase counter. But we by no means can pick a physical
-		 * eraseblock with erase counter greater or equivalent than the
-		 * lowest erase counter plus %WL_FREE_MAX_DIFF/2.
-		 */
-		first = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry,
-					u.rb);
-		last = rb_entry(rb_last(&ubi->free), struct ubi_wl_entry, u.rb);
-
-		if (last->ec - first->ec < WL_FREE_MAX_DIFF)
-			e = rb_entry(ubi->free.rb_node,
-					struct ubi_wl_entry, u.rb);
-		else
-			e = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF/2);
-		break;
-	case UBI_SHORTTERM:
-		/*
-		 * For short term data we pick a physical eraseblock with the
-		 * lowest erase counter as we expect it will be erased soon.
-		 */
-		e = rb_entry(rb_first(&ubi->free), struct ubi_wl_entry, u.rb);
-		break;
-	default:
-		BUG();
+	e = find_mean_wl_entry(ubi, &ubi->free);
+	if (!e) {
+		ubi_err("no free eraseblocks");
+		return -ENOSPC;
 	}
 
-	paranoid_check_in_wl_tree(ubi, e, &ubi->free);
+	self_check_in_wl_tree(ubi, e, &ubi->free);
 
 	/*
 	 * Move the physical eraseblock to the protection queue where it will
 	 * be protected from being moved for some time.
 	 */
 	rb_erase(&e->u.rb, &ubi->free);
+	ubi->free_count--;
 	dbg_wl("PEB %d EC %d", e->pnum, e->ec);
+#ifndef CONFIG_MTD_UBI_FASTMAP
+	/* We have to enqueue e only if fastmap is disabled,
+	 * is fastmap enabled prot_queue_add() will be called by
+	 * ubi_wl_get_peb() after removing e from the pool. */
 	prot_queue_add(ubi, e);
+#endif
+	return e->pnum;
+}
+
+#ifdef CONFIG_MTD_UBI_FASTMAP
+/**
+ * return_unused_pool_pebs - returns unused PEB to the free tree.
+ * @ubi: UBI device description object
+ * @pool: fastmap pool description object
+ */
+static void return_unused_pool_pebs(struct ubi_device *ubi,
+				    struct ubi_fm_pool *pool)
+{
+	int i;
+	struct ubi_wl_entry *e;
+
+	for (i = pool->used; i < pool->size; i++) {
+		e = ubi->lookuptbl[pool->pebs[i]];
+		wl_tree_add(e, &ubi->free);
+		ubi->free_count++;
+	}
+}
+
+/**
+ * refill_wl_pool - refills all the fastmap pool used by the
+ * WL sub-system.
+ * @ubi: UBI device description object
+ */
+static void refill_wl_pool(struct ubi_device *ubi)
+{
+	struct ubi_wl_entry *e;
+	struct ubi_fm_pool *pool = &ubi->fm_wl_pool;
+
+	return_unused_pool_pebs(ubi, pool);
+
+	for (pool->size = 0; pool->size < pool->max_size; pool->size++) {
+		if (!ubi->free.rb_node ||
+		   (ubi->free_count - ubi->beb_rsvd_pebs < 5))
+			break;
+
+		e = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF);
+		self_check_in_wl_tree(ubi, e, &ubi->free);
+		rb_erase(&e->u.rb, &ubi->free);
+		ubi->free_count--;
+
+		pool->pebs[pool->size] = e->pnum;
+	}
+	pool->used = 0;
+}
+
+/**
+ * refill_wl_user_pool - refills all the fastmap pool used by ubi_wl_get_peb.
+ * @ubi: UBI device description object
+ */
+static void refill_wl_user_pool(struct ubi_device *ubi)
+{
+	struct ubi_fm_pool *pool = &ubi->fm_pool;
+
+	return_unused_pool_pebs(ubi, pool);
+
+	for (pool->size = 0; pool->size < pool->max_size; pool->size++) {
+		if (!ubi->free.rb_node ||
+		   (ubi->free_count - ubi->beb_rsvd_pebs < 1))
+			break;
+
+		pool->pebs[pool->size] = __wl_get_peb(ubi);
+		if (pool->pebs[pool->size] < 0)
+			break;
+	}
+	pool->used = 0;
+}
+
+/**
+ * ubi_refill_pools - refills all fastmap PEB pools.
+ * @ubi: UBI device description object
+ */
+void ubi_refill_pools(struct ubi_device *ubi)
+{
+	spin_lock(&ubi->wl_lock);
+	refill_wl_pool(ubi);
+	refill_wl_user_pool(ubi);
+	spin_unlock(&ubi->wl_lock);
+}
+
+/* ubi_wl_get_peb - works exaclty like __wl_get_peb but keeps track of
+ * the fastmap pool.
+ */
+int ubi_wl_get_peb(struct ubi_device *ubi)
+{
+	int ret;
+	struct ubi_fm_pool *pool = &ubi->fm_pool;
+	struct ubi_fm_pool *wl_pool = &ubi->fm_wl_pool;
+
+	if (!pool->size || !wl_pool->size || pool->used == pool->size ||
+	    wl_pool->used == wl_pool->size)
+		ubi_update_fastmap(ubi);
+
+	/* we got not a single free PEB */
+	if (!pool->size)
+		ret = -ENOSPC;
+	else {
+		spin_lock(&ubi->wl_lock);
+		ret = pool->pebs[pool->used++];
+		prot_queue_add(ubi, ubi->lookuptbl[ret]);
+		spin_unlock(&ubi->wl_lock);
+	}
+
+	return ret;
+}
+
+/* get_peb_for_wl - returns a PEB to be used internally by the WL sub-system.
+ *
+ * @ubi: UBI device description object
+ */
+static struct ubi_wl_entry *get_peb_for_wl(struct ubi_device *ubi)
+{
+	struct ubi_fm_pool *pool = &ubi->fm_wl_pool;
+	int pnum;
+
+	if (pool->used == pool->size || !pool->size) {
+		/* We cannot update the fastmap here because this
+		 * function is called in atomic context.
+		 * Let's fail here and refill/update it as soon as possible. */
+		schedule_work(&ubi->fm_work);
+		return NULL;
+	} else {
+		pnum = pool->pebs[pool->used++];
+		return ubi->lookuptbl[pnum];
+	}
+}
+#else
+static struct ubi_wl_entry *get_peb_for_wl(struct ubi_device *ubi)
+{
+	struct ubi_wl_entry *e;
+
+	e = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF);
+	self_check_in_wl_tree(ubi, e, &ubi->free);
+	rb_erase(&e->u.rb, &ubi->free);
+
+	return e;
+}
+
+int ubi_wl_get_peb(struct ubi_device *ubi)
+{
+	int peb, err;
+
+	spin_lock(&ubi->wl_lock);
+	peb = __wl_get_peb(ubi);
 	spin_unlock(&ubi->wl_lock);
 
-	err = ubi_dbg_check_all_ff(ubi, e->pnum, ubi->vid_hdr_aloffset,
-				   ubi->peb_size - ubi->vid_hdr_aloffset);
+	err = ubi_self_check_all_ff(ubi, peb, ubi->vid_hdr_aloffset,
+				    ubi->peb_size - ubi->vid_hdr_aloffset);
 	if (err) {
-		ubi_err("new PEB %d does not contain all 0xFF bytes", e->pnum);
+		ubi_err("new PEB %d does not contain all 0xFF bytes", peb);
 		return err;
 	}
 
-	return e->pnum;
+	return peb;
 }
+#endif
 
 /**
  * prot_queue_del - remove a physical eraseblock from the protection queue.
@@ -487,7 +715,7 @@
 	if (!e)
 		return -ENODEV;
 
-	if (paranoid_check_in_pq(ubi, e))
+	if (self_check_in_pq(ubi, e))
 		return -ENODEV;
 
 	list_del(&e->u.list);
@@ -513,7 +741,7 @@
 
 	dbg_wl("erase PEB %d, old EC %llu", e->pnum, ec);
 
-	err = paranoid_check_ec(ubi, e->pnum, e->ec);
+	err = self_check_ec(ubi, e->pnum, e->ec);
 	if (err)
 		return -EINVAL;
 
@@ -601,14 +829,14 @@
 }
 
 /**
- * schedule_ubi_work - schedule a work.
+ * __schedule_ubi_work - schedule a work.
  * @ubi: UBI device description object
  * @wrk: the work to schedule
  *
  * This function adds a work defined by @wrk to the tail of the pending works
- * list.
+ * list. Can only be used of ubi->work_sem is already held in read mode!
  */
-static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
+static void __schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
 {
 	spin_lock(&ubi->wl_lock);
 	list_add_tail(&wrk->list, &ubi->works);
@@ -619,23 +847,54 @@
 	spin_unlock(&ubi->wl_lock);
 }
 
+/**
+ * schedule_ubi_work - schedule a work.
+ * @ubi: UBI device description object
+ * @wrk: the work to schedule
+ *
+ * This function adds a work defined by @wrk to the tail of the pending works
+ * list.
+ */
+static void schedule_ubi_work(struct ubi_device *ubi, struct ubi_work *wrk)
+{
+	down_read(&ubi->work_sem);
+	__schedule_ubi_work(ubi, wrk);
+	up_read(&ubi->work_sem);
+}
+
 static int erase_worker(struct ubi_device *ubi, struct ubi_work *wl_wrk,
 			int cancel);
 
+#ifdef CONFIG_MTD_UBI_FASTMAP
+/**
+ * ubi_is_erase_work - checks whether a work is erase work.
+ * @wrk: The work object to be checked
+ */
+int ubi_is_erase_work(struct ubi_work *wrk)
+{
+	return wrk->func == erase_worker;
+}
+#endif
+
 /**
  * schedule_erase - schedule an erase work.
  * @ubi: UBI device description object
  * @e: the WL entry of the physical eraseblock to erase
+ * @vol_id: the volume ID that last used this PEB
+ * @lnum: the last used logical eraseblock number for the PEB
  * @torture: if the physical eraseblock has to be tortured
  *
  * This function returns zero in case of success and a %-ENOMEM in case of
  * failure.
  */
 static int schedule_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
-			  int torture)
+			  int vol_id, int lnum, int torture)
 {
 	struct ubi_work *wl_wrk;
 
+	ubi_assert(e);
+	ubi_assert(!ubi_is_fm_block(ubi, e->pnum));
+
 	dbg_wl("schedule erasure of PEB %d, EC %d, torture %d",
 	       e->pnum, e->ec, torture);
 
@@ -645,6 +904,8 @@
 
 	wl_wrk->func = &erase_worker;
 	wl_wrk->e = e;
+	wl_wrk->vol_id = vol_id;
+	wl_wrk->lnum = lnum;
 	wl_wrk->torture = torture;
 
 	schedule_ubi_work(ubi, wl_wrk);
@@ -652,6 +913,79 @@
 }
 
 /**
+ * do_sync_erase - run the erase worker synchronously.
+ * @ubi: UBI device description object
+ * @e: the WL entry of the physical eraseblock to erase
+ * @vol_id: the volume ID that last used this PEB
+ * @lnum: the last used logical eraseblock number for the PEB
+ * @torture: if the physical eraseblock has to be tortured
+ *
+ */
+static int do_sync_erase(struct ubi_device *ubi, struct ubi_wl_entry *e,
+			 int vol_id, int lnum, int torture)
+{
+	struct ubi_work *wl_wrk;
+
+	dbg_wl("sync erase of PEB %i", e->pnum);
+
+	wl_wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);
+	if (!wl_wrk)
+		return -ENOMEM;
+
+	wl_wrk->e = e;
+	wl_wrk->vol_id = vol_id;
+	wl_wrk->lnum = lnum;
+	wl_wrk->torture = torture;
+
+	return erase_worker(ubi, wl_wrk, 0);
+}
+
+#ifdef CONFIG_MTD_UBI_FASTMAP
+/**
+ * ubi_wl_put_fm_peb - returns a PEB used in a fastmap to the wear-leveling
+ * sub-system.
+ * see: ubi_wl_put_peb()
+ *
+ * @ubi: UBI device description object
+ * @fm_e: physical eraseblock to return
+ * @lnum: the last used logical eraseblock number for the PEB
+ * @torture: if this physical eraseblock has to be tortured
+ */
+int ubi_wl_put_fm_peb(struct ubi_device *ubi, struct ubi_wl_entry *fm_e,
+		      int lnum, int torture)
+{
+	struct ubi_wl_entry *e;
+	int vol_id, pnum = fm_e->pnum;
+
+	dbg_wl("PEB %d", pnum);
+
+	ubi_assert(pnum >= 0);
+	ubi_assert(pnum < ubi->peb_count);
+
+	spin_lock(&ubi->wl_lock);
+	e = ubi->lookuptbl[pnum];
+
+	/* This can happen if we recovered from a fastmap the very
+	 * first time and writing now a new one. In this case the wl system
+	 * has never seen any PEB used by the original fastmap.
+	 */
+	if (!e) {
+		e = fm_e;
+		ubi_assert(e->ec >= 0);
+		ubi->lookuptbl[pnum] = e;
+	} else {
+		e->ec = fm_e->ec;
+		kfree(fm_e);
+	}
+
+	spin_unlock(&ubi->wl_lock);
+
+	vol_id = lnum ? UBI_FM_DATA_VOLUME_ID : UBI_FM_SB_VOLUME_ID;
+	return schedule_erase(ubi, e, vol_id, lnum, torture);
+}
+#endif
+
+/**
  * wear_leveling_worker - wear-leveling worker function.
  * @ubi: UBI device description object
  * @wrk: the work object
@@ -666,6 +1000,9 @@
 {
 	int err, scrubbing = 0, torture = 0, protect = 0, erroneous = 0;
 	int vol_id = -1, uninitialized_var(lnum);
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	int anchor = wrk->anchor;
+#endif
 	struct ubi_wl_entry *e1, *e2;
 	struct ubi_vid_hdr *vid_hdr;
 
@@ -699,21 +1036,42 @@
 		goto out_cancel;
 	}
 
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	/* Check whether we need to produce an anchor PEB */
+	if (!anchor)
+		anchor = !anchor_pebs_avalible(&ubi->free);
+
+	if (anchor) {
+		e1 = find_anchor_wl_entry(&ubi->used);
+		if (!e1)
+			goto out_cancel;
+		e2 = get_peb_for_wl(ubi);
+		if (!e2)
+			goto out_cancel;
+
+		self_check_in_wl_tree(ubi, e1, &ubi->used);
+		rb_erase(&e1->u.rb, &ubi->used);
+		dbg_wl("anchor-move PEB %d to PEB %d", e1->pnum, e2->pnum);
+	} else if (!ubi->scrub.rb_node) {
+#else
 	if (!ubi->scrub.rb_node) {
+#endif
 		/*
 		 * Now pick the least worn-out used physical eraseblock and a
 		 * highly worn-out free physical eraseblock. If the erase
 		 * counters differ much enough, start wear-leveling.
 		 */
 		e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);
-		e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
+		e2 = get_peb_for_wl(ubi);
+		if (!e2)
+			goto out_cancel;
 
 		if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD)) {
 			dbg_wl("no WL needed: min used EC %d, max free EC %d",
 			       e1->ec, e2->ec);
 			goto out_cancel;
 		}
-		paranoid_check_in_wl_tree(ubi, e1, &ubi->used);
+		self_check_in_wl_tree(ubi, e1, &ubi->used);
 		rb_erase(&e1->u.rb, &ubi->used);
 		dbg_wl("move PEB %d EC %d to PEB %d EC %d",
 		       e1->pnum, e1->ec, e2->pnum, e2->ec);
@@ -721,14 +1079,15 @@
 		/* Perform scrubbing */
 		scrubbing = 1;
 		e1 = rb_entry(rb_first(&ubi->scrub), struct ubi_wl_entry, u.rb);
-		e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
-		paranoid_check_in_wl_tree(ubi, e1, &ubi->scrub);
+		e2 = get_peb_for_wl(ubi);
+		if (!e2)
+			goto out_cancel;
+
+		self_check_in_wl_tree(ubi, e1, &ubi->scrub);
 		rb_erase(&e1->u.rb, &ubi->scrub);
 		dbg_wl("scrub PEB %d to PEB %d", e1->pnum, e2->pnum);
 	}
 
-	paranoid_check_in_wl_tree(ubi, e2, &ubi->free);
-	rb_erase(&e2->u.rb, &ubi->free);
 	ubi->move_from = e1;
 	ubi->move_to = e2;
 	spin_unlock(&ubi->wl_lock);
@@ -797,7 +1156,7 @@
 			scrubbing = 1;
 			goto out_not_moved;
 		}
-		if (err == MOVE_CANCEL_BITFLIPS || err == MOVE_TARGET_WR_ERR ||
+		if (err == MOVE_TARGET_BITFLIPS || err == MOVE_TARGET_WR_ERR ||
 		    err == MOVE_TARGET_RD_ERR) {
 			/*
 			 * Target PEB had bit-flips or write error - torture it.
@@ -845,7 +1204,7 @@
 	ubi->move_to_put = ubi->wl_scheduled = 0;
 	spin_unlock(&ubi->wl_lock);
 
-	err = schedule_erase(ubi, e1, 0);
+	err = do_sync_erase(ubi, e1, vol_id, lnum, 0);
 	if (err) {
 		kmem_cache_free(ubi_wl_entry_slab, e1);
 		if (e2)
@@ -860,7 +1219,7 @@
 		 */
 		dbg_wl("PEB %d (LEB %d:%d) was put meanwhile, erase",
 		       e2->pnum, vol_id, lnum);
-		err = schedule_erase(ubi, e2, 0);
+		err = do_sync_erase(ubi, e2, vol_id, lnum, 0);
 		if (err) {
 			kmem_cache_free(ubi_wl_entry_slab, e2);
 			goto out_ro;
@@ -899,7 +1258,7 @@
 	spin_unlock(&ubi->wl_lock);
 
 	ubi_free_vid_hdr(ubi, vid_hdr);
-	err = schedule_erase(ubi, e2, torture);
+	err = do_sync_erase(ubi, e2, vol_id, lnum, torture);
 	if (err) {
 		kmem_cache_free(ubi_wl_entry_slab, e2);
 		goto out_ro;
@@ -940,12 +1299,13 @@
 /**
  * ensure_wear_leveling - schedule wear-leveling if it is needed.
  * @ubi: UBI device description object
+ * @nested: set to non-zero if this function is called from UBI worker
  *
  * This function checks if it is time to start wear-leveling and schedules it
  * if yes. This function returns zero in case of success and a negative error
  * code in case of failure.
  */
-static int ensure_wear_leveling(struct ubi_device *ubi)
+static int ensure_wear_leveling(struct ubi_device *ubi, int nested)
 {
 	int err = 0;
 	struct ubi_wl_entry *e1;
@@ -973,7 +1333,7 @@
 		 * %UBI_WL_THRESHOLD.
 		 */
 		e1 = rb_entry(rb_first(&ubi->used), struct ubi_wl_entry, u.rb);
-		e2 = find_wl_entry(&ubi->free, WL_FREE_MAX_DIFF);
+		e2 = find_wl_entry(ubi, &ubi->free, WL_FREE_MAX_DIFF);
 
 		if (!(e2->ec - e1->ec >= UBI_WL_THRESHOLD))
 			goto out_unlock;
@@ -990,8 +1350,12 @@
 		goto out_cancel;
 	}
 
+	wrk->anchor = 0;
 	wrk->func = &wear_leveling_worker;
-	schedule_ubi_work(ubi, wrk);
+	if (nested)
+		__schedule_ubi_work(ubi, wrk);
+	else
+		schedule_ubi_work(ubi, wrk);
 	return err;
 
 out_cancel:
@@ -1002,6 +1366,38 @@
 	return err;
 }
 
+#ifdef CONFIG_MTD_UBI_FASTMAP
+/**
+ * ubi_ensure_anchor_pebs - schedule wear-leveling to produce an anchor PEB.
+ * @ubi: UBI device description object
+ */
+int ubi_ensure_anchor_pebs(struct ubi_device *ubi)
+{
+	struct ubi_work *wrk;
+
+	spin_lock(&ubi->wl_lock);
+	if (ubi->wl_scheduled) {
+		spin_unlock(&ubi->wl_lock);
+		return 0;
+	}
+	ubi->wl_scheduled = 1;
+	spin_unlock(&ubi->wl_lock);
+
+	wrk = kmalloc(sizeof(struct ubi_work), GFP_NOFS);
+	if (!wrk) {
+		spin_lock(&ubi->wl_lock);
+		ubi->wl_scheduled = 0;
+		spin_unlock(&ubi->wl_lock);
+		return -ENOMEM;
+	}
+
+	wrk->anchor = 1;
+	wrk->func = &wear_leveling_worker;
+	schedule_ubi_work(ubi, wrk);
+	return 0;
+}
+#endif
+
 /**
  * erase_worker - physical eraseblock erase worker function.
  * @ubi: UBI device description object
@@ -1017,7 +1413,10 @@
 			int cancel)
 {
 	struct ubi_wl_entry *e = wl_wrk->e;
-	int pnum = e->pnum, err, need;
+	int pnum = e->pnum;
+	int vol_id = wl_wrk->vol_id;
+	int lnum = wl_wrk->lnum;
+	int err, available_consumed = 0;
 
 	if (cancel) {
 		dbg_wl("cancel erasure of PEB %d EC %d", pnum, e->ec);
@@ -1026,7 +1425,10 @@
 		return 0;
 	}
 
-	dbg_wl("erase PEB %d EC %d", pnum, e->ec);
+	dbg_wl("erase PEB %d EC %d LEB %d:%d",
+	       pnum, e->ec, wl_wrk->vol_id, wl_wrk->lnum);
+
+	ubi_assert(!ubi_is_fm_block(ubi, e->pnum));
 
 	err = sync_erase(ubi, e, wl_wrk->torture);
 	if (!err) {
@@ -1035,6 +1437,7 @@
 
 		spin_lock(&ubi->wl_lock);
 		wl_tree_add(e, &ubi->free);
+		ubi->free_count++;
 		spin_unlock(&ubi->wl_lock);
 
 		/*
@@ -1044,7 +1447,7 @@
 		serve_prot_queue(ubi);
 
 		/* And take care about wear-leveling */
-		err = ensure_wear_leveling(ubi);
+		err = ensure_wear_leveling(ubi, 1);
 		return err;
 	}
 
@@ -1056,7 +1459,7 @@
 		int err1;
 
 		/* Re-schedule the LEB for erasure */
-		err1 = schedule_erase(ubi, e, 0);
+		err1 = schedule_erase(ubi, e, vol_id, lnum, 0);
 		if (err1) {
 			err = err1;
 			goto out_ro;
@@ -1081,20 +1484,14 @@
 	}
 
 	spin_lock(&ubi->volumes_lock);
-	need = ubi->beb_rsvd_level - ubi->beb_rsvd_pebs + 1;
-	if (need > 0) {
-		need = ubi->avail_pebs >= need ? need : ubi->avail_pebs;
-		ubi->avail_pebs -= need;
-		ubi->rsvd_pebs += need;
-		ubi->beb_rsvd_pebs += need;
-		if (need > 0)
-			ubi_msg("reserve more %d PEBs", need);
-	}
-
 	if (ubi->beb_rsvd_pebs == 0) {
-		spin_unlock(&ubi->volumes_lock);
-		ubi_err("no reserved physical eraseblocks");
-		goto out_ro;
+		if (ubi->avail_pebs == 0) {
+			spin_unlock(&ubi->volumes_lock);
+			ubi_err("no reserved/available physical eraseblocks");
+			goto out_ro;
+		}
+		ubi->avail_pebs -= 1;
+		available_consumed = 1;
 	}
 	spin_unlock(&ubi->volumes_lock);
 
@@ -1104,19 +1501,36 @@
 		goto out_ro;
 
 	spin_lock(&ubi->volumes_lock);
-	ubi->beb_rsvd_pebs -= 1;
+	if (ubi->beb_rsvd_pebs > 0) {
+		if (available_consumed) {
+			/*
+			 * The amount of reserved PEBs increased since we last
+			 * checked.
+			 */
+			ubi->avail_pebs += 1;
+			available_consumed = 0;
+		}
+		ubi->beb_rsvd_pebs -= 1;
+	}
 	ubi->bad_peb_count += 1;
 	ubi->good_peb_count -= 1;
 	ubi_calculate_reserved(ubi);
-	if (ubi->beb_rsvd_pebs)
+	if (available_consumed)
+		ubi_warn("no PEBs in the reserved pool, used an available PEB");
+	else if (ubi->beb_rsvd_pebs)
 		ubi_msg("%d PEBs left in the reserve", ubi->beb_rsvd_pebs);
 	else
-		ubi_warn("last PEB from the reserved pool was used");
+		ubi_warn("last PEB from the reserve was used");
 	spin_unlock(&ubi->volumes_lock);
 
 	return err;
 
 out_ro:
+	if (available_consumed) {
+		spin_lock(&ubi->volumes_lock);
+		ubi->avail_pebs += 1;
+		spin_unlock(&ubi->volumes_lock);
+	}
 	ubi_ro_mode(ubi);
 	return err;
 }
@@ -1124,6 +1538,8 @@
 /**
  * ubi_wl_put_peb - return a PEB to the wear-leveling sub-system.
  * @ubi: UBI device description object
+ * @vol_id: the volume ID that last used this PEB
+ * @lnum: the last used logical eraseblock number for the PEB
  * @pnum: physical eraseblock to return
  * @torture: if this physical eraseblock has to be tortured
  *
@@ -1132,7 +1548,8 @@
  * occurred to this @pnum and it has to be tested. This function returns zero
  * in case of success, and a negative error code in case of failure.
  */
-int ubi_wl_put_peb(struct ubi_device *ubi, int pnum, int torture)
+int ubi_wl_put_peb(struct ubi_device *ubi, int vol_id, int lnum,
+		   int pnum, int torture)
 {
 	int err;
 	struct ubi_wl_entry *e;
@@ -1174,13 +1591,13 @@
 		return 0;
 	} else {
 		if (in_wl_tree(e, &ubi->used)) {
-			paranoid_check_in_wl_tree(ubi, e, &ubi->used);
+			self_check_in_wl_tree(ubi, e, &ubi->used);
 			rb_erase(&e->u.rb, &ubi->used);
 		} else if (in_wl_tree(e, &ubi->scrub)) {
-			paranoid_check_in_wl_tree(ubi, e, &ubi->scrub);
+			self_check_in_wl_tree(ubi, e, &ubi->scrub);
 			rb_erase(&e->u.rb, &ubi->scrub);
 		} else if (in_wl_tree(e, &ubi->erroneous)) {
-			paranoid_check_in_wl_tree(ubi, e, &ubi->erroneous);
+			self_check_in_wl_tree(ubi, e, &ubi->erroneous);
 			rb_erase(&e->u.rb, &ubi->erroneous);
 			ubi->erroneous_peb_count -= 1;
 			ubi_assert(ubi->erroneous_peb_count >= 0);
@@ -1198,7 +1615,7 @@
 	}
 	spin_unlock(&ubi->wl_lock);
 
-	err = schedule_erase(ubi, e, torture);
+	err = schedule_erase(ubi, e, vol_id, lnum, torture);
 	if (err) {
 		spin_lock(&ubi->wl_lock);
 		wl_tree_add(e, &ubi->used);
@@ -1222,7 +1639,7 @@
 {
 	struct ubi_wl_entry *e;
 
-	dbg_msg("schedule PEB %d for scrubbing", pnum);
+	ubi_msg("schedule PEB %d for scrubbing", pnum);
 
 retry:
 	spin_lock(&ubi->wl_lock);
@@ -1247,7 +1664,7 @@
 	}
 
 	if (in_wl_tree(e, &ubi->used)) {
-		paranoid_check_in_wl_tree(ubi, e, &ubi->used);
+		self_check_in_wl_tree(ubi, e, &ubi->used);
 		rb_erase(&e->u.rb, &ubi->used);
 	} else {
 		int err;
@@ -1268,29 +1685,60 @@
 	 * Technically scrubbing is the same as wear-leveling, so it is done
 	 * by the WL worker.
 	 */
-	return ensure_wear_leveling(ubi);
+	return ensure_wear_leveling(ubi, 0);
 }
 
 /**
  * ubi_wl_flush - flush all pending works.
  * @ubi: UBI device description object
+ * @vol_id: the volume id to flush for
+ * @lnum: the logical eraseblock number to flush for
  *
- * This function returns zero in case of success and a negative error code in
- * case of failure.
+ * This function executes all pending works for a particular volume id /
+ * logical eraseblock number pair. If either value is set to %UBI_ALL, then it
+ * acts as a wildcard for all of the corresponding volume numbers or logical
+ * eraseblock numbers. It returns zero in case of success and a negative error
+ * code in case of failure.
  */
-int ubi_wl_flush(struct ubi_device *ubi)
+int ubi_wl_flush(struct ubi_device *ubi, int vol_id, int lnum)
 {
-	int err;
+	int err = 0;
+	int found = 1;
 
 	/*
 	 * Erase while the pending works queue is not empty, but not more than
 	 * the number of currently pending works.
 	 */
-	dbg_wl("flush (%d pending works)", ubi->works_count);
-	while (ubi->works_count) {
-		err = do_work(ubi);
-		if (err)
-			return err;
+	dbg_wl("flush pending work for LEB %d:%d (%d pending works)",
+	       vol_id, lnum, ubi->works_count);
+
+	while (found) {
+		struct ubi_work *wrk;
+		found = 0;
+
+		down_read(&ubi->work_sem);
+		spin_lock(&ubi->wl_lock);
+		list_for_each_entry(wrk, &ubi->works, list) {
+			if ((vol_id == UBI_ALL || wrk->vol_id == vol_id) &&
+			    (lnum == UBI_ALL || wrk->lnum == lnum)) {
+				list_del(&wrk->list);
+				ubi->works_count -= 1;
+				ubi_assert(ubi->works_count >= 0);
+				spin_unlock(&ubi->wl_lock);
+
+				err = wrk->func(ubi, wrk, 0);
+				if (err) {
+					up_read(&ubi->work_sem);
+					return err;
+				}
+
+				spin_lock(&ubi->wl_lock);
+				found = 1;
+				break;
+			}
+		}
+		spin_unlock(&ubi->wl_lock);
+		up_read(&ubi->work_sem);
 	}
 
 	/*
@@ -1300,18 +1748,7 @@
 	down_write(&ubi->work_sem);
 	up_write(&ubi->work_sem);
 
-	/*
-	 * And in case last was the WL worker and it canceled the LEB
-	 * movement, flush again.
-	 */
-	while (ubi->works_count) {
-		dbg_wl("flush more (%d pending works)", ubi->works_count);
-		err = do_work(ubi);
-		if (err)
-			return err;
-	}
-
-	return 0;
+	return err;
 }
 
 /**
@@ -1420,27 +1857,30 @@
 }
 
 /**
- * ubi_wl_init_scan - initialize the WL sub-system using scanning information.
+ * ubi_wl_init - initialize the WL sub-system using attaching information.
  * @ubi: UBI device description object
- * @si: scanning information
+ * @ai: attaching information
  *
  * This function returns zero in case of success, and a negative error code in
  * case of failure.
  */
-int ubi_wl_init_scan(struct ubi_device *ubi, struct ubi_scan_info *si)
+int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
 {
-	int err, i;
+	int err, i, reserved_pebs, found_pebs = 0;
 	struct rb_node *rb1, *rb2;
-	struct ubi_scan_volume *sv;
-	struct ubi_scan_leb *seb, *tmp;
+	struct ubi_ainf_volume *av;
+	struct ubi_ainf_peb *aeb, *tmp;
 	struct ubi_wl_entry *e;
 
 	ubi->used = ubi->erroneous = ubi->free = ubi->scrub = RB_ROOT;
 	spin_lock_init(&ubi->wl_lock);
 	mutex_init(&ubi->move_mutex);
 	init_rwsem(&ubi->work_sem);
-	ubi->max_ec = si->max_ec;
+	ubi->max_ec = ai->max_ec;
 	INIT_LIST_HEAD(&ubi->works);
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	INIT_WORK(&ubi->fm_work, update_fastmap_work_fn);
+#endif
 
 	sprintf(ubi->bgt_name, UBI_BGT_NAME_PATTERN, ubi->ubi_num);
 
@@ -1453,48 +1893,59 @@
 		INIT_LIST_HEAD(&ubi->pq[i]);
 	ubi->pq_head = 0;
 
-	list_for_each_entry_safe(seb, tmp, &si->erase, u.list) {
+	list_for_each_entry_safe(aeb, tmp, &ai->erase, u.list) {
 		cond_resched();
 
 		e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);
 		if (!e)
 			goto out_free;
 
-		e->pnum = seb->pnum;
-		e->ec = seb->ec;
+		e->pnum = aeb->pnum;
+		e->ec = aeb->ec;
+		ubi_assert(!ubi_is_fm_block(ubi, e->pnum));
 		ubi->lookuptbl[e->pnum] = e;
-		if (schedule_erase(ubi, e, 0)) {
+		if (schedule_erase(ubi, e, aeb->vol_id, aeb->lnum, 0)) {
 			kmem_cache_free(ubi_wl_entry_slab, e);
 			goto out_free;
 		}
+
+		found_pebs++;
 	}
 
-	list_for_each_entry(seb, &si->free, u.list) {
+	ubi->free_count = 0;
+	list_for_each_entry(aeb, &ai->free, u.list) {
 		cond_resched();
 
 		e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);
 		if (!e)
 			goto out_free;
 
-		e->pnum = seb->pnum;
-		e->ec = seb->ec;
+		e->pnum = aeb->pnum;
+		e->ec = aeb->ec;
 		ubi_assert(e->ec >= 0);
+		ubi_assert(!ubi_is_fm_block(ubi, e->pnum));
+
 		wl_tree_add(e, &ubi->free);
+		ubi->free_count++;
+
 		ubi->lookuptbl[e->pnum] = e;
+
+		found_pebs++;
 	}
 
-	ubi_rb_for_each_entry(rb1, sv, &si->volumes, rb) {
-		ubi_rb_for_each_entry(rb2, seb, &sv->root, u.rb) {
+	ubi_rb_for_each_entry(rb1, av, &ai->volumes, rb) {
+		ubi_rb_for_each_entry(rb2, aeb, &av->root, u.rb) {
 			cond_resched();
 
 			e = kmem_cache_alloc(ubi_wl_entry_slab, GFP_KERNEL);
 			if (!e)
 				goto out_free;
 
-			e->pnum = seb->pnum;
-			e->ec = seb->ec;
+			e->pnum = aeb->pnum;
+			e->ec = aeb->ec;
 			ubi->lookuptbl[e->pnum] = e;
-			if (!seb->scrub) {
+
+			if (!aeb->scrub) {
 				dbg_wl("add PEB %d EC %d to the used tree",
 				       e->pnum, e->ec);
 				wl_tree_add(e, &ubi->used);
@@ -1503,22 +1954,38 @@
 				       e->pnum, e->ec);
 				wl_tree_add(e, &ubi->scrub);
 			}
+
+			found_pebs++;
 		}
 	}
 
-	if (ubi->avail_pebs < WL_RESERVED_PEBS) {
+	dbg_wl("found %i PEBs", found_pebs);
+
+	if (ubi->fm)
+		ubi_assert(ubi->good_peb_count == \
+			   found_pebs + ubi->fm->used_blocks);
+	else
+		ubi_assert(ubi->good_peb_count == found_pebs);
+
+	reserved_pebs = WL_RESERVED_PEBS;
+#ifdef CONFIG_MTD_UBI_FASTMAP
+	/* Reserve enough LEBs to store two fastmaps. */
+	reserved_pebs += (ubi->fm_size / ubi->leb_size) * 2;
+#endif
+
+	if (ubi->avail_pebs < reserved_pebs) {
 		ubi_err("no enough physical eraseblocks (%d, need %d)",
-			ubi->avail_pebs, WL_RESERVED_PEBS);
+			ubi->avail_pebs, reserved_pebs);
 		if (ubi->corr_peb_count)
 			ubi_err("%d PEBs are corrupted and not used",
 				ubi->corr_peb_count);
 		goto out_free;
 	}
-	ubi->avail_pebs -= WL_RESERVED_PEBS;
-	ubi->rsvd_pebs += WL_RESERVED_PEBS;
+	ubi->avail_pebs -= reserved_pebs;
+	ubi->rsvd_pebs += reserved_pebs;
 
 	/* Schedule wear-leveling if needed */
-	err = ensure_wear_leveling(ubi);
+	err = ensure_wear_leveling(ubi, 0);
 	if (err)
 		goto out_free;
 
@@ -1566,10 +2033,8 @@
 	kfree(ubi->lookuptbl);
 }
 
-#ifdef CONFIG_MTD_UBI_DEBUG
-
 /**
- * paranoid_check_ec - make sure that the erase counter of a PEB is correct.
+ * self_check_ec - make sure that the erase counter of a PEB is correct.
  * @ubi: UBI device description object
  * @pnum: the physical eraseblock number to check
  * @ec: the erase counter to check
@@ -1578,13 +2043,13 @@
  * is equivalent to @ec, and a negative error code if not or if an error
  * occurred.
  */
-static int paranoid_check_ec(struct ubi_device *ubi, int pnum, int ec)
+static int self_check_ec(struct ubi_device *ubi, int pnum, int ec)
 {
 	int err;
 	long long read_ec;
 	struct ubi_ec_hdr *ec_hdr;
 
-	if (!ubi->dbg->chk_gen)
+	if (!ubi_dbg_chk_gen(ubi))
 		return 0;
 
 	ec_hdr = kzalloc(ubi->ec_hdr_alsize, GFP_NOFS);
@@ -1599,10 +2064,10 @@
 	}
 
 	read_ec = be64_to_cpu(ec_hdr->ec);
-	if (ec != read_ec) {
-		ubi_err("paranoid check failed for PEB %d", pnum);
+	if (ec != read_ec && read_ec - ec > 1) {
+		ubi_err("self-check failed for PEB %d", pnum);
 		ubi_err("read EC is %lld, should be %d", read_ec, ec);
-		ubi_dbg_dump_stack();
+		dump_stack();
 		err = 1;
 	} else
 		err = 0;
@@ -1613,7 +2078,7 @@
 }
 
 /**
- * paranoid_check_in_wl_tree - check that wear-leveling entry is in WL RB-tree.
+ * self_check_in_wl_tree - check that wear-leveling entry is in WL RB-tree.
  * @ubi: UBI device description object
  * @e: the wear-leveling entry to check
  * @root: the root of the tree
@@ -1621,37 +2086,36 @@
  * This function returns zero if @e is in the @root RB-tree and %-EINVAL if it
  * is not.
  */
-static int paranoid_check_in_wl_tree(const struct ubi_device *ubi,
-				     struct ubi_wl_entry *e,
-				     struct rb_root *root)
+static int self_check_in_wl_tree(const struct ubi_device *ubi,
+				 struct ubi_wl_entry *e, struct rb_root *root)
 {
-	if (!ubi->dbg->chk_gen)
+	if (!ubi_dbg_chk_gen(ubi))
 		return 0;
 
 	if (in_wl_tree(e, root))
 		return 0;
 
-	ubi_err("paranoid check failed for PEB %d, EC %d, RB-tree %p ",
+	ubi_err("self-check failed for PEB %d, EC %d, RB-tree %p ",
 		e->pnum, e->ec, root);
-	ubi_dbg_dump_stack();
+	dump_stack();
 	return -EINVAL;
 }
 
 /**
- * paranoid_check_in_pq - check if wear-leveling entry is in the protection
+ * self_check_in_pq - check if wear-leveling entry is in the protection
  *                        queue.
  * @ubi: UBI device description object
  * @e: the wear-leveling entry to check
  *
  * This function returns zero if @e is in @ubi->pq and %-EINVAL if it is not.
  */
-static int paranoid_check_in_pq(const struct ubi_device *ubi,
-				struct ubi_wl_entry *e)
+static int self_check_in_pq(const struct ubi_device *ubi,
+			    struct ubi_wl_entry *e)
 {
 	struct ubi_wl_entry *p;
 	int i;
 
-	if (!ubi->dbg->chk_gen)
+	if (!ubi_dbg_chk_gen(ubi))
 		return 0;
 
 	for (i = 0; i < UBI_PROT_QUEUE_LEN; ++i)
@@ -1659,10 +2123,8 @@
 			if (p == e)
 				return 0;
 
-	ubi_err("paranoid check failed for PEB %d, EC %d, Protect queue",
+	ubi_err("self-check failed for PEB %d, EC %d, Protect queue",
 		e->pnum, e->ec);
-	ubi_dbg_dump_stack();
+	dump_stack();
 	return -EINVAL;
 }
-
-#endif /* CONFIG_MTD_UBI_DEBUG */
diff -ur a/drivers/net/ethernet/intel/e1000/e1000_hw.c b/drivers/net/ethernet/intel/e1000/e1000_hw.c
--- a/drivers/net/ethernet/intel/e1000/e1000_hw.c	2013-08-24 11:36:57.000000000 +0200
+++ b/drivers/net/ethernet/intel/e1000/e1000_hw.c	2014-02-17 11:57:26.000000000 +0100
@@ -107,6 +107,7 @@
 };
 
 static DEFINE_SPINLOCK(e1000_eeprom_lock);
+static DEFINE_SPINLOCK(e1000_phy_lock);
 
 /**
  * e1000_set_phy_type - Set the phy type member in the hw struct.
@@ -2988,19 +2989,25 @@
 s32 e1000_read_phy_reg(struct e1000_hw *hw, u32 reg_addr, u16 *phy_data)
 {
 	u32 ret_val;
+	unsigned long flags;
 
 	e_dbg("e1000_read_phy_reg");
 
+	spin_lock_irqsave(&e1000_phy_lock, flags);
+
 	if ((hw->phy_type == e1000_phy_igp) &&
 	    (reg_addr > MAX_PHY_MULTI_PAGE_REG)) {
 		ret_val = e1000_write_phy_reg_ex(hw, IGP01E1000_PHY_PAGE_SELECT,
 						 (u16) reg_addr);
-		if (ret_val)
+		if (ret_val) {
+			spin_unlock_irqrestore(&e1000_phy_lock, flags);
 			return ret_val;
+		}
 	}
 
 	ret_val = e1000_read_phy_reg_ex(hw, MAX_PHY_REG_ADDRESS & reg_addr,
 					phy_data);
+	spin_unlock_irqrestore(&e1000_phy_lock, flags);
 
 	return ret_val;
 }
@@ -3123,19 +3130,25 @@
 s32 e1000_write_phy_reg(struct e1000_hw *hw, u32 reg_addr, u16 phy_data)
 {
 	u32 ret_val;
+	unsigned long flags;
 
 	e_dbg("e1000_write_phy_reg");
 
+	spin_lock_irqsave(&e1000_phy_lock, flags);
+
 	if ((hw->phy_type == e1000_phy_igp) &&
 	    (reg_addr > MAX_PHY_MULTI_PAGE_REG)) {
 		ret_val = e1000_write_phy_reg_ex(hw, IGP01E1000_PHY_PAGE_SELECT,
 						 (u16) reg_addr);
-		if (ret_val)
+		if (ret_val) {
+			spin_unlock_irqrestore(&e1000_phy_lock, flags);
 			return ret_val;
+		}
 	}
 
 	ret_val = e1000_write_phy_reg_ex(hw, MAX_PHY_REG_ADDRESS & reg_addr,
 					 phy_data);
+	spin_unlock_irqrestore(&e1000_phy_lock, flags);
 
 	return ret_val;
 }
diff -ur a/drivers/net/ethernet/intel/e1000/e1000_main.c b/drivers/net/ethernet/intel/e1000/e1000_main.c
--- a/drivers/net/ethernet/intel/e1000/e1000_main.c	2013-08-24 11:36:57.000000000 +0200
+++ b/drivers/net/ethernet/intel/e1000/e1000_main.c	2014-02-17 11:57:26.000000000 +0100
@@ -236,6 +236,46 @@
 	return adapter->netdev;
 }
 
+#ifdef MY_ABC_HERE
+void e1000_syno_led_switch(int iEnable)
+{
+#ifdef CONFIG_ARCH_GEN3
+	struct net_device *dev = NULL;
+	struct e1000_adapter *adapter = NULL;
+	struct e1000_hw *hw = NULL;
+	u16 uiActLedCtrl = 0, uiLinkLedCtrl = 0;
+
+	dev = first_net_device(&init_net);
+	adapter = netdev_priv(dev);
+	hw = &adapter->hw;
+	/* The structure retured by first_net_device()
+	* does not contain the value of mac_type, this
+	* makes wrong opertaion to r/w phy regs.
+	* So we assign the value for Evansport's nic  here.
+	*/
+	hw->mac_type = e1000_ce4100;
+
+	e1000_write_phy_reg(hw, 31, 0x0007);
+	e1000_write_phy_reg(hw, 30, 0x002C);
+	e1000_read_phy_reg(hw, 26, &uiActLedCtrl);
+	e1000_read_phy_reg(hw, 28, &uiLinkLedCtrl);
+
+	if (iEnable) {
+		uiActLedCtrl |= 0x0040;
+		uiLinkLedCtrl |= 0x0700;
+	} else {
+		uiActLedCtrl &= 0xFFBF;
+		uiLinkLedCtrl &= 0xF8FF;
+	}
+
+	e1000_write_phy_reg(hw, 26, uiActLedCtrl);
+	e1000_write_phy_reg(hw, 28, uiLinkLedCtrl);
+	e1000_write_phy_reg(hw, 31, 0x0000);
+#endif
+}
+EXPORT_SYMBOL(e1000_syno_led_switch);
+#endif /*MY_ABC_HERE*/
+
 /**
  * e1000_init_module - Driver Registration Routine
  *
@@ -4841,9 +4881,21 @@
 
 			/* Enable WoL for selected modes */
 			e1000_write_phy_reg(hw, 31, 0x0007);
+#ifdef MY_DEF_HERE
+			udelay(10);
+#endif
 			e1000_write_phy_reg(hw, 30, 0x006d);
+#ifdef MY_DEF_HERE
+			udelay(10);
+#endif
 			e1000_write_phy_reg(hw, 22, 0x9fff);
+#ifdef MY_DEF_HERE
+			udelay(10);
+#endif
 			e1000_write_phy_reg(hw, 21, (u16) phy_wol);
+#ifdef MY_DEF_HERE
+			udelay(10);
+#endif
 
 			/* Disable GMII/RGMII pad for power saving */
 			/* FIXME: for the S5 -> S0 wake to work, the BIOS
diff -ur a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
--- a/drivers/net/irda/stir4200.c	2013-08-24 11:36:55.000000000 +0200
+++ b/drivers/net/irda/stir4200.c	2014-02-17 11:57:21.000000000 +0100
@@ -750,7 +750,7 @@
 
 			write_reg(stir, REG_CTRL1, CTRL1_TXPWD|CTRL1_RXPWD);
 
-			refrigerator();
+			try_to_freeze();
 
 			if (change_speed(stir, stir->speed))
 				break;
Nur in b/drivers/net/phy: atheros.c.
diff -ur a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
--- a/drivers/net/phy/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/net/phy/Kconfig	2014-01-21 09:37:12.000000000 +0100
@@ -19,6 +19,12 @@
 	tristate "Drivers for Marvell PHYs"
 	---help---
 	  Currently has a driver for the 88E1011S
+
+config ATHEROS_PHY
+	tristate "Drivers for Atheros PHYs"
+	depends on SYNO_COMCERTO
+	---help---
+	  Currently supports AR8035 and AR8327
 	
 config DAVICOM_PHY
 	tristate "Drivers for Davicom PHYs"
diff -ur a/drivers/net/phy/Makefile b/drivers/net/phy/Makefile
--- a/drivers/net/phy/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/net/phy/Makefile	2014-01-21 09:37:12.000000000 +0100
@@ -4,6 +4,7 @@
 
 obj-$(CONFIG_PHYLIB)		+= libphy.o
 obj-$(CONFIG_MARVELL_PHY)	+= marvell.o
+obj-$(CONFIG_ATHEROS_PHY)	+= atheros.o
 obj-$(CONFIG_DAVICOM_PHY)	+= davicom.o
 obj-$(CONFIG_CICADA_PHY)	+= cicada.o
 obj-$(CONFIG_LXT_PHY)		+= lxt.o
diff -ur a/drivers/net/ppp/ppp_generic.c b/drivers/net/ppp/ppp_generic.c
--- a/drivers/net/ppp/ppp_generic.c	2013-08-24 11:36:58.000000000 +0200
+++ b/drivers/net/ppp/ppp_generic.c	2014-02-17 11:57:28.000000000 +0100
@@ -56,6 +56,9 @@
 #endif
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+#include <linux/jiffies.h>
+#endif
 
 #define PPP_VERSION	"2.4.2"
 
@@ -562,6 +565,9 @@
 	struct ppp *ppp;
 	int err = -EFAULT, val, val2, i;
 	struct ppp_idle idle;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+        struct ppp_idle fppidle;
+#endif
 	struct npioctl npi;
 	int unit, cflags;
 	struct slcompress *vj;
@@ -741,6 +747,30 @@
 		err = 0;
 		break;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	case PPPIOCSFPPIDLE:
+		if (copy_from_user(&fppidle, argp, sizeof(fppidle)))
+			break;
+
+		ppp_xmit_lock(ppp);
+
+		if (time_after((jiffies - (fppidle.xmit_idle * HZ)) , ppp->last_xmit))
+			ppp->last_xmit = (jiffies - fppidle.xmit_idle * HZ);
+
+		ppp_xmit_unlock(ppp);
+
+		ppp_recv_lock(ppp);
+
+		if (time_after((jiffies - (fppidle.recv_idle * HZ)) , ppp->last_recv))
+			ppp->last_recv = (jiffies - fppidle.recv_idle * HZ);
+
+		ppp_recv_unlock(ppp);
+
+		err = 0;
+		break;
+#endif
+
+
 #ifdef CONFIG_PPP_FILTER
 	case PPPIOCSPASS:
 	{
@@ -2829,7 +2859,14 @@
 	write_lock_bh(&pch->upl);
 	ret = -EINVAL;
 	if (pch->ppp)
+#if defined(CONFIG_SYNO_COMCERTO)
+	{
+		write_unlock_bh(&pch->upl);
+		goto out;
+	}
+#else
 		goto outl;
+#endif
 
 	ppp_lock(ppp);
 	if (pch->file.hdrlen > ppp->file.hdrlen)
@@ -2842,10 +2879,22 @@
 	pch->ppp = ppp;
 	atomic_inc(&ppp->file.refcnt);
 	ppp_unlock(ppp);
+#if !defined(CONFIG_SYNO_COMCERTO)
 	ret = 0;
 
  outl:
+#endif
 	write_unlock_bh(&pch->upl);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	if ((ppp->dev) && (!ppp->closing)) {
+		rtnl_lock();
+		rtmsg_ifinfo(RTM_NEWLINK, ppp->dev, 0);
+		rtnl_unlock();
+	}
+
+	ret = 0;
+#endif
  out:
 	mutex_unlock(&pn->all_ppp_mutex);
 	return ret;
@@ -2871,6 +2920,15 @@
 		if (--ppp->n_channels == 0)
 			wake_up_interruptible(&ppp->file.rwait);
 		ppp_unlock(ppp);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+		if ((ppp->dev) && (!ppp->closing)) {
+			rtnl_lock();
+			rtmsg_ifinfo(RTM_NEWLINK, ppp->dev, 0);
+			rtnl_unlock();
+		}
+#endif
+
 		if (atomic_dec_and_test(&ppp->file.refcnt))
 			ppp_destroy_interface(ppp);
 		err = 0;
diff -ur a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
--- a/drivers/net/ppp/pppoe.c	2013-08-24 11:36:58.000000000 +0200
+++ b/drivers/net/ppp/pppoe.c	2014-02-17 11:57:28.000000000 +0100
@@ -913,15 +913,24 @@
 		goto end;
 
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32 + NET_SKB_PAD,
+			   0, GFP_KERNEL);
+#else
 	skb = sock_wmalloc(sk, total_len + dev->hard_header_len + 32,
 			   0, GFP_KERNEL);
+#endif
 	if (!skb) {
 		error = -ENOMEM;
 		goto end;
 	}
 
 	/* Reserve space for headers. */
+#if defined(CONFIG_SYNO_COMCERTO)
+	skb_reserve(skb, dev->hard_header_len + NET_SKB_PAD);
+#else
 	skb_reserve(skb, dev->hard_header_len);
+#endif
 	skb_reset_network_header(skb);
 
 	skb->dev = dev;
@@ -997,7 +1006,15 @@
 
 	skb->protocol = cpu_to_be16(ETH_P_PPP_SES);
 	skb->dev = dev;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	if((skb->ipsec_offload == 1) && (!skb->sp))
+	{
+		  dev_hard_header(skb, dev, ETH_P_PPP_SES,
+			 dev->dev_addr, po->pppoe_pa.remote, data_len);
 
+	}
+	else
+#endif
 	dev_hard_header(skb, dev, ETH_P_PPP_SES,
 			po->pppoe_pa.remote, NULL, data_len);
 
@@ -1061,17 +1078,33 @@
 {
 	struct pppox_sock *po;
 	char *dev_name;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	char *ppp_name;
+#endif
 
 	if (v == SEQ_START_TOKEN) {
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+		seq_puts(seq, "Id       Address              Device  PPPDevice\n");
+#else
 		seq_puts(seq, "Id       Address              Device\n");
+#endif
 		goto out;
 	}
 
 	po = v;
 	dev_name = po->pppoe_pa.dev;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	ppp_name = ppp_dev_name(&po->chan);
+	if (!ppp_name)
+		goto out;
+
+	seq_printf(seq, "%04X %pM %8s       %s\n",
+		ntohs(po->pppoe_pa.sid), po->pppoe_pa.remote, dev_name, ppp_name);
+#else
 	seq_printf(seq, "%08X %pM %8s\n",
 		po->pppoe_pa.sid, po->pppoe_pa.remote, dev_name);
+#endif
 out:
 	return 0;
 }
diff -ur a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
--- a/drivers/net/virtio_net.c	2013-08-24 11:36:55.000000000 +0200
+++ b/drivers/net/virtio_net.c	2014-02-17 11:57:22.000000000 +0100
@@ -368,7 +368,7 @@
 
 	skb_to_sgvec(skb, vi->rx_sg + 1, 0, skb->len);
 
-	err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 2, skb, gfp);
+	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 2, skb, gfp);
 	if (err < 0)
 		dev_kfree_skb(skb);
 
@@ -413,8 +413,8 @@
 
 	/* chain first in list head */
 	first->private = (unsigned long)list;
-	err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
-				    first, gfp);
+	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, MAX_SKB_FRAGS + 2,
+				first, gfp);
 	if (err < 0)
 		give_pages(vi, first);
 
@@ -432,7 +432,7 @@
 
 	sg_init_one(vi->rx_sg, page_address(page), PAGE_SIZE);
 
-	err = virtqueue_add_buf_gfp(vi->rvq, vi->rx_sg, 0, 1, page, gfp);
+	err = virtqueue_add_buf(vi->rvq, vi->rx_sg, 0, 1, page, gfp);
 	if (err < 0)
 		give_pages(vi, page);
 
@@ -601,7 +601,7 @@
 
 	hdr->num_sg = skb_to_sgvec(skb, vi->tx_sg + 1, 0, skb->len) + 1;
 	return virtqueue_add_buf(vi->svq, vi->tx_sg, hdr->num_sg,
-					0, skb);
+				 0, skb, GFP_ATOMIC);
 }
 
 static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev)
@@ -754,7 +754,7 @@
 		sg_set_buf(&sg[i + 1], sg_virt(s), s->length);
 	sg_set_buf(&sg[out + in - 1], &status, sizeof(status));
 
-	BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi) < 0);
+	BUG_ON(virtqueue_add_buf(vi->cvq, sg, out, in, vi, GFP_ATOMIC) < 0);
 
 	virtqueue_kick(vi->cvq);
 
diff -ur a/drivers/pci/quirks.c b/drivers/pci/quirks.c
--- a/drivers/pci/quirks.c	2013-08-24 11:37:07.000000000 +0200
+++ b/drivers/pci/quirks.c	2014-02-17 11:57:44.000000000 +0100
@@ -598,12 +598,12 @@
 static u32 SynoGpioCount = 0;
 
 #if defined(CONFIG_SYNO_CEDARVIEW)
-static u32 ich9_writable_pin[] = {1, 6, 7, 10, 15, 16, 17, 18, 20, 21, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37, 45, 46, 47, 49, 55, 57};
+static u32 ich9_writable_pin[] = {1, 6, 7, 10, 15, 16, 17, 18, 20, 21, 24, 25, 29, 30, 31, 32, 33, 34, 35, 36, 37, 42, 43, 45, 46, 47, 49, 55, 57};
 #else
 static u32 ich9_writable_pin[] = {1, 6, 7, 10, 15, 16, 17, 18, 20, 21, 24, 25, 30, 31, 32, 33, 34, 35, 36, 37, 46, 47, 49, 55, 57};
 #endif
 static u32 c206_writable_pin[] = {0, 5, 16, 20, 21, 22, 34, 38, 48, 52, 54, 69, 70, 71};
-static u32 c226_writable_pin[] = {5, 16, 18, 19, 20, 21, 23, 29, 30, 32, 33, 34, 35, 36, 37, 58, 62, 64, 75};
+static u32 c226_writable_pin[] = {5, 16, 18, 19, 20, 21, 23, 32, 33, 34, 35, 36, 37, 45};
 
 u32 syno_pch_lpc_gpio_pin(int pin, int *pValue, int isWrite)
 {
diff -ur a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
--- a/drivers/rtc/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/rtc/Kconfig	2014-01-21 09:37:15.000000000 +0100
@@ -762,6 +762,13 @@
 	  This driver can also be build as a module. If so, the module
 	  will be called rtc-s3c.
 
+config RTC_DRV_C2K
+	tristate "Mindspeed C2000 SoC RTC"
+	depends on ARCH_M86XXX && SYNO_COMCERTO
+	default y
+	help
+		Comcerto C2000 RTC (Realtime Clock) driver.
+
 config RTC_DRV_EP93XX
 	tristate "Cirrus Logic EP93XX"
 	depends on ARCH_EP93XX
diff -ur a/drivers/rtc/Makefile b/drivers/rtc/Makefile
--- a/drivers/rtc/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/rtc/Makefile	2014-01-21 09:37:15.000000000 +0100
@@ -91,6 +91,7 @@
 obj-$(CONFIG_RTC_DRV_RX8581)	+= rtc-rx8581.o
 obj-$(CONFIG_RTC_DRV_S35390A)	+= rtc-s35390a.o
 obj-$(CONFIG_RTC_DRV_S3C)	+= rtc-s3c.o
+obj-$(CONFIG_RTC_DRV_C2K)      += rtc-c2k.o
 obj-$(CONFIG_RTC_DRV_SA1100)	+= rtc-sa1100.o
 obj-$(CONFIG_RTC_DRV_SH)	+= rtc-sh.o
 obj-$(CONFIG_RTC_DRV_SPEAR)	+= rtc-spear.o
Nur in b/drivers/rtc: rtc-c2k.c.
diff -ur a/drivers/s390/kvm/kvm_virtio.c b/drivers/s390/kvm/kvm_virtio.c
--- a/drivers/s390/kvm/kvm_virtio.c	2013-08-24 11:36:53.000000000 +0200
+++ b/drivers/s390/kvm/kvm_virtio.c	2014-02-17 11:57:16.000000000 +0100
@@ -198,7 +198,7 @@
 		goto out;
 
 	vq = vring_new_virtqueue(config->num, KVM_S390_VIRTIO_RING_ALIGN,
-				 vdev, (void *) config->address,
+				 vdev, true, (void *) config->address,
 				 kvm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
diff -ur a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c
--- a/drivers/scsi/fcoe/fcoe_ctlr.c	2013-08-24 11:37:00.000000000 +0200
+++ b/drivers/scsi/fcoe/fcoe_ctlr.c	2014-02-17 11:57:31.000000000 +0100
@@ -2030,7 +2030,7 @@
 	 */
 	port_id = fip->port_id;
 	if (fip->probe_tries)
-		port_id = prandom32(&fip->rnd_state) & 0xffff;
+		port_id = prandom_u32_state(&fip->rnd_state) & 0xffff;
 	else if (!port_id)
 		port_id = fip->lp->wwpn & 0xffff;
 	if (!port_id || port_id == 0xffff)
@@ -2055,7 +2055,7 @@
 static void fcoe_ctlr_vn_start(struct fcoe_ctlr *fip)
 {
 	fip->probe_tries = 0;
-	prandom32_seed(&fip->rnd_state, fip->lp->wwpn);
+	prandom_seed_state(&fip->rnd_state, fip->lp->wwpn);
 	fcoe_ctlr_vn_restart(fip);
 }
 
diff -ur a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
--- a/drivers/scsi/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/scsi/Kconfig	2014-01-21 09:37:15.000000000 +0100
@@ -1908,6 +1908,14 @@
 	  To compile this driver as a module, choose M here. The module will
 	  be called bfa.
 
+config SCSI_VIRTIO
+	tristate "virtio-scsi support (EXPERIMENTAL)"
+	depends on EXPERIMENTAL && VIRTIO
+	help
+          This is the virtual HBA driver for virtio.  If the kernel will
+          be used in a virtual machine, say Y or M.
+
+
 endif # SCSI_LOWLEVEL
 
 source "drivers/scsi/pcmcia/Kconfig"
diff -ur a/drivers/scsi/Makefile b/drivers/scsi/Makefile
--- a/drivers/scsi/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/scsi/Makefile	2014-01-21 09:37:15.000000000 +0100
@@ -142,6 +142,7 @@
 obj-$(CONFIG_SCSI_BNX2_ISCSI)	+= libiscsi.o bnx2i/
 obj-$(CONFIG_BE2ISCSI)		+= libiscsi.o be2iscsi/
 obj-$(CONFIG_SCSI_PMCRAID)	+= pmcraid.o
+obj-$(CONFIG_SCSI_VIRTIO)	+= virtio_scsi.o
 obj-$(CONFIG_VMWARE_PVSCSI)	+= vmw_pvscsi.o
 
 obj-$(CONFIG_ARM)		+= arm/
diff -ur a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
--- a/drivers/scsi/scsi.c	2013-08-24 11:37:00.000000000 +0200
+++ b/drivers/scsi/scsi.c	2014-02-17 11:57:30.000000000 +0100
@@ -56,7 +56,7 @@
 #include <linux/mutex.h>
 #if defined(MY_ABC_HERE)
 #include <linux/ata.h>
-#endif
+#endif /* MY_ABC_HERE */
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -689,7 +689,7 @@
 		}
 	}
 }
-#endif
+#endif /* MY_ABC_HERE */
 
 
 /**
diff -ur a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
--- a/drivers/scsi/scsi_scan.c	2013-08-24 11:37:00.000000000 +0200
+++ b/drivers/scsi/scsi_scan.c	2014-02-17 11:57:31.000000000 +0100
@@ -50,10 +50,21 @@
 #if defined(MY_ABC_HERE)
 #define SYNO_INQUIRY_TMP_LEN 32
 #define SZ_STAT_DISK_VENDOR "ATA     "
+#define SYNO_INQUIRY_VENDOR_LEN 8
+typedef struct _tag_SYNO_DISK_VENDOR {
+	const char *szName;    /* name of vendor, or NULL for list end */
+	const int iLength; /* length of vendor */
+}SYNO_DISK_VENDOR;
+
+SYNO_DISK_VENDOR gDiskVendor[] = {
+	{"OCZ", 3},
+	{NULL, 0}
+};
 #if defined(MY_ABC_HERE)
 #define SYNO_RESULT_LEN 512
 /* The IDENTIFY DEVICE command will get most 40 characters */
 #define SYNO_IDENTIFY_DEVICE_TMP_LEN 40
+extern int syno_get_ata_identity(struct scsi_device *sdev, u16 *id);
 #endif
 #endif
 
@@ -549,6 +560,8 @@
 	char szTmpStr[SYNO_INQUIRY_TMP_LEN] = {'\0'};
 	int iCharIdx;
 	int blPreIsSpace = 0, blSegmented = 0;
+	int blSpecialVendor = 0;
+	int i = 0;
 
 	if (NULL == szInqStr || 0 == uiLen) {
 		goto END;
@@ -561,23 +574,34 @@
 	memcpy(szRevStr, szInqStr + uiLen - 4, 4);
 	memcpy(szTmpStr, szInqStr + 8, uiLen - 4 - 8);
 
-	for (iCharIdx = 0; iCharIdx < sizeof(szTmpStr); iCharIdx++) {
-		if ('\0' == szTmpStr[iCharIdx]) {
+	for (i = 0; NULL != gDiskVendor[i].szName; i++) {
+		if (!strncmp(gDiskVendor[i].szName, szTmpStr, gDiskVendor[i].iLength)) {
+			blSpecialVendor = 1;
 			break;
 		}
+	}
 
-		if (' ' == szTmpStr[iCharIdx]) {
-			blPreIsSpace = 1;
-		} else {
-			if (blPreIsSpace) {
-				blSegmented =1;
+	if (1 == blSpecialVendor) {
+		iCharIdx = gDiskVendor[i].iLength;
+	} else {
+		for (iCharIdx = 0; iCharIdx < sizeof(szTmpStr); iCharIdx++) {
+			if ('\0' == szTmpStr[iCharIdx]) {
 				break;
 			}
+
+			if (' ' == szTmpStr[iCharIdx]) {
+				blPreIsSpace = 1;
+			} else {
+				if (blPreIsSpace) {
+					blSegmented =1;
+					break;
+				}
+			}
 		}
-	}
 
-	if (!blSegmented) {
-		goto END;
+		if (!blSegmented) {
+			goto END;
+		}
 	}
 
 	memset(szInqStr, 0, uiLen);
@@ -588,6 +612,40 @@
 END:
 	return;
 }
+
+/**
+ * syno_standard_vendor_string - refine the vendor strings of SATA disks
+ * @szInqStr: INQUIRY result string to be refined
+ * @uiLen: length of the string
+ *
+ * Description:
+ * 	The verder name is not correct in some disk. For example, the vendor name of
+ * 	"OCZ-VERTEX3 MI" is "OCZ-VERT".
+ *	This function refines the INQUIRY result of SATA disks to correctly
+ *	fill the vendor name by vendor list gDiskVendor.
+ **/
+static void syno_standard_vendor_string(unsigned char *szInqStr, unsigned int uiLen)
+{
+	int i = 0;
+
+	if (NULL == szInqStr || 0 == uiLen) {
+		goto END;
+	}
+
+	if (uiLen > SYNO_INQUIRY_VENDOR_LEN) {
+		uiLen = SYNO_INQUIRY_VENDOR_LEN;
+	}
+
+	for (i = 0; NULL != gDiskVendor[i].szName; i++) {
+		if (!strncmp(gDiskVendor[i].szName, szInqStr, gDiskVendor[i].iLength)) {
+			memset(szInqStr, 0, uiLen);
+			memcpy(szInqStr, gDiskVendor[i].szName, gDiskVendor[i].iLength);
+			break;
+		}
+	}
+END:
+	return;
+}
 #endif
 
 #ifdef MY_ABC_HERE
@@ -614,32 +672,16 @@
 
 static void scsi_ata_identify_device_get_model_name(struct scsi_device *sdev, unsigned char *szInqReturn)
 {
-	unsigned char szScsiCmd[MAX_COMMAND_SIZE] = {0};
-	unsigned char szInqResult[SYNO_RESULT_LEN] = {0};
 	int i = 0;
-	int iResid;
+	int blSpecialVendor = 0;
 	int blPreIsSpace = 0;
 	int blSegmented = 0;
 	int iRes = 0;
-	struct scsi_sense_hdr sshdr;
-
-	memset(szScsiCmd, 0, MAX_COMMAND_SIZE);
-
-	/* ATA PASS-THROUGH (16) command */
-	szScsiCmd[0] = 0x85;
-	/* PROTOCOL=PIO Data-In */
-	szScsiCmd[1] = 0x08;
-	/* T_DIR=1, BYT_BLOK=1, T_LENGTH=2 */
-	szScsiCmd[2] = 0x0e;
-	/* ATA IDENTIFY DEVICE command */
-	szScsiCmd[14] = 0xec;
-
-	iRes = scsi_execute_req(sdev, szScsiCmd, DMA_FROM_DEVICE,
-		szInqResult, SYNO_RESULT_LEN, &sshdr, HZ*10, 5,	&iResid);
+	u16 id[SYNO_RESULT_LEN / 2] = {0};
+	unsigned char szInqResult[SYNO_RESULT_LEN] = {0};
 
-	if (iRes || '\0' == szInqResult[54+i]) {
-		return;
-	}
+	iRes = syno_get_ata_identity(sdev, id);
+	memcpy(szInqResult, id, SYNO_RESULT_LEN);
 
 	/* Swap string for endian problems */
 	for (i = 0; i < SYNO_RESULT_LEN - 1; i += 2)
@@ -649,25 +691,45 @@
 		szInqResult[i+1] = tmp;
 	}
 
-	/* Search the end of vendor name */
-	for (i = 0; i < SYNO_IDENTIFY_DEVICE_TMP_LEN; i++) {
-		if ('\0' == szInqResult[54+i]) {
+	if (!iRes || '\0' == szInqResult[54]) {
+		return;
+	}
+
+	for (i = 0; NULL != gDiskVendor[i].szName; i++) {
+		/* The disk model name start from word 27 in the buffer */
+		if (!strncmp(gDiskVendor[i].szName, &szInqResult[54], gDiskVendor[i].iLength)) {
+			blSpecialVendor = 1;
 			break;
 		}
+	}
 
-		if (' ' == szInqResult[54+i]) {
-			if(1 == blPreIsSpace){
+	if (1 == blSpecialVendor) {
+		if (' ' == szInqResult[54 + gDiskVendor[i].iLength] || '-' == szInqResult[54 + gDiskVendor[i].iLength]) {
+			i = gDiskVendor[i].iLength + 1;
+		} else {
+			i = gDiskVendor[i].iLength;
+		}
+	} else {
+		/* Search the end of vendor name */
+		for (i = 0; i < SYNO_IDENTIFY_DEVICE_TMP_LEN; i++) {
+			if ('\0' == szInqResult[54+i]) {
+				break;
+			}
+
+			if (' ' == szInqResult[54+i]) {
+				if(1 == blPreIsSpace){
+					break;
+				}
+				blPreIsSpace = 1;
+			} else if (blPreIsSpace) {
+				blSegmented = 1;
 				break;
 			}
-			blPreIsSpace = 1;
-		} else if (blPreIsSpace) {
-			blSegmented = 1;
-			break;
 		}
-	}
 
-	if (!blSegmented){
-		i = 0;
+		if (!blSegmented){
+			i = 0;
+		}
 	}
 
 	/* The disk model name start from word 27 in the buffer */
@@ -794,6 +856,8 @@
 		 */
 		if (!strncmp(&inq_result[8], SZ_STAT_DISK_VENDOR, 8)) {
 			syno_standard_inquiry_string(&inq_result[8], 28);
+		} else {
+			syno_standard_vendor_string(&inq_result[8], 8);
 		}
 #endif
 		sanitize_inquiry_string(&inq_result[8], 8);
diff -ur a/drivers/scsi/scsi_sysfs.c b/drivers/scsi/scsi_sysfs.c
--- a/drivers/scsi/scsi_sysfs.c	2013-08-24 11:37:01.000000000 +0200
+++ b/drivers/scsi/scsi_sysfs.c	2014-02-17 11:57:33.000000000 +0100
@@ -577,14 +577,32 @@
 	if (NULL == (sdev = to_scsi_device(dev))) {
 		goto END;
 	}
-	
+
 	iRet = snprintf (buf, 20, "%lu\n", (jiffies - sdev->idle) / HZ + 1);
-	
+
 END:
 	return iRet;
 }
 
-static DEVICE_ATTR(syno_idle_time, S_IRUGO, sdev_show_syno_idle_time, NULL);
+static ssize_t
+sdev_store_syno_idle_time(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct scsi_device *sdev;
+	unsigned long idletime;
+
+	if (NULL == (sdev = to_scsi_device(dev))) {
+		goto END;
+	}
+
+	sscanf(buf, "%lu", &idletime);
+	// idletime = (jiffies - sdev->idle) / HZ + 1
+	sdev->idle = jiffies - (idletime -1) * HZ;
+
+END:
+	return count;
+}
+
+static DEVICE_ATTR(syno_idle_time, S_IRUGO | S_IWUSR, sdev_show_syno_idle_time, sdev_store_syno_idle_time);
 
 static ssize_t
 sdev_show_syno_spindown(struct device *dev, struct device_attribute *attr, char *buf)
diff -ur a/drivers/scsi/sd.c b/drivers/scsi/sd.c
--- a/drivers/scsi/sd.c	2013-08-24 11:37:00.000000000 +0200
+++ b/drivers/scsi/sd.c	2014-02-17 11:57:31.000000000 +0100
@@ -101,15 +101,11 @@
 #endif
 
 #ifdef MY_ABC_HERE
-extern int syno_hibernation_log_sec;
-#endif
-
-#ifdef MY_ABC_HERE
 extern int gSynoHasDynModule;
 #endif
 
-#ifdef CONFIG_SYNO_ARMADA
-extern int gSynoUSBStation;
+#ifdef CONFIG_SYNO_DUAL_HEAD
+extern int gSynoDualHead;
 #endif
 
 #ifdef MY_ABC_HERE
@@ -2606,6 +2602,37 @@
 	return;
 }
 
+#if defined(MY_ABC_HERE)
+/**
+ * syno_get_ata_identity - Get ATA IDENTITY via ATA PASS-THRU command
+ * @sdev: the disk you want to get ata identity
+ * @id: ata identity result will stored in here
+ *
+ * return 0: if it's SAS disk or failed
+ *        1: success
+ */
+int
+syno_get_ata_identity(struct scsi_device *sdev, u16 *id)
+{
+	unsigned char scsi_cmd[MAX_COMMAND_SIZE] = {0};
+
+	/* ATA IDENTIFY DEVICE via ATA PASS-THRU(16)*/
+	scsi_cmd[0] = ATA_16;
+	scsi_cmd[1] = 0x08; /* PIO Data-in */
+	scsi_cmd[2] = 0x0e; /* T_DIR=1, BYT_BLOK=1, T_LENGTH=2 */
+	scsi_cmd[14] = ATA_CMD_ID_ATA;
+
+	/* if it's SAS disk, ATA PASS-THRU will fail. Return -1 */
+	if (scsi_execute_req(sdev, scsi_cmd, DMA_FROM_DEVICE,
+		id, 512, NULL, 10 * HZ, 5, NULL)) {
+		return 0;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(syno_get_ata_identity);
+#endif
+
 /**
  * sd_read_block_limits - Query disk device for preferred I/O sizes.
  * @disk: disk to query
@@ -3035,6 +3062,12 @@
 	}
 
 	if (SYNO_PORT_TYPE_SATA == sdp->host->hostt->syno_port_type) {
+#ifdef CONFIG_SYNO_DUAL_HEAD
+		if (1 == gSynoDualHead && (!strncmp(SYNO_SATA_DOM_VENDOR, sdp->vendor, strlen(SYNO_SATA_DOM_VENDOR))
+					|| !strncmp(SYNO_SATA_DOM_MODEL, sdp->model, strlen(SYNO_SATA_DOM_MODEL)))){
+			return SYNO_DISK_SYNOBOOT;
+		}
+#endif
 		// else treat as internal disks
 		return SYNO_DISK_SATA;
 	}
@@ -3133,7 +3166,7 @@
 		sdp->idle = jiffies;
 		sdp->nospindown = 0;
 		sdp->spindown = 0;
-#endif
+#endif /* MY_ABC_HERE */
 
 		spin_lock(&sd_index_lock);
 
@@ -3164,24 +3197,7 @@
 					break;
 				}
 #endif
-
-#ifdef CONFIG_SYNO_ARMADA
-				/* The device node of internal micro SD card of USB station 3 should be fixed to sda. */
-				if (1 == gSynoHasDynModule && 1 == gSynoUSBStation) {
-					struct us_data *us = host_to_us(sdp->host);
-					struct usb_device *usbdev = us->pusb_dev;
-
-					if (0 == strcmp((&(&usbdev->dev)->kobj)->name, SYNO_INTERNAL_MICROSD_NAME)) {
-						want_idx = 0;
-					} else {
-						want_idx = SYNO_MAX_INTERNAL_DISK + 1;
-					}
-				} else {
-					want_idx = SYNO_MAX_INTERNAL_DISK + 1;
-				}
-#else /* CONFIG_SYNO_ARMADA */
 				want_idx = SYNO_MAX_INTERNAL_DISK + 1;
-#endif /* CONFIG_SYNO_ARMADA */
 				break;
 			case SYNO_DISK_SAS:
 			case SYNO_DISK_SATA:
Nur in b/drivers/scsi: virtio_scsi.c.
Nur in b/drivers/spi: comcerto_spi.c.
diff -ur a/drivers/spi/Kconfig b/drivers/spi/Kconfig
--- a/drivers/spi/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/spi/Kconfig	2014-01-21 09:37:18.000000000 +0100
@@ -399,6 +399,22 @@
 
 	  Or for the DS570, see "XPS Serial Peripheral Interface (SPI) (v2.00b)"
 
+config SPI_MSPD_LOW_SPEED
+	tristate "MSPD SPI low-speed controller module"
+	depends on SYNO_COMCERTO
+	select SPI_DESIGNWARE
+	default SPI_MASTER
+	help
+		This exposes the MSPD SPI low-speed controller.
+
+config SPI_MSPD_HIGH_SPEED
+	tristate "MSPD SPI high-speed controller module"
+	depends on SYNO_COMCERTO
+	select SPI_DESIGNWARE
+	default SPI_MASTER
+	help
+		This exposes the MSPD SPI high-speed controller.
+
 config SPI_NUC900
 	tristate "Nuvoton NUC900 series SPI"
 	depends on ARCH_W90X900 && EXPERIMENTAL
diff -ur a/drivers/spi/Makefile b/drivers/spi/Makefile
--- a/drivers/spi/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/spi/Makefile	2014-01-21 09:37:18.000000000 +0100
@@ -22,6 +22,25 @@
 obj-$(CONFIG_SPI_BITBANG)		+= spi-bitbang.o
 obj-$(CONFIG_SPI_BUTTERFLY)		+= spi-butterfly.o
 obj-$(CONFIG_SPI_COLDFIRE_QSPI)		+= spi-coldfire-qspi.o
+
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+SPI_MSPD=n
+ifeq ($(CONFIG_SPI_MSPD_HIGH_SPEED),m)
+    SPI_MSPD=m
+endif
+ifeq ($(CONFIG_SPI_MSPD_LOW_SPEED),m)
+    SPI_MSPD=m
+endif
+ifeq ($(CONFIG_SPI_MSPD_HIGH_SPEED),y)
+    SPI_MSPD=y
+endif
+ifeq ($(CONFIG_SPI_MSPD_LOW_SPEED),y)
+    SPI_MSPD=y
+endif
+obj-$(SPI_MSPD)				+= spi-c2000-dma.o
+obj-$(SPI_MSPD)				+= spi-c2000.o
+endif
+
 obj-$(CONFIG_SPI_DAVINCI)		+= spi-davinci.o
 obj-$(CONFIG_SPI_DESIGNWARE)		+= spi-dw.o
 obj-$(CONFIG_SPI_DW_MMIO)		+= spi-dw-mmio.o
Nur in b/drivers/spi: spi-c2000.c.
Nur in b/drivers/spi: spi-c2000-dma.c.
Nur in b/drivers/spi: spi-c2000-dma.h.
diff -ur a/drivers/spi/spi-dw.c b/drivers/spi/spi-dw.c
--- a/drivers/spi/spi-dw.c	2013-08-24 11:36:50.000000000 +0200
+++ b/drivers/spi/spi-dw.c	2014-02-17 11:57:10.000000000 +0100
@@ -24,6 +24,14 @@
 #include <linux/delay.h>
 #include <linux/slab.h>
 #include <linux/spi/spi.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/clk.h>
+#endif
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+	#include <linux/c2k-devfreq.h>
+	#include <linux/devfreq.h>
+#endif
 
 #include "spi-dw.h"
 
@@ -62,6 +70,48 @@
 	void (*cs_control)(u32 command);
 };
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+static devfreq_counters dc;
+
+static int set_spi_freq(struct c2k_devfreq_data *data, unsigned long *freq)
+{
+	struct spi_device *spi = container_of(data->dev, struct spi_device, dev);
+	struct dw_spi *dws = container_of(&spi, struct dw_spi, cur_dev);
+	struct chip_data *chip;
+	u32 clk_div;
+
+	/* Only alloc on first setup */
+	chip = spi_get_ctldata(spi);
+
+	if (!chip)
+	{
+		chip = spi_get_ctldata(spi);
+		if (!chip) {
+			chip = kzalloc(sizeof(struct chip_data), GFP_KERNEL);
+			if (!chip)
+				return -ENOMEM;
+		}
+	}
+
+	if ((*freq <= data->max_freq) && (*freq >= data->min_freq))
+	{
+		chip->speed_hz = (u32)*freq;
+		clk_div = dws->max_freq / chip->speed_hz;
+		clk_div = (clk_div + 1) & 0xfffe;
+
+		chip->clk_div = clk_div;
+		spi_set_clk(dws, chip->clk_div);
+	}
+	else
+	{
+		printk (KERN_ERR "%s: Trying to set out of range spi freq: %lu\n\
+			", __func__, *freq);
+	}
+
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_DEBUG_FS
 static int spi_show_regs_open(struct inode *inode, struct file *file)
 {
@@ -394,6 +444,10 @@
 	u32 speed = 0;
 	u32 cr0 = 0;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+	devfreq_func_start(&dc);
+#endif
+
 	/* Get current state information */
 	message = dws->cur_msg;
 	transfer = dws->cur_transfer;
@@ -548,6 +602,10 @@
 	if (chip->poll_mode)
 		poll_transfer(dws);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+	devfreq_func_end(&dc);
+#endif
+
 	return;
 
 early_exit:
@@ -598,6 +656,9 @@
 {
 	struct dw_spi *dws = spi_master_get_devdata(spi->master);
 	unsigned long flags;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+	devfreq_func_start(&dc);
+#endif
 
 	spin_lock_irqsave(&dws->lock, flags);
 
@@ -626,6 +687,9 @@
 	}
 
 	spin_unlock_irqrestore(&dws->lock, flags);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+	devfreq_func_end(&dc);
+#endif
 	return 0;
 }
 
@@ -801,6 +865,49 @@
 	}
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+#define	SPI_MAXFREQ_MHZ	2000000
+#define	SPI_MINFREQ_MHZ	1000000
+#define	POLLING_MS	1000
+#define	OPP_TABLE_SIZE	4
+
+struct devfreq_dev_profile spi_devfreq_profile; 
+struct c2k_devfreq_data devfreq_spi_data;
+static struct c2k_devfreq_opp_table spi_opp_tbl[OPP_TABLE_SIZE];
+
+/* 
+ * intialize OPP table, profile data (initial freq, polling interval),
+ * max/min freq supported by SPI controller.
+ */
+static void init_spi_devfreq_data(struct dw_spi *dws)
+{
+	int i = 0;
+	struct c2k_devfreq_opp_table opp_tbl[OPP_TABLE_SIZE] = {
+		{1, 1000000, 0},
+		{2, 2000000, 0},
+		{3, 4000000, 0},
+		{0, 0, 0},
+	};
+
+	while (i < sizeof(opp_tbl))
+	{
+		spi_opp_tbl[i].idx = opp_tbl[i].idx;
+		spi_opp_tbl[i].freq = opp_tbl[i].freq;
+		spi_opp_tbl[i].volt = opp_tbl[i].volt;
+		i++;
+	}
+
+	spi_devfreq_profile.initial_freq = dws->max_freq;
+	spi_devfreq_profile.polling_ms = POLLING_MS;
+
+	devfreq_spi_data.devfreq_profile = &spi_devfreq_profile;
+	devfreq_spi_data.opp_table = &spi_opp_tbl[0];
+	devfreq_spi_data.set_freq = set_spi_freq;
+	devfreq_spi_data.max_freq = SPI_MAXFREQ_MHZ;
+	devfreq_spi_data.min_freq = SPI_MINFREQ_MHZ;
+}
+#endif
+
 int __devinit dw_spi_add_host(struct dw_spi *dws)
 {
 	struct spi_master *master;
@@ -814,6 +921,10 @@
 		goto exit;
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO) 
+	clk_enable(dws->clk_spi);
+#endif
+
 	dws->master = master;
 	dws->type = SSI_MOTO_SPI;
 	dws->prev_chip = NULL;
@@ -867,6 +978,14 @@
 	}
 
 	mrst_spi_debugfs_init(dws);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_C2K_DEVFREQ_DW)
+	init_spi_devfreq_data(dws);
+
+	ret = c2k_driver_devfreq(&master->dev, &devfreq_spi_data);
+	if(ret < 0)
+		goto err_queue_alloc;
+#endif
 	return 0;
 
 err_queue_alloc:
@@ -877,6 +996,9 @@
 	spi_enable_chip(dws, 0);
 	free_irq(dws->irq, dws);
 err_free_master:
+#if defined(CONFIG_SYNO_COMCERTO)
+	clk_disable(dws->clk_spi);
+#endif
 	spi_master_put(master);
 exit:
 	return ret;
@@ -900,12 +1022,18 @@
 	if (dws->dma_ops && dws->dma_ops->dma_exit)
 		dws->dma_ops->dma_exit(dws);
 	spi_enable_chip(dws, 0);
+#if !defined(CONFIG_SYNO_COMCERTO)
 	/* Disable clk */
 	spi_set_clk(dws, 0);
+#endif
 	free_irq(dws->irq, dws);
 
 	/* Disconnect from the SPI framework */
 	spi_unregister_master(dws->master);
+#if defined(CONFIG_SYNO_COMCERTO)
+	/* Disable clk */
+	clk_disable(dws->clk_spi);
+#endif
 }
 EXPORT_SYMBOL_GPL(dw_spi_remove_host);
 
@@ -917,7 +1045,12 @@
 	if (ret)
 		return ret;
 	spi_enable_chip(dws, 0);
+#if defined(CONFIG_SYNO_COMCERTO)
+	clk_disable(dws->clk_spi);
+#else
 	spi_set_clk(dws, 0);
+#endif
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(dw_spi_suspend_host);
@@ -926,6 +1059,9 @@
 {
 	int ret;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	clk_enable(dws->clk_spi);
+#endif
 	spi_hw_init(dws);
 	ret = start_queue(dws);
 	if (ret)
diff -ur a/drivers/spi/spi-dw.h b/drivers/spi/spi-dw.h
--- a/drivers/spi/spi-dw.h	2013-08-24 11:36:50.000000000 +0200
+++ b/drivers/spi/spi-dw.h	2014-02-17 11:57:10.000000000 +0100
@@ -92,6 +92,9 @@
 struct dw_spi {
 	struct spi_master	*master;
 	struct spi_device	*cur_dev;
+#if defined(CONFIG_SYNO_COMCERTO)
+	struct clk		*clk_spi;
+#endif
 	struct device		*parent_dev;
 	enum dw_ssi_type	type;
 	char			name[16];
Nur in b/drivers: spi2.
diff -ur a/drivers/staging/Kconfig b/drivers/staging/Kconfig
--- a/drivers/staging/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/staging/Kconfig	2014-01-21 09:37:19.000000000 +0100
@@ -88,6 +88,8 @@
 
 source "drivers/staging/iio/Kconfig"
 
+source "drivers/staging/zsmalloc/Kconfig"
+
 source "drivers/staging/zram/Kconfig"
 
 source "drivers/staging/zcache/Kconfig"
diff -ur a/drivers/staging/Makefile b/drivers/staging/Makefile
--- a/drivers/staging/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/staging/Makefile	2014-01-21 09:37:19.000000000 +0100
@@ -35,6 +35,7 @@
 obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_DX_SEP)            += sep/
 obj-$(CONFIG_IIO)		+= iio/
+obj-$(CONFIG_ZSMALLOC)		+= zsmalloc/
 obj-$(CONFIG_ZRAM)		+= zram/
 obj-$(CONFIG_XVMALLOC)		+= zram/
 obj-$(CONFIG_ZCACHE)		+= zcache/
diff -ur a/drivers/staging/zram/Kconfig b/drivers/staging/zram/Kconfig
--- a/drivers/staging/zram/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/staging/zram/Kconfig	2014-01-21 09:37:20.000000000 +0100
@@ -1,11 +1,6 @@
-config XVMALLOC
-	bool
-	default n
-
 config ZRAM
 	tristate "Compressed RAM block device support"
-	depends on BLOCK && SYSFS
-	select XVMALLOC
+	depends on BLOCK && SYSFS && ZSMALLOC
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
 	default n
@@ -19,7 +14,7 @@
 	  disks and maybe many more.
 
 	  See zram.txt for more information.
-	  Project home: http://compcache.googlecode.com/
+	  Project home: <https://compcache.googlecode.com/>
 
 config ZRAM_DEBUG
 	bool "Compressed RAM block device debug support"
diff -ur a/drivers/staging/zram/Makefile b/drivers/staging/zram/Makefile
--- a/drivers/staging/zram/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/staging/zram/Makefile	2014-01-21 09:37:20.000000000 +0100
@@ -1,4 +1,3 @@
-zram-y	:=	zram_drv.o zram_sysfs.o
+zram-y	:=	zram_drv.o
 
 obj-$(CONFIG_ZRAM)	+=	zram.o
-obj-$(CONFIG_XVMALLOC)	+=	xvmalloc.o
\ Kein Zeilenumbruch am Dateiende.
Nur in a/drivers/staging/zram: xvmalloc.c.
Nur in a/drivers/staging/zram: xvmalloc.h.
Nur in a/drivers/staging/zram: xvmalloc_int.h.
diff -ur a/drivers/staging/zram/zram_drv.c b/drivers/staging/zram/zram_drv.c
--- a/drivers/staging/zram/zram_drv.c	2013-08-24 11:37:13.000000000 +0200
+++ b/drivers/staging/zram/zram_drv.c	2014-02-17 11:57:51.000000000 +0100
@@ -37,301 +37,371 @@
 
 /* Globals */
 static int zram_major;
-struct zram *zram_devices;
+static struct zram *zram_devices;
 
 /* Module params (documentation at end) */
-unsigned int zram_num_devices;
+static unsigned int num_devices = 1;
 
-static void zram_stat_inc(u32 *v)
+static inline struct zram *dev_to_zram(struct device *dev)
 {
-	*v = *v + 1;
+	return (struct zram *)dev_to_disk(dev)->private_data;
 }
 
-static void zram_stat_dec(u32 *v)
+static ssize_t disksize_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	*v = *v - 1;
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n", zram->disksize);
+}
+
+static ssize_t initstate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%u\n", zram->init_done);
+}
+
+static ssize_t num_reads_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.num_reads));
 }
 
-static void zram_stat64_add(struct zram *zram, u64 *v, u64 inc)
+static ssize_t num_writes_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	spin_lock(&zram->stat64_lock);
-	*v = *v + inc;
-	spin_unlock(&zram->stat64_lock);
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.num_writes));
 }
 
-static void zram_stat64_sub(struct zram *zram, u64 *v, u64 dec)
+static ssize_t invalid_io_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	spin_lock(&zram->stat64_lock);
-	*v = *v - dec;
-	spin_unlock(&zram->stat64_lock);
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.invalid_io));
 }
 
-static void zram_stat64_inc(struct zram *zram, u64 *v)
+static ssize_t notify_free_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
 {
-	zram_stat64_add(zram, v, 1);
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.notify_free));
 }
 
-static int zram_test_flag(struct zram *zram, u32 index,
+static ssize_t zero_pages_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%u\n", zram->stats.pages_zero);
+}
+
+static ssize_t orig_data_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+		(u64)(zram->stats.pages_stored) << PAGE_SHIFT);
+}
+
+static ssize_t compr_data_size_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return sprintf(buf, "%llu\n",
+			(u64)atomic64_read(&zram->stats.compr_size));
+}
+
+static ssize_t mem_used_total_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u64 val = 0;
+	struct zram *zram = dev_to_zram(dev);
+	struct zram_meta *meta = zram->meta;
+
+	down_read(&zram->init_lock);
+	if (zram->init_done)
+		val = zs_get_total_size_bytes(meta->mem_pool);
+	up_read(&zram->init_lock);
+
+	return sprintf(buf, "%llu\n", val);
+}
+
+static int zram_test_flag(struct zram_meta *meta, u32 index,
 			enum zram_pageflags flag)
 {
-	return zram->table[index].flags & BIT(flag);
+	return meta->table[index].flags & BIT(flag);
 }
 
-static void zram_set_flag(struct zram *zram, u32 index,
+static void zram_set_flag(struct zram_meta *meta, u32 index,
 			enum zram_pageflags flag)
 {
-	zram->table[index].flags |= BIT(flag);
+	meta->table[index].flags |= BIT(flag);
 }
 
-static void zram_clear_flag(struct zram *zram, u32 index,
+static void zram_clear_flag(struct zram_meta *meta, u32 index,
 			enum zram_pageflags flag)
 {
-	zram->table[index].flags &= ~BIT(flag);
+	meta->table[index].flags &= ~BIT(flag);
 }
 
-static int page_zero_filled(void *ptr)
+static inline int is_partial_io(struct bio_vec *bvec)
 {
-	unsigned int pos;
-	unsigned long *page;
+	return bvec->bv_len != PAGE_SIZE;
+}
 
-	page = (unsigned long *)ptr;
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline int valid_io_request(struct zram *zram, struct bio *bio)
+{
+	u64 start, end, bound;
 
-	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
-		if (page[pos])
-			return 0;
-	}
+	/* unaligned request */
+	if (unlikely(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+		return 0;
+	if (unlikely(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+		return 0;
+
+	start = bio->bi_sector;
+	end = start + (bio->bi_size >> SECTOR_SHIFT);
+	bound = zram->disksize >> SECTOR_SHIFT;
+	/* out of range range */
+	if (unlikely(start >= bound || end > bound || start > end))
+		return 0;
 
+	/* I/O request is valid */
 	return 1;
 }
 
-static void zram_set_disksize(struct zram *zram, size_t totalram_bytes)
+static void zram_meta_free(struct zram_meta *meta)
 {
-	if (!zram->disksize) {
-		pr_info(
-		"disk size not provided. You can use disksize_kb module "
-		"param to specify size.\nUsing default: (%u%% of RAM).\n",
-		default_disksize_perc_ram
-		);
-		zram->disksize = default_disksize_perc_ram *
-					(totalram_bytes / 100);
-	}
-
-	if (zram->disksize > 2 * (totalram_bytes)) {
-		pr_info(
-		"There is little point creating a zram of greater than "
-		"twice the size of memory since we expect a 2:1 compression "
-		"ratio. Note that zram uses about 0.1%% of the size of "
-		"the disk when not in use so a huge zram is "
-		"wasteful.\n"
-		"\tMemory Size: %zu kB\n"
-		"\tSize you selected: %llu kB\n"
-		"Continuing anyway ...\n",
-		totalram_bytes >> 10, zram->disksize
-		);
-	}
-
-	zram->disksize &= PAGE_MASK;
+	zs_destroy_pool(meta->mem_pool);
+	kfree(meta->compress_workmem);
+	free_pages((unsigned long)meta->compress_buffer, 1);
+	vfree(meta->table);
+	kfree(meta);
 }
 
-static void zram_free_page(struct zram *zram, size_t index)
+static struct zram_meta *zram_meta_alloc(u64 disksize)
 {
-	u32 clen;
-	void *obj;
-
-	struct page *page = zram->table[index].page;
-	u32 offset = zram->table[index].offset;
+	size_t num_pages;
+	struct zram_meta *meta = kmalloc(sizeof(*meta), GFP_KERNEL);
+	if (!meta)
+		goto out;
 
-	if (unlikely(!page)) {
-		/*
-		 * No memory is allocated for zero filled pages.
-		 * Simply clear zero page flag.
-		 */
-		if (zram_test_flag(zram, index, ZRAM_ZERO)) {
-			zram_clear_flag(zram, index, ZRAM_ZERO);
-			zram_stat_dec(&zram->stats.pages_zero);
-		}
-		return;
+	meta->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (!meta->compress_workmem)
+		goto free_meta;
+
+	meta->compress_buffer =
+		(void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	if (!meta->compress_buffer) {
+		pr_err("Error allocating compressor buffer space\n");
+		goto free_workmem;
 	}
 
-	if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED))) {
-		clen = PAGE_SIZE;
-		__free_page(page);
-		zram_clear_flag(zram, index, ZRAM_UNCOMPRESSED);
-		zram_stat_dec(&zram->stats.pages_expand);
-		goto out;
+	num_pages = disksize >> PAGE_SHIFT;
+	meta->table = vzalloc(num_pages * sizeof(*meta->table));
+	if (!meta->table) {
+		pr_err("Error allocating zram address table\n");
+		goto free_buffer;
 	}
 
-	obj = kmap_atomic(page, KM_USER0) + offset;
-	clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
-	kunmap_atomic(obj, KM_USER0);
+	meta->mem_pool = zs_create_pool(GFP_NOIO | __GFP_HIGHMEM);
+	if (!meta->mem_pool) {
+		pr_err("Error creating memory pool\n");
+		goto free_table;
+	}
 
-	xv_free(zram->mem_pool, page, offset);
-	if (clen <= PAGE_SIZE / 2)
-		zram_stat_dec(&zram->stats.good_compress);
+	return meta;
 
+free_table:
+	vfree(meta->table);
+free_buffer:
+	free_pages((unsigned long)meta->compress_buffer, 1);
+free_workmem:
+	kfree(meta->compress_workmem);
+free_meta:
+	kfree(meta);
+	meta = NULL;
 out:
-	zram_stat64_sub(zram, &zram->stats.compr_size, clen);
-	zram_stat_dec(&zram->stats.pages_stored);
+	return meta;
+}
 
-	zram->table[index].page = NULL;
-	zram->table[index].offset = 0;
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+	if (*offset + bvec->bv_len >= PAGE_SIZE)
+		(*index)++;
+	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
 }
 
-static void handle_zero_page(struct bio_vec *bvec)
+static int page_zero_filled(void *ptr)
 {
-	struct page *page = bvec->bv_page;
-	void *user_mem;
+	unsigned int pos;
+	unsigned long *page;
 
-	user_mem = kmap_atomic(page, KM_USER0);
-	memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
-	kunmap_atomic(user_mem, KM_USER0);
+	page = (unsigned long *)ptr;
 
-	flush_dcache_page(page);
+	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+		if (page[pos])
+			return 0;
+	}
+
+	return 1;
 }
 
-static void handle_uncompressed_page(struct zram *zram, struct bio_vec *bvec,
-				     u32 index, int offset)
+static void handle_zero_page(struct bio_vec *bvec)
 {
 	struct page *page = bvec->bv_page;
-	unsigned char *user_mem, *cmem;
-
-	user_mem = kmap_atomic(page, KM_USER0);
-	cmem = kmap_atomic(zram->table[index].page, KM_USER1);
+	void *user_mem;
 
-	memcpy(user_mem + bvec->bv_offset, cmem + offset, bvec->bv_len);
-	kunmap_atomic(cmem, KM_USER1);
-	kunmap_atomic(user_mem, KM_USER0);
+	user_mem = kmap_atomic(page);
+	if (is_partial_io(bvec))
+		memset(user_mem + bvec->bv_offset, 0, bvec->bv_len);
+	else
+		clear_page(user_mem);
+	kunmap_atomic(user_mem);
 
 	flush_dcache_page(page);
 }
 
-static inline int is_partial_io(struct bio_vec *bvec)
-{
-	return bvec->bv_len != PAGE_SIZE;
-}
-
-static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
-			  u32 index, int offset, struct bio *bio)
+static void zram_free_page(struct zram *zram, size_t index)
 {
-	int ret;
-	size_t clen;
-	struct page *page;
-	struct zobj_header *zheader;
-	unsigned char *user_mem, *cmem, *uncmem = NULL;
-
-	page = bvec->bv_page;
+	struct zram_meta *meta = zram->meta;
+	unsigned long handle = meta->table[index].handle;
+	u16 size = meta->table[index].size;
 
-	if (zram_test_flag(zram, index, ZRAM_ZERO)) {
-		handle_zero_page(bvec);
-		return 0;
+	if (unlikely(!handle)) {
+		/*
+		 * No memory is allocated for zero filled pages.
+		 * Simply clear zero page flag.
+		 */
+		if (zram_test_flag(meta, index, ZRAM_ZERO)) {
+			zram_clear_flag(meta, index, ZRAM_ZERO);
+			zram->stats.pages_zero--;
+		}
+		return;
 	}
 
-	/* Requested page is not present in compressed area */
-	if (unlikely(!zram->table[index].page)) {
-		pr_debug("Read before write: sector=%lu, size=%u",
-			 (ulong)(bio->bi_sector), bio->bi_size);
-		handle_zero_page(bvec);
-		return 0;
-	}
+	if (unlikely(size > max_zpage_size))
+		zram->stats.bad_compress--;
 
-	/* Page is stored uncompressed since it's incompressible */
-	if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED))) {
-		handle_uncompressed_page(zram, bvec, index, offset);
-		return 0;
-	}
+	zs_free(meta->mem_pool, handle);
 
-	if (is_partial_io(bvec)) {
-		/* Use  a temporary buffer to decompress the page */
-		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
-		if (!uncmem) {
-			pr_info("Error allocating temp memory!\n");
-			return -ENOMEM;
-		}
-	}
+	if (size <= PAGE_SIZE / 2)
+		zram->stats.good_compress--;
 
-	user_mem = kmap_atomic(page, KM_USER0);
-	if (!is_partial_io(bvec))
-		uncmem = user_mem;
-	clen = PAGE_SIZE;
+	atomic64_sub(meta->table[index].size, &zram->stats.compr_size);
+	zram->stats.pages_stored--;
 
-	cmem = kmap_atomic(zram->table[index].page, KM_USER1) +
-		zram->table[index].offset;
+	meta->table[index].handle = 0;
+	meta->table[index].size = 0;
+}
 
-	ret = lzo1x_decompress_safe(cmem + sizeof(*zheader),
-				    xv_get_object_size(cmem) - sizeof(*zheader),
-				    uncmem, &clen);
+static int zram_decompress_page(struct zram *zram, char *mem, u32 index)
+{
+	int ret = LZO_E_OK;
+	size_t clen = PAGE_SIZE;
+	unsigned char *cmem;
+	struct zram_meta *meta = zram->meta;
+	unsigned long handle = meta->table[index].handle;
 
-	if (is_partial_io(bvec)) {
-		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
-		       bvec->bv_len);
-		kfree(uncmem);
+	if (!handle || zram_test_flag(meta, index, ZRAM_ZERO)) {
+		clear_page(mem);
+		return 0;
 	}
 
-	kunmap_atomic(cmem, KM_USER1);
-	kunmap_atomic(user_mem, KM_USER0);
+	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO);
+	if (meta->table[index].size == PAGE_SIZE)
+		copy_page(mem, cmem);
+	else
+		ret = lzo1x_decompress_safe(cmem, meta->table[index].size,
+						mem, &clen);
+	zs_unmap_object(meta->mem_pool, handle);
 
 	/* Should NEVER happen. Return bio error if it does. */
 	if (unlikely(ret != LZO_E_OK)) {
 		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-		zram_stat64_inc(zram, &zram->stats.failed_reads);
+		atomic64_inc(&zram->stats.failed_reads);
 		return ret;
 	}
 
-	flush_dcache_page(page);
-
 	return 0;
 }
 
-static int zram_read_before_write(struct zram *zram, char *mem, u32 index)
+static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+			  u32 index, int offset, struct bio *bio)
 {
 	int ret;
-	size_t clen = PAGE_SIZE;
-	struct zobj_header *zheader;
-	unsigned char *cmem;
+	struct page *page;
+	unsigned char *user_mem, *uncmem = NULL;
+	struct zram_meta *meta = zram->meta;
+	page = bvec->bv_page;
 
-	if (zram_test_flag(zram, index, ZRAM_ZERO) ||
-	    !zram->table[index].page) {
-		memset(mem, 0, PAGE_SIZE);
+	if (unlikely(!meta->table[index].handle) ||
+			zram_test_flag(meta, index, ZRAM_ZERO)) {
+		handle_zero_page(bvec);
 		return 0;
 	}
 
-	cmem = kmap_atomic(zram->table[index].page, KM_USER0) +
-		zram->table[index].offset;
+	if (is_partial_io(bvec))
+		/* Use  a temporary buffer to decompress the page */
+		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
 
-	/* Page is stored uncompressed since it's incompressible */
-	if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED))) {
-		memcpy(mem, cmem, PAGE_SIZE);
-		kunmap_atomic(cmem, KM_USER0);
-		return 0;
-	}
+	user_mem = kmap_atomic(page);
+	if (!is_partial_io(bvec))
+		uncmem = user_mem;
 
-	ret = lzo1x_decompress_safe(cmem + sizeof(*zheader),
-				    xv_get_object_size(cmem) - sizeof(*zheader),
-				    mem, &clen);
-	kunmap_atomic(cmem, KM_USER0);
+	if (!uncmem) {
+		pr_info("Unable to allocate temp memory\n");
+		ret = -ENOMEM;
+		goto out_cleanup;
+	}
 
+	ret = zram_decompress_page(zram, uncmem, index);
 	/* Should NEVER happen. Return bio error if it does. */
-	if (unlikely(ret != LZO_E_OK)) {
-		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
-		zram_stat64_inc(zram, &zram->stats.failed_reads);
-		return ret;
-	}
+	if (unlikely(ret != LZO_E_OK))
+		goto out_cleanup;
 
-	return 0;
+	if (is_partial_io(bvec))
+		memcpy(user_mem + bvec->bv_offset, uncmem + offset,
+				bvec->bv_len);
+
+	flush_dcache_page(page);
+	ret = 0;
+out_cleanup:
+	kunmap_atomic(user_mem);
+	if (is_partial_io(bvec))
+		kfree(uncmem);
+	return ret;
 }
 
 static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index,
 			   int offset)
 {
-	int ret;
-	u32 store_offset;
+	int ret = 0;
 	size_t clen;
-	struct zobj_header *zheader;
-	struct page *page, *page_store;
+	unsigned long handle;
+	struct page *page;
 	unsigned char *user_mem, *cmem, *src, *uncmem = NULL;
+	struct zram_meta *meta = zram->meta;
 
 	page = bvec->bv_page;
-	src = zram->compress_buffer;
+	src = meta->compress_buffer;
 
 	if (is_partial_io(bvec)) {
 		/*
@@ -340,122 +410,123 @@
 		 */
 		uncmem = kmalloc(PAGE_SIZE, GFP_NOIO);
 		if (!uncmem) {
-			pr_info("Error allocating temp memory!\n");
 			ret = -ENOMEM;
 			goto out;
 		}
-		ret = zram_read_before_write(zram, uncmem, index);
-		if (ret) {
-			kfree(uncmem);
+		ret = zram_decompress_page(zram, uncmem, index);
+		if (ret)
 			goto out;
-		}
 	}
 
-	/*
-	 * System overwrites unused sectors. Free memory associated
-	 * with this sector now.
-	 */
-	if (zram->table[index].page ||
-	    zram_test_flag(zram, index, ZRAM_ZERO))
-		zram_free_page(zram, index);
-
-	user_mem = kmap_atomic(page, KM_USER0);
+	user_mem = kmap_atomic(page);
 
-	if (is_partial_io(bvec))
+	if (is_partial_io(bvec)) {
 		memcpy(uncmem + offset, user_mem + bvec->bv_offset,
 		       bvec->bv_len);
-	else
+		kunmap_atomic(user_mem);
+		user_mem = NULL;
+	} else {
 		uncmem = user_mem;
+	}
 
 	if (page_zero_filled(uncmem)) {
-		kunmap_atomic(user_mem, KM_USER0);
-		if (is_partial_io(bvec))
-			kfree(uncmem);
-		zram_stat_inc(&zram->stats.pages_zero);
-		zram_set_flag(zram, index, ZRAM_ZERO);
+		kunmap_atomic(user_mem);
+		/* Free memory associated with this sector now. */
+		zram_free_page(zram, index);
+
+		zram->stats.pages_zero++;
+		zram_set_flag(meta, index, ZRAM_ZERO);
 		ret = 0;
 		goto out;
 	}
 
+	/*
+	 * zram_slot_free_notify could miss free so that let's
+	 * double check.
+	 */
+	if (unlikely(meta->table[index].handle ||
+			zram_test_flag(meta, index, ZRAM_ZERO)))
+		zram_free_page(zram, index);
+
 	ret = lzo1x_1_compress(uncmem, PAGE_SIZE, src, &clen,
-			       zram->compress_workmem);
+			       meta->compress_workmem);
 
-	kunmap_atomic(user_mem, KM_USER0);
-	if (is_partial_io(bvec))
-			kfree(uncmem);
+	if (!is_partial_io(bvec)) {
+		kunmap_atomic(user_mem);
+		user_mem = NULL;
+		uncmem = NULL;
+	}
 
 	if (unlikely(ret != LZO_E_OK)) {
 		pr_err("Compression failed! err=%d\n", ret);
 		goto out;
 	}
 
-	/*
-	 * Page is incompressible. Store it as-is (uncompressed)
-	 * since we do not want to return too many disk write
-	 * errors which has side effect of hanging the system.
-	 */
 	if (unlikely(clen > max_zpage_size)) {
+		zram->stats.bad_compress++;
 		clen = PAGE_SIZE;
-		page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
-		if (unlikely(!page_store)) {
-			pr_info("Error allocating memory for "
-				"incompressible page: %u\n", index);
-			ret = -ENOMEM;
-			goto out;
-		}
-
-		store_offset = 0;
-		zram_set_flag(zram, index, ZRAM_UNCOMPRESSED);
-		zram_stat_inc(&zram->stats.pages_expand);
-		zram->table[index].page = page_store;
-		src = kmap_atomic(page, KM_USER0);
-		goto memstore;
+		src = NULL;
+		if (is_partial_io(bvec))
+			src = uncmem;
 	}
 
-	if (xv_malloc(zram->mem_pool, clen + sizeof(*zheader),
-		      &zram->table[index].page, &store_offset,
-		      GFP_NOIO | __GFP_HIGHMEM)) {
-		pr_info("Error allocating memory for compressed "
-			"page: %u, size=%zu\n", index, clen);
+	handle = zs_malloc(meta->mem_pool, clen);
+	if (!handle) {
+		pr_info("Error allocating memory for compressed page: %u, size=%zu\n",
+			index, clen);
 		ret = -ENOMEM;
 		goto out;
 	}
+	cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_WO);
 
-memstore:
-	zram->table[index].offset = store_offset;
+	if ((clen == PAGE_SIZE) && !is_partial_io(bvec)) {
+		src = kmap_atomic(page);
+		copy_page(cmem, src);
+		kunmap_atomic(src);
+	} else {
+		memcpy(cmem, src, clen);
+	}
 
-	cmem = kmap_atomic(zram->table[index].page, KM_USER1) +
-		zram->table[index].offset;
+	zs_unmap_object(meta->mem_pool, handle);
 
-#if 0
-	/* Back-reference needed for memory defragmentation */
-	if (!zram_test_flag(zram, index, ZRAM_UNCOMPRESSED)) {
-		zheader = (struct zobj_header *)cmem;
-		zheader->table_idx = index;
-		cmem += sizeof(*zheader);
-	}
-#endif
-
-	memcpy(cmem, src, clen);
-
-	kunmap_atomic(cmem, KM_USER1);
-	if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED)))
-		kunmap_atomic(src, KM_USER0);
+	/*
+	 * Free memory associated with this sector
+	 * before overwriting unused sectors.
+	 */
+	zram_free_page(zram, index);
+
+	meta->table[index].handle = handle;
+	meta->table[index].size = clen;
 
 	/* Update stats */
-	zram_stat64_add(zram, &zram->stats.compr_size, clen);
-	zram_stat_inc(&zram->stats.pages_stored);
+	atomic64_add(clen, &zram->stats.compr_size);
+	zram->stats.pages_stored++;
 	if (clen <= PAGE_SIZE / 2)
-		zram_stat_inc(&zram->stats.good_compress);
-
-	return 0;
+		zram->stats.good_compress++;
 
 out:
+	if (is_partial_io(bvec))
+		kfree(uncmem);
+
 	if (ret)
-		zram_stat64_inc(zram, &zram->stats.failed_writes);
+		atomic64_inc(&zram->stats.failed_writes);
 	return ret;
 }
 
+static void handle_pending_slot_free(struct zram *zram)
+{
+	struct zram_slot_free *free_rq;
+
+	spin_lock(&zram->slot_free_lock);
+	while (zram->slot_free_rq) {
+		free_rq = zram->slot_free_rq;
+		zram->slot_free_rq = free_rq->next;
+		zram_free_page(zram, free_rq->index);
+		kfree(free_rq);
+	}
+	spin_unlock(&zram->slot_free_lock);
+}
+
 static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
 			int offset, struct bio *bio, int rw)
 {
@@ -463,10 +534,12 @@
 
 	if (rw == READ) {
 		down_read(&zram->lock);
+		handle_pending_slot_free(zram);
 		ret = zram_bvec_read(zram, bvec, index, offset, bio);
 		up_read(&zram->lock);
 	} else {
 		down_write(&zram->lock);
+		handle_pending_slot_free(zram);
 		ret = zram_bvec_write(zram, bvec, index, offset);
 		up_write(&zram->lock);
 	}
@@ -474,11 +547,135 @@
 	return ret;
 }
 
-static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+static void zram_reset_device(struct zram *zram, bool reset_capacity)
 {
-	if (*offset + bvec->bv_len >= PAGE_SIZE)
-		(*index)++;
-	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+	size_t index;
+	struct zram_meta *meta;
+
+	flush_work(&zram->free_work);
+
+	down_write(&zram->init_lock);
+	if (!zram->init_done) {
+		up_write(&zram->init_lock);
+		return;
+	}
+
+	meta = zram->meta;
+	zram->init_done = 0;
+
+	/* Free all pages that are still in this zram device */
+	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
+		unsigned long handle = meta->table[index].handle;
+		if (!handle)
+			continue;
+
+		zs_free(meta->mem_pool, handle);
+	}
+
+	zram_meta_free(zram->meta);
+	zram->meta = NULL;
+	/* Reset stats */
+	memset(&zram->stats, 0, sizeof(zram->stats));
+
+	zram->disksize = 0;
+	if (reset_capacity)
+		set_capacity(zram->disk, 0);
+	up_write(&zram->init_lock);
+}
+
+static void zram_init_device(struct zram *zram, struct zram_meta *meta)
+{
+	if (zram->disksize > 2 * (totalram_pages << PAGE_SHIFT)) {
+		pr_info(
+		"There is little point creating a zram of greater than "
+		"twice the size of memory since we expect a 2:1 compression "
+		"ratio. Note that zram uses about 0.1%% of the size of "
+		"the disk when not in use so a huge zram is "
+		"wasteful.\n"
+		"\tMemory Size: %lu kB\n"
+		"\tSize you selected: %llu kB\n"
+		"Continuing anyway ...\n",
+		(totalram_pages << PAGE_SHIFT) >> 10, zram->disksize >> 10
+		);
+	}
+
+	/* zram devices sort of resembles non-rotational disks */
+	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
+
+	zram->meta = meta;
+	zram->init_done = 1;
+
+	pr_debug("Initialization done!\n");
+}
+
+static ssize_t disksize_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	u64 disksize;
+	struct zram_meta *meta;
+	struct zram *zram = dev_to_zram(dev);
+
+	disksize = memparse(buf, NULL);
+	if (!disksize)
+		return -EINVAL;
+
+	disksize = PAGE_ALIGN(disksize);
+	meta = zram_meta_alloc(disksize);
+	down_write(&zram->init_lock);
+	if (zram->init_done) {
+		up_write(&zram->init_lock);
+		zram_meta_free(meta);
+		pr_info("Cannot change disksize for initialized device\n");
+		return -EBUSY;
+	}
+
+	zram->disksize = disksize;
+	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	zram_init_device(zram, meta);
+	up_write(&zram->init_lock);
+
+	return len;
+}
+
+static ssize_t reset_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int ret;
+	unsigned short do_reset;
+	struct zram *zram;
+	struct block_device *bdev;
+
+	zram = dev_to_zram(dev);
+	bdev = bdget_disk(zram->disk, 0);
+
+	if (!bdev)
+		return -ENOMEM;
+
+	/* Do not reset an active device! */
+	if (bdev->bd_holders) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ret = kstrtou16(buf, 10, &do_reset);
+	if (ret)
+		goto out;
+
+	if (!do_reset) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* Make sure all pending I/O is finished */
+	fsync_bdev(bdev);
+	bdput(bdev);
+
+	zram_reset_device(zram, true);
+	return len;
+
+out:
+	bdput(bdev);
+	return ret;
 }
 
 static void __zram_make_request(struct zram *zram, struct bio *bio, int rw)
@@ -489,10 +686,10 @@
 
 	switch (rw) {
 	case READ:
-		zram_stat64_inc(zram, &zram->stats.num_reads);
+		atomic64_inc(&zram->stats.num_reads);
 		break;
 	case WRITE:
-		zram_stat64_inc(zram, &zram->stats.num_writes);
+		atomic64_inc(&zram->stats.num_writes);
 		break;
 	}
 
@@ -537,39 +734,19 @@
 }
 
 /*
- * Check if request is within bounds and aligned on zram logical blocks.
- */
-static inline int valid_io_request(struct zram *zram, struct bio *bio)
-{
-	if (unlikely(
-		(bio->bi_sector >= (zram->disksize >> SECTOR_SHIFT)) ||
-		(bio->bi_sector & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)) ||
-		(bio->bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))) {
-
-		return 0;
-	}
-
-	/* I/O request is valid */
-	return 1;
-}
-
-/*
  * Handler function for all zram I/O requests.
  */
 static void zram_make_request(struct request_queue *queue, struct bio *bio)
 {
 	struct zram *zram = queue->queuedata;
 
-	if (unlikely(!zram->init_done) && zram_init_device(zram))
-		goto error;
-
 	down_read(&zram->init_lock);
 	if (unlikely(!zram->init_done))
-		goto error_unlock;
+		goto error;
 
 	if (!valid_io_request(zram, bio)) {
-		zram_stat64_inc(zram, &zram->stats.invalid_io);
-		goto error_unlock;
+		atomic64_inc(&zram->stats.invalid_io);
+		goto error;
 	}
 
 	__zram_make_request(zram, bio, bio_data_dir(bio));
@@ -577,133 +754,45 @@
 
 	return;
 
-error_unlock:
-	up_read(&zram->init_lock);
 error:
+	up_read(&zram->init_lock);
 	bio_io_error(bio);
 }
 
-void __zram_reset_device(struct zram *zram)
+static void zram_slot_free(struct work_struct *work)
 {
-	size_t index;
-
-	zram->init_done = 0;
-
-	/* Free various per-device buffers */
-	kfree(zram->compress_workmem);
-	free_pages((unsigned long)zram->compress_buffer, 1);
-
-	zram->compress_workmem = NULL;
-	zram->compress_buffer = NULL;
-
-	/* Free all pages that are still in this zram device */
-	for (index = 0; index < zram->disksize >> PAGE_SHIFT; index++) {
-		struct page *page;
-		u16 offset;
-
-		page = zram->table[index].page;
-		offset = zram->table[index].offset;
-
-		if (!page)
-			continue;
-
-		if (unlikely(zram_test_flag(zram, index, ZRAM_UNCOMPRESSED)))
-			__free_page(page);
-		else
-			xv_free(zram->mem_pool, page, offset);
-	}
-
-	vfree(zram->table);
-	zram->table = NULL;
-
-	xv_destroy_pool(zram->mem_pool);
-	zram->mem_pool = NULL;
-
-	/* Reset stats */
-	memset(&zram->stats, 0, sizeof(zram->stats));
-
-	zram->disksize = 0;
-}
+	struct zram *zram;
 
-void zram_reset_device(struct zram *zram)
-{
-	down_write(&zram->init_lock);
-	__zram_reset_device(zram);
-	up_write(&zram->init_lock);
+	zram = container_of(work, struct zram, free_work);
+	down_write(&zram->lock);
+	handle_pending_slot_free(zram);
+	up_write(&zram->lock);
 }
 
-int zram_init_device(struct zram *zram)
-{
-	int ret;
-	size_t num_pages;
-
-	down_write(&zram->init_lock);
-
-	if (zram->init_done) {
-		up_write(&zram->init_lock);
-		return 0;
-	}
-
-	zram_set_disksize(zram, totalram_pages << PAGE_SHIFT);
-
-	zram->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
-	if (!zram->compress_workmem) {
-		pr_err("Error allocating compressor working memory!\n");
-		ret = -ENOMEM;
-		goto fail_no_table;
-	}
-
-	zram->compress_buffer = (void *)__get_free_pages(__GFP_ZERO, 1);
-	if (!zram->compress_buffer) {
-		pr_err("Error allocating compressor buffer space\n");
-		ret = -ENOMEM;
-		goto fail_no_table;
-	}
-
-	num_pages = zram->disksize >> PAGE_SHIFT;
-	zram->table = vzalloc(num_pages * sizeof(*zram->table));
-	if (!zram->table) {
-		pr_err("Error allocating zram address table\n");
-		ret = -ENOMEM;
-		goto fail_no_table;
-	}
-
-	set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT);
-
-	/* zram devices sort of resembles non-rotational disks */
-	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue);
-
-	zram->mem_pool = xv_create_pool();
-	if (!zram->mem_pool) {
-		pr_err("Error creating memory pool\n");
-		ret = -ENOMEM;
-		goto fail;
-	}
-
-	zram->init_done = 1;
-	up_write(&zram->init_lock);
-
-	pr_debug("Initialization done!\n");
-	return 0;
-
-fail_no_table:
-	/* To prevent accessing table entries during cleanup */
-	zram->disksize = 0;
-fail:
-	__zram_reset_device(zram);
-	up_write(&zram->init_lock);
-	pr_err("Initialization failed: err=%d\n", ret);
-	return ret;
+static void add_slot_free(struct zram *zram, struct zram_slot_free *free_rq)
+{
+	spin_lock(&zram->slot_free_lock);
+	free_rq->next = zram->slot_free_rq;
+	zram->slot_free_rq = free_rq;
+	spin_unlock(&zram->slot_free_lock);
 }
 
 static void zram_slot_free_notify(struct block_device *bdev,
 				unsigned long index)
 {
 	struct zram *zram;
+	struct zram_slot_free *free_rq;
 
 	zram = bdev->bd_disk->private_data;
-	zram_free_page(zram, index);
-	zram_stat64_inc(zram, &zram->stats.notify_free);
+	atomic64_inc(&zram->stats.notify_free);
+
+	free_rq = kmalloc(sizeof(struct zram_slot_free), GFP_ATOMIC);
+	if (!free_rq)
+		return;
+
+	free_rq->index = index;
+	add_slot_free(zram, free_rq);
+	schedule_work(&zram->free_work);
 }
 
 static const struct block_device_operations zram_devops = {
@@ -711,19 +800,53 @@
 	.owner = THIS_MODULE
 };
 
+static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR,
+		disksize_show, disksize_store);
+static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store);
+static DEVICE_ATTR(num_reads, S_IRUGO, num_reads_show, NULL);
+static DEVICE_ATTR(num_writes, S_IRUGO, num_writes_show, NULL);
+static DEVICE_ATTR(invalid_io, S_IRUGO, invalid_io_show, NULL);
+static DEVICE_ATTR(notify_free, S_IRUGO, notify_free_show, NULL);
+static DEVICE_ATTR(zero_pages, S_IRUGO, zero_pages_show, NULL);
+static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL);
+static DEVICE_ATTR(compr_data_size, S_IRUGO, compr_data_size_show, NULL);
+static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL);
+
+static struct attribute *zram_disk_attrs[] = {
+	&dev_attr_disksize.attr,
+	&dev_attr_initstate.attr,
+	&dev_attr_reset.attr,
+	&dev_attr_num_reads.attr,
+	&dev_attr_num_writes.attr,
+	&dev_attr_invalid_io.attr,
+	&dev_attr_notify_free.attr,
+	&dev_attr_zero_pages.attr,
+	&dev_attr_orig_data_size.attr,
+	&dev_attr_compr_data_size.attr,
+	&dev_attr_mem_used_total.attr,
+	NULL,
+};
+
+static struct attribute_group zram_disk_attr_group = {
+	.attrs = zram_disk_attrs,
+};
+
 static int create_device(struct zram *zram, int device_id)
 {
-	int ret = 0;
+	int ret = -ENOMEM;
 
 	init_rwsem(&zram->lock);
 	init_rwsem(&zram->init_lock);
-	spin_lock_init(&zram->stat64_lock);
+
+	INIT_WORK(&zram->free_work, zram_slot_free);
+	spin_lock_init(&zram->slot_free_lock);
+	zram->slot_free_rq = NULL;
 
 	zram->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!zram->queue) {
 		pr_err("Error allocating disk queue for device %d\n",
 			device_id);
-		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -733,11 +856,9 @@
 	 /* gendisk structure */
 	zram->disk = alloc_disk(1);
 	if (!zram->disk) {
-		blk_cleanup_queue(zram->queue);
-		pr_warning("Error allocating disk structure for device %d\n",
+		pr_warn("Error allocating disk structure for device %d\n",
 			device_id);
-		ret = -ENOMEM;
-		goto out;
+		goto out_free_queue;
 	}
 
 	zram->disk->major = zram_major;
@@ -765,12 +886,18 @@
 	ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj,
 				&zram_disk_attr_group);
 	if (ret < 0) {
-		pr_warning("Error creating sysfs group");
-		goto out;
+		pr_warn("Error creating sysfs group");
+		goto out_free_disk;
 	}
 
 	zram->init_done = 0;
+	return 0;
 
+out_free_disk:
+	del_gendisk(zram->disk);
+	put_disk(zram->disk);
+out_free_queue:
+	blk_cleanup_queue(zram->queue);
 out:
 	return ret;
 }
@@ -780,52 +907,45 @@
 	sysfs_remove_group(&disk_to_dev(zram->disk)->kobj,
 			&zram_disk_attr_group);
 
-	if (zram->disk) {
-		del_gendisk(zram->disk);
-		put_disk(zram->disk);
-	}
+	del_gendisk(zram->disk);
+	put_disk(zram->disk);
 
-	if (zram->queue)
-		blk_cleanup_queue(zram->queue);
+	blk_cleanup_queue(zram->queue);
 }
 
 static int __init zram_init(void)
 {
 	int ret, dev_id;
 
-	if (zram_num_devices > max_num_devices) {
-		pr_warning("Invalid value for num_devices: %u\n",
-				zram_num_devices);
+	if (num_devices > max_num_devices) {
+		pr_warn("Invalid value for num_devices: %u\n",
+				num_devices);
 		ret = -EINVAL;
 		goto out;
 	}
 
 	zram_major = register_blkdev(0, "zram");
 	if (zram_major <= 0) {
-		pr_warning("Unable to get major number\n");
+		pr_warn("Unable to get major number\n");
 		ret = -EBUSY;
 		goto out;
 	}
 
-	if (!zram_num_devices) {
-		pr_info("num_devices not specified. Using default: 1\n");
-		zram_num_devices = 1;
-	}
-
 	/* Allocate the device array and initialize each one */
-	pr_info("Creating %u devices ...\n", zram_num_devices);
-	zram_devices = kzalloc(zram_num_devices * sizeof(struct zram), GFP_KERNEL);
+	zram_devices = kzalloc(num_devices * sizeof(struct zram), GFP_KERNEL);
 	if (!zram_devices) {
 		ret = -ENOMEM;
 		goto unregister;
 	}
 
-	for (dev_id = 0; dev_id < zram_num_devices; dev_id++) {
+	for (dev_id = 0; dev_id < num_devices; dev_id++) {
 		ret = create_device(&zram_devices[dev_id], dev_id);
 		if (ret)
 			goto free_devices;
 	}
 
+	pr_info("Created %u device(s) ...\n", num_devices);
+
 	return 0;
 
 free_devices:
@@ -843,12 +963,15 @@
 	int i;
 	struct zram *zram;
 
-	for (i = 0; i < zram_num_devices; i++) {
+	for (i = 0; i < num_devices; i++) {
 		zram = &zram_devices[i];
 
 		destroy_device(zram);
-		if (zram->init_done)
-			zram_reset_device(zram);
+		/*
+		 * Shouldn't access zram->disk after destroy_device
+		 * because destroy_device already released zram->disk.
+		 */
+		zram_reset_device(zram, false);
 	}
 
 	unregister_blkdev(zram_major, "zram");
@@ -857,12 +980,12 @@
 	pr_debug("Cleanup done!\n");
 }
 
-module_param(zram_num_devices, uint, 0);
-MODULE_PARM_DESC(zram_num_devices, "Number of zram devices");
-
 module_init(zram_init);
 module_exit(zram_exit);
 
+module_param(num_devices, uint, 0);
+MODULE_PARM_DESC(num_devices, "Number of zram devices");
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
 MODULE_DESCRIPTION("Compressed RAM Block Device");
diff -ur a/drivers/staging/zram/zram_drv.h b/drivers/staging/zram/zram_drv.h
--- a/drivers/staging/zram/zram_drv.h	2013-08-24 11:37:13.000000000 +0200
+++ b/drivers/staging/zram/zram_drv.h	2014-02-17 11:57:51.000000000 +0100
@@ -18,7 +18,7 @@
 #include <linux/spinlock.h>
 #include <linux/mutex.h>
 
-#include "xvmalloc.h"
+#include "../zsmalloc/zsmalloc.h"
 
 /*
  * Some arbitrary value. This is just to catch
@@ -26,23 +26,8 @@
  */
 static const unsigned max_num_devices = 32;
 
-/*
- * Stored at beginning of each compressed object.
- *
- * It stores back-reference to table entry which points to this
- * object. This is required to support memory defragmentation.
- */
-struct zobj_header {
-#if 0
-	u32 table_idx;
-#endif
-};
-
 /*-- Configurable parameters */
 
-/* Default zram disk size: 25% of total RAM */
-static const unsigned default_disksize_perc_ram = 25;
-
 /*
  * Pages that compress to size greater than this are stored
  * uncompressed in memory.
@@ -51,8 +36,8 @@
 
 /*
  * NOTE: max_zpage_size must be less than or equal to:
- *   XV_MAX_ALLOC_SIZE - sizeof(struct zobj_header)
- * otherwise, xv_malloc() would always return failure.
+ *   ZS_MAX_ALLOC_SIZE. Otherwise, zs_malloc() would
+ * always return failure.
  */
 
 /*-- End of configurable params */
@@ -68,9 +53,6 @@
 
 /* Flags for zram pages (table[page_no].flags) */
 enum zram_pageflags {
-	/* Page is stored uncompressed */
-	ZRAM_UNCOMPRESSED,
-
 	/* Page consists entirely of zeros */
 	ZRAM_ZERO,
 
@@ -81,34 +63,51 @@
 
 /* Allocated for each disk page */
 struct table {
-	struct page *page;
-	u16 offset;
+	unsigned long handle;
+	u16 size;	/* object size (excluding header) */
 	u8 count;	/* object ref count (not yet used) */
 	u8 flags;
-} __attribute__((aligned(4)));
+} __aligned(4);
 
+/*
+ * All 64bit fields should only be manipulated by 64bit atomic accessors.
+ * All modifications to 32bit counter should be protected by zram->lock.
+ */
 struct zram_stats {
-	u64 compr_size;		/* compressed size of pages stored */
-	u64 num_reads;		/* failed + successful */
-	u64 num_writes;		/* --do-- */
-	u64 failed_reads;	/* should NEVER! happen */
-	u64 failed_writes;	/* can happen when memory is too low */
-	u64 invalid_io;		/* non-page-aligned I/O requests */
-	u64 notify_free;	/* no. of swap slot free notifications */
+	atomic64_t compr_size;	/* compressed size of pages stored */
+	atomic64_t num_reads;	/* failed + successful */
+	atomic64_t num_writes;	/* --do-- */
+	atomic64_t failed_reads;	/* should NEVER! happen */
+	atomic64_t failed_writes;	/* can happen when memory is too low */
+	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
+	atomic64_t notify_free;	/* no. of swap slot free notifications */
 	u32 pages_zero;		/* no. of zero filled pages */
 	u32 pages_stored;	/* no. of pages currently stored */
 	u32 good_compress;	/* % of pages with compression ratio<=50% */
-	u32 pages_expand;	/* % of incompressible pages */
+	u32 bad_compress;	/* % of pages with compression ratio>=75% */
 };
 
-struct zram {
-	struct xv_pool *mem_pool;
+struct zram_meta {
 	void *compress_workmem;
 	void *compress_buffer;
 	struct table *table;
-	spinlock_t stat64_lock;	/* protect 64-bit stats */
-	struct rw_semaphore lock; /* protect compression buffers and table
-				   * against concurrent read and writes */
+	struct zs_pool *mem_pool;
+};
+
+struct zram_slot_free {
+	unsigned long index;
+	struct zram_slot_free *next;
+};
+
+struct zram {
+	struct zram_meta *meta;
+	struct rw_semaphore lock; /* protect compression buffers, table,
+				   * 32bit stat counters against concurrent
+				   * notifications, reads and writes */
+
+	struct work_struct free_work;  /* handle pending free request */
+	struct zram_slot_free *slot_free_rq; /* list head of free request */
+
 	struct request_queue *queue;
 	struct gendisk *disk;
 	int init_done;
@@ -119,17 +118,8 @@
 	 * we can store in a disk.
 	 */
 	u64 disksize;	/* bytes */
+	spinlock_t slot_free_lock;
 
 	struct zram_stats stats;
 };
-
-extern struct zram *zram_devices;
-extern unsigned int zram_num_devices;
-#ifdef CONFIG_SYSFS
-extern struct attribute_group zram_disk_attr_group;
-#endif
-
-extern int zram_init_device(struct zram *zram);
-extern void __zram_reset_device(struct zram *zram);
-
 #endif
Nur in a/drivers/staging/zram: zram_sysfs.c.
diff -ur a/drivers/staging/zram/zram.txt b/drivers/staging/zram/zram.txt
--- a/drivers/staging/zram/zram.txt	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/staging/zram/zram.txt	2014-01-21 09:37:20.000000000 +0100
@@ -23,17 +23,17 @@
 	This creates 4 devices: /dev/zram{0,1,2,3}
 	(num_devices parameter is optional. Default: 1)
 
-2) Set Disksize (Optional):
-	Set disk size by writing the value to sysfs node 'disksize'
-	(in bytes). If disksize is not given, default value of 25%
-	of RAM is used.
-
-	# Initialize /dev/zram0 with 50MB disksize
-	echo $((50*1024*1024)) > /sys/block/zram0/disksize
-
-	NOTE: disksize cannot be changed if the disk contains any
-	data. So, for such a disk, you need to issue 'reset' (see below)
-	before you can change its disksize.
+2) Set Disksize
+        Set disk size by writing the value to sysfs node 'disksize'.
+        The value can be either in bytes or you can use mem suffixes.
+        Examples:
+            # Initialize /dev/zram0 with 50MB disksize
+            echo $((50*1024*1024)) > /sys/block/zram0/disksize
+
+            # Using mem suffixes
+            echo 256K > /sys/block/zram0/disksize
+            echo 512M > /sys/block/zram0/disksize
+            echo 1G > /sys/block/zram0/disksize
 
 3) Activate:
 	mkswap /dev/zram0
@@ -65,8 +65,9 @@
 	echo 1 > /sys/block/zram0/reset
 	echo 1 > /sys/block/zram1/reset
 
-	(This frees all the memory allocated for the given device).
-
+	This frees all the memory allocated for the given device and
+	resets the disksize to zero. You must set the disksize again
+	before reusing the device.
 
 Please report any problems at:
  - Mailing list: linux-mm-cc at laptop dot org
Nur in b/drivers/staging: zsmalloc.
diff -ur a/drivers/tty/Kconfig b/drivers/tty/Kconfig
--- a/drivers/tty/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/tty/Kconfig	2014-01-21 09:37:20.000000000 +0100
@@ -1,6 +1,6 @@
 config VT
 	bool "Virtual terminal" if EXPERT
-	depends on !S390 && !UML
+	depends on !S390 && !UML && !ARCH_COMCERTO
 	select INPUT
 	default y
 	---help---
diff -ur a/drivers/tty/serial/8250.c b/drivers/tty/serial/8250.c
--- a/drivers/tty/serial/8250.c	2013-08-24 11:36:53.000000000 +0200
+++ b/drivers/tty/serial/8250.c	2014-02-17 11:57:17.000000000 +0100
@@ -51,6 +51,12 @@
 #include "suncore.h"
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+#include <linux/clk.h>
+#include <mach/reset.h>
+static struct clk *uart_clk;     /*UART Clock(DUS) depends upon the AXI*/
+#endif 
+
 /*
  * Configuration:
  *   share_irqs - whether we pass IRQF_SHARED to request_irq().  This option
@@ -469,7 +475,7 @@
 	__raw_writel(value, p->membase + offset);
 }
 
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 /* Save the LCR value so it can be re-written when a Busy Detect IRQ occurs. */
 static inline void dwapb_save_out_value(struct uart_port *p, int offset,
                                        int value)
@@ -492,7 +498,7 @@
 {
        int save_offset = offset;
        offset = map_8250_out_reg(p, offset) << p->regshift;
-#ifdef CONFIG_PLAT_ARMADA
+#if defined(CONFIG_PLAT_ARMADA) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
        /* If we are accessing DLH (0x4), DLL (0x0), LCR(0xC) or 0x1C
        ** we need to make sure that the busy bit is cleared in USR register.
        */
@@ -543,7 +549,7 @@
 		break;
 
 	case UPIO_RM9000:
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 		p->serial_in = mem_serial_in;
 		p->serial_out = dwapb_serial_out;
 		break;
@@ -1114,7 +1120,7 @@
 			 */
 			DEBUG_AUTOCONF("Xscale ");
 			up->port.type = PORT_XSCALE;
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 			up->capabilities |= UART_CAP_UUE;
 #else
 			up->capabilities |= UART_CAP_UUE | UART_CAP_RTOIE;
@@ -1817,15 +1823,15 @@
 	do {
 		struct uart_8250_port *up;
 		struct uart_port *port;
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 		unsigned int iir;
 #endif
 
 		up = list_entry(l, struct uart_8250_port, list);
 		port = &up->port;
 
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
-#if defined(CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
+#if defined(CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 
 		iir = serial_in(up, UART_IIR);
 		if (!(iir & UART_IIR_NO_INT)) {
@@ -3297,6 +3303,31 @@
 	struct uart_port port;
 	int ret, i, irqflag = 0;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	unsigned long uart_rate;
+
+	/* Take the Fast-UART device Out-Of-Reset*/
+	c2000_block_reset(COMPONENT_AXI_FAST_UART,0);
+
+	/* Get the FAST-UART clk structure from DUS */	
+	uart_clk = clk_get(NULL,"DUS");
+	
+	if (IS_ERR(uart_clk)) {
+		pr_err("%s: Unable to get UART clock: %ld\n",__func__,PTR_ERR(uart_clk));
+		return PTR_ERR(uart_clk);
+        }
+	
+	/* Enable the FAST-UART Clock */
+	ret = clk_enable(uart_clk);
+	if (ret){
+                pr_err("%s: UART clock failed to enable:\n",__func__);
+		return  ret;       
+	}
+	
+	/* Get the UART Clock in Hz */
+	uart_rate = clk_get_rate(uart_clk);
+#endif
+
 	memset(&port, 0, sizeof(struct uart_port));
 
 	if (share_irqs)
@@ -3307,7 +3338,11 @@
 		port.membase		= p->membase;
 		port.irq		= p->irq;
 		port.irqflags		= p->irqflags;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+		port.uartclk		= uart_rate;    /* Assigning the rate value to the ports */
+#else	
 		port.uartclk		= p->uartclk;
+#endif
 		port.regshift		= p->regshift;
 		port.iotype		= p->iotype;
 		port.flags		= p->flags;
@@ -3346,9 +3381,19 @@
 		if (up->port.dev == &dev->dev)
 			serial8250_unregister_port(i);
 	}
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	/*Disable the Fast-UART clock here*/
+	clk_disable(uart_clk);	
+	clk_put(uart_clk);
+
+	/* Put the  Fast-UART device in Reset*/
+	c2000_block_reset(COMPONENT_AXI_FAST_UART,1);
+#endif
+
 	return 0;
 }
 
+#if !defined(CONFIG_SYNO_COMCERTO) || defined(CONFIG_PM)
 static int serial8250_suspend(struct platform_device *dev, pm_message_t state)
 {
 	int i;
@@ -3360,6 +3405,16 @@
 			uart_suspend_port(&serial8250_reg, &up->port);
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	/* Now Do the FAST_UART_CLOCK gating here, be sure no other devices
+	 * are using the DUS clock to shutdown the clock. 
+	 * Here above clock is derived from DUS , henece it will be not
+	 * gated , unless and until DMA/FAST-SPI will disable the DUS clock 
+	 * to make the usecount 0.
+	*/
+	clk_disable(uart_clk);
+#endif
+
 	return 0;
 }
 
@@ -3367,6 +3422,20 @@
 {
 	int i;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	/* Now Enable the FAST_UART_CLOCK here , before 
+	 * before resuming any opertions.
+	 */
+	if (clk_enable(uart_clk)){
+                pr_err("%s: Unable to enable FAST-UART clock: \n",__func__);
+		/* Here we are not able to enable the FAST-UART clock , 
+		 * beacause of clk_disable unable to shutdown( usecount is 
+		 * not zero ,due to dependancy with DMA and FAST-SPI) clock
+		 * so Let resume the port only .
+		 */ 
+        }
+#endif
+
 	for (i = 0; i < UART_NR; i++) {
 		struct uart_8250_port *up = &serial8250_ports[i];
 
@@ -3376,12 +3445,15 @@
 
 	return 0;
 }
+#endif
 
 static struct platform_driver serial8250_isa_driver = {
 	.probe		= serial8250_probe,
 	.remove		= __devexit_p(serial8250_remove),
+#if !defined(CONFIG_SYNO_COMCERTO) || defined(CONFIG_PM)
 	.suspend	= serial8250_suspend,
 	.resume		= serial8250_resume,
+#endif
 	.driver		= {
 		.name	= "serial8250",
 		.owner	= THIS_MODULE,
diff -ur a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c
--- a/drivers/tty/serial/serial_core.c	2013-08-24 11:36:53.000000000 +0200
+++ b/drivers/tty/serial/serial_core.c	2014-02-17 11:57:17.000000000 +0100
@@ -2061,8 +2061,8 @@
 	case UPIO_MEM32:
 	case UPIO_AU:
 	case UPIO_TSI:
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
-#if defined(CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
+#if defined(CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 	case UPIO_DWAPB:
 #endif
 #endif
@@ -2479,8 +2479,8 @@
 	case UPIO_MEM32:
 	case UPIO_AU:
 	case UPIO_TSI:
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
-#if defined(CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
+#if defined(CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 	case UPIO_DWAPB:
 #endif
 #endif
diff -ur a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c
--- a/drivers/usb/core/devices.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/core/devices.c	2014-02-17 11:57:11.000000000 +0100
@@ -546,7 +546,6 @@
 #endif
 
 #ifdef MY_ABC_HERE
-extern char gszSynoHWVersion[];
 int blIsCardReader(struct usb_device *usbdev)
 {
 	char buf[256];
@@ -572,8 +571,8 @@
 		}
 #endif
 #if defined(CONFIG_SYNO_ARMADA)
-		if (!strncmp(gszSynoHWVersion, HW_US3v10, strlen(HW_US3v10))) {
-			if (!strcmp(buf, "0000:00:01.0-1") || !strcmp(buf, "0000:00:01.0-4")) {
+		if (syno_is_hw_version(HW_US3v10)) {
+			if (!strcmp(buf, "0000:00:00.0-1")) {
 				return 1;
 			}
 		}
diff -ur a/drivers/usb/core/ethub.c b/drivers/usb/core/ethub.c
--- a/drivers/usb/core/ethub.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/core/ethub.c	2014-02-17 11:57:11.000000000 +0100
@@ -8,6 +8,7 @@
  *
  */
 
+#include <linux/version.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/module.h>
@@ -21,10 +22,10 @@
 #include <linux/usbdevice_fs.h>
 #include <linux/usb/hcd.h>
 #include <linux/usb/quirks.h>
+#include <linux/usb/storage.h>
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/freezer.h>
-#include <linux/random.h>
 
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
@@ -65,6 +66,7 @@
 							device present */
 	unsigned long		wakeup_bits[1];	/* ports that have signaled
 							remote wakeup */
+	unsigned long		bot_mode_bits[1];
 #if USB_MAXCHILDREN > 31 /* 8*sizeof(unsigned long) - 1 */
 #error event_bits[] is too short!
 #endif
@@ -87,7 +89,7 @@
 
 static inline int hub_is_superspeed(struct usb_device *hdev)
 {
-	return (hdev->descriptor.bDeviceProtocol == 3);
+	return (hdev->descriptor.bDeviceProtocol == USB_HUB_PR_SS);
 }
 
 /* Protect struct usb_device->state and ->children members
@@ -172,6 +174,10 @@
  */
 static int clear_port_feature(struct usb_device *hdev, int port1, int feature)
 {
+struct usb_hub *hub = hdev_to_hub(hdev);
+dev_warn(hub->intfdev,
+	"%s - port %x feature %x\n", __func__, port1, feature);
+
 	return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
 		USB_REQ_CLEAR_FEATURE, USB_RT_PORT, feature, port1,
 		NULL, 0, 1000);
@@ -182,6 +188,10 @@
  */
 static int set_port_feature(struct usb_device *hdev, int port1, int feature)
 {
+struct usb_hub *hub = hdev_to_hub(hdev);
+dev_warn(hub->intfdev,
+	"%s - port %x feature %x\n", __func__, port1, feature);
+
 	return usb_control_msg(hdev, usb_sndctrlpipe(hdev, 0),
 		USB_REQ_SET_FEATURE, USB_RT_PORT, feature, port1,
 		NULL, 0, 1000);
@@ -333,6 +343,8 @@
 		*change = le16_to_cpu(hub->status->port.wPortChange);
 
 		ret = 0;
+dev_warn(hub->intfdev,
+	"%s - port %x stauts %04x change %04x\n", __func__, port1, *status, *change);
 	}
 	mutex_unlock(&hub->status_mutex);
 	return ret;
@@ -370,8 +382,13 @@
  * device initiates resume, so the USB core will not receive notice of the
  * resume through the normal hub interrupt URB.
  */
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0))
 void usb_wakeup_notification(struct usb_device *hdev,
 		unsigned int portnum)
+#else
+void ethub_usb_wakeup_notification(struct usb_device *hdev,
+		unsigned int portnum)
+#endif
 {
 	struct usb_hub *hub;
 
@@ -384,7 +401,26 @@
 		kick_kethubd(hub);
 	}
 }
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0))
 EXPORT_SYMBOL_GPL(usb_wakeup_notification);
+#endif
+
+void usb_run_bot_mode_notification(struct usb_device *hdev,
+		unsigned int portnum)
+{
+	struct usb_hub *hub;
+
+	if (!hdev)
+		return;
+
+	hub = hdev_to_hub(hdev);
+	if (hub) {
+		set_bit(portnum, hub->bot_mode_bits);
+		set_bit(portnum, hub->change_bits);
+		kick_kethubd(hub);
+	}
+}
+EXPORT_SYMBOL_GPL(usb_run_bot_mode_notification);
 
 /* completion function, fires on port status changes and various faults */
 static void hub_irq(struct urb *urb)
@@ -489,7 +525,7 @@
 	int ret = 0;
 
 	if (hdev->children[port1-1] && set_state)
-		usb_set_device_state(hdev->children[port1-1],
+		ethub_usb_set_device_state(hdev->children[port1-1],
 				USB_STATE_NOTATTACHED);
 	if (!hub->error && !hub_is_superspeed(hub->hdev))
 		ret = clear_port_feature(hdev, port1, USB_PORT_FEAT_ENABLE);
@@ -742,7 +778,7 @@
 
 		} else {
 			/* The power session is gone; tell khubd */
-			usb_set_device_state(udev, USB_STATE_NOTATTACHED);
+			ethub_usb_set_device_state(udev, USB_STATE_NOTATTACHED);
 			set_bit(port1, hub->change_bits);
 		}
 	}
@@ -760,6 +796,12 @@
 
 		/* Don't do a long sleep inside a workqueue routine */
 		if (type == HUB_INIT2) {
+			if (hub_is_superspeed(hdev) &&
+				(hdev->descriptor.idVendor == cpu_to_le16(0x1c04) ||
+				hdev->descriptor.idProduct == cpu_to_le16(0x0008))) {
+				delay = 5000;
+			}
+
 			PREPARE_DELAYED_WORK(&hub->init_work, hub_init_func3);
 			schedule_delayed_work(&hub->init_work,
 					msecs_to_jiffies(delay));
@@ -816,7 +858,7 @@
 		/* Disconnect all the children */
 		for (i = 0; i < hdev->maxchild; ++i) {
 			if (hdev->children[i])
-				usb_disconnect(&hdev->children[i]);
+				ethub_usb_disconnect(&hdev->children[i]);
 		}
 	}
 
@@ -894,7 +936,13 @@
 		(hdev->maxchild == 1) ? "" : "s");
 
 	hub->port_owners = kzalloc(hdev->maxchild * sizeof(void *), GFP_KERNEL);
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0))
+	hdev->children = kzalloc(hdev->maxchild *
+				sizeof(struct usb_device *), GFP_KERNEL);
+	if (!hdev->children || !hub->port_owners) {
+#else
 	if (!hub->port_owners) {
+#endif
 		ret = -ENOMEM;
 		goto fail;
 	}
@@ -917,27 +965,27 @@
 		dev_dbg(hub_dev, "standalone hub\n");
 
 	switch (wHubCharacteristics & HUB_CHAR_LPSM) {
-		case 0x00:
+	case HUB_CHAR_COMMON_LPSM:
 			dev_dbg(hub_dev, "ganged power switching\n");
 			break;
-		case 0x01:
+	case HUB_CHAR_INDV_PORT_LPSM:
 			dev_dbg(hub_dev, "individual port power switching\n");
 			break;
-		case 0x02:
-		case 0x03:
+	case HUB_CHAR_NO_LPSM:
+	case HUB_CHAR_LPSM:
 			dev_dbg(hub_dev, "no power switching (usb 1.0)\n");
 			break;
 	}
 
 	switch (wHubCharacteristics & HUB_CHAR_OCPM) {
-		case 0x00:
+	case HUB_CHAR_COMMON_OCPM:
 			dev_dbg(hub_dev, "global over-current protection\n");
 			break;
-		case 0x08:
+	case HUB_CHAR_INDV_PORT_OCPM:
 			dev_dbg(hub_dev, "individual port over-current protection\n");
 			break;
-		case 0x10:
-		case 0x18:
+	case HUB_CHAR_NO_OCPM:
+	case HUB_CHAR_OCPM:
 			dev_dbg(hub_dev, "no over-current protection\n");
                         break;
 	}
@@ -945,13 +993,13 @@
 	spin_lock_init (&hub->tt.lock);
 	INIT_LIST_HEAD (&hub->tt.clear_list);
 	switch (hdev->descriptor.bDeviceProtocol) {
-		case 0:
+	case USB_HUB_PR_FS:
 			break;
-		case 1:
+	case USB_HUB_PR_HS_SINGLE_TT:
 			dev_dbg(hub_dev, "Single TT\n");
 			hub->tt.hub = hdev;
 			break;
-		case 2:
+	case USB_HUB_PR_HS_MULTI_TT:
 			ret = usb_set_interface(hdev, 0, 1);
 			if (ret == 0) {
 				dev_dbg(hub_dev, "TT per port\n");
@@ -961,7 +1009,7 @@
 					ret);
 			hub->tt.hub = hdev;
 			break;
-		case 3:
+	case USB_HUB_PR_SS:
 			/* USB 3.0 hubs don't have a TT */
 			break;
 		default:
@@ -1120,7 +1168,10 @@
 
 static void hub_disconnect(struct usb_interface *intf)
 {
-	struct usb_hub *hub = usb_get_intfdata (intf);
+	struct usb_hub *hub = usb_get_intfdata(intf);
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0))
+	struct usb_device *hdev = interface_to_usbdev(intf);
+#endif
 
 	/* Take the hub off the event list and don't let it be added again */
 	spin_lock_irq(&hub_event_lock);
@@ -1142,6 +1193,9 @@
 		highspeed_hubs--;
 
 	usb_free_urb(hub->urb);
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0))
+	kfree(hdev->children);
+#endif
 	kfree(hub->port_owners);
 	kfree(hub->descriptor);
 	kfree(hub->status);
@@ -1163,7 +1217,13 @@
 	if (!usb_is_etron_hcd(hdev))
 		return -ENODEV;
 
-	/* Hubs have proper suspend/resume support. */
+	/* Hubs have proper suspend/resume support.  USB 3.0 device suspend is
+	 * different from USB 2.0/1.1 device suspend, and unfortunately we
+	 * don't support it yet.  So leave autosuspend disabled for USB 3.0
+	 * external hubs for now.  Enable autosuspend for USB 3.0 roothubs,
+	 * since that isn't a "real" hub.
+	 */
+	if (!hub_is_superspeed(hdev) || !hdev->parent)
 	usb_enable_autosuspend(hdev);
 
 	if (hdev->level == MAX_TOPO_LEVEL) {
@@ -1221,7 +1281,6 @@
 	return -ENODEV;
 }
 
-/* No BKL needed */
 static int
 hub_ioctl(struct usb_interface *intf, unsigned int code, void *user_data)
 {
@@ -1505,16 +1564,20 @@
 	 * this device (and any of its children) will fail immediately.
 	 * this quiesces everything except pending urbs.
 	 */
-	usb_set_device_state(udev, USB_STATE_NOTATTACHED);
+	ethub_usb_set_device_state(udev, USB_STATE_NOTATTACHED);
 	dev_info(&udev->dev, "USB disconnect, device number %d\n",
 			udev->devnum);
 
 	usb_lock_device(udev);
 
 	/* Free up all the children before we remove this device */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0))
+	for (i = 0; i < udev->maxchild; i++) {
+#else
 	for (i = 0; i < USB_MAXCHILDREN; i++) {
+#endif
 		if (udev->children[i])
-			usb_disconnect(&udev->children[i]);
+			ethub_usb_disconnect(&udev->children[i]);
 	}
 
 	/* deallocate hcd/hardware state ... nuking all pending urbs and
@@ -1611,10 +1674,87 @@
 						      udev->descriptor.iManufacturer);
 		udev->serial = usb_cache_string(udev, udev->descriptor.iSerialNumber);
 	}
+#if (LINUX_VERSION_CODE == KERNEL_VERSION(3,2,40))
+	usb_detect_interface_quirks(udev);
+#endif
 fail:
 	return err;
 }
 
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0))
+static void set_usb_port_removable(struct usb_device *udev)
+{
+	struct usb_device *hdev = udev->parent;
+	struct usb_hub *hub;
+	u8 port = udev->portnum;
+	u16 wHubCharacteristics;
+	bool removable = true;
+
+	if (!hdev)
+		return;
+
+	hub = hdev_to_hub(udev->parent);
+
+	wHubCharacteristics = le16_to_cpu(hub->descriptor->wHubCharacteristics);
+
+	if (!(wHubCharacteristics & HUB_CHAR_COMPOUND))
+		return;
+
+	if (hub_is_superspeed(hdev)) {
+		if (hub->descriptor->u.ss.DeviceRemovable & (1 << port))
+			removable = false;
+	} else {
+		if (hub->descriptor->u.hs.DeviceRemovable[port / 8] & (1 << (port % 8)))
+			removable = false;
+	}
+
+	if (removable)
+		udev->removable = USB_DEVICE_REMOVABLE;
+	else
+		udev->removable = USB_DEVICE_FIXED;
+}
+#endif
+
+static void usb_set_uas_device_quirks(struct usb_device *udev)
+{
+	struct device_driver *driver;
+	struct usb_interface_cache *intfc;
+	struct usb_host_interface *alts;
+	int i;
+
+#define USB_QUIRK_UAS_MODE		0x80000000
+#define USB_QUIRK_BOT_MODE		0x40000000
+
+	driver = driver_find("uas", &usb_bus_type);
+	if (driver == NULL)
+		return;
+
+#if (LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0))
+	put_driver(driver);
+#endif
+	if (!udev->config || udev->speed != USB_SPEED_SUPER)
+		return;
+
+	intfc = udev->config[0].intf_cache[0];
+	for (i = 0; i < intfc->num_altsetting; i++) {
+		alts = &intfc->altsetting[i];
+		if (alts->desc.bInterfaceClass == USB_CLASS_MASS_STORAGE &&
+			alts->desc.bInterfaceSubClass == USB_SC_SCSI &&
+			alts->desc.bInterfaceProtocol == USB_PR_UAS) {
+			struct usb_hub *hub = hdev_to_hub(udev->parent);
+
+			if (!test_bit(udev->portnum, hub->bot_mode_bits)) {
+				udev->quirks |= USB_QUIRK_UAS_MODE;
+				dev_info(&udev->dev, "set UAS mode quirk\n");
+			} else {
+				udev->quirks |= USB_QUIRK_BOT_MODE;
+				dev_info(&udev->dev, "set BOT mode quirk\n");
+			}
+			return;
+		}
+	}
+}
+
 #ifdef MY_ABC_HERE
 /* Return 1 if found the same serial in other usb device. Otherwizs, return 0. */
 static int device_serial_match(struct usb_device *dev, struct usb_device *udev_search)
@@ -1766,15 +1906,19 @@
 	/* Tell the world! */
 	announce_device(udev);
 
-	if (udev->serial)
-		add_device_randomness(udev->serial, strlen(udev->serial));
-	if (udev->product)
-		add_device_randomness(udev->product, strlen(udev->product));
-	if (udev->manufacturer)
-		add_device_randomness(udev->manufacturer,
-				      strlen(udev->manufacturer));
-
+	usb_set_uas_device_quirks(udev);
 	device_enable_async_suspend(&udev->dev);
+
+	/*
+	 * check whether the hub marks this port as non-removable. Do it
+	 * now so that platform-specific data can override it in
+	 * device_add()
+	 */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,4,0))
+	if (udev->parent)
+		set_usb_port_removable(udev);
+#endif
+
 	/* Register the device.  The device driver is responsible
 	 * for configuring the device and invoking the add-device
 	 * notifier chain (used by usbfs and possibly others).
@@ -1791,7 +1935,7 @@
 	return err;
 
 fail:
-	usb_set_device_state(udev, USB_STATE_NOTATTACHED);
+	ethub_usb_set_device_state(udev, USB_STATE_NOTATTACHED);
 	pm_runtime_disable(&udev->dev);
 	pm_runtime_set_suspended(&udev->dev);
 	return err;
@@ -1899,11 +2043,10 @@
 #define HUB_SHORT_RESET_TIME	10
 #define HUB_BH_RESET_TIME	50
 #define HUB_LONG_RESET_TIME	200
-#define HUB_RESET_TIMEOUT	500
+#define HUB_RESET_TIMEOUT	800
 
 static int hub_port_reset(struct usb_hub *hub, int port1,
-			struct usb_device *udev, unsigned int delay, bool warm, bool init);
-static int hub_port_debounce(struct usb_hub *hub, int port1);
+			struct usb_device *udev, unsigned int delay, bool warm);
 
 /* Is a USB 3.0 port in the Inactive or Complinance Mode state?
  * Port worm reset is required to recover
@@ -1914,13 +2057,11 @@
 		(((portstatus & USB_PORT_STAT_LINK_STATE) ==
 		  USB_SS_PORT_LS_SS_INACTIVE) ||
 		 ((portstatus & USB_PORT_STAT_LINK_STATE) ==
-		  USB_SS_PORT_LS_COMP_MOD) ||
-		 ((portstatus & USB_PORT_STAT_LINK_STATE) ==
-		  USB_SS_PORT_LS_POLLING)) ;
+		  USB_SS_PORT_LS_COMP_MOD)) ;
 }
 
 static int hub_port_wait_reset(struct usb_hub *hub, int port1,
-			struct usb_device *udev, unsigned int delay, bool warm, bool init)
+			struct usb_device *udev, unsigned int delay, bool warm)
 {
 	int delay_time, ret;
 	u16 portstatus;
@@ -1937,39 +2078,9 @@
 		if (ret < 0)
 			return ret;
 
-		/*
-		 * Some buggy devices require a warm reset to be issued even
-		 * when the port appears not to be connected.
-		 */
-		if (init) {
-			if (portchange & USB_PORT_STAT_C_CONNECTION) {
-				hub_port_debounce(hub, port1);
-				ret = hub_port_status(hub, port1, &portstatus, &portchange);
-				if (ret < 0)
-					return ret;
-			}
-
-			/* if we`ve finished resetting, then break out of
-			 * the loop
-			 */
-			if (!(portstatus & USB_PORT_STAT_RESET) &&
-			    (portstatus & USB_PORT_STAT_ENABLE)) {
-				if (hub_is_superspeed(hub->hdev))
-					udev->speed = USB_SPEED_SUPER;
-				else if (portstatus & USB_PORT_STAT_HIGH_SPEED)
-					udev->speed = USB_SPEED_HIGH;
-				else if (portstatus & USB_PORT_STAT_LOW_SPEED)
-					udev->speed = USB_SPEED_LOW;
-				else
-					udev->speed = USB_SPEED_FULL;
-				return 0;
-			}
-		}
-
-		if (warm) {
-			if (portchange & USB_PORT_STAT_C_BH_RESET)
-				return 0;
-		}
+		/* The port state is unknown until the reset completes. */
+		if (!(portstatus & USB_PORT_STAT_RESET))
+			break;
 
 		/* switch to the long delay after two short delay failures */
 		if (delay_time >= 2 * HUB_SHORT_RESET_TIME)
@@ -1980,100 +2091,108 @@
 			port1, warm ? "warm " : "", delay);
 	}
 
-	if (init) {
+	if ((portstatus & USB_PORT_STAT_RESET))
+		return -EBUSY;
+
+	if (hub_port_warm_reset_required(hub, portstatus))
+		return -ENOTCONN;
+
 		/* Device went away? */
 		if (!(portstatus & USB_PORT_STAT_CONNECTION))
 			return -ENOTCONN;
 
-		/* bomb out completely if the connection bounced */
-		if ((portchange & USB_PORT_STAT_C_CONNECTION))
+	/* bomb out completely if the connection bounced.  A USB 3.0
+	 * connection may bounce if multiple warm resets were issued,
+	 * but the device may have successfully re-connected. Ignore it.
+	 */
+	if (!hub_is_superspeed(hub->hdev) &&
+			(portchange & USB_PORT_STAT_C_CONNECTION))
 			return -ENOTCONN;
-	}
 
+	if (!(portstatus & USB_PORT_STAT_ENABLE))
 	return -EBUSY;
+
+	if (!udev)
+		return 0;
+
+	if (hub_is_superspeed(hub->hdev))
+		udev->speed = USB_SPEED_SUPER;
+	else if (portstatus & USB_PORT_STAT_HIGH_SPEED)
+		udev->speed = USB_SPEED_HIGH;
+	else if (portstatus & USB_PORT_STAT_LOW_SPEED)
+		udev->speed = USB_SPEED_LOW;
+	else
+		udev->speed = USB_SPEED_FULL;
+	return 0;
 }
 
 static void hub_port_finish_reset(struct usb_hub *hub, int port1,
-			struct usb_device *udev, int *status, bool warm, bool init)
+			struct usb_device *udev, int *status)
 {
 	switch (*status) {
 	case 0:
-		if (init) {
-			struct usb_hcd *hcd;
 			/* TRSTRCY = 10 ms; plus some extra */
 			msleep(10 + 40);
+		if (udev) {
+			struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+
 			update_devnum(udev, 0);
-			hcd = bus_to_hcd(udev->bus);
-			if (hcd->driver->reset_device) {
-				*status = hcd->driver->reset_device(hcd, udev);
-				if (*status < 0) {
-					dev_err(&udev->dev, "Cannot reset "
-							"HCD device state\n");
-					break;
-				}
-			}
+			/* The xHC may think the device is already reset,
+			 * so ignore the status.
+			 */
+			if (hcd->driver->reset_device)
+				hcd->driver->reset_device(hcd, udev);
 		}
 		/* FALL THROUGH */
 	case -ENOTCONN:
 	case -ENODEV:
 		clear_port_feature(hub->hdev,
 				port1, USB_PORT_FEAT_C_RESET);
-		/* FIXME need disconnect() for NOTATTACHED device */
-		if (warm) {
+		if (hub_is_superspeed(hub->hdev)) {
 			clear_port_feature(hub->hdev, port1,
 					USB_PORT_FEAT_C_BH_PORT_RESET);
 			clear_port_feature(hub->hdev, port1,
 					USB_PORT_FEAT_C_PORT_LINK_STATE);
+			clear_port_feature(hub->hdev, port1,
+					USB_PORT_FEAT_C_CONNECTION);
 		}
-
-		if (init) {
-			usb_set_device_state(udev, *status
+		if (udev)
+			ethub_usb_set_device_state(udev, *status
 					? USB_STATE_NOTATTACHED
 					: USB_STATE_DEFAULT);
-		}
 		break;
 	}
 }
 
 /* Handle port reset and port warm(BH) reset (for USB3 protocol ports) */
 static int hub_port_reset(struct usb_hub *hub, int port1,
-			struct usb_device *udev, unsigned int delay, bool warm, bool init)
+			struct usb_device *udev, unsigned int delay, bool warm)
 {
 	int i, status;
-	u16 portstatus, portchange;
+	u16 portchange, portstatus;
 
-	if (warm) {
 		if (!hub_is_superspeed(hub->hdev)) {
+		if (warm) {
 			dev_err(hub->intfdev, "only USB3 hub support "
 						"warm reset\n");
 			return -EINVAL;
 		}
-	} else {
-		if (hub_is_superspeed(hub->hdev)) {
-			status = hub_port_status(hub, port1, &portstatus, &portchange);
+	} else if (!warm) {
+		/*
+		 * If the caller hasn't explicitly requested a warm reset,
+		 * double check and see if one is needed.
+		 */
+		status = hub_port_status(hub, port1,
+					&portstatus, &portchange);
 			if (status < 0)
 				goto done;
 
-			if (hub_port_warm_reset_required(hub, portstatus)) {
+		if (hub_port_warm_reset_required(hub, portstatus))
 				warm = true;
-				delay = HUB_LONG_RESET_TIME;
-			}
-		}
 	}
 
 	/* Reset the port */
 	for (i = 0; i < PORT_RESET_TRIES; i++) {
-		if (portchange & USB_PORT_STAT_C_CONNECTION) {
-			clear_port_feature(hub->hdev, port1,
-				USB_PORT_FEAT_C_CONNECTION);
-		}
-
-		if ((portchange & USB_PORT_STAT_C_LINK_STATE) &&
-				hub_is_superspeed(hub->hdev)) {
-			clear_port_feature(hub->hdev, port1,
-				USB_PORT_FEAT_C_PORT_LINK_STATE);
-		}
-
 		status = set_port_feature(hub->hdev, port1, (warm ?
 					USB_PORT_FEAT_BH_PORT_RESET :
 					USB_PORT_FEAT_RESET));
@@ -2083,28 +2202,40 @@
 					warm ? "warm " : "", port1, status);
 		} else {
 			status = hub_port_wait_reset(hub, port1, udev, delay,
-								warm, init);
+								warm);
 			if (status && status != -ENOTCONN)
 				dev_dbg(hub->intfdev,
 						"port_wait_reset: err = %d\n",
 						status);
 		}
 
-		if (hub_is_superspeed(hub->hdev)) {
-			status = hub_port_status(hub, port1, &portstatus, &portchange);
-			if (status < 0)
+		/* Check for disconnect or reset */
+		if (status == 0 || status == -ENOTCONN || status == -ENODEV) {
+			hub_port_finish_reset(hub, port1, udev, &status);
+
+			if (!hub_is_superspeed(hub->hdev))
 				goto done;
 
-			if (hub_port_warm_reset_required(hub, portstatus)) {
-				warm = true;
-				status = -EBUSY;
-			}
-		}
+			/*
+			 * If a USB 3.0 device migrates from reset to an error
+			 * state, re-issue the warm reset.
+			 */
+			if (hub_port_status(hub, port1,
+					&portstatus, &portchange) < 0)
+				goto done;
 
-		/* return on disconnect or reset */
-		if (status == 0 || status == -ENOTCONN || status == -ENODEV) {
-			hub_port_finish_reset(hub, port1, udev, &status, warm, init);
+			if (!hub_port_warm_reset_required(hub, portstatus))
 			goto done;
+
+			/*
+			 * If the port is in SS.Inactive or Compliance Mode, the
+			 * hot or warm reset failed.  Try another warm reset.
+			 */
+			if (!warm) {
+				dev_dbg(hub->intfdev, "hot reset failed, warm reset port %d\n",
+						port1);
+				warm = true;
+			}
 		}
 
 		dev_dbg (hub->intfdev,
@@ -2285,7 +2416,11 @@
 			dev_dbg(&udev->dev, "won't remote wakeup, status %d\n",
 					status);
 			/* bail if autosuspend is requested */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 			if (PMSG_IS_AUTO(msg))
+#else
+			if (msg.event & PM_EVENT_AUTO)
+#endif
 				return status;
 		}
 	}
@@ -2310,14 +2445,23 @@
 				USB_CTRL_SET_TIMEOUT);
 
 		/* System sleep transitions should never fail */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 		if (!PMSG_IS_AUTO(msg))
+#else
+		if (!(msg.event & PM_EVENT_AUTO))
+#endif
 			status = 0;
 	} else {
 		/* device has up to 10 msec to fully suspend */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 		dev_dbg(&udev->dev, "usb %ssuspend, wakeup %d\n",
 				(PMSG_IS_AUTO(msg) ? "auto-" : ""),
 				udev->do_remote_wakeup);
-		usb_set_device_state(udev, USB_STATE_SUSPENDED);
+#else
+		dev_dbg(&udev->dev, "usb %ssuspend\n",
+				(msg.event & PM_EVENT_AUTO ? "auto-" : ""));
+#endif
+		ethub_usb_set_device_state(udev, USB_STATE_SUSPENDED);
 		msleep(10);
 	}
 	usb_mark_last_busy(hub->hdev);
@@ -2338,7 +2482,7 @@
 static int finish_port_resume(struct usb_device *udev)
 {
 	int	status = 0;
-	u16	devstatus;
+	u16	devstatus = 0;
 
 	/* caller owns the udev device lock */
 	dev_dbg(&udev->dev, "%s\n",
@@ -2349,7 +2493,7 @@
 	 * first two on the host side; they'd be inside hub_port_init()
 	 * during many timeouts, but khubd can't suspend until later.
 	 */
-	usb_set_device_state(udev, udev->actconfig
+	ethub_usb_set_device_state(udev, udev->actconfig
 			? USB_STATE_CONFIGURED
 			: USB_STATE_ADDRESS);
 
@@ -2383,7 +2527,13 @@
 	if (status) {
 		dev_dbg(&udev->dev, "gone after usb resume? status %d\n",
 				status);
-	} else if (udev->actconfig) {
+	/*
+	 * There are a few quirky devices which violate the standard
+	 * by claiming to have remote wakeup enabled after a reset,
+	 * which crash if the feature is cleared, hence check for
+	 * udev->reset_resume
+	 */
+	} else if (udev->actconfig && !udev->reset_resume) {
 		le16_to_cpus(&devstatus);
 		if (devstatus & (1 << USB_DEVICE_REMOTE_WAKEUP)) {
 			status = usb_control_msg(udev,
@@ -2466,8 +2616,13 @@
 				port1, status);
 	} else {
 		/* drive resume for at least 20 msec */
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 		dev_dbg(&udev->dev, "usb %sresume\n",
 				(PMSG_IS_AUTO(msg) ? "auto-" : ""));
+#else
+		dev_dbg(&udev->dev, "usb %sresume\n",
+				(msg.event & PM_EVENT_AUTO ? "auto-" : ""));
+#endif
 		msleep(25);
 
 		/* Virtual root hubs can trigger on GET_PORT_STATUS to
@@ -2570,7 +2725,11 @@
 		udev = hdev->children [port1-1];
 		if (udev && udev->can_submit) {
 			dev_warn(&intf->dev, "port %d nyet suspended\n", port1);
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 			if (PMSG_IS_AUTO(msg))
+#else
+			if (msg.event & PM_EVENT_AUTO)
+#endif
 				return -EBUSY;
 		}
 	}
@@ -2726,7 +2885,7 @@
 	if (retval == 0) {
 		update_devnum(udev, devnum);
 		/* Device now using proper address. */
-		usb_set_device_state(udev, USB_STATE_ADDRESS);
+		ethub_usb_set_device_state(udev, USB_STATE_ADDRESS);
 		usb_ep0_reinit(udev);
 	}
 	return retval;
@@ -2752,7 +2911,11 @@
 	int			i, j, retval;
 	unsigned		delay = HUB_SHORT_RESET_TIME;
 	enum usb_device_speed	oldspeed = udev->speed;
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 	const char		*speed;
+#else
+	char 			*speed, *type;
+#endif
 	int			devnum = udev->devnum;
 
 	/* root hub ports have a slightly longer reset period
@@ -2773,7 +2936,7 @@
 
 	/* Reset the device; full speed may morph to high speed */
 	/* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
-	retval = hub_port_reset(hub, port1, udev, delay, false, true);
+	retval = hub_port_reset(hub, port1, udev, delay, false);
 	if (retval < 0)		/* error or disconnect */
 		goto fail;
 	/* success, speed is known */
@@ -2813,6 +2976,7 @@
 		goto fail;
 	}
 
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 	if (udev->speed == USB_SPEED_WIRELESS)
 		speed = "variable speed Wireless";
 	else
@@ -2823,6 +2987,27 @@
 				"%s %s USB device number %d using %s\n",
 				(udev->config) ? "reset" : "new", speed,
 				devnum, udev->bus->controller->driver->name);
+#else
+	type = "";
+	switch (udev->speed) {
+	case USB_SPEED_LOW:	speed = "low";	break;
+	case USB_SPEED_FULL:	speed = "full";	break;
+	case USB_SPEED_HIGH:	speed = "high";	break;
+	case USB_SPEED_SUPER:
+				speed = "super";
+				break;
+	case USB_SPEED_WIRELESS:
+				speed = "variable";
+				type = "Wireless ";
+				break;
+	default: 		speed = "?";	break;
+	}
+	if (udev->speed != USB_SPEED_SUPER)
+		dev_info(&udev->dev,
+				"%s %s speed %sUSB device number %d using %s\n",
+				(udev->config) ? "reset" : "new", speed, type,
+				devnum, udev->bus->controller->driver->name);
+#endif
 
 	/* Set up TT records, if needed  */
 	if (hdev->tt) {
@@ -2911,7 +3096,7 @@
 		dev_err(&udev->dev, "got a wrong device descriptor, "
 				"warm reset device\n");
 		hub_port_reset(hub, port1, udev,
-				HUB_BH_RESET_TIME, true, false);
+				HUB_BH_RESET_TIME, true);
 		retval = -EINVAL;
 		goto fail;
 	}
@@ -2921,7 +3106,11 @@
 		i = 512;
 	else
 		i = udev->descriptor.bMaxPacketSize0;
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 	if (usb_endpoint_maxp(&udev->ep0.desc) != i) {
+#else
+	if (le16_to_cpu(udev->ep0.desc.wMaxPacketSize) != i) {
+#endif
 		if (udev->speed == USB_SPEED_LOW ||
 				!(i == 8 || i == 16 || i == 32 || i == 64)) {
 			dev_err(&udev->dev, "Invalid ep0 maxpacket: %d\n", i);
@@ -2945,8 +3134,10 @@
 		goto fail;
 	}
 
+#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
 	if (udev->wusb == 0 && le16_to_cpu(udev->descriptor.bcdUSB) >= 0x0201)
 		usb_get_bos_descriptor(udev);
+#endif
 
 	retval = 0;
 	/* notify HCD that we have a device connected and addressed */
@@ -3060,15 +3251,6 @@
 				USB_PORT_STAT_C_ENABLE);
 #endif
 
-	if (hub_port_warm_reset_required(hub, portstatus)) {
-		hub_port_reset(hub, port1, NULL,
-				HUB_BH_RESET_TIME, true, false);
-		status = hub_port_status(hub, port1,
-				&portstatus, &portchange);
-		if (status < 0)
-			return;
-	}
-
 	/* Try to resuscitate an existing device */
 	udev = hdev->children[port1-1];
 	if ((portstatus & USB_PORT_STAT_CONNECTION) && udev &&
@@ -3084,7 +3266,7 @@
 			/* For a suspended device, treat this as a
 			 * remote wakeup event.
 			 */
-			status = usb_remote_wakeup(udev);
+			status = ethub_usb_remote_wakeup(udev);
 #endif
 
 		} else {
@@ -3100,15 +3282,17 @@
 
 	/* Disconnect any existing devices under this port */
 	if (udev)
-		usb_disconnect(&hdev->children[port1-1]);
+		ethub_usb_disconnect(&hdev->children[port1-1]);
 	clear_bit(port1, hub->change_bits);
 
 	/* We can forget about a "removed" device when there's a physical
 	 * disconnect or the connect status changes.
 	 */
 	if (!(portstatus & USB_PORT_STAT_CONNECTION) ||
-			(portchange & USB_PORT_STAT_C_CONNECTION))
+			(portchange & USB_PORT_STAT_C_CONNECTION)) {
 		clear_bit(port1, hub->removed_bits);
+		clear_bit(port1, hub->bot_mode_bits);
+	}
 
 	if (portchange & (USB_PORT_STAT_C_CONNECTION |
 				USB_PORT_STAT_C_ENABLE)) {
@@ -3137,10 +3321,6 @@
 		if (portstatus & USB_PORT_STAT_ENABLE)
   			goto done;
 
-		if (hub_port_warm_reset_required(hub, portstatus)) {
-			set_bit(port1, hub->change_bits);
-			kick_kethubd(hub);
-		}
 		return;
 	}
 
@@ -3157,7 +3337,7 @@
 			goto done;
 		}
 
-		usb_set_device_state(udev, USB_STATE_POWERED);
+		ethub_usb_set_device_state(udev, USB_STATE_POWERED);
  		udev->bus_mA = hub->mA_per_port;
 		udev->level = hdev->level + 1;
 		udev->wusb = 0;
@@ -3239,7 +3419,7 @@
 
 		/* Run it through the hoops (find a driver, etc) */
 		if (!status) {
-			status = usb_new_device(udev);
+			status = ethub_usb_new_device(udev);
 			if (status) {
 				spin_lock_irq(&device_state_lock);
 				hdev->children[port1-1] = NULL;
@@ -3305,7 +3485,7 @@
 		msleep(10);
 
 		usb_lock_device(udev);
-		ret = usb_remote_wakeup(udev);
+		ret = ethub_usb_remote_wakeup(udev);
 		usb_unlock_device(udev);
 		if (ret < 0)
 			connect_change = 1;
@@ -3393,7 +3573,7 @@
 			dev_dbg (hub_dev, "resetting for error %d\n",
 				hub->error);
 
-			ret = usb_reset_device(hdev);
+			ret = ethub_usb_reset_device(hdev);
 			if (ret) {
 				dev_dbg (hub_dev,
 					"error resetting hub: %d\n", ret);
@@ -3419,6 +3599,28 @@
 			if (ret < 0)
 				continue;
 
+			/* Warm reset a USB3 protocol port if it's in
+			 * SS.Inactive, Compliance Mode or Polling state.
+			 */
+			if (hub_port_warm_reset_required(hub, portstatus)) {
+				struct usb_device *udev =
+					hdev->children[i - 1];
+
+				dev_dbg(hub_dev, "warm reset port %d\n", i);
+				if (!udev) {
+					ret = hub_port_reset(hub, i, NULL,
+							HUB_BH_RESET_TIME, true);
+				} else {
+					usb_lock_device(udev);
+					ret = ethub_usb_reset_device(udev);
+					usb_unlock_device(udev);
+				}
+				ret = hub_port_status(hub, i,
+						&portstatus, &portchange);
+				if (ret < 0)
+					continue;
+			}
+
 			if (portchange & USB_PORT_STAT_C_CONNECTION) {
 				clear_port_feature(hdev, i,
 					USB_PORT_FEAT_C_CONNECTION);
@@ -3698,6 +3900,30 @@
 	return changed;
 }
 
+static void usb_stop_device(struct usb_device *udev)
+{
+	int i;
+	struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+	struct usb_host_endpoint *ep;
+
+
+	for (i = 1; i < 16; i++) {
+		ep = udev->ep_out[i];
+		if (ep) {
+			ep->enabled = 0;
+			if (hcd->driver->stop_endpoint)
+				hcd->driver->stop_endpoint(hcd, udev, ep);
+		}
+
+		ep = udev->ep_in[i];
+		if (ep) {
+			ep->enabled = 0;
+			if (hcd->driver->stop_endpoint)
+				hcd->driver->stop_endpoint(hcd, udev, ep);
+		}
+	}
+}
+
 /**
  * usb_reset_and_verify_device - perform a USB port reset to reinitialize a device
  * @udev: device to reset (not in SUSPENDED or NOTATTACHED state)
@@ -3751,6 +3977,11 @@
 	}
 	parent_hub = hdev_to_hub(parent_hdev);
 
+dev_warn(parent_hub->intfdev,
+	"%s - port %x\n", __func__, port1);
+
+	usb_stop_device(udev);
+
 	set_bit(port1, parent_hub->busy_bits);
 	for (i = 0; i < SET_CONFIG_TRIES; ++i) {
 
@@ -3798,7 +4029,7 @@
 		goto re_enumerate;
   	}
 	mutex_unlock(hcd->bandwidth_mutex);
-	usb_set_device_state(udev, USB_STATE_CONFIGURED);
+	ethub_usb_set_device_state(udev, USB_STATE_CONFIGURED);
 
 	/* Put interfaces back into the same altsettings as before.
 	 * Don't bother to send the Set-Interface request for interfaces
diff -ur a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c
--- a/drivers/usb/core/hub.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/core/hub.c	2014-02-17 11:57:11.000000000 +0100
@@ -2375,80 +2375,35 @@
 		if ((portstatus & USB_PORT_STAT_RESET))
 			goto delay;
 
-		/*
-		 * Some buggy devices require a warm reset to be issued even
-		 * when the port appears not to be connected.
+		if (hub_port_warm_reset_required(hub, portstatus))
+			return -ENOTCONN;
+
+		/* Device went away? */
+		if (!(portstatus & USB_PORT_STAT_CONNECTION))
+			return -ENOTCONN;
+
+		/* bomb out completely if the connection bounced.  A USB 3.0
+		 * connection may bounce if multiple warm resets were issued,
+		 * but the device may have successfully re-connected. Ignore it.
 		 */
-		if (!warm) {
-			/*
-			 * Some buggy devices can cause an NEC host controller
-			 * to transition to the "Error" state after a hot port
-			 * reset.  This will show up as the port state in
-			 * "Inactive", and the port may also report a
-			 * disconnect.  Forcing a warm port reset seems to make
-			 * the device work.
-			 *
-			 * See https://bugzilla.kernel.org/show_bug.cgi?id=41752
-			 */
-			if (hub_port_warm_reset_required(hub, portstatus)) {
-				int ret;
+		if (!hub_is_superspeed(hub->hdev) &&
+				(portchange & USB_PORT_STAT_C_CONNECTION))
+			return -ENOTCONN;
 
-				if ((portchange & USB_PORT_STAT_C_CONNECTION))
-					clear_port_feature(hub->hdev, port1,
-							USB_PORT_FEAT_C_CONNECTION);
-				if (portchange & USB_PORT_STAT_C_LINK_STATE)
-					clear_port_feature(hub->hdev, port1,
-							USB_PORT_FEAT_C_PORT_LINK_STATE);
-				if (portchange & USB_PORT_STAT_C_RESET)
-					clear_port_feature(hub->hdev, port1,
-							USB_PORT_FEAT_C_RESET);
-				dev_dbg(hub->intfdev, "hot reset failed, warm reset port %d\n",
-						port1);
-				ret = hub_port_reset(hub, port1,
-						udev, HUB_BH_RESET_TIME,
-						true);
-				if ((portchange & USB_PORT_STAT_C_CONNECTION))
-					clear_port_feature(hub->hdev, port1,
-							USB_PORT_FEAT_C_CONNECTION);
-				return ret;
-		}
-#ifdef MY_DEF_HERE
-			if (portchange & (USB_PORT_STAT_C_CONNECTION | USB_PORT_STAT_C_ENABLE))
-			{
-				hub_port_debounce(hub, port1);
-				ret = hub_port_status(hub, port1, &portstatus, &portchange);
-				if (ret < 0)
-					return ret;
-			}
-#else
-			/* Device went away? */
-			if (!(portstatus & USB_PORT_STAT_CONNECTION))
-				return -ENOTCONN;
-
-			/* bomb out completely if the connection bounced */
-			if ((portchange & USB_PORT_STAT_C_CONNECTION))
-				return -ENOTCONN;
-#endif
-
-			if ((portstatus & USB_PORT_STAT_ENABLE)) {
-				if (hub_is_wusb(hub))
-					udev->speed = USB_SPEED_WIRELESS;
-				else if (hub_is_superspeed(hub->hdev))
-					udev->speed = USB_SPEED_SUPER;
-				else if (portstatus & USB_PORT_STAT_HIGH_SPEED)
-					udev->speed = USB_SPEED_HIGH;
-				else if (portstatus & USB_PORT_STAT_LOW_SPEED)
-					udev->speed = USB_SPEED_LOW;
-				else
-					udev->speed = USB_SPEED_FULL;
+		if ((portstatus & USB_PORT_STAT_ENABLE)) {
+			if (!udev)
 				return 0;
-			}
-		} else {
-			if (!(portstatus & USB_PORT_STAT_CONNECTION) ||
-					hub_port_warm_reset_required(hub,
-						portstatus))
-				return -ENOTCONN;
 
+			if (hub_is_wusb(hub))
+				udev->speed = USB_SPEED_WIRELESS;
+			else if (hub_is_superspeed(hub->hdev))
+				udev->speed = USB_SPEED_SUPER;
+			else if (portstatus & USB_PORT_STAT_HIGH_SPEED)
+				udev->speed = USB_SPEED_HIGH;
+			else if (portstatus & USB_PORT_STAT_LOW_SPEED)
+				udev->speed = USB_SPEED_LOW;
+			else
+				udev->speed = USB_SPEED_FULL;
 			return 0;
 		}
 
@@ -2477,16 +2432,16 @@
 }
 
 static void hub_port_finish_reset(struct usb_hub *hub, int port1,
-			struct usb_device *udev, int *status, bool warm)
+			struct usb_device *udev, int *status)
 {
 	switch (*status) {
 	case 0:
-		if (!warm) {
-			struct usb_hcd *hcd;
-			/* TRSTRCY = 10 ms; plus some extra */
-			msleep(10 + 40);
+		/* TRSTRCY = 10 ms; plus some extra */
+		msleep(10 + 40);
+		if (udev) {
+			struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+
 			update_devnum(udev, 0);
-			hcd = bus_to_hcd(udev->bus);
 			/* The xHC may think the device is already reset,
 			 * so ignore the status.
 			 */
@@ -2498,14 +2453,15 @@
 	case -ENODEV:
 		clear_port_feature(hub->hdev,
 				port1, USB_PORT_FEAT_C_RESET);
-		/* FIXME need disconnect() for NOTATTACHED device */
 		if (hub_is_superspeed(hub->hdev)) {
 			clear_port_feature(hub->hdev, port1,
 					USB_PORT_FEAT_C_BH_PORT_RESET);
 			clear_port_feature(hub->hdev, port1,
 					USB_PORT_FEAT_C_PORT_LINK_STATE);
+			clear_port_feature(hub->hdev, port1,
+					USB_PORT_FEAT_C_CONNECTION);
 		}
-		if (!warm)
+		if (udev)
 			usb_set_device_state(udev, *status
 					? USB_STATE_NOTATTACHED
 					: USB_STATE_DEFAULT);
@@ -2513,23 +2469,39 @@
 	}
 }
 
+#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_USB_MARVELL_ERRATA_FE_9049667)
+extern void (*gpfn_ehci_marvell_hs_detect_wa_done)(struct usb_device *);
+#endif
+
 /* Handle port reset and port warm(BH) reset (for USB3 protocol ports) */
 static int hub_port_reset(struct usb_hub *hub, int port1,
 			struct usb_device *udev, unsigned int delay, bool warm)
 {
 	int i, status;
+	u16 portchange, portstatus;
 
-	if (!warm) {
-		/* Block EHCI CF initialization during the port reset.
-		 * Some companion controllers don't like it when they mix.
-		 */
-		down_read(&ehci_cf_port_reset_rwsem);
-	} else {
-		if (!hub_is_superspeed(hub->hdev)) {
+	if (!hub_is_superspeed(hub->hdev)) {
+		if (warm) {
 			dev_err(hub->intfdev, "only USB3 hub support "
 						"warm reset\n");
 			return -EINVAL;
 		}
+		/* Block EHCI CF initialization during the port reset.
+		 * Some companion controllers don't like it when they mix.
+		 */
+		down_read(&ehci_cf_port_reset_rwsem);
+	} else if (!warm) {
+		/*
+		 * If the caller hasn't explicitly requested a warm reset,
+		 * double check and see if one is needed.
+		 */
+		status = hub_port_status(hub, port1,
+					&portstatus, &portchange);
+		if (status < 0)
+			goto done;
+
+		if (hub_port_warm_reset_required(hub, portstatus))
+			warm = true;
 	}
 
 	/* Reset the port */
@@ -2550,10 +2522,33 @@
 						status);
 		}
 
-		/* return on disconnect or reset */
+		/* Check for disconnect or reset */
 		if (status == 0 || status == -ENOTCONN || status == -ENODEV) {
-			hub_port_finish_reset(hub, port1, udev, &status, warm);
-			goto done;
+			hub_port_finish_reset(hub, port1, udev, &status);
+
+			if (!hub_is_superspeed(hub->hdev))
+				goto done;
+
+			/*
+			 * If a USB 3.0 device migrates from reset to an error
+			 * state, re-issue the warm reset.
+			 */
+			if (hub_port_status(hub, port1,
+					&portstatus, &portchange) < 0)
+				goto done;
+
+			if (!hub_port_warm_reset_required(hub, portstatus))
+				goto done;
+
+			/*
+			 * If the port is in SS.Inactive or Compliance Mode, the
+			 * hot or warm reset failed.  Try another warm reset.
+			 */
+			if (!warm) {
+				dev_dbg(hub->intfdev, "hot reset failed, warm reset port %d\n",
+						port1);
+				warm = true;
+			}
 		}
 
 		dev_dbg (hub->intfdev,
@@ -2567,8 +2562,13 @@
 		port1);
 
 done:
-	if (!warm)
+	if (!hub_is_superspeed(hub->hdev))
 		up_read(&ehci_cf_port_reset_rwsem);
+#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_USB_MARVELL_ERRATA_FE_9049667)
+	if (NULL != gpfn_ehci_marvell_hs_detect_wa_done) {
+		gpfn_ehci_marvell_hs_detect_wa_done(hub->hdev);
+	}
+#endif
 
 	return status;
 }
@@ -3220,11 +3220,15 @@
 	return retval;
 }
 
-#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE)
+#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
 enum XHCI_SPECIAL_RESET_MODE xhci_special_reset = XHCI_SPECIAL_RESET_PAUSE;
 EXPORT_SYMBOL_GPL(xhci_special_reset);
 #define SPECIAL_RESET_RETRY 20 // times
+#if defined(CONFIG_SYNO_COMCERTO)
+#define IS_XHCI(hub) (!strcmp(hub->hdev->bus->controller->driver->name, "xhci-hcd"))
+#else
 #define IS_XHCI(hub) (!strcmp(hub->hdev->bus->controller->driver->name, "xhci_hcd"))
+#endif //CONFIG_SYNO_COMCERTO
 #endif
 
 /* Reset device, (re)assign address, get device descriptor.
@@ -3249,7 +3253,7 @@
 	enum usb_device_speed	oldspeed = udev->speed;
 	const char		*speed;
 	int			devnum = udev->devnum;
-#ifdef MY_DEF_HERE
+#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE)
 #ifdef MY_DEF_HERE
 	bool reset_for_addr_err = true;
 #endif
@@ -3280,13 +3284,13 @@
 
 	mutex_lock(&usb_address0_mutex);
 
-#ifdef MY_DEF_HERE
+#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE)
 port_init_retry:
 #endif
 
 	/* Reset the device; full speed may morph to high speed */
 	/* FIXME a USB 2.0 device may morph into SuperSpeed on reset. */
-#ifdef MY_DEF_HERE
+#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE)
 	// special reset for xhci port only during reboot only
 	if (!IS_XHCI(hub) ||
 			(XHCI_SPECIAL_RESET_RUN!= xhci_special_reset)) {
@@ -3299,7 +3303,7 @@
 		for (xhci_retry = 0; xhci_retry < SPECIAL_RESET_RETRY; xhci_retry++) {
 			retval = hub_port_reset(hub, port1, udev, delay, false);
 			dev_dbg(&udev->dev, "hub_port_reset2. %dth. speed:%d. ret:%d.\n", xhci_retry, udev->speed, retval);
-			if (udev->speed !=USB_SPEED_HIGH || retval < 0) { // USB 2.0 device may morph into SuperSpeed
+			if (udev->speed == USB_SPEED_SUPER || retval < 0) { // USB 2.0 device may morph into SuperSpeed
 				break;
 			}
 		}
@@ -3556,18 +3560,16 @@
 		goto fail;
 	}
 
-#ifdef MY_DEF_HERE
-		dev_dbg(&udev->dev, "vid:0x%x.pid:0x%x.\n", le16_to_cpu(udev->descriptor.idVendor), le16_to_cpu(udev->descriptor.idProduct));
-		dev_dbg(&udev->dev, "bcdUSB:0x%x.\n", le16_to_cpu(udev->descriptor.bcdUSB));
-		// check bcdUSB or pid/vid if we need to do special reset to morph the speed to super
-		//if (0x??? == le16_to_cpu(udev->descriptor.idProduct) && 0x??? == le16_to_cpu(udev->descriptor.idVendor)) {
-		if (0x0210 <= le16_to_cpu(udev->descriptor.bcdUSB)) { // Innostor's bcdUSB is 0x210 if it's speed is high, 0x300 if it's speed is super
+#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE)
+		//A USB2 device's bcdUSB must be equal or less than 0x0200
+		//A USB3 device's bcdUSB should be 0x0300. But if speed downgrade happens , its value could be 0x0210 or higher
+		if (0x0210 <= le16_to_cpu(udev->descriptor.bcdUSB)) {
 			if (IS_XHCI(hub) &&
-					udev->speed != USB_SPEED_SUPER &&
-					(XHCI_SPECIAL_RESET_PAUSE == xhci_special_reset) &&
-					NULL == hub->hdev->parent) { // skip special reset for device which is behind a external hub
+				USB_SPEED_SUPER != udev->speed &&
+				XHCI_SPECIAL_RESET_PAUSE == xhci_special_reset &&
+				NULL == hub->hdev->parent) { //skip special reset for device which is behind a external hub
 				hub_port_disable(hub, port1, 0);
-				update_devnum(udev, devnum); /* for disconnect processing */
+				update_devnum(udev, devnum); //for disconnect processing
 				oldspeed = USB_SPEED_UNKNOWN;
 				xhci_special_reset = XHCI_SPECIAL_RESET_RUN;
 				dev_err(&udev->dev, "Special reset for xhci.\n");
@@ -4137,12 +4139,21 @@
 			 */
 			if (hub_port_warm_reset_required(hub, portstatus)) {
 				int status;
+				struct usb_device *udev =
+					hub->hdev->children[i - 1];
 
 				dev_dbg(hub_dev, "warm reset port %d\n", i);
-				status = hub_port_reset(hub, i, NULL,
-						HUB_BH_RESET_TIME, true);
-				if (status < 0)
-					hub_port_disable(hub, i, 1);
+				if (!udev) {
+					status = hub_port_reset(hub, i,
+							NULL, HUB_BH_RESET_TIME,
+							true);
+					if (status < 0)
+						hub_port_disable(hub, i, 1);
+				} else {
+					usb_lock_device(udev);
+					status = usb_reset_device(udev);
+					usb_unlock_device(udev);
+				}
 				connect_change = 0;
 			}
 
diff -ur a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c
--- a/drivers/usb/core/usb.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/core/usb.c	2014-02-17 11:57:11.000000000 +0100
@@ -59,6 +59,10 @@
 #define usb_autosuspend_delay		0
 #endif
 
+#if defined(CONFIG_SYNO_ARMADA_ARCH)
+void (*gpfn_ehci_marvell_hs_detect_wa_done)(struct usb_device *) = NULL;
+EXPORT_SYMBOL(gpfn_ehci_marvell_hs_detect_wa_done);
+#endif
 
 /**
  * usb_find_alt_setting() - Given a configuration, find the alternate setting
@@ -652,6 +656,73 @@
 }
 EXPORT_SYMBOL_GPL(__usb_get_extra_descriptor);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+static struct usb_device *match_device_name(struct usb_device *dev,
+					    const char *name)
+{
+	struct usb_device *ret_dev = NULL;
+	int child;
+
+	dev_dbg(&dev->dev, "check for name %s ...\n", name);
+
+	/* see if this device matches */
+	if (strcmp(dev_name(&dev->dev), name) == 0 ) {
+		dev_dbg(&dev->dev, "matched this device!\n");
+		ret_dev = usb_get_dev(dev);
+		goto exit;
+	}
+
+	/* look through all of the children of this device */
+	for (child = 0; child < dev->maxchild; ++child) {
+		if (dev->children[child]) {
+			usb_lock_device(dev->children[child]);
+			ret_dev = match_device_name(dev->children[child], name);
+			usb_unlock_device(dev->children[child]);
+			if (ret_dev)
+				goto exit;
+		}
+	}
+exit:
+	return ret_dev;
+}
+
+/**
+ * usb_find_device_by_name - find a specific usb device in the system
+ * @name: the name of the device to find
+ *
+ * Returns a pointer to a struct usb_device if such a specified usb
+ * device is present in the system currently.  The usage count of the
+ * device will be incremented if a device is found.  Make sure to call
+ * usb_put_dev() when the caller is finished with the device.
+ *
+ * If a device with the specified bus id is not found, NULL is returned.
+ */
+struct usb_device *usb_find_device_by_name(const char *name)
+{
+	struct list_head *buslist;
+	struct usb_bus *bus;
+	struct usb_device *dev = NULL;
+
+	mutex_lock(&usb_bus_list_lock);
+	for (buslist = usb_bus_list.next;
+	     buslist != &usb_bus_list;
+	     buslist = buslist->next) {
+		bus = container_of(buslist, struct usb_bus, bus_list);
+		if (!bus->root_hub)
+			continue;
+		usb_lock_device(bus->root_hub);
+		dev = match_device_name(bus->root_hub, name);
+		usb_unlock_device(bus->root_hub);
+		if (dev)
+			goto exit;
+	}
+exit:
+	mutex_unlock(&usb_bus_list_lock);
+	return dev;
+}
+EXPORT_SYMBOL_GPL(usb_find_device_by_name);
+#endif
+
 /**
  * usb_alloc_coherent - allocate dma-consistent buffer for URB_NO_xxx_DMA_MAP
  * @dev: device the buffer will be used with
diff -ur a/drivers/usb/core/usb.h b/drivers/usb/core/usb.h
--- a/drivers/usb/core/usb.h	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/core/usb.h	2014-02-17 11:57:11.000000000 +0100
@@ -59,8 +59,6 @@
 #if defined(CONFIG_USB_ETRON_HUB)
 extern int usb_is_etron_hcd(struct usb_device *udev);
 extern void ethub_usb_kick_kethubd(struct usb_device *hdev);
-void ethub_usb_wakeup_notification(struct usb_device *hdev,
-		unsigned int portnum);
 extern int ethub_usb_remove_device(struct usb_device *udev);
 extern int ethub_usb_hub_claim_port(struct usb_device *hdev, unsigned port1, void *owner);
 extern int ethub_usb_hub_release_port(struct usb_device *hdev, unsigned port1, void *owner);
@@ -74,12 +72,21 @@
 extern int ethub_usb_authorize_device(struct usb_device *usb_dev);
 extern int ethub_usb_port_suspend(struct usb_device *udev, pm_message_t msg);
 extern int ethub_usb_port_resume(struct usb_device *udev, pm_message_t msg);
-extern int ethub_usb_remote_wakeup(struct usb_device *udev);
 extern void ethub_usb_root_hub_lost_power(struct usb_device *rhdev);
 extern void ethub_usb_ep0_reinit(struct usb_device *udev);
 extern int ethub_usb_reset_device(struct usb_device *udev);
 extern int ethub_init(void);
 extern void ethub_cleanup(void);
+
+#ifdef CONFIG_USB_SUSPEND
+extern int ethub_usb_remote_wakeup(struct usb_device *udev);
+#else
+static inline int ethub_usb_remote_wakeup(struct usb_device *udev)
+{
+	return 0;
+}
+#endif
+
 #endif
 
 #ifdef	CONFIG_PM
Nur in b/drivers/usb: dwc_otg.
diff -ur a/drivers/usb/host/ehci.h b/drivers/usb/host/ehci.h
--- a/drivers/usb/host/ehci.h	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/ehci.h	2014-02-17 11:57:11.000000000 +0100
@@ -147,6 +147,9 @@
 	unsigned		use_dummy_qh:1;	/* AMD Frame List table quirk*/
 	unsigned		has_synopsys_hc_bug:1; /* Synopsys HC */
 	unsigned		frame_index_bug:1; /* MosChip (AKA NetMos) */
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned		ignore_oc:1;
+#endif
 
 	/* required for usb32 quirk */
 	#define OHCI_CTRL_HCFS          (3 << 6)
@@ -690,6 +693,10 @@
 { }
 #endif
 
+#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_USB_MARVELL_ERRATA_FE_9049667)
+extern int ehci_marvell_hs_detect_wa(struct ehci_hcd *ehci, int busnum);
+#endif
+
 /*-------------------------------------------------------------------------*/
 
 /*
diff -ur a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c
--- a/drivers/usb/host/ehci-hcd.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/ehci-hcd.c	2014-02-17 11:57:11.000000000 +0100
@@ -763,7 +763,11 @@
 		"USB %x.%x started, EHCI %x.%02x%s\n",
 		((ehci->sbrn & 0xf0)>>4), (ehci->sbrn & 0x0f),
 		temp >> 8, temp & 0xff,
+#if defined(CONFIG_SYNO_COMCERTO)
+		(ignore_oc || ehci->ignore_oc) ? ", overcurrent ignored" : "");
+#else
 		ignore_oc ? ", overcurrent ignored" : "");
+#endif
 
 	ehci_writel(ehci, INTR_MASK,
 		    &ehci->regs->intr_enable); /* Turn On Interrupts */
diff -ur a/drivers/usb/host/ehci-hub.c b/drivers/usb/host/ehci-hub.c
--- a/drivers/usb/host/ehci-hub.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/ehci-hub.c	2014-02-17 11:57:11.000000000 +0100
@@ -578,7 +578,11 @@
 	 * always set, seem to clear PORT_OCC and PORT_CSC when writing to
 	 * PORT_POWER; that's surprising, but maybe within-spec.
 	 */
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (!ignore_oc && !ehci->ignore_oc)
+#else
 	if (!ignore_oc)
+#endif
 		mask = PORT_CSC | PORT_PEC | PORT_OCC;
 	else
 		mask = PORT_CSC | PORT_PEC;
@@ -803,7 +807,11 @@
 		if (temp & PORT_PEC)
 			status |= USB_PORT_STAT_C_ENABLE << 16;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+		if ((temp & PORT_OCC) && (!ignore_oc && !ehci->ignore_oc)){
+#else
 		if ((temp & PORT_OCC) && !ignore_oc){
+#endif
 			status |= USB_PORT_STAT_C_OVERCURRENT << 16;
 
 			/*
@@ -874,8 +882,22 @@
 			retval = handshake(ehci, status_reg,
 					PORT_RESET, 0, 1000);
 			if (retval != 0) {
+
+#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_USB_MARVELL_ERRATA_FE_9049667)
+				/*
+				 * Attempt to resolve HS reset error by
+				 * applying the HS detect WA
+				 */
+				if (ehci_marvell_hs_detect_wa(ehci,
+							hcd->self.busnum)) {
+					ehci_err(ehci, "port %d reset error %d\n",
+						wIndex + 1, retval);
+				}
+#else
 				ehci_err (ehci, "port %d reset error %d\n",
 					wIndex + 1, retval);
+#endif
+
 				goto error;
 			}
 
@@ -938,6 +960,15 @@
 	if (status & ~0xffff)	/* only if wPortChange is interesting */
 #endif
 		dbg_port (ehci, "GetStatus", wIndex + 1, temp);
+
+#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_USB_MARVELL_ERRATA_FE_9049667)
+		if ((temp & PORT_CONNECT) && (temp & PORT_PEC) &&
+				(temp & PORT_CSC)) {
+			if (!ehci_marvell_hs_detect_wa(ehci, hcd->self.busnum))
+				goto error;
+		}
+#endif
+
 		put_unaligned_le32(status, buf);
 		break;
 	case SetHubFeature:
diff -ur a/drivers/usb/host/ehci_marvell.c b/drivers/usb/host/ehci_marvell.c
--- a/drivers/usb/host/ehci_marvell.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/ehci_marvell.c	2014-02-17 11:57:11.000000000 +0100
@@ -40,6 +40,110 @@
 
 static int ehci_marvell_setup(struct usb_hcd *hcd);
 
+#if defined(CONFIG_SYNO_ARMADA_ARCH) && defined(CONFIG_USB_MARVELL_ERRATA_FE_9049667)
+/* In a370 and axp USB UTMI PHY there is an errata which causes
+ * error in detection of high speed devices. For certain devices
+ * with low pull up values the USB MAC doesnt detect the end of the
+ * device chirp K signal and therefore remains stuck in reset
+ * state.
+ * The workaround solves this issue by modifying the UTMI PHY
+ * squelch threshold once a high speed port reset error is detected.
+ * Modifying the squelch level enables the MAC to detect the end of
+ * device chirp K signal and to come out of reset. Once the MAC
+ * comes out of reset a consecutive reset attempt is made by the USB stack.
+ * This reset attempt succeeds due to the updated squelch level.
+
+ * Since the optimal squelch level is device dependant the WA
+ * toggles between 2 verfied squelch levels 0xA and 0xE.*/
+
+#define MAX_EHCI_PORTS		3
+#define PHY_RX_CTRL_REG_OFFSET(x) (0x708 + (0x40 * (x)))
+#define SQUELCH_TH_OFFSET	4
+#define SQUELCH_TH_MASK		0xF
+
+static int hs_wa_applied[MAX_EHCI_PORTS] = {0};
+
+static void ehci_marvell_toggle_squelch(struct ehci_hcd *ehci, int busnum)
+{
+	u32 __iomem *phy_rx_ctrl_reg;
+	u32 val, squelch_th;
+
+	phy_rx_ctrl_reg = (u32 __iomem *)(((u8 __iomem *)ehci->regs)
+			+ PHY_RX_CTRL_REG_OFFSET(busnum - 1));
+
+	val = ehci_readl(ehci, phy_rx_ctrl_reg);
+
+	squelch_th = (val >> SQUELCH_TH_OFFSET) & SQUELCH_TH_MASK;
+	if (squelch_th == 0xA)
+		squelch_th = 0xE;
+	else
+		squelch_th = 0xA;
+
+	val &= ~(SQUELCH_TH_MASK << SQUELCH_TH_OFFSET);
+	val |= (squelch_th & SQUELCH_TH_MASK) << SQUELCH_TH_OFFSET;
+
+	ehci_writel(ehci, val, phy_rx_ctrl_reg);
+}
+
+void ehci_marvell_hs_detect_wa_done(struct usb_device *udev)
+{
+	struct usb_hcd *hcd = bus_to_hcd(udev->bus);
+	struct ehci_hcd	*ehci = hcd_to_ehci(hcd);
+	int busnum = hcd->self.busnum;
+
+	if (hs_wa_applied[busnum])
+		ehci_marvell_toggle_squelch(ehci, busnum);
+
+	hs_wa_applied[busnum] = 0;
+}
+
+extern void (*gpfn_ehci_marvell_hs_detect_wa_done)(struct usb_device *udev);
+
+int ehci_marvell_hs_detect_wa(struct ehci_hcd *ehci, int busnum)
+{
+	u32 __iomem *portsc_reg;
+	u32 val = 0;
+	u32 timeout;
+	if (NULL == gpfn_ehci_marvell_hs_detect_wa_done) {
+		gpfn_ehci_marvell_hs_detect_wa_done = &ehci_marvell_hs_detect_wa_done;
+	}
+
+	/* Apply the WA only once in a reset cycle */
+	if (hs_wa_applied[busnum]++)
+		return 1;
+
+	ehci_marvell_toggle_squelch(ehci, busnum);
+
+	/*
+	 * After the squelch value is replaced we need to
+	 * wait upto 3ms for the MAC to leave reset state.
+	 */
+	portsc_reg = &ehci->regs->port_status[0];
+	timeout = 30;
+	while (timeout--) {
+		udelay(100);
+		val = ehci_readl(ehci, portsc_reg);
+		if ((val & PORT_RESET) == 0)
+			break;
+	}
+
+	/* Return error If the MAC doesn't come out of reset */
+	if (val & PORT_RESET)
+		return 1;
+
+	/*
+	 * Clear Connect Status Change, Port Enable, and Port Enable Change.
+	 * This returns the port status to pre-reset state and allows for
+	 * succesfull consecutive reset.
+	 */
+	val = ehci_readl(ehci, portsc_reg);
+	val = val  & (~PORT_PE);
+	val = (val  & (~PORT_RWC_BITS)) | PORT_CSC | PORT_PEC;
+	ehci_writel(ehci, val, portsc_reg);
+
+	return 0;
+}
+#endif /* CONFIG_SYNO_ARMADA_ARCH && CONFIG_USB_MARVELL_ERRATA_FE_9049667 */
 
 void 	ehci_marvell_port_status_changed(struct ehci_hcd *ehci)
 {
@@ -223,5 +327,3 @@
 #endif
     .shutdown = usb_hcd_platform_shutdown, 
 };  
-
-
diff -ur a/drivers/usb/host/etxhci.c b/drivers/usb/host/etxhci.c
--- a/drivers/usb/host/etxhci.c	2013-08-24 11:36:50.000000000 +0200
+++ b/drivers/usb/host/etxhci.c	2014-02-17 11:57:11.000000000 +0100
@@ -184,6 +184,9 @@
 		xhci->bus_state[i].port_c_suspend = 0;
 		xhci->bus_state[i].suspended_ports = 0;
 		xhci->bus_state[i].resuming_ports = 0;
+		xhci->bus_state[i].port_c_connection = 0;
+		xhci->bus_state[i].downgraded_ports = 0;
+		xhci->bus_state[i].downgraded_open = 0;
 	}
 
 	return ret;
@@ -817,7 +820,7 @@
 	command &= ~CMD_RUN;
 	xhci_writel(xhci, command, &xhci->op_regs->command);
 	if (handshake(xhci, &xhci->op_regs->status,
-		      STS_HALT, STS_HALT, 100*100)) {
+		      STS_HALT, STS_HALT, XHCI_MAX_HALT_USEC)) {
 		xhci_warn(xhci, "WARN: xHC CMD_RUN timeout\n");
 		spin_unlock_irq(&xhci->lock);
 		return -ETIMEDOUT;
@@ -1648,6 +1651,70 @@
 	return 0;
 }
 
+void etxhci_stop_endpoint(struct usb_hcd *hcd, struct usb_device *udev,
+		struct usb_host_endpoint *ep)
+{
+	struct xhci_hcd *xhci;
+	struct xhci_virt_device *virt_dev;
+	struct xhci_ep_ctx *ep_ctx;
+	struct xhci_command *cmd;
+	unsigned int ep_index;
+	unsigned long flags;
+	int ret, timeleft;
+
+#ifndef MY_ABC_HERE
+printk("%s\n", __func__);
+#endif
+	ret = xhci_check_args(hcd, udev, ep, 1, true, __func__);
+	if (ret < 0)
+		return;
+
+	xhci = hcd_to_xhci(hcd);
+	virt_dev = xhci->devs[udev->slot_id];
+	ep_index = etxhci_get_endpoint_index(&ep->desc);
+	ep_ctx = etxhci_get_ep_ctx(xhci, virt_dev->out_ctx, ep_index);
+	if ((ep_ctx->ep_info & cpu_to_le32(EP_STATE_MASK)) != cpu_to_le32(EP_STATE_RUNNING)) {
+		xhci_dbg(xhci, "xHCI %s called with non-running ep %p\n",
+				__func__, ep);
+		return;
+	}
+
+	cmd = etxhci_alloc_command(xhci, false, true, GFP_NOIO);
+	if (!cmd) {
+		xhci_dbg(xhci, "Couldn't allocate command structure.\n");
+		return;
+	}
+
+	spin_lock_irqsave(&xhci->lock, flags);
+	cmd->command_trb = xhci->cmd_ring->enqueue;
+	if (TRB_TYPE_LINK_LE32(cmd->command_trb->link.control)) {
+		cmd->command_trb = xhci->cmd_ring->enq_seg->next->trbs;
+	}
+	list_add_tail(&cmd->cmd_list, &virt_dev->cmd_list);
+	etxhci_queue_stop_endpoint(xhci, udev->slot_id, ep_index, 0);
+	etxhci_ring_cmd_db(xhci);
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+	/* Wait for last stop endpoint command to finish */
+	timeleft = wait_for_completion_interruptible_timeout(
+			cmd->completion,
+			USB_CTRL_SET_TIMEOUT);
+	if (timeleft <= 0) {
+		xhci_warn(xhci, "%s while waiting for stop endpoint command\n",
+				timeleft == 0 ? "Timeout" : "Signal");
+		spin_lock_irqsave(&xhci->lock, flags);
+		/* The timeout might have raced with the event ring handler, so
+		 * only delete from the list if the item isn't poisoned.
+		 */
+		if (cmd->cmd_list.next != LIST_POISON1) {
+			list_del(&cmd->cmd_list);
+		}
+		spin_unlock_irqrestore(&xhci->lock, flags);
+	}
+
+	etxhci_free_command(xhci, cmd);
+}
+
 static void xhci_zero_in_ctx(struct xhci_hcd *xhci, struct xhci_virt_device *virt_dev)
 {
 	struct xhci_input_control_ctx *ctrl_ctx;
@@ -1943,6 +2010,12 @@
 	if (xhci->xhc_state & XHCI_STATE_DYING)
 		return -ENODEV;
 
+#ifndef MY_ABC_HERE
+	ret = xhci_downgrade_to_usb2(hcd, udev);
+	if (!ret)
+		return -ENODEV;
+#endif
+
 	xhci_dbg(xhci, "%s called for udev %p\n", __func__, udev);
 	virt_dev = xhci->devs[udev->slot_id];
 
@@ -2105,6 +2178,118 @@
 	}
 }
 
+static void etxhci_prev_endpoint_reset(struct usb_hcd *hcd,
+		struct usb_host_endpoint *ep)
+{
+	struct xhci_hcd *xhci;
+	struct usb_device *udev;
+	unsigned int ep_index;
+	unsigned int last_ctx = 0;
+	struct xhci_ep_ctx *out_ep_ctx;
+	struct xhci_ep_ctx *tmp_ep_ctx;
+	struct xhci_ep_ctx *in_ep_ctx;
+	struct xhci_container_ctx *in_ctx, *out_ctx;
+	struct xhci_input_control_ctx *ctrl_ctx;
+	struct xhci_slot_ctx *out_slot_ctx, *in_slot_ctx;
+	u32 cur_add_flags, drop_flag, add_flag;
+	int i, ret = 0;
+
+	xhci = hcd_to_xhci(hcd);
+	udev = (struct usb_device *) ep->hcpriv;
+	
+	if (!ep->hcpriv)
+		return;
+	
+	ep_index = etxhci_get_endpoint_index(&ep->desc);
+	in_ctx = xhci->devs[udev->slot_id]->in_ctx;
+	out_ctx = xhci->devs[udev->slot_id]->out_ctx;
+	
+	out_ep_ctx = etxhci_get_ep_ctx(xhci, out_ctx, ep_index);
+	in_ep_ctx = etxhci_get_ep_ctx(xhci, in_ctx, ep_index);
+
+	out_slot_ctx = etxhci_get_slot_ctx(xhci, out_ctx);
+	in_slot_ctx = etxhci_get_slot_ctx(xhci, in_ctx);
+
+	ctrl_ctx = etxhci_get_input_control_ctx(xhci, in_ctx);
+
+	if ((EP_STATE_RUNNING == (le32_to_cpu(out_ep_ctx->ep_info) & EP_STATE_MASK)) &&
+			(USB_SPEED_SUPER != udev->speed) &&
+			(SLOT_STATE_CONFIGURED == GET_SLOT_STATE(le32_to_cpu(out_slot_ctx->dev_state)))) {
+
+		drop_flag = etxhci_get_endpoint_flag_from_index(ep_index);
+		add_flag = 0;
+
+		etxhci_slot_copy(xhci, in_ctx, out_ctx);
+		etxhci_endpoint_copy(xhci, in_ctx, out_ctx, ep_index);
+
+		ctrl_ctx->drop_flags |= cpu_to_le32(drop_flag);
+		ctrl_ctx->add_flags &= cpu_to_le32(add_flag);
+
+		cur_add_flags = 2;
+		for (i = 1; i < 31; i++) {
+			tmp_ep_ctx = etxhci_get_ep_ctx(xhci, out_ctx, i);
+			if ((tmp_ep_ctx->ep_info & cpu_to_le32(EP_STATE_MASK)) != cpu_to_le32(EP_STATE_DISABLED) &&
+				!(le32_to_cpu(ctrl_ctx->add_flags) & etxhci_get_endpoint_flag_from_index(i)) &&
+				!(le32_to_cpu(ctrl_ctx->drop_flags) & etxhci_get_endpoint_flag_from_index(i)))
+				cur_add_flags |= etxhci_get_endpoint_flag_from_index(i);
+		}
+
+		last_ctx = etxhci_last_valid_endpoint(le32_to_cpu(ctrl_ctx->add_flags) | cur_add_flags);
+		/* Update the last valid endpoint context, if we deleted the last one */
+		in_slot_ctx->dev_info &= cpu_to_le32(~LAST_CTX_MASK);
+		in_slot_ctx->dev_info |= cpu_to_le32(LAST_CTX(last_ctx));
+
+		xhci_dbg(xhci, "drop ep 0x%x, slot id %d, drop flag = %#x, add flag = %#x, new in slot info = %#x\n",
+				(unsigned int) ep->desc.bEndpointAddress,
+				udev->slot_id,
+				(unsigned int) drop_flag,
+				(unsigned int) add_flag,
+				(unsigned int) in_slot_ctx->dev_info);
+
+		ret = xhci_configure_endpoint(xhci, udev, NULL,
+			false, false);
+		if (ret)
+			xhci_warn(xhci, "%s - ret %d\n", __func__,ret);
+
+		drop_flag = 0;
+		add_flag = etxhci_get_endpoint_flag_from_index(ep_index);
+
+		ctrl_ctx->drop_flags &= cpu_to_le32(drop_flag);
+		ctrl_ctx->add_flags |= cpu_to_le32(add_flag);
+
+		cur_add_flags = 2;
+		for (i = 1; i < 31; i++) {
+			tmp_ep_ctx = etxhci_get_ep_ctx(xhci, out_ctx, i);
+			if ((tmp_ep_ctx->ep_info & cpu_to_le32(EP_STATE_MASK)) != cpu_to_le32(EP_STATE_DISABLED) &&
+				!(le32_to_cpu(ctrl_ctx->add_flags) & etxhci_get_endpoint_flag_from_index(i)) &&
+				!(le32_to_cpu(ctrl_ctx->drop_flags) & etxhci_get_endpoint_flag_from_index(i)))
+				cur_add_flags |= etxhci_get_endpoint_flag_from_index(i);
+		}
+
+		last_ctx = etxhci_last_valid_endpoint(le32_to_cpu(ctrl_ctx->add_flags) | cur_add_flags );
+		/* Update the last valid endpoint context, if we deleted the last one */
+		in_slot_ctx->dev_info &= cpu_to_le32(~LAST_CTX_MASK);
+		in_slot_ctx->dev_info |= cpu_to_le32(LAST_CTX(last_ctx));
+
+		ep->hcpriv = udev;
+
+		xhci_dbg(xhci, "add ep 0x%x, slot id %d, drop flag = %#x, add flag = %#x, new in slot info = %#x\n",
+				(unsigned int) ep->desc.bEndpointAddress,
+				udev->slot_id,
+				(unsigned int) drop_flag,
+				(unsigned int) add_flag,
+				(unsigned int) in_slot_ctx->dev_info);
+
+		ret = xhci_configure_endpoint(xhci, udev, NULL,
+			false, false);
+		if (ret)
+			xhci_warn(xhci, "%s - ret %d\n", __func__,ret);
+
+	}
+
+	return ;
+}
+
 /* Deal with stalled endpoints.  The core should have sent the control message
  * to clear the halt condition.  However, we need to make the xHCI hardware
  * reset its sequence number, since a device will expect a sequence number of
@@ -2131,6 +2316,7 @@
 	ep_index = etxhci_get_endpoint_index(&ep->desc);
 	virt_ep = &xhci->devs[udev->slot_id]->eps[ep_index];
 	if (!virt_ep->stopped_td) {
+		etxhci_prev_endpoint_reset(hcd, ep);
 		xhci_dbg(xhci, "Endpoint 0x%x not halted, refusing to reset.\n",
 				ep->desc.bEndpointAddress);
 		return;
@@ -2341,14 +2527,14 @@
 		unsigned int num_streams, gfp_t mem_flags)
 {
 	int i, ret;
-	struct xhci_hcd *xhci;
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 	struct xhci_virt_device *vdev;
 	struct xhci_command *config_cmd;
 	struct xhci_slot_ctx *slot_ctx;
 	unsigned int ep_index;
 	unsigned int num_stream_ctxs;
 	unsigned long flags;
-	u32 changed_ep_bitmask = 0, temp;
+	u32 changed_ep_bitmask = 0;
 
 	if (!eps)
 		return -EINVAL;
@@ -2357,7 +2543,6 @@
 	 * stream 0 that is reserved for xHCI usage.
 	 */
 	num_streams += 1;
-	xhci = hcd_to_xhci(hcd);
 	xhci_dbg(xhci, "Driver wants %u stream IDs (including stream 0).\n",
 			num_streams);
 
@@ -2460,15 +2645,6 @@
 		vdev->eps[ep_index].ep_state |= EP_HAS_STREAMS;
 	}
 	etxhci_free_command(xhci, config_cmd);
-
-	if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x30)) {
-		temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40c0);
-		temp = (temp & 0xffffff00) | 0x00;
-		xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40c0);
-		temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40cc);
-		temp = (temp & 0xffffff00) | 0xc2;
-		xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40cc);
-	}
 	spin_unlock_irqrestore(&xhci->lock, flags);
 
 	/* Subtract 1 for stream 0, which drivers can't use */
@@ -2507,7 +2683,7 @@
 	struct xhci_command *command;
 	unsigned int ep_index;
 	unsigned long flags;
-	u32 changed_ep_bitmask, temp;
+	u32 changed_ep_bitmask;
 
 	xhci = hcd_to_xhci(hcd);
 	vdev = xhci->devs[udev->slot_id];
@@ -2567,15 +2743,6 @@
 		vdev->eps[ep_index].ep_state &= ~EP_GETTING_NO_STREAMS;
 		vdev->eps[ep_index].ep_state &= ~EP_HAS_STREAMS;
 	}
-
-	if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x30)) {
-		temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40c0);
-		temp = (temp & 0xffffff00) | 0x0e;
-		xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40c0);
-		temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40cc);
-		temp = (temp & 0xffffff00) | 0xc0;
-		xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40cc);
-	}
 	spin_unlock_irqrestore(&xhci->lock, flags);
 
 	return 0;
@@ -2753,6 +2920,36 @@
 	return ret;
 }
 
+static void etxhci_pre_free_dev(struct usb_hcd *hcd, struct usb_device *udev)
+{
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+	struct xhci_bus_state *bus_state;
+	__le32 __iomem **port_array;
+	unsigned long flags;
+	u32 temp;
+
+	spin_lock_irqsave(&xhci->lock, flags);
+
+	port_array = xhci->usb2_ports;
+	temp = xhci_readl(xhci, port_array[udev->portnum - 1]);
+	bus_state = &xhci->bus_state[hcd_index(hcd->shared_hcd)];
+
+	if (hcd->speed != HCD_USB3) {
+		if (bus_state->downgraded_ports & (1 << (udev->portnum - 1))) {
+			if (!(temp & PORT_CONNECT)) {
+				xhci_hub_power_port(hcd->shared_hcd, udev->portnum, true);
+				bus_state->downgraded_ports &= ~(1 << (udev->portnum - 1));
+			}
+			bus_state->downgraded_open |= 1 << (udev->portnum - 1);
+		}
+	}
+
+	if (bus_state->downgraded_ports & (1 << (udev->portnum - 1)))
+		bus_state->downgraded_ports &= ~(1 << (udev->portnum - 1));
+
+	spin_unlock_irqrestore(&xhci->lock, flags);
+}
+
 /*
  * At this point, the struct usb_device is about to go away, the device has
  * disconnected, and all traffic has been stopped and the endpoints have been
@@ -2775,6 +2972,10 @@
 
 	virt_dev = xhci->devs[udev->slot_id];
 
+#ifndef MY_ABC_HERE
+	etxhci_pre_free_dev(hcd, udev);
+#endif
+
 	/* Stop any wayward timer functions (which may grab the lock) */
 	for (i = 0; i < 31; ++i) {
 		virt_dev->eps[i].ep_state &= ~EP_HALT_PENDING;
@@ -2862,6 +3063,39 @@
 	return 1;
 }
 
+static void etxhci_post_address_device(struct usb_hcd *hcd, struct usb_device *udev)
+{
+	struct xhci_hcd	*xhci = hcd_to_xhci(hcd);
+	__le32 __iomem **port_array;
+	struct xhci_bus_state *bus_state;
+	unsigned long flags;
+	u32 ecount;
+
+	spin_lock_irqsave(&xhci->lock, flags);
+
+	if (!(udev->parent && !udev->parent->parent))
+		goto err_done;
+
+	if (udev->speed != USB_SPEED_SUPER)
+		goto err_done;
+
+	if (udev->state == USB_STATE_NOTATTACHED)
+		goto err_done;
+
+	bus_state = &xhci->bus_state[hcd_index(hcd)];
+	port_array = xhci->usb3_ports;
+
+	ecount = xhci_readl(xhci, port_array[udev->portnum - 1] + 2);
+
+	if (ecount == 0xffffffff)
+		goto err_done;
+
+	if ((ecount & 0xffff) > 0)
+		bus_state->downgraded_ports |= 1 << (udev->portnum - 1);
+
+err_done:
+	spin_unlock_irqrestore(&xhci->lock, flags);
+}
 /*
  * Issue an Address Device command (which will issue a SetAddress request to
  * the device).
@@ -3000,7 +3234,7 @@
 	xhci_zero_in_ctx(xhci, virt_dev);
 
 	xhci_dbg(xhci, "Internal device address = %d\n", virt_dev->address);
-
+	etxhci_post_address_device(hcd, udev);
 	return 0;
 }
 
@@ -3145,11 +3379,17 @@
 		return 0;
 	}
 
-	get_quirks(dev, xhci);
-
 	xhci->cap_regs = hcd->regs;
 	xhci->op_regs = hcd->regs +
 		HC_LENGTH(xhci_readl(xhci, &xhci->cap_regs->hc_capbase));
+
+	/* Make sure the HC is halted. */
+	retval = etxhci_halt(xhci);
+	if (retval)
+		goto error;
+
+	get_quirks(dev, xhci);
+
 	xhci->run_regs = hcd->regs +
 		(xhci_readl(xhci, &xhci->cap_regs->run_regs_off) & RTSOFF_MASK);
 	/* Cache read-only capability registers */
@@ -3161,11 +3401,6 @@
 	xhci->hcc_params = xhci_readl(xhci, &xhci->cap_regs->hcc_params);
 	etxhci_print_registers(xhci);
 
-	/* Make sure the HC is halted. */
-	retval = etxhci_halt(xhci);
-	if (retval)
-		goto error;
-
 	xhci_dbg(xhci, "Resetting HCD\n");
 	/* Reset the internal HC memory state and registers. */
 	retval = etxhci_reset(xhci);
@@ -3193,6 +3428,80 @@
 	return retval;
 }
 
+int etxhci_update_uas_device(struct usb_hcd *hcd, struct usb_device *udev,
+		int type)
+{
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
+	unsigned long flags;
+	u32 temp;
+
+#define UAS_PROBE	0
+#define UAS_DISCONNECT	1
+#define UAS_PREV_RESET	2
+#define UAS_POST_RESET	3
+
+	spin_lock_irqsave(&xhci->lock, flags);
+	switch (type) {
+	case UAS_PROBE:
+		if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x30)) {
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40c0);
+			temp = (temp & 0xffffff00) | 0x00;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40c0);
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40cc);
+			temp = (temp & 0xffffff00) | 0xc2;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40cc);
+		} 
+		break;
+	case UAS_DISCONNECT:
+		if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x30)) {
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40c0);
+			temp = (temp & 0xffffff00) | 0x0e;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40c0);
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x40cc);
+			temp = (temp & 0xffffff00) | 0xc0;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x40cc);
+		}
+		break;
+	case UAS_PREV_RESET:		
+		if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x30)) {
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x4060);
+			temp |= 0x01;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x4060);
+		}
+
+		if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x40)) {
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x4210);
+			temp |= 0x01;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x4210);
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x4250);
+			temp |= 0x01;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x4250);
+		}
+		break;
+	case UAS_POST_RESET:
+		if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x30)) {
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x4060);
+			temp &= ~0x01;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x4060);
+		}
+
+		if (!xhci->hcc_params1 || ((xhci->hcc_params1 & 0xff) == 0x40)) {
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x4210);
+			temp &= ~0x01;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x4210);
+			temp = xhci_readl(xhci, (void __iomem *)xhci->cap_regs + 0x4250);
+			temp &= ~0x01;
+			xhci_writel(xhci, temp, (void __iomem *)xhci->cap_regs + 0x4250);
+		}
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+	return 0;
+}
+
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_AUTHOR(DRIVER_AUTHOR);
 MODULE_LICENSE("GPL");
diff -ur a/drivers/usb/host/etxhci-dbg.c b/drivers/usb/host/etxhci-dbg.c
--- a/drivers/usb/host/etxhci-dbg.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci-dbg.c	2014-02-17 11:57:11.000000000 +0100
@@ -589,3 +589,63 @@
 			i, le64_to_cpu(stream_info->stream_ctx_array[i].stream_ring));
 	}
 }
+
+
+
+
+void etxhci_print_trbs(	struct xhci_hcd *xhci,
+    struct xhci_segment *seg,
+    union xhci_trb *trb,
+    int num_trbs)
+{
+	int i;
+
+	for (i = 0; i < num_trbs; i++) {
+		if ((trb->generic.field[3] & TRB_TYPE_BITMASK) == TRB_TYPE(TRB_LINK)) {
+			printk("@%016llx %08x %08x %08x %08x\n",
+				(unsigned long long)etxhci_trb_virt_to_dma(seg, trb),
+				le32_to_cpu(trb->generic.field[0]),
+				le32_to_cpu(trb->generic.field[1]),
+				le32_to_cpu(trb->generic.field[2]),
+				le32_to_cpu(trb->generic.field[3]));
+			seg = seg->next;
+			trb = seg->trbs;
+		}
+
+		printk("@%016llx %08x %08x %08x %08x\n",
+			(unsigned long long)etxhci_trb_virt_to_dma(seg, trb),
+			le32_to_cpu(trb->generic.field[0]),
+			le32_to_cpu(trb->generic.field[1]),
+			le32_to_cpu(trb->generic.field[2]),
+			le32_to_cpu(trb->generic.field[3]));
+		trb++;
+	}
+}
+
+void etxhci_print_segment(struct xhci_hcd *xhci, struct xhci_segment *seg)
+{
+	int i;
+	union xhci_trb *trb = seg->trbs;
+
+	for (i = 0; i < TRBS_PER_SEGMENT; ++i) {
+		trb = &seg->trbs[i];
+		printk("@%016llx %08x %08x %08x %08x\n",
+			(unsigned long long)etxhci_trb_virt_to_dma(seg, trb),
+			le32_to_cpu(trb->generic.field[0]),
+			le32_to_cpu(trb->generic.field[1]),
+			le32_to_cpu(trb->generic.field[2]),
+			le32_to_cpu(trb->generic.field[3]));
+	}
+}
+
+void etxhci_print_ring(struct xhci_hcd *xhci, struct xhci_ring *ring)
+{
+	struct xhci_segment *seg;
+	struct xhci_segment *first_seg = ring->first_seg;
+
+	etxhci_print_segment(xhci, first_seg);
+	for (seg = first_seg->next; seg != first_seg; seg = seg->next) {
+		etxhci_print_segment(xhci, seg);
+	}
+}
+
diff -ur a/drivers/usb/host/etxhci-ej168v0.0660.c b/drivers/usb/host/etxhci-ej168v0.0660.c
--- a/drivers/usb/host/etxhci-ej168v0.0660.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci-ej168v0.0660.c	2014-02-17 11:57:11.000000000 +0100
@@ -91,9 +91,10 @@
 
 void xhci_init_ej168_v00660(struct xhci_hcd *xhci)
 {
-	int i;
+	int i, error_flag = 0;
 	struct usb_hcd *hcd = xhci_to_hcd(xhci);
 	struct pci_dev *pdev = to_pci_dev(hcd->self.controller);
+	u8 reg8 = 0;
 
 	for (i = 0; i < CFG_S1_ITEMS_V00660; i++) {
 		pci_write_config_byte(pdev, cfg_s1_items_v00660[i].offset,
@@ -110,7 +111,25 @@
 
 	for (i = 0; i < MMIO_ITEMS_V00660; i++) {
 		xhci_writeb(xhci, mmio_items_v00660[i].value,
-			hcd->regs + mmio_items_v00660[i].offset);
+			mmio_items_v00660[i].offset);
+	}
+
+	for (i = 0; i < MMIO_ITEMS_V00660; i++) {
+		if ((0x1811 != mmio_items_v00660[i].offset) && (0 == error_flag)) {
+			reg8 = xhci_readb(xhci, mmio_items_v00660[i].offset);
+			if (reg8 != (u8)mmio_items_v00660[i].value)
+				error_flag = 1;
+		}
+	}
+
+	if (error_flag) {
+		for (i = 0; i < MMIO_ITEMS_V00660; i++) {
+			if (0x1811 != mmio_items_v00660[i].offset) {
+				reg8 = xhci_readb(xhci, mmio_items_v00660[i].offset);
+				xhci_err(xhci, "%s - @%04x %02x\n",
+					__func__, mmio_items_v00660[i].offset, reg8);
+			}
+		}
 	}
 }
 
diff -ur a/drivers/usb/host/etxhci-ej188v0.01.00.900.c b/drivers/usb/host/etxhci-ej188v0.01.00.900.c
--- a/drivers/usb/host/etxhci-ej188v0.01.00.900.c	2013-08-24 11:36:50.000000000 +0200
+++ b/drivers/usb/host/etxhci-ej188v0.01.00.900.c	2014-02-17 11:57:11.000000000 +0100
@@ -135,9 +135,7 @@
 	{0x44e4, 0x0d},
 	{0x4213, 0x01},
 	{0x4253, 0x01},
-	{0x4280, 0x0e},
 	{0x4281, 0x01},
-	{0x42c0, 0x0e},
 	{0x42c1, 0x01},
 	{0x4286, 0x20},
 	{0x4287, 0x01},
@@ -183,9 +181,10 @@
 
 void xhci_init_ej188_v00100900(struct xhci_hcd *xhci)
 {
-	int i;
+	int i, error_flag = 0;
 	struct usb_hcd *hcd = xhci_to_hcd(xhci);
 	struct pci_dev *pdev = to_pci_dev(hcd->self.controller);
+	u8 reg8 = 0;
 
 	for (i = 0; i < CFG_ITEMS_V00100900; i++) {
 		pci_write_config_byte(pdev, cfg_items_v00100900[i].offset,
@@ -194,7 +193,25 @@
 
 	for (i = 0; i < MMIO_ITEMS_V00100900; i++) {
 		xhci_writeb(xhci, mmio_items_v00100900[i].value,
-			hcd->regs + mmio_items_v00100900[i].offset);
+			mmio_items_v00100900[i].offset);
+	}
+
+	for (i = 0; i < MMIO_ITEMS_V00100900; i++) {
+		if ((0x1811 != mmio_items_v00100900[i].offset) && (0 == error_flag)) {
+			reg8 = xhci_readb(xhci, mmio_items_v00100900[i].offset);
+			if (reg8 != (u8)mmio_items_v00100900[i].value)
+				error_flag = 1;
+		}
+	}
+
+	if (error_flag) {
+		for (i = 0; i < MMIO_ITEMS_V00100900; i++) {
+			if (0x1811 != mmio_items_v00100900[i].offset) {
+				reg8 = xhci_readb(xhci, mmio_items_v00100900[i].offset);
+				xhci_err(xhci, "%s - @%04x %02x\n",
+					__func__, mmio_items_v00100900[i].offset, reg8);
+			}
+		}
 	}
 }
 
diff -ur a/drivers/usb/host/etxhci-ejxxx.c b/drivers/usb/host/etxhci-ejxxx.c
--- a/drivers/usb/host/etxhci-ejxxx.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci-ejxxx.c	2014-02-17 11:57:11.000000000 +0100
@@ -38,9 +38,19 @@
 		reg32 = xhci_readl(xhci, hcd->regs + 0x40c0);
 		reg32 = (reg32 & 0xffff00ff) | 0x0100;
 		xhci_writel(xhci, reg32, hcd->regs + 0x40c0);
+		reg32 = xhci_readl(xhci, hcd->regs + 0x40d4);
+		reg32 = (reg32 & 0xfffffffe) | 0x01;
+		xhci_writel(xhci, reg32, hcd->regs + 0x40d4);
 		break;
 	case 0x40:
 		xhci_init_ej188_v00100900(xhci);
+
+		reg32 = xhci_readl(xhci, hcd->regs + 0x4294);
+		reg32 = (reg32 & 0xfffffffe) | 0x01;
+		xhci_writel(xhci, reg32, hcd->regs + 0x4294);
+		reg32 = xhci_readl(xhci, hcd->regs + 0x42d4);
+		reg32 = (reg32 & 0xfffffffe) | 0x01;
+		xhci_writel(xhci, reg32, hcd->regs + 0x42d4);
 		break;
 	default:
 		break;
diff -ur a/drivers/usb/host/etxhci.h b/drivers/usb/host/etxhci.h
--- a/drivers/usb/host/etxhci.h	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci.h	2014-02-17 11:57:11.000000000 +0100
@@ -277,6 +277,7 @@
 #define PORT_PLS_MASK	(0xf << 5)
 #define XDEV_U0		(0x0 << 5)
 #define XDEV_U3		(0x3 << 5)
+#define XDEV_INACTIVE	(0x6 << 5)
 #define XDEV_RESUME	(0xf << 5)
 /* true: port has power (see HCC_PPC) */
 #define PORT_POWER	(1 << 9)
@@ -611,7 +612,7 @@
  * 4 - TRB error
  * 5-7 - reserved
  */
-#define EP_STATE_MASK		(0xf)
+#define EP_STATE_MASK		(0x7)
 #define EP_STATE_DISABLED	0
 #define EP_STATE_RUNNING	1
 #define EP_STATE_HALTED		2
@@ -1163,6 +1164,7 @@
 	unsigned int		num_trbs_free;
 	unsigned int		num_trbs_free_temp;
 	enum xhci_ring_type	type;
+	bool			last_td_was_short;
 	struct radix_tree_root *trb_address_map;
 };
 
@@ -1231,8 +1233,11 @@
 	/* Port suspend arrays are indexed by the portnum of the fake roothub */
 	/* ports suspend status arrays - max 31 ports for USB2, 15 for USB3 */
 	u32			port_c_suspend;
+	u32			port_c_connection;
 	u32			suspended_ports;
 	u32			port_remote_wakeup;
+	u32			downgraded_ports;
+	u32			downgraded_open;
 	unsigned long		resume_done[USB_MAXCHILDREN];
 	/* which ports have started to resume */
 	unsigned long		resuming_ports;
@@ -1338,6 +1343,7 @@
 	unsigned int		quirks;
 #define	XHCI_LINK_TRB_QUIRK	(1 << 0)
 #define XHCI_RESET_EP_QUIRK	(1 << 1)
+#define XHCI_SPURIOUS_SUCCESS	(1 << 4)
 #define XHCI_BROKEN_MSI		(1 << 6)
 #define XHCI_RESET_ON_RESUME	(1 << 7)
 #define XHCI_TRUST_TX_LENGTH	(1 << 10)
@@ -1388,16 +1394,6 @@
 
 /* TODO: copied from ehci.h - can be refactored? */
 /* xHCI spec says all registers are little endian */
-static inline u8 xhci_readb(const struct xhci_hcd *xhci,
-		__le32 __iomem *regs)
-{
-	return readb(regs);
-}
-static inline void xhci_writeb(struct xhci_hcd *xhci,
-		const u8 val, __le32 __iomem *regs)
-{
-	writeb(val, regs);
-}
 static inline unsigned int xhci_readl(const struct xhci_hcd *xhci,
 		__le32 __iomem *regs)
 {
@@ -1408,6 +1404,31 @@
 {
 	writel(val, regs);
 }
+static inline u8 xhci_readb(struct xhci_hcd *xhci,
+		unsigned int offset)
+{
+	struct usb_hcd *hcd = xhci_to_hcd(xhci);
+	unsigned int temp;
+
+	temp = xhci_readl(xhci, hcd->regs + (offset & 0xfffc));
+	temp = 0x0ff & (temp >> (8 * (offset & 3)));
+
+	return temp;
+}
+static inline void xhci_writeb(struct xhci_hcd *xhci,
+		unsigned int val, unsigned int offset)
+{
+	struct usb_hcd *hcd = xhci_to_hcd(xhci);
+	unsigned int mask, temp;
+
+	mask = 0x0ff;
+	temp = xhci_readl(xhci, hcd->regs + (offset & 0xfffc));
+
+	temp &= ~(mask << (8 * (offset & 3)));
+	temp |= (val & mask) << (8 * (offset & 3));
+
+	xhci_writel(xhci, temp, hcd->regs + (offset & 0xfffc));
+}
 
 /*
  * Registers should always be accessed with double word or quad word accesses.
@@ -1462,6 +1483,12 @@
 		struct xhci_virt_ep *ep);
 void etxhci_dbg_stream_info(struct xhci_hcd *xhci,
 		unsigned int ep_index, struct xhci_stream_info *stream_info);
+void etxhci_print_trbs(struct xhci_hcd *xhci,
+    struct xhci_segment *seg,
+    union xhci_trb *trb,
+    int num_trbs);
+void etxhci_print_segment(struct xhci_hcd *xhci, struct xhci_segment *seg);
+void etxhci_print_ring(struct xhci_hcd *xhci, struct xhci_ring *ring);
 
 /* xHCI memory management */
 void etxhci_mem_cleanup(struct xhci_hcd *xhci);
@@ -1566,10 +1593,13 @@
 int etxhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status);
 int etxhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *ep);
 int etxhci_drop_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *ep);
+void etxhci_stop_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *ep);
 void etxhci_endpoint_reset(struct usb_hcd *hcd, struct usb_host_endpoint *ep);
 int etxhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev);
 int etxhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev);
 void etxhci_reset_bandwidth(struct usb_hcd *hcd, struct usb_device *udev);
+int etxhci_update_uas_device(struct usb_hcd *hcd, struct usb_device *udev,
+		int type);
 
 /* xHCI ring, segment, TRB, and TD functions */
 dma_addr_t etxhci_trb_virt_to_dma(struct xhci_segment *seg, union xhci_trb *trb);
@@ -1638,6 +1668,9 @@
 int etxhci_find_slot_id_by_port(struct usb_hcd *hcd, struct xhci_hcd *xhci,
 		u16 port);
 void etxhci_ring_device(struct xhci_hcd *xhci, int slot_id);
+void xhci_hub_power_port(struct usb_hcd *hcd, int port, bool onoff);
+int xhci_downgrade_to_usb2(struct usb_hcd *hcd, struct usb_device *udev);
+bool xhci_is_mass_storage_device(struct xhci_hcd *xhci, int slot_id);
 
 /* xHCI contexts */
 struct xhci_input_control_ctx *etxhci_get_input_control_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx);
diff -ur a/drivers/usb/host/etxhci-hub.c b/drivers/usb/host/etxhci-hub.c
--- a/drivers/usb/host/etxhci-hub.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci-hub.c	2014-02-17 11:57:11.000000000 +0100
@@ -530,6 +530,9 @@
 	u16 link_state = 0;
 	u16 wake_mask = 0;
 
+#ifndef MY_ABC_HERE
+printk("%s - %04x %04x %04x %04x\n", __func__, typeReq, wValue, wIndex, wLength);
+#endif
 	max_ports = xhci_get_ports(hcd, &port_array);
 	bus_state = &xhci->bus_state[hcd_index(hcd)];
 
@@ -667,6 +670,8 @@
 		}
 		if (bus_state->port_c_suspend & (1 << wIndex))
 			status |= 1 << USB_PORT_FEAT_C_SUSPEND;
+		if (bus_state->port_c_connection & (1 << wIndex))
+			status |= USB_PORT_STAT_C_CONNECTION << 16;
 		xhci_dbg(xhci, "Get port status returned 0x%x\n", status);
 		put_unaligned(cpu_to_le32(status), (__le32 *) buf);
 		break;
@@ -854,11 +859,19 @@
 			}
 			etxhci_ring_device(xhci, slot_id);
 			break;
+		case USB_PORT_FEAT_C_CONNECTION:
+			if (bus_state->port_c_connection & (1 << wIndex)) {
+				bus_state->port_c_connection &= ~(1 << wIndex);
+				bus_state->resume_done[wIndex] = 0;
+ 			} else {
+				xhci_clear_port_change_bit(xhci, wValue, wIndex,
+						port_array[wIndex], temp);
+ 			}
+			break;
 		case USB_PORT_FEAT_C_SUSPEND:
 			bus_state->port_c_suspend &= ~(1 << wIndex);
 		case USB_PORT_FEAT_C_RESET:
 		case USB_PORT_FEAT_C_BH_PORT_RESET:
-		case USB_PORT_FEAT_C_CONNECTION:
 		case USB_PORT_FEAT_C_OVER_CURRENT:
 		case USB_PORT_FEAT_C_ENABLE:
 		case USB_PORT_FEAT_C_PORT_LINK_STATE:
@@ -926,6 +939,7 @@
 			break;
 		}
 		if ((temp & mask) != 0 ||
+			(bus_state->port_c_connection & 1 << i) ||
 			(bus_state->port_c_suspend & 1 << i) ||
 			(bus_state->resume_done[i] && time_after_eq(
 			    jiffies, bus_state->resume_done[i]))) {
@@ -937,6 +951,105 @@
 	return status ? retval : 0;
 }
 
+void xhci_hub_power_port(struct usb_hcd *hcd,
+		int port, bool onoff)
+{
+	struct xhci_hcd	*xhci = hcd_to_xhci(hcd);
+	__le32 __iomem **port_array;
+	int max_ports;
+	u32 temp;
+
+	max_ports = xhci_get_ports(hcd, &port_array);
+
+	if (!port || port > max_ports)
+		return;
+
+	port--;
+	temp = xhci_readl(xhci, port_array[port]);
+	if (temp == 0xffffffff)
+		return;
+
+	temp = etxhci_port_state_to_neutral(temp);
+	if (onoff)
+		temp |= PORT_POWER;
+	else
+		temp &= ~PORT_POWER;
+
+	xhci_writel(xhci, temp, port_array[port]);
+	temp = xhci_readl(xhci, port_array[port]);
+	xhci_dbg(xhci, "power %s port, actual port %p status  = 0x%x\n",
+			(onoff) ? "on" : "off", port_array[port], temp);
+}
+
+int xhci_downgrade_to_usb2(struct usb_hcd *hcd,
+		struct usb_device *udev)
+{
+	struct xhci_hcd	*xhci = hcd_to_xhci(hcd);
+	__le32 __iomem **port_array;
+	struct xhci_bus_state *bus_state;
+	unsigned long flags;
+	int max_ports, slot_id, ret = -ENODEV;
+	u32 portsc;
+
+	if (!(udev->parent && !udev->parent->parent))
+		goto err_done;
+
+	if (udev->speed != USB_SPEED_SUPER)
+		goto err_done;
+
+	if (udev->state == USB_STATE_NOTATTACHED)
+		goto err_done;
+
+	slot_id = etxhci_find_slot_id_by_port(hcd, xhci,
+					udev->portnum);
+
+	if(!(xhci_is_mass_storage_device(xhci, slot_id)))
+		goto err_done;
+
+	bus_state = &xhci->bus_state[hcd_index(hcd)];
+	max_ports = xhci_get_ports(hcd, &port_array);
+	if (udev->portnum > max_ports)
+		goto err_done;
+
+	if (udev->descriptor.idVendor == cpu_to_le16(0x1759) &&
+		udev->descriptor.idProduct == cpu_to_le16(0x5100))
+		goto err_done;
+
+	spin_lock_irqsave(&xhci->lock, flags);
+
+	portsc = xhci_readl(xhci, port_array[udev->portnum - 1]);
+	if (portsc == 0xffffffff) {
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		goto err_done;
+	}
+
+	if (!(portsc & 0x080) &&
+		(bus_state->downgraded_ports & (1 << (udev->portnum - 1)))) {
+		xhci_hub_power_port(hcd, udev->portnum, false);
+		xhci_hub_power_port(hcd->shared_hcd, udev->portnum, false);
+
+		spin_unlock_irqrestore(&xhci->lock, flags);
+
+		msleep(500);
+
+		spin_lock_irqsave(&xhci->lock, flags);
+		xhci_hub_power_port(hcd->shared_hcd, udev->portnum, true);
+		bus_state->port_c_connection |= 1 << (udev->portnum - 1);
+		ret = 0;
+
+		spin_unlock_irqrestore(&xhci->lock, flags);
+		msleep(500);
+		usb_hcd_poll_rh_status(hcd);
+
+		spin_lock_irqsave(&xhci->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&xhci->lock, flags);
+
+err_done:
+	return ret;
+}
+
 #ifdef CONFIG_PM
 
 int etxhci_bus_suspend(struct usb_hcd *hcd)
diff -ur a/drivers/usb/host/etxhci-mem.c b/drivers/usb/host/etxhci-mem.c
--- a/drivers/usb/host/etxhci-mem.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci-mem.c	2014-02-17 11:57:11.000000000 +0100
@@ -1196,6 +1196,8 @@
 static unsigned int xhci_parse_microframe_interval(struct usb_device *udev,
 		struct usb_host_endpoint *ep)
 {
+	if (ep->desc.bInterval == 0)
+		return 0;
 	return xhci_microframes_to_exponent(udev, ep,
 			ep->desc.bInterval, 0, 15);
 }
diff -ur a/drivers/usb/host/etxhci-pci.c b/drivers/usb/host/etxhci-pci.c
--- a/drivers/usb/host/etxhci-pci.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci-pci.c	2014-02-17 11:57:11.000000000 +0100
@@ -35,7 +35,7 @@
 unsigned short xhci_vendor = 0;
 #endif
 
-static const char hcd_name[] = "etxhci_hcd_130207";
+static const char hcd_name[] = "etxhci_hcd_130927d1";
 
 /* called after powerup, by probe or system-pm "wakeup" */
 static int xhci_pci_reinit(struct xhci_hcd *xhci, struct pci_dev *pdev)
@@ -78,6 +78,7 @@
 			hcd->chip_id = HCD_CHIP_ID_ETRON_EJ188;
 
 		xhci_dbg(xhci, "Etron chip ID %02x\n", hcd->chip_id);
+		xhci->quirks |= XHCI_SPURIOUS_SUCCESS;
 		xhci->quirks |= XHCI_HUB_INFO_QUIRK;
 		xhci->quirks |= XHCI_RESET_ON_RESUME;
 		xhci_dbg(xhci, "QUIRK: Resetting on resume\n");
@@ -244,12 +245,14 @@
 	.free_streams =		etxhci_free_streams,
 	.add_endpoint =		etxhci_add_endpoint,
 	.drop_endpoint =	etxhci_drop_endpoint,
+	.stop_endpoint =	etxhci_stop_endpoint,
 	.endpoint_reset =	etxhci_endpoint_reset,
 	.check_bandwidth =	etxhci_check_bandwidth,
 	.reset_bandwidth =	etxhci_reset_bandwidth,
 	.address_device =	etxhci_address_device,
 	.update_hub_device =	etxhci_update_hub_device,
 	.reset_device =		etxhci_discover_or_reset_device,
+	.update_uas_device = etxhci_update_uas_device,
 
 	/*
 	 * scheduling support
diff -ur a/drivers/usb/host/etxhci-ring.c b/drivers/usb/host/etxhci-ring.c
--- a/drivers/usb/host/etxhci-ring.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/etxhci-ring.c	2014-02-17 11:57:11.000000000 +0100
@@ -66,6 +66,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/dma-mapping.h>
 #include "etxhci.h"
 
 static int handle_cmd_in_cmd_wait_list(struct xhci_hcd *xhci,
@@ -592,8 +593,13 @@
 	    (trb->field[3] & cpu_to_le32(LINK_TOGGLE)))
 		state->new_cycle_state ^= 0x1;
 	next_trb(xhci, ep_ring, &state->new_deq_seg, &state->new_deq_ptr);
-	if (last_trb(xhci, ep_ring, state->new_deq_seg, state->new_deq_ptr))
+	if (last_trb(xhci, ep_ring, state->new_deq_seg, state->new_deq_ptr)) {
+		trb = &state->new_deq_ptr->generic;
+		if (TRB_TYPE_LINK_LE32(trb->field[3]) &&
+			(trb->field[3] & cpu_to_le32(LINK_TOGGLE)))
+			state->new_cycle_state ^= 0x1;
 		next_trb(xhci, ep_ring, &state->new_deq_seg, &state->new_deq_ptr);
+	}
 
 	/*
 	 * If there is only one segment in a ring, find_trb_seg()'s while loop
@@ -760,24 +766,29 @@
 	struct xhci_td *last_unlinked_td;
 
 	struct xhci_dequeue_state deq_state;
+#ifndef MY_ABC_HERE
+printk("%s\n", __func__);
+#endif
+	slot_id = TRB_TO_SLOT_ID(le32_to_cpu(trb->generic.field[3]));
+	virt_dev = xhci->devs[slot_id];
+	if (!virt_dev) {
+		xhci_warn(xhci, "Stop endpoint command "
+			"completion for disabled slot %u\n",
+			slot_id);
+		return;
+	}
 
-	if (unlikely(TRB_TO_SUSPEND_PORT(
-			     le32_to_cpu(xhci->cmd_ring->dequeue->generic.field[3])))) {
-		slot_id = TRB_TO_SLOT_ID(
-			le32_to_cpu(xhci->cmd_ring->dequeue->generic.field[3]));
-		virt_dev = xhci->devs[slot_id];
-		if (virt_dev)
-			handle_cmd_in_cmd_wait_list(xhci, virt_dev,
-				event);
-		else
-			xhci_warn(xhci, "Stop endpoint command "
-				"completion for disabled slot %u\n",
-				slot_id);
+	if (TRB_TO_SUSPEND_PORT(le32_to_cpu(trb->generic.field[3]))) {
+		handle_cmd_in_cmd_wait_list(xhci, virt_dev, event);
+		return;
+	}
+
+	if (!list_empty(&virt_dev->cmd_list) &&
+		handle_cmd_in_cmd_wait_list(xhci, virt_dev, event)) {
 		return;
 	}
 
 	memset(&deq_state, 0, sizeof(deq_state));
-	slot_id = TRB_TO_SLOT_ID(le32_to_cpu(trb->generic.field[3]));
 	ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3]));
 	ep = &xhci->devs[slot_id]->eps[ep_index];
 
@@ -1183,6 +1194,9 @@
 static void xhci_complete_cmd_in_cmd_wait_list(struct xhci_hcd *xhci,
 		struct xhci_command *command, u32 status)
 {
+#ifndef MY_ABC_HERE
+printk("%s\n", __func__);
+#endif
 	command->status = status;
 	list_del(&command->cmd_list);
 	if (command->completion)
@@ -1202,6 +1216,9 @@
 {
 	struct xhci_command *command;
 
+#ifndef MY_ABC_HERE
+printk("%s\n", __func__);
+#endif
 	if (list_empty(&virt_dev->cmd_list))
 		return 0;
 
@@ -1236,6 +1253,17 @@
 	cur_seg = find_trb_seg(xhci->cmd_ring->first_seg,
 			xhci->cmd_ring->dequeue, &cycle_state);
 
+	if (!cur_seg) {
+		xhci_warn(xhci, "Command ring mismatch, dequeue = %p %llx (dma)\n",
+				xhci->cmd_ring->dequeue,
+				(unsigned long long)
+				etxhci_trb_virt_to_dma(xhci->cmd_ring->deq_seg,
+					xhci->cmd_ring->dequeue));
+		etxhci_debug_ring(xhci, xhci->cmd_ring);
+		etxhci_dbg_ring_ptrs(xhci, xhci->cmd_ring);
+		return;
+	}
+
 	/* find the command trb matched by cd from command ring */
 	for (cmd_trb = xhci->cmd_ring->dequeue;
 			cmd_trb != xhci->cmd_ring->enqueue;
@@ -1487,6 +1515,65 @@
 	inc_deq(xhci, xhci->cmd_ring);
 }
 
+bool xhci_is_mass_storage_device(struct xhci_hcd *xhci,
+		int slot_id)
+{
+	int i;
+	struct xhci_virt_device *virt_dev;
+	struct usb_device *udev;
+	struct usb_interface_cache *intfc;
+	struct usb_host_interface *alts;
+	bool status = false;
+
+	virt_dev = xhci->devs[slot_id];
+	if (!virt_dev || !virt_dev->udev)
+		goto err_done;
+
+	udev = virt_dev->udev;
+	if (!(udev->parent && !udev->parent->parent))
+		goto err_done;
+
+	if (!udev->config)
+		goto err_done;
+
+	intfc = udev->config[0].intf_cache[0];
+	for (i = 0; i < intfc->num_altsetting; i++) {
+		alts = &intfc->altsetting[i];
+		if (alts->desc.bInterfaceClass == USB_CLASS_MASS_STORAGE) {
+			status = true;
+			break;
+		}
+	}
+
+err_done:
+	return status;
+}
+
+static void xhci_giveback_error_urb(struct xhci_hcd *xhci,
+		int slot_id)
+{
+	struct xhci_virt_device *virt_dev;
+	int i;
+
+	virt_dev = xhci->devs[slot_id];
+	for (i = LAST_EP_INDEX; i > 0; i--) {
+		struct xhci_virt_ep *ep = &virt_dev->eps[i];
+		struct xhci_ring *ring = ep->ring;
+		if (!ring)
+			continue;
+
+		if (!list_empty(&ring->td_list)) {
+			struct xhci_td *cur_td = list_first_entry(&ring->td_list,
+				struct xhci_td,
+				td_list);
+			list_del_init(&cur_td->td_list);
+			if (!list_empty(&cur_td->cancelled_td_list))
+				list_del_init(&cur_td->cancelled_td_list);
+			xhci_giveback_urb_in_irq(xhci, cur_td, -EPROTO, "killed");
+		}
+	}
+}
+
 /* @port_id: the one-based port ID from the hardware (indexed from array of all
  * port registers -- USB 3.0 and USB 2.0).
  *
@@ -1550,7 +1637,7 @@
 {
 	struct usb_hcd *hcd;
 	u32 port_id;
-	u32 temp, temp1;
+	u32 temp, temp1, mask;
 	int max_ports;
 	int slot_id;
 	unsigned int faked_port_index;
@@ -1614,6 +1701,9 @@
 			port_id);
 
 	temp = xhci_readl(xhci, port_array[faked_port_index]);
+#ifndef MY_ABC_HERE
+printk("%s - port %p status %08x\n", __func__, port_array[faked_port_index], temp);
+#endif
 	if (hcd->state == HC_STATE_SUSPENDED) {
 		xhci_dbg(xhci, "resume root hub\n");
 		usb_hcd_resume_root_hub(hcd);
@@ -1653,6 +1743,40 @@
 				  bus_state->resume_done[faked_port_index]);
 			/* Do the rest in GetPortStatus */
 		}
+	} else if ((temp & PORT_PLC) && (temp & PORT_PLS_MASK) == XDEV_INACTIVE) {
+		slot_id = etxhci_find_slot_id_by_port(hcd, xhci,
+			faked_port_index + 1);
+		if (xhci_is_mass_storage_device(xhci, slot_id)) {
+			bogus_port_status = true;
+			temp = etxhci_port_state_to_neutral(temp);
+			temp |= (PORT_WR | PORT_PLC | PORT_CSC);
+			xhci_writel(xhci, temp, port_array[faked_port_index]);
+			goto cleanup;
+		}
+	}
+
+	if (temp & PORT_WRC) {
+		slot_id = etxhci_find_slot_id_by_port(hcd, xhci,
+			faked_port_index + 1);
+		if (xhci_is_mass_storage_device(xhci, slot_id)) {
+			bogus_port_status = true;
+			temp1 = etxhci_port_state_to_neutral(temp);
+			temp1 |= (PORT_WRC | PORT_RC | PORT_PLC | PORT_CSC);
+			xhci_writel(xhci, temp1, port_array[faked_port_index]);
+			mask = PORT_PE | PORT_CONNECT;
+			if ((temp & mask) == mask) {
+				bus_state->port_c_connection &= ~(1 << faked_port_index);
+				bus_state->resume_done[faked_port_index] = 0;
+				xhci_giveback_error_urb(xhci, slot_id);
+			} else {
+				bus_state->port_c_connection |= 1 << faked_port_index;
+				bus_state->resume_done[faked_port_index] = jiffies +
+					msecs_to_jiffies(5000);
+				mod_timer(&hcd->rh_timer,
+					bus_state->resume_done[faked_port_index]);
+			}
+			goto cleanup;
+		}
 	}
 
 	if ((temp & PORT_PLC) && (temp & PORT_PLS_MASK) == XDEV_U0 &&
@@ -1669,7 +1793,7 @@
 				faked_port_index + 1);
 		if (slot_id && xhci->devs[slot_id])
 			etxhci_ring_device(xhci, slot_id);
-		if (bus_state->port_remote_wakeup && (1 << faked_port_index)) {
+		if (bus_state->port_remote_wakeup & (1 << faked_port_index)) {
 			bus_state->port_remote_wakeup &=
 				~(1 << faked_port_index);
 			etxhci_test_and_clear_bit(xhci, port_array,
@@ -1681,9 +1805,16 @@
 		}
 	}
 
-	if (hcd->speed != HCD_USB3)
+	if (hcd->speed != HCD_USB3) {
+		bus_state = &xhci->bus_state[hcd_index(hcd->shared_hcd)];
+		if ((bus_state->downgraded_open & (1 << faked_port_index))) {
+			xhci_hub_power_port(hcd->shared_hcd, faked_port_index + 1, true);
+			bus_state->downgraded_ports &= ~(1 << faked_port_index);
+			bus_state->downgraded_open &= ~(1 << faked_port_index);
+		}
 		etxhci_test_and_clear_bit(xhci, port_array, faked_port_index,
 					PORT_PLC);
+	}
 
 cleanup:
 	/* Update event ring dequeue pointer before dropping the lock */
@@ -2474,6 +2605,16 @@
 		if (!event_seg) {
 			if (!ep->skip ||
 			    !usb_endpoint_xfer_isoc(&td->urb->ep->desc)) {
+				/* Some host controllers give a spurious
+				 * successful event after a short transfer.
+				 * Ignore it.
+				 */
+				if ((xhci->quirks & XHCI_SPURIOUS_SUCCESS) && 
+						ep_ring->last_td_was_short) {
+					ep_ring->last_td_was_short = false;
+					ret = 0;
+					goto cleanup;
+				}
 				/* HC is busted, give up! */
 				xhci_err(xhci,
 					"ERROR Transfer event TRB DMA ptr not "
@@ -2484,6 +2625,10 @@
 			ret = skip_isoc_td(xhci, td, event, ep, &status);
 			goto cleanup;
 		}
+		if (trb_comp_code == COMP_SHORT_TX)
+			ep_ring->last_td_was_short = true;
+		else
+			ep_ring->last_td_was_short = false;
 
 		if (ep->skip) {
 			xhci_dbg(xhci, "Found td. Clear skip flag.\n");
@@ -2538,6 +2683,8 @@
 				(trb_comp_code != COMP_STALL &&
 					trb_comp_code != COMP_BABBLE))
 				etxhci_urb_free_priv(xhci, urb_priv);
+			else
+				kfree(urb_priv);
 
 			usb_hcd_unlink_urb_from_ep(bus_to_hcd(urb->dev->bus), urb);
 			if ((urb->actual_length != urb->transfer_buffer_length &&
@@ -2596,6 +2743,15 @@
 		return 0;
 	}
 
+#ifndef MY_ABC_HERE
+if (((le32_to_cpu(event->generic.field[3]) & TRB_TYPE_BITMASK) == TRB_TYPE(TRB_COMPLETION)) ||
+	((GET_COMP_CODE(le32_to_cpu(event->generic.field[2])) != COMP_SUCCESS) &&
+	(GET_COMP_CODE(le32_to_cpu(event->generic.field[2])) != COMP_SHORT_TX))) {
+	printk("%s\n", __func__);
+	etxhci_print_trbs(xhci, xhci->event_ring->deq_seg, xhci->event_ring->dequeue, 1);
+}
+#endif
+
 	/*
 	 * Barrier between reading the TRB_CYCLE (valid) flag above and any
 	 * speculative reads of the event's flags/data below.
@@ -2622,6 +2778,15 @@
 		break;
 	default:
 		xhci->error_bitmask |= 1 << 3;
+		xhci_err(xhci, "Unknown event TRB:\n");
+		xhci_err(xhci, "@%016llx %08x %08x %08x %08x\n",
+			 (unsigned long long) etxhci_trb_virt_to_dma(xhci->event_ring->deq_seg, event),
+			 le32_to_cpu(event->generic.field[0]),
+			 le32_to_cpu(event->generic.field[1]),
+			 le32_to_cpu(event->generic.field[2]),
+			 le32_to_cpu(event->generic.field[3]));
+		xhci_dbg(xhci, "Event ring:\n");
+		etxhci_debug_segment(xhci, xhci->event_ring->deq_seg);
 	}
 	/* Any of the above functions may drop and re-acquire the lock, so check
 	 * to make sure a watchdog timer didn't mark the host as non-responsive.
@@ -2741,33 +2906,22 @@
 
 irqreturn_t etxhci_msi_irq(int irq, struct usb_hcd *hcd)
 {
-#if (LINUX_VERSION_CODE > KERNEL_VERSION(3,2,0))
-	irqreturn_t ret;
-	unsigned long flags;
-
-	local_irq_save(flags);
-
-	ret = etxhci_irq(hcd);
-
-	local_irq_restore(flags);
-	return ret;
-#else
 	irqreturn_t ret;
-	struct xhci_hcd *xhci;
 	unsigned long flags;
 
-	local_irq_save(flags);
-
-	xhci = hcd_to_xhci(hcd);
+#ifdef HCD_FLAG_SAW_IRQ
+	struct xhci_hcd *xhci = hcd_to_xhci(hcd);
 	set_bit(HCD_FLAG_SAW_IRQ, &hcd->flags);
 	if (xhci->shared_hcd)
 		set_bit(HCD_FLAG_SAW_IRQ, &xhci->shared_hcd->flags);
+#endif
+
+	local_irq_save(flags);
 
 	ret = etxhci_irq(hcd);
 
 	local_irq_restore(flags);
 	return ret;
-#endif
 }
 
 /****		Endpoint Ring Operations	****/
@@ -2996,6 +3150,24 @@
 				urb->transfer_buffer_length);
 }
 
+static void requeue_first_trb(struct xhci_hcd *xhci, int slot_id,
+		unsigned int ep_index, unsigned int stream_id,
+		struct xhci_generic_trb *start_trb,
+		struct xhci_generic_trb *temp_trb)
+{
+	/*
+	 * Pass all the TRBs to the hardware at once and make sure this write
+	 * isn't reordered.
+	 */
+	wmb();
+	start_trb->field[0] = temp_trb->field[0];
+	start_trb->field[1] = temp_trb->field[1];
+	start_trb->field[2] = temp_trb->field[2];
+	start_trb->field[3] = temp_trb->field[3];
+
+	etxhci_ring_ep_doorbell(xhci, slot_id, ep_index, stream_id);
+}
+
 static void giveback_first_trb(struct xhci_hcd *xhci, int slot_id,
 		unsigned int ep_index, unsigned int stream_id, int start_cycle,
 		struct xhci_generic_trb *start_trb)
@@ -3069,11 +3241,11 @@
 }
 
 /*
- * For xHCI 1.0 host controllers, TD size is the number of packets remaining in
- * the TD (*not* including this TRB).
+ * For xHCI 1.0 host controllers, TD size is the number of max packet sized
+ * packets remaining in the TD (*not* including this TRB).
  *
  * Total TD packet count = total_packet_count =
- *     roundup(TD size in bytes / wMaxPacketSize)
+ *     DIV_ROUND_UP(TD size in bytes / wMaxPacketSize)
  *
  * Packets transferred up to and including this TRB = packets_transferred =
  *     rounddown(total bytes transferred including this TRB / wMaxPacketSize)
@@ -3081,25 +3253,27 @@
  * TD size = total_packet_count - packets_transferred
  *
  * It must fit in bits 21:17, so it can't be bigger than 31.
+ * The last TRB in a TD must have the TD size set to zero.
  */
-
 static u32 xhci_v1_0_td_remainder(int running_total, int trb_buff_len,
-		unsigned int total_packet_count, struct urb *urb)
+		unsigned int total_packet_count, struct urb *urb,
+		unsigned int num_trbs_left)
 {
 	int packets_transferred;
-	u32 remainder;
 
 	/* One TRB with a zero-length data packet. */
-	if (running_total == 0 && trb_buff_len == 0)
+	if (num_trbs_left == 0 || (running_total == 0 && trb_buff_len == 0))
 		return 0;
 
 	/* All the TRB queueing functions don't count the current TRB in
 	 * running_total.
 	 */
 	packets_transferred = (running_total + trb_buff_len) /
-		usb_endpoint_maxp(&urb->ep->desc);
-	remainder = total_packet_count - packets_transferred;
-	return (remainder > 31) ? 31 << 17 : remainder << 17;
+		GET_MAX_PACKET(usb_endpoint_maxp(&urb->ep->desc));
+
+	if ((total_packet_count - packets_transferred) > 31)
+		return 31 << 17;
+	return (total_packet_count - packets_transferred) << 17;
 }
 
 static int queue_bulk_sg_tx(struct xhci_hcd *xhci, gfp_t mem_flags,
@@ -3118,7 +3292,7 @@
 	bool more_trbs_coming;
 
 	struct xhci_generic_trb *start_trb;
-	int start_cycle;
+	struct xhci_generic_trb temp_trb;
 
 	ep_ring = xhci_urb_to_transfer_ring(xhci, urb);
 	if (!ep_ring)
@@ -3148,7 +3322,6 @@
 	 * state may change as we enqueue the other TRBs, so save it too.
 	 */
 	start_trb = &ep_ring->enqueue->generic;
-	start_cycle = ep_ring->cycle_state;
 
 	running_total = 0;
 	/*
@@ -3175,13 +3348,7 @@
 		u32 length_field = 0;
 		u32 remainder = 0;
 
-		/* Don't change the cycle bit of the first TRB until later */
-		if (first_trb) {
-			first_trb = false;
-			if (start_cycle == 0)
-				field |= 0x1;
-		} else
-			field |= ep_ring->cycle_state;
+		field |= ep_ring->cycle_state;
 
 		/* Chain all the TRBs together; clear the chain bit in the last
 		 * TRB to indicate it's the last TRB in the chain.
@@ -3207,15 +3374,9 @@
 		}
 
 		/* Set the TRB length, TD size, and interrupter fields. */
-		if (xhci->hci_version < 0x100) {
-			remainder = xhci_td_remainder(
-					urb->transfer_buffer_length -
-					running_total);
-		} else {
-			if (num_trbs > 1)
 				remainder = xhci_v1_0_td_remainder(running_total,
-						trb_buff_len, total_packet_count, urb);
-		}
+				trb_buff_len, total_packet_count, urb,
+				num_trbs - 1);
 		length_field = TRB_LEN(trb_buff_len) |
 			remainder |
 			TRB_INTR_TARGET(0);
@@ -3224,11 +3385,20 @@
 			more_trbs_coming = true;
 		else
 			more_trbs_coming = false;
-		queue_trb(xhci, ep_ring, more_trbs_coming,
-				lower_32_bits(addr),
-				upper_32_bits(addr),
-				length_field,
-				field | TRB_TYPE(TRB_NORMAL));
+		if (first_trb) {
+			first_trb = false;
+			temp_trb.field[0] = cpu_to_le32(lower_32_bits(addr));
+			temp_trb.field[1] = cpu_to_le32(upper_32_bits(addr));
+			temp_trb.field[2] = cpu_to_le32(length_field);
+			temp_trb.field[3] = cpu_to_le32(field | TRB_TYPE(TRB_NORMAL));
+			inc_enq(xhci, ep_ring, more_trbs_coming);
+		} else {
+			queue_trb(xhci, ep_ring, more_trbs_coming,
+					lower_32_bits(addr),
+					upper_32_bits(addr),
+					length_field,
+					field | TRB_TYPE(TRB_NORMAL));
+		}
 		--num_trbs;
 		running_total += trb_buff_len;
 
@@ -3256,8 +3426,8 @@
 	} while (running_total < urb->transfer_buffer_length);
 
 	check_trb_math(urb, num_trbs, running_total);
-	giveback_first_trb(xhci, slot_id, ep_index, urb->stream_id,
-			start_cycle, start_trb);
+	requeue_first_trb(xhci, slot_id, ep_index, urb->stream_id,
+			start_trb, &temp_trb);
 	return 0;
 }
 
@@ -3270,9 +3440,9 @@
 	struct xhci_td *td;
 	int num_trbs;
 	struct xhci_generic_trb *start_trb;
+	struct xhci_generic_trb temp_trb;
 	bool first_trb;
 	bool more_trbs_coming;
-	int start_cycle;
 	u32 field, length_field;
 
 	int running_total, trb_buff_len, ret;
@@ -3319,7 +3489,6 @@
 	 * state may change as we enqueue the other TRBs, so save it too.
 	 */
 	start_trb = &ep_ring->enqueue->generic;
-	start_cycle = ep_ring->cycle_state;
 
 	running_total = 0;
 	total_packet_count = DIV_ROUND_UP(urb->transfer_buffer_length,
@@ -3338,13 +3507,7 @@
 		u32 remainder = 0;
 		field = 0;
 
-		/* Don't change the cycle bit of the first TRB until later */
-		if (first_trb) {
-			first_trb = false;
-			if (start_cycle == 0)
-				field |= 0x1;
-		} else
-			field |= ep_ring->cycle_state;
+		field |= ep_ring->cycle_state;
 
 		/* Chain all the TRBs together; clear the chain bit in the last
 		 * TRB to indicate it's the last TRB in the chain.
@@ -3362,15 +3525,9 @@
 			field |= TRB_ISP;
 
 		/* Set the TRB length, TD size, and interrupter fields. */
-		if (xhci->hci_version < 0x100) {
-			remainder = xhci_td_remainder(
-					urb->transfer_buffer_length -
-					running_total);
-		} else {
-			if (num_trbs > 1)
 				remainder = xhci_v1_0_td_remainder(running_total,
-						trb_buff_len, total_packet_count, urb);
-		}
+				trb_buff_len, total_packet_count, urb,
+				num_trbs - 1);
 		length_field = TRB_LEN(trb_buff_len) |
 			remainder |
 			TRB_INTR_TARGET(0);
@@ -3379,11 +3536,20 @@
 			more_trbs_coming = true;
 		else
 			more_trbs_coming = false;
-		queue_trb(xhci, ep_ring, more_trbs_coming,
-				lower_32_bits(addr),
-				upper_32_bits(addr),
-				length_field,
-				field | TRB_TYPE(TRB_NORMAL));
+		if (first_trb) {
+			first_trb = false;
+			temp_trb.field[0] = cpu_to_le32(lower_32_bits(addr));
+			temp_trb.field[1] = cpu_to_le32(upper_32_bits(addr));
+			temp_trb.field[2] = cpu_to_le32(length_field);
+			temp_trb.field[3] = cpu_to_le32(field | TRB_TYPE(TRB_NORMAL));
+			inc_enq(xhci, ep_ring, more_trbs_coming);
+		} else {
+			queue_trb(xhci, ep_ring, more_trbs_coming,
+					lower_32_bits(addr),
+					upper_32_bits(addr),
+					length_field,
+					field | TRB_TYPE(TRB_NORMAL));
+		}
 		--num_trbs;
 		running_total += trb_buff_len;
 
@@ -3395,8 +3561,8 @@
 	} while (running_total < urb->transfer_buffer_length);
 
 	check_trb_math(urb, num_trbs, running_total);
-	giveback_first_trb(xhci, slot_id, ep_index, urb->stream_id,
-			start_cycle, start_trb);
+	requeue_first_trb(xhci, slot_id, ep_index, urb->stream_id,
+			start_trb, &temp_trb);
 	return 0;
 }
 
@@ -3633,7 +3799,8 @@
 		td_len = urb->iso_frame_desc[i].length;
 		td_remain_len = td_len;
 		total_packet_count = DIV_ROUND_UP(td_len,
-				usb_endpoint_maxp(&urb->ep->desc));
+				GET_MAX_PACKET(
+					usb_endpoint_maxp(&urb->ep->desc)));
 		/* A zero-length transfer still involves at least one packet. */
 		if (total_packet_count == 0)
 			total_packet_count++;
@@ -3655,9 +3822,11 @@
 		td = urb_priv->td[i];
 		for (j = 0; j < trbs_per_td; j++) {
 			u32 remainder = 0;
-			field = TRB_TBC(burst_count) | TRB_TLBPC(residue);
+			field = 0;
 
 			if (first_trb) {
+				field = TRB_TBC(burst_count) |
+					TRB_TLBPC(residue);
 				/* Queue the isoc TRB */
 				field |= TRB_TYPE(TRB_ISOC);
 				/* Assume URB_ISO_ASAP is set */
@@ -3703,15 +3872,10 @@
 				trb_buff_len = td_remain_len;
 
 			/* Set the TRB length, TD size, & interrupter fields. */
-			if (xhci->hci_version < 0x100) {
-				remainder = xhci_td_remainder(
-						td_len - running_total);
-			} else {
-				if (j < trbs_per_td - 1)
 					remainder = xhci_v1_0_td_remainder(
 							running_total, trb_buff_len,
-							total_packet_count, urb);
-			}
+					total_packet_count, urb,
+					(trbs_per_td - j - 1));
 			length_field = TRB_LEN(trb_buff_len) |
 				remainder |
 				TRB_INTR_TARGET(0);
@@ -3848,9 +4012,16 @@
 static int queue_command(struct xhci_hcd *xhci, u32 field1, u32 field2,
 		u32 field3, u32 field4, bool command_must_succeed)
 {
+#ifndef MY_ABC_HERE
+union xhci_trb *dbg_trb;
+struct xhci_segment *dbg_seg;
+#endif
 	int reserved_trbs = xhci->cmd_ring_reserved_trbs;
 	int ret;
 
+#ifndef MY_ABC_HERE
+printk("%s\n", __func__);
+#endif
 	if (!command_must_succeed)
 		reserved_trbs++;
 
@@ -3863,8 +4034,17 @@
 					"unfailable commands failed.\n");
 		return ret;
 	}
+
+#ifndef MY_ABC_HERE
+dbg_trb = xhci->cmd_ring->enqueue;
+dbg_seg = xhci->cmd_ring->enq_seg;
+#endif
+
 	queue_trb(xhci, xhci->cmd_ring, false, field1, field2, field3,
 			field4 | xhci->cmd_ring->cycle_state);
+#ifndef MY_ABC_HERE
+etxhci_print_trbs(xhci, dbg_seg, dbg_trb, 1);
+#endif
 	return 0;
 }
 
diff -ur a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig
--- a/drivers/usb/host/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/usb/host/Kconfig	2014-01-21 09:37:20.000000000 +0100
@@ -54,6 +54,11 @@
 	  To compile this driver as a module, choose M here: the
 	  module will be called xhci-hcd.
 
+config USB_XHCI_PLATFORM
+        tristate
+        depends on USB_XHCI_HCD && SYNO_COMCERTO
+	default y
+
 config USB_XHCI_HCD_DEBUGGING
 	bool "Debugging for the xHCI host controller"
 	depends on USB_XHCI_HCD
@@ -639,3 +644,22 @@
 	bool  "Enable SYNO_XHCI_RING_EXPANSION patch"
 	depends on USB && PCI
 	default n
+
+config USB_MARVELL_ERRATA_FE_9049667
+	bool "Marvell High speed detection WA"
+	depends on (ARCH_ARMADA370 || ARCH_ARMADA_XP) && USB
+	default n
+	help
+	  In a370 and axp USB UTMI PHY there is an errata which causes
+	  errors in detection of high speed devices. For certain devices
+	  with low pull up values the USB MAC doesnt detect the end of the
+	  device chirp K signal and therefore remains stuck in reset
+	  state.
+	  The workaround solves this issue by modifying the UTMI PHY
+	  squelch threshold once a high speed port reset error is detected.
+	  Modifying the squelch level enables the MAC to detect the end of
+	  device chirp K signal and to come out of reset. Once the MAC
+	  comes out of reset a consecutive reset attempt is made by the
+	  USB stack. That reset attempt succeeds due to the updated squelch
+	  level. Since the optimal squelch level is device dependant the WA
+	  toggles between 2 verfied squelch levels 0xA and 0xE.
diff -ur a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile
--- a/drivers/usb/host/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/usb/host/Makefile	2014-01-21 09:37:20.000000000 +0100
@@ -18,6 +18,10 @@
 etxhci-hcd-y += etxhci-ring.o etxhci-hub.o etxhci-dbg.o
 etxhci-hcd-y += etxhci-ejxxx.o
 
+ifneq ($(CONFIG_USB_XHCI_PLATFORM), )
+	xhci-hcd-y		+= xhci-plat.o xhci-comcerto2000.o
+endif
+
 obj-$(CONFIG_USB_WHCI_HCD)	+= whci/
 
 obj-$(CONFIG_PCI)		+= pci-quirks.o
diff -ur a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c
--- a/drivers/usb/host/xhci.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/xhci.c	2014-02-17 11:57:11.000000000 +0100
@@ -550,13 +550,21 @@
 	int ret;
 
 	/* return if using legacy interrupt */
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (xhci_to_hcd(xhci)->irq > 0)
+#else
 	if (xhci_to_hcd(xhci)->irq >= 0)
+#endif
 		return;
 
 	ret = xhci_free_msi(xhci);
 	if (!ret)
 		return;
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (pdev->irq > 0)
+#else
 	if (pdev->irq >= 0)
+#endif
 		free_irq(pdev->irq, xhci_to_hcd(xhci));
 
 	return;
@@ -667,7 +675,11 @@
 	/* unregister the legacy interrupt */
 	if (hcd->irq)
 		free_irq(hcd->irq, hcd);
+#if defined(CONFIG_SYNO_COMCERTO)
+	hcd->irq = 0;
+#else
 	hcd->irq = -1;
+#endif
 
 	ret = xhci_setup_msix(xhci);
 	if (ret)
@@ -675,7 +687,11 @@
 		ret = xhci_setup_msi(xhci);
 
 	if (!ret)
+#if defined(CONFIG_SYNO_COMCERTO)
+		/* hcd->irq is 0, we have MSI */
+#else
 		/* hcd->irq is -1, we have MSI */
+#endif
 		return 0;
 
 	if (!pdev->irq) {
@@ -1240,6 +1256,9 @@
 	 */
 	ring->cycle_state = 1;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	ring->num_trbs_free = ring->num_segs * (TRBS_PER_SEGMENT - 1) - 1;
+#endif
 	/*
 	 * Reset the hardware dequeue pointer.
 	 * Yes, this will need to be re-written after resume, but we're paranoid
@@ -1873,9 +1892,11 @@
 		goto done;
 	}
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "Cancel URB %p\n", urb);
 	xhci_dbg(xhci, "Event ring:\n");
 	xhci_debug_ring(xhci, xhci->event_ring);
+#endif
 	ep_index = xhci_get_endpoint_index(&urb->ep->desc);
 	ep = &xhci->devs[urb->dev->slot_id]->eps[ep_index];
 	ep_ring = xhci_urb_to_transfer_ring(xhci, urb);
@@ -1884,12 +1905,29 @@
 		goto done;
 	}
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "Endpoint ring:\n");
 	xhci_debug_ring(xhci, ep_ring);
+#endif
 
 	urb_priv = urb->hcpriv;
+#if defined(CONFIG_SYNO_COMCERTO)
+	i = urb_priv->td_cnt;
+	if (i < urb_priv->length)
+		xhci_dbg(xhci, "Cancel URB %p, dev %s, ep 0x%x, "
+				"starting at offset 0x%llx\n",
+				urb, urb->dev->devpath,
+				urb->ep->desc.bEndpointAddress,
+				(unsigned long long) xhci_trb_virt_to_dma(
+					urb_priv->td[i]->start_seg,
+					urb_priv->td[i]->first_trb));
+#endif
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	for (; i < urb_priv->length; i++) {
+#else
 	for (i = urb_priv->td_cnt; i < urb_priv->length; i++) {
+#endif
 		td = urb_priv->td[i];
 		list_add_tail(&td->cancelled_td_list, &ep->cancelled_td_list);
 	}
@@ -4185,6 +4223,41 @@
 	3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000};
 
 /* Calculate HIRD/BESL for USB2 PORTPMSC*/
+#if defined(CONFIG_SYNO_COMCERTO)
+static int xhci_calculate_hird_besl(struct xhci_hcd *xhci,
+					struct usb_device *udev)
+{
+	int u2del, besl, besl_host;
+	int besl_device = 0;
+	u32 field;
+
+	u2del = HCS_U2_LATENCY(xhci->hcs_params3);
+	field = le32_to_cpu(udev->bos->ext_cap->bmAttributes);
+
+	if (field & USB_BESL_SUPPORT) {
+		for (besl_host = 0; besl_host < 16; besl_host++) {
+			if (xhci_besl_encoding[besl_host] >= u2del)
+				break;
+		}
+		/* Use baseline BESL value as default */
+		if (field & USB_BESL_BASELINE_VALID)
+			besl_device = USB_GET_BESL_BASELINE(field);
+		else if (field & USB_BESL_DEEP_VALID)
+			besl_device = USB_GET_BESL_DEEP(field);
+	} else {
+		if (u2del <= 50)
+			besl_host = 0;
+		else
+			besl_host = (u2del - 51) / 75 + 1;
+	}
+
+	besl = besl_host + besl_device;
+	if (besl > 15)
+		besl = 15;
+
+	return besl;
+}
+#else
 static int xhci_calculate_hird_besl(int u2del, bool use_besl)
 {
 	int hird;
@@ -4206,6 +4279,7 @@
 
 	return hird;
 }
+#endif
 
 static int xhci_usb2_software_lpm_test(struct usb_hcd *hcd,
 					struct usb_device *udev)
@@ -4217,7 +4291,11 @@
 	u32		temp, dev_id;
 	unsigned int	port_num;
 	unsigned long	flags;
+#if defined(CONFIG_SYNO_COMCERTO)
+	int		hird;
+#else
 	int		u2del, hird;
+#endif
 	int		ret;
 
 	if (hcd->speed == HCD_USB3 || !xhci->sw_lpm_support ||
@@ -4263,11 +4341,15 @@
 	 * HIRD or BESL shoule be used. See USB2.0 LPM errata.
 	 */
 	pm_addr = port_array[port_num] + 1;
+#if defined(CONFIG_SYNO_COMCERTO)
+	hird = xhci_calculate_hird_besl(xhci, udev);
+#else
 	u2del = HCS_U2_LATENCY(xhci->hcs_params3);
 	if (le32_to_cpu(udev->bos->ext_cap->bmAttributes) & (1 << 2))
 		hird = xhci_calculate_hird_besl(u2del, 1);
 	else
 		hird = xhci_calculate_hird_besl(u2del, 0);
+#endif
 
 	temp = PORT_L1DS(udev->slot_id) | PORT_HIRD(hird);
 	xhci_writel(xhci, temp, pm_addr);
@@ -4347,7 +4429,11 @@
 	u32		temp;
 	unsigned int	port_num;
 	unsigned long	flags;
+#if defined(CONFIG_SYNO_COMCERTO)
+	int		hird;
+#else
 	int		u2del, hird;
+#endif
 
 	if (hcd->speed == HCD_USB3 || !xhci->hw_lpm_support ||
 			!udev->lpm_capable)
@@ -4370,11 +4456,15 @@
 	xhci_dbg(xhci, "%s port %d USB2 hardware LPM\n",
 			enable ? "enable" : "disable", port_num);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	hird = xhci_calculate_hird_besl(xhci, udev);
+#else
 	u2del = HCS_U2_LATENCY(xhci->hcs_params3);
 	if (le32_to_cpu(udev->bos->ext_cap->bmAttributes) & (1 << 2))
 		hird = xhci_calculate_hird_besl(u2del, 1);
 	else
 		hird = xhci_calculate_hird_besl(u2del, 0);
+#endif
 
 	if (enable) {
 		temp &= ~PORT_HIRD_MASK;
@@ -4631,6 +4721,13 @@
 		printk(KERN_DEBUG "Problem registering PCI driver.");
 		return retval;
 	}
+#if defined(CONFIG_SYNO_COMCERTO)
+	retval = xhci_register_plat();
+	if (retval < 0) {
+		printk(KERN_DEBUG "Problem registering platform driver.");
+		goto unreg_pci;
+	}
+#endif
 	/*
 	 * Check the compiler generated sizes of structures that must be laid
 	 * out in specific ways for hardware access.
@@ -4650,11 +4747,19 @@
 	BUILD_BUG_ON(sizeof(struct xhci_run_regs) != (8+8*128)*32/8);
 	BUILD_BUG_ON(sizeof(struct xhci_doorbell_array) != 256*32/8);
 	return 0;
+#if defined(CONFIG_SYNO_COMCERTO)
+unreg_pci:
+	xhci_unregister_pci();
+	return retval;
+#endif
 }
 module_init(xhci_hcd_init);
 
 static void __exit xhci_hcd_cleanup(void)
 {
 	xhci_unregister_pci();
+#if defined(CONFIG_SYNO_COMCERTO)
+	xhci_unregister_plat();
+#endif
 }
 module_exit(xhci_hcd_cleanup);
Nur in b/drivers/usb/host: xhci-comcerto2000.c.
Nur in b/drivers/usb/host: xhci-comcerto2000.h.
diff -ur a/drivers/usb/host/xhci-dbg.c b/drivers/usb/host/xhci-dbg.c
--- a/drivers/usb/host/xhci-dbg.c	2013-08-24 11:36:50.000000000 +0200
+++ b/drivers/usb/host/xhci-dbg.c	2014-02-17 11:57:11.000000000 +0100
@@ -119,7 +119,11 @@
 	xhci_dbg(xhci, "  Event Interrupts %s\n",
 			(temp & CMD_EIE) ? "enabled " : "disabled");
 	xhci_dbg(xhci, "  Host System Error Interrupts %s\n",
+#if defined(CONFIG_SYNO_COMCERTO)
+			(temp & CMD_HSEIE) ? "enabled " : "disabled");
+#else
 			(temp & CMD_EIE) ? "enabled " : "disabled");
+#endif
 	xhci_dbg(xhci, "  HC has %sfinished light reset\n",
 			(temp & CMD_LRESET) ? "not " : "");
 }
diff -ur a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h
--- a/drivers/usb/host/xhci.h	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/xhci.h	2014-02-17 11:57:11.000000000 +0100
@@ -1234,10 +1234,14 @@
 /* Allow two commands + a link TRB, along with any reserved command TRBs */
 #define MAX_RSVD_CMD_TRBS	(TRBS_PER_SEGMENT - 3)
 #define SEGMENT_SIZE		(TRBS_PER_SEGMENT*16)
+#if defined(CONFIG_SYNO_COMCERTO)
+#define SEGMENT_SHIFT		(__ffs(SEGMENT_SIZE))
+#else
 /* SEGMENT_SHIFT should be log2(SEGMENT_SIZE).
  * Change this if you change TRBS_PER_SEGMENT!
  */
 #define SEGMENT_SHIFT		10
+#endif
 /* TRB buffer pointers can't cross 64KB boundaries */
 #define TRB_MAX_BUFF_SHIFT		16
 #define TRB_MAX_BUFF_SIZE	(1 << TRB_MAX_BUFF_SHIFT)
@@ -1380,6 +1384,9 @@
 	/* ports suspend status arrays - max 31 ports for USB2, 15 for USB3 */
 	u32			port_c_suspend;
 	u32			suspended_ports;
+#if defined(CONFIG_SYNO_COMCERTO)
+	u32			port_remote_wakeup;
+#endif
 	unsigned long		resume_done[USB_MAXCHILDREN];
 };
 
@@ -1391,7 +1398,11 @@
 		return 1;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO)
+/* There is one xhci_hcd structure per controller */
+#else
 /* There is one ehci_hci structure per controller */
+#endif
 struct xhci_hcd {
 	struct usb_hcd *main_hcd;
 	struct usb_hcd *shared_hcd;
@@ -1700,6 +1711,18 @@
 static inline void xhci_unregister_pci(void) {}
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO) 
+#if defined(CONFIG_USB_XHCI_PLATFORM) || defined(CONFIG_USB_XHCI_PLATFORM_MODULE)
+int xhci_register_plat(void);
+void xhci_unregister_plat(void);
+#else
+static inline int xhci_register_plat(void)
+{ return 0; }
+static inline void xhci_unregister_plat(void)
+{  }
+#endif
+#endif
+
 /* xHCI host controller glue */
 typedef void (*xhci_get_quirks_t)(struct device *, struct xhci_hcd *);
 int handshake(struct xhci_hcd *xhci, void __iomem *ptr,
diff -ur a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c
--- a/drivers/usb/host/xhci-hub.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/xhci-hub.c	2014-02-17 11:57:11.000000000 +0100
@@ -57,10 +57,16 @@
 	desc->bHubContrCurrent = 0;
 
 	desc->bNbrPorts = ports;
+#if !defined(CONFIG_SYNO_COMCERTO)
 	/* Ugh, these should be #defines, FIXME */
 	/* Using table 11-13 in USB 2.0 spec. */
+#endif
 	temp = 0;
+#if defined(CONFIG_SYNO_COMCERTO)
+	/* Bits 1:0 - support per-port power switching, or power always on */
+#else
 	/* Bits 1:0 - support port power switching, or power always on */
+#endif
 	if (HCC_PPC(xhci->hcc_params))
 		temp |= 0x0001;
 	else
@@ -86,9 +92,17 @@
 	ports = xhci->num_usb2_ports;
 
 	xhci_common_hub_descriptor(xhci, desc, ports);
+#if defined(CONFIG_SYNO_COMCERTO)
+	desc->bDescriptorType = USB_DT_HUB;
+#else
 	desc->bDescriptorType = 0x29;
+#endif
 	temp = 1 + (ports / 8);
+#if defined(CONFIG_SYNO_COMCERTO)
+	desc->bDescLength = USB_DT_HUB_NONVAR_SIZE + 2 * temp;
+#else
 	desc->bDescLength = 7 + 2 * temp;
+#endif
 
 	/* The Device Removable bits are reported on a byte granularity.
 	 * If the port doesn't exist within that byte, the bit is set to 0.
@@ -137,8 +151,13 @@
 
 	ports = xhci->num_usb3_ports;
 	xhci_common_hub_descriptor(xhci, desc, ports);
+#if defined(CONFIG_SYNO_COMCERTO)
+	desc->bDescriptorType = USB_DT_SS_HUB;
+	desc->bDescLength = USB_DT_SS_HUB_SIZE;
+#else
 	desc->bDescriptorType = 0x2a;
 	desc->bDescLength = 12;
+#endif
 
 	/* header decode latency should be zero for roothubs,
 	 * see section 4.23.5.2.
@@ -593,6 +612,9 @@
 	int slot_id;
 	struct xhci_bus_state *bus_state;
 	u16 link_state = 0;
+#if defined(CONFIG_SYNO_COMCERTO)
+	u16 wake_mask = 0;
+#endif
 
 #ifdef MY_DEF_HERE
 	xhci_dbg(xhci, "xhci_hub_control.type:0x%x.wvalue:%d.\n", typeReq, wValue);
@@ -769,6 +791,10 @@
 	case SetPortFeature:
 		if (wValue == USB_PORT_FEAT_LINK_STATE)
 			link_state = (wIndex & 0xff00) >> 3;
+#if defined(CONFIG_SYNO_COMCERTO)
+		if (wValue == USB_PORT_FEAT_REMOTE_WAKE_MASK)
+			wake_mask = wIndex & 0xff00;
+#endif
 		wIndex &= 0xff;
 		if (!wIndex || wIndex > max_ports)
 			goto error;
@@ -1193,6 +1219,12 @@
 			t2 |= PORT_LINK_STROBE | XDEV_U3;
 			set_bit(port_index, &bus_state->bus_suspended);
 		}
+#if defined(CONFIG_SYNO_COMCERTO)
+		/* USB core sets remote wake mask for USB 3.0 hubs,
+		 * including the USB 3.0 roothub, but only if CONFIG_USB_SUSPEND
+		 * is enabled, so also enable remote wake here.
+		 */
+#endif
 		if (hcd->self.root_hub->do_remote_wakeup) {
 			if (t1 & PORT_CONNECT) {
 				t2 |= PORT_WKOC_E | PORT_WKDISC_E;
diff -ur a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c
--- a/drivers/usb/host/xhci-mem.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/xhci-mem.c	2014-02-17 11:57:11.000000000 +0100
@@ -44,15 +44,19 @@
 	seg = kzalloc(sizeof *seg, flags);
 	if (!seg)
 		return NULL;
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "Allocating priv segment structure at %p\n", seg);
+#endif
 
 	seg->trbs = dma_pool_alloc(xhci->segment_pool, flags, &dma);
 	if (!seg->trbs) {
 		kfree(seg);
 		return NULL;
 	}
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "// Allocating segment at %p (virtual) 0x%llx (DMA)\n",
 			seg->trbs, (unsigned long long)dma);
+#endif
 
 	memset(seg->trbs, 0, SEGMENT_SIZE);
 	/* If the cycle state is 0, set the cycle bit to 1 for all the TRBs */
@@ -69,12 +73,16 @@
 static void xhci_segment_free(struct xhci_hcd *xhci, struct xhci_segment *seg)
 {
 	if (seg->trbs) {
+#if !defined(CONFIG_SYNO_COMCERTO)
 		xhci_dbg(xhci, "Freeing DMA segment at %p (virtual) 0x%llx (DMA)\n",
 				seg->trbs, (unsigned long long)seg->dma);
+#endif
 		dma_pool_free(xhci->segment_pool, seg->trbs, seg->dma);
 		seg->trbs = NULL;
 	}
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "Freeing priv segment structure at %p\n", seg);
+#endif
 	kfree(seg);
 }
 
@@ -123,9 +131,11 @@
 			val |= TRB_CHAIN;
 		prev->trbs[TRBS_PER_SEGMENT-1].link.control = cpu_to_le32(val);
 	}
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "Linking segment 0x%llx to segment 0x%llx (DMA)\n",
 			(unsigned long long)prev->dma,
 			(unsigned long long)next->dma);
+#endif
 }
 
 /*
@@ -248,7 +258,9 @@
 	int ret;
 
 	ring = kzalloc(sizeof *(ring), flags);
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "Allocating ring at %p\n", ring);
+#endif
 	if (!ring)
 		return NULL;
 
@@ -2289,7 +2301,11 @@
 	unsigned int	val, val2;
 	u64		val_64;
 	struct xhci_segment	*seg;
+#if defined(CONFIG_SYNO_COMCERTO)
+	u32 page_size, temp;
+#else
 	u32 page_size;
+#endif
 	int i;
 
 	page_size = xhci_readl(xhci, &xhci->op_regs->page_size);
@@ -2473,6 +2489,17 @@
 
 	INIT_LIST_HEAD(&xhci->lpm_failed_devs);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	/* Enable USB 3.0 device notifications for function remote wake, which
+	 * is necessary for allowing USB 3.0 devices to do remote wakeup from
+	 * U3 (device suspend).
+	 */
+	temp = xhci_readl(xhci, &xhci->op_regs->dev_notification);
+	temp &= ~DEV_NOTE_MASK;
+	temp |= DEV_NOTE_FWAKE;
+	xhci_writel(xhci, temp, &xhci->op_regs->dev_notification);
+#endif
+
 	return 0;
 
 fail:
diff -ur a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
--- a/drivers/usb/host/xhci-pci.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/xhci-pci.c	2014-02-17 11:57:11.000000000 +0100
@@ -358,7 +358,11 @@
 	return pci_register_driver(&xhci_pci_driver);
 }
 
+#if defined(CONFIG_SYNO_COMCERTO)
+void xhci_unregister_pci(void)
+#else
 void __exit xhci_unregister_pci(void)
+#endif
 {
 	pci_unregister_driver(&xhci_pci_driver);
 }
Nur in b/drivers/usb/host: xhci-plat.c.
diff -ur a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c
--- a/drivers/usb/host/xhci-ring.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/host/xhci-ring.c	2014-02-17 11:57:11.000000000 +0100
@@ -662,12 +662,18 @@
 					cpu_to_le32(TRB_CYCLE);
 			cur_trb->generic.field[3] |= cpu_to_le32(
 				TRB_TYPE(TRB_TR_NOOP));
+#if defined(CONFIG_SYNO_COMCERTO)
+			xhci_dbg(xhci, "TRB to noop at offset 0x%llx\n",
+					(unsigned long long)
+					xhci_trb_virt_to_dma(cur_seg, cur_trb));
+#else
 			xhci_dbg(xhci, "Cancel TRB %p (0x%llx dma) "
 					"in seg %p (0x%llx dma)\n",
 					cur_trb,
 					(unsigned long long)xhci_trb_virt_to_dma(cur_seg, cur_trb),
 					cur_seg,
 					(unsigned long long)cur_seg->dma);
+#endif
 		}
 		if (cur_trb == cur_td->last_trb)
 			break;
@@ -807,9 +813,15 @@
 	 */
 	list_for_each(entry, &ep->cancelled_td_list) {
 		cur_td = list_entry(entry, struct xhci_td, cancelled_td_list);
+#if defined(CONFIG_SYNO_COMCERTO)
+		xhci_dbg(xhci, "Removing canceled TD starting at 0x%llx (dma).\n",
+				(unsigned long long)xhci_trb_virt_to_dma(
+					cur_td->start_seg, cur_td->first_trb));
+#else
 		xhci_dbg(xhci, "Cancelling TD starting at %p, 0x%llx (dma).\n",
 				cur_td->first_trb,
 				(unsigned long long)xhci_trb_virt_to_dma(cur_td->start_seg, cur_td->first_trb));
+#endif
 		ep_ring = xhci_urb_to_transfer_ring(xhci, cur_td->urb);
 		if (!ep_ring) {
 			/* This shouldn't happen unless a driver is mucking
@@ -1652,7 +1664,18 @@
 		}
 
 		if (DEV_SUPERSPEED(temp)) {
+#if defined(CONFIG_SYNO_COMCERTO)
+			xhci_dbg(xhci, "remote wake SS port %d\n", port_id);
+			/* Set a flag to say the port signaled remote wakeup,
+			 * so we can tell the difference between the end of
+			 * device and host initiated resume.
+			 */
+			bus_state->port_remote_wakeup |= 1 << faked_port_index;
+			xhci_test_and_clear_bit(xhci, port_array,
+					faked_port_index, PORT_PLC);
+#else
 			xhci_dbg(xhci, "resume SS port %d\n", port_id);
+#endif
 			xhci_set_link_state(xhci, port_array, faked_port_index,
 						XDEV_U0);
 			slot_id = xhci_find_slot_id_by_port(hcd, xhci,
@@ -1957,7 +1980,9 @@
 	ep_ctx = xhci_get_ep_ctx(xhci, xdev->out_ctx, ep_index);
 	trb_comp_code = GET_COMP_CODE(le32_to_cpu(event->transfer_len));
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_debug_trb(xhci, xhci->event_ring->dequeue);
+#endif
 	switch (trb_comp_code) {
 	case COMP_SUCCESS:
 		if (event_trb == ep_ring->dequeue) {
@@ -2283,6 +2308,18 @@
 	xdev = xhci->devs[slot_id];
 	if (!xdev) {
 		xhci_err(xhci, "ERROR Transfer event pointed to bad slot\n");
+#if defined(CONFIG_SYNO_COMCERTO)
+		xhci_err(xhci, "@%016llx %08x %08x %08x %08x\n",
+			 (unsigned long long) xhci_trb_virt_to_dma(
+				 xhci->event_ring->deq_seg,
+				 xhci->event_ring->dequeue),
+			 lower_32_bits(le64_to_cpu(event->buffer)),
+			 upper_32_bits(le64_to_cpu(event->buffer)),
+			 le32_to_cpu(event->transfer_len),
+			 le32_to_cpu(event->flags));
+		xhci_dbg(xhci, "Event ring:\n");
+		xhci_debug_segment(xhci, xhci->event_ring->deq_seg);
+#endif
 		return -ENODEV;
 	}
 
@@ -2296,6 +2333,18 @@
 	    EP_STATE_DISABLED) {
 		xhci_err(xhci, "ERROR Transfer event for disabled endpoint "
 				"or incorrect stream ring\n");
+#if defined(CONFIG_SYNO_COMCERTO)
+		xhci_err(xhci, "@%016llx %08x %08x %08x %08x\n",
+			 (unsigned long long) xhci_trb_virt_to_dma(
+				 xhci->event_ring->deq_seg,
+				 xhci->event_ring->dequeue),
+			 lower_32_bits(le64_to_cpu(event->buffer)),
+			 upper_32_bits(le64_to_cpu(event->buffer)),
+			 le32_to_cpu(event->transfer_len),
+			 le32_to_cpu(event->flags));
+		xhci_dbg(xhci, "Event ring:\n");
+		xhci_debug_segment(xhci, xhci->event_ring->deq_seg);
+#endif
 		return -ENODEV;
 	}
 
@@ -2688,7 +2737,11 @@
 	/* FIXME when MSI-X is supported and there are multiple vectors */
 	/* Clear the MSI-X event interrupt status */
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (hcd->irq) {
+#else
 	if (hcd->irq != -1) {
+#endif
 		u32 irq_pending;
 		/* Acknowledge the PCI interrupt */
 		irq_pending = xhci_readl(xhci, &xhci->ir_set->irq_pending);
@@ -2740,6 +2793,9 @@
 
 irqreturn_t xhci_msi_irq(int irq, struct usb_hcd *hcd)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	return xhci_irq(hcd);
+#else
 	irqreturn_t ret;
 	struct xhci_hcd *xhci;
 
@@ -2751,6 +2807,7 @@
 	ret = xhci_irq(hcd);
 
 	return ret;
+#endif
 }
 
 /****		Endpoint Ring Operations	****/
@@ -2867,11 +2924,13 @@
 			/* Toggle the cycle bit after the last ring segment. */
 			if (last_trb_on_last_seg(xhci, ring, ring->enq_seg, next)) {
 				ring->cycle_state = (ring->cycle_state ? 0 : 1);
+#if !defined(CONFIG_SYNO_COMCERTO)
 				if (!in_interrupt()) {
 					xhci_dbg(xhci, "queue_trb: Toggle cycle "
 						"state for ring %p = %i\n",
 						ring, (unsigned int)ring->cycle_state);
 				}
+#endif
 			}
 			ring->enq_seg = ring->enq_seg->next;
 			ring->enqueue = ring->enq_seg->trbs;
@@ -2942,10 +3001,14 @@
 	num_sgs = urb->num_mapped_sgs;
 	temp = urb->transfer_buffer_length;
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "count sg list trbs: \n");
+#endif
 	num_trbs = 0;
 	for_each_sg(urb->sg, sg, num_sgs, i) {
+#if !defined(CONFIG_SYNO_COMCERTO)
 		unsigned int previous_total_trbs = num_trbs;
+#endif
 		unsigned int len = sg_dma_len(sg);
 
 		/* Scatter gather list entries may cross 64KB boundaries */
@@ -2960,15 +3023,18 @@
 			num_trbs++;
 			running_total += TRB_MAX_BUFF_SIZE;
 		}
+#if !defined(CONFIG_SYNO_COMCERTO)
 		xhci_dbg(xhci, " sg #%d: dma = %#llx, len = %#x (%d), num_trbs = %d\n",
 				i, (unsigned long long)sg_dma_address(sg),
 				len, len, num_trbs - previous_total_trbs);
+#endif
 
 		len = min_t(int, len, temp);
 		temp -= len;
 		if (temp == 0)
 			break;
 	}
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "\n");
 	if (!in_interrupt())
 		xhci_dbg(xhci, "ep %#x - urb len = %d, sglist used, "
@@ -2976,6 +3042,7 @@
 				urb->ep->desc.bEndpointAddress,
 				urb->transfer_buffer_length,
 				num_trbs);
+#endif
 	return num_trbs;
 }
 
@@ -3049,7 +3116,11 @@
 				urb->dev->speed == USB_SPEED_FULL)
 			urb->interval /= 8;
 	}
+#if defined(CONFIG_SYNO_COMCERTO)
+	return xhci_queue_bulk_tx(xhci, mem_flags, urb, slot_id, ep_index);
+#else
 	return xhci_queue_bulk_tx(xhci, GFP_ATOMIC, urb, slot_id, ep_index);
+#endif
 }
 
 /*
@@ -3164,8 +3235,10 @@
 	trb_buff_len = min_t(int, trb_buff_len, this_sg_len);
 	if (trb_buff_len > urb->transfer_buffer_length)
 		trb_buff_len = urb->transfer_buffer_length;
+#if !defined(CONFIG_SYNO_COMCERTO)
 	xhci_dbg(xhci, "First length to xfer from 1st sglist entry = %u\n",
 			trb_buff_len);
+#endif
 
 	first_trb = true;
 	/* Queue the first TRB, even if it's zero-length */
@@ -3197,11 +3270,13 @@
 		if (usb_urb_dir_in(urb))
 			field |= TRB_ISP;
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 		xhci_dbg(xhci, " sg entry: dma = %#x, len = %#x (%d), "
 				"64KB boundary at %#x, end dma = %#x\n",
 				(unsigned int) addr, trb_buff_len, trb_buff_len,
 				(unsigned int) (addr + TRB_MAX_BUFF_SIZE) & ~(TRB_MAX_BUFF_SIZE - 1),
 				(unsigned int) addr + trb_buff_len);
+#endif
 		if (TRB_MAX_BUFF_SIZE -
 				(addr & (TRB_MAX_BUFF_SIZE - 1)) < trb_buff_len) {
 			xhci_warn(xhci, "WARN: sg dma xfer crosses 64KB boundaries!\n");
@@ -3308,6 +3383,7 @@
 	}
 	/* FIXME: this doesn't deal with URB_ZERO_PACKET - need one more */
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	if (!in_interrupt())
 		xhci_dbg(xhci, "ep %#x - urb len = %#x (%d), "
 				"addr = %#llx, num_trbs = %d\n",
@@ -3316,6 +3392,7 @@
 				urb->transfer_buffer_length,
 				(unsigned long long)urb->transfer_dma,
 				num_trbs);
+#endif
 
 	ret = prepare_transfer(xhci, xhci->devs[slot_id],
 			ep_index, urb->stream_id,
@@ -3438,9 +3515,11 @@
 	if (!urb->setup_packet)
 		return -EINVAL;
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	if (!in_interrupt())
 		xhci_dbg(xhci, "Queueing ctrl tx for slot id %d, ep %d\n",
 				slot_id, ep_index);
+#endif
 	/* 1 TRB for setup, 1 for status */
 	num_trbs = 2;
 	/*
@@ -3632,6 +3711,7 @@
 		return -EINVAL;
 	}
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	if (!in_interrupt())
 		xhci_dbg(xhci, "ep %#x - urb len = %#x (%d),"
 				" addr = %#llx, num_tds = %d\n",
@@ -3640,6 +3720,7 @@
 				urb->transfer_buffer_length,
 				(unsigned long long)urb->transfer_dma,
 				num_tds);
+#endif
 
 	start_addr = (u64) urb->transfer_dma;
 	start_trb = &ep_ring->enqueue->generic;
diff -ur a/drivers/usb/Kconfig b/drivers/usb/Kconfig
--- a/drivers/usb/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/usb/Kconfig	2014-01-21 09:37:20.000000000 +0100
@@ -181,4 +181,6 @@
 
 source "drivers/usb/otg/Kconfig"
 
+source "drivers/usb/dwc_otg/Kconfig"
+
 endif # USB_SUPPORT
diff -ur a/drivers/usb/Makefile b/drivers/usb/Makefile
--- a/drivers/usb/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/usb/Makefile	2014-01-21 09:37:20.000000000 +0100
@@ -49,6 +49,10 @@
 obj-$(CONFIG_USB_ATM)		+= atm/
 obj-$(CONFIG_USB_SPEEDTOUCH)	+= atm/
 
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_DWC_OTG)		+= dwc_otg/
+endif
+
 obj-$(CONFIG_USB_MUSB_HDRC)	+= musb/
 obj-$(CONFIG_USB_RENESAS_USBHS)	+= renesas_usbhs/
 obj-$(CONFIG_USB_OTG_UTILS)	+= otg/
diff -ur a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c
--- a/drivers/usb/storage/usb.c	2013-08-24 11:36:51.000000000 +0200
+++ b/drivers/usb/storage/usb.c	2014-02-17 11:57:12.000000000 +0100
@@ -58,6 +58,8 @@
 #include <linux/kthread.h>
 #include <linux/mutex.h>
 #include <linux/utsname.h>
+#include <linux/usb.h>
+#include <linux/usb/hcd.h>
 
 #include <scsi/scsi.h>
 #include <scsi/scsi_cmnd.h>
@@ -86,12 +88,6 @@
 module_param_string(quirks, quirks, sizeof(quirks), S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(quirks, "supplemental list of device IDs and their quirks");
 
-#if defined(CONFIG_USB_UAS) || defined(CONFIG_USB_UAS_MODULE)
-static unsigned int uas_check = 1;
-module_param(uas_check, uint, S_IRUGO | S_IWUSR);
-MODULE_PARM_DESC(uas_check, "check whether new device supports UAS protocol");
-#endif
-
 /*
  * The entries in this table correspond, line for line,
  * with the entries in usb_storage_usb_ids[], defined in usual-tables.c.
@@ -1055,18 +1051,11 @@
 #if defined(CONFIG_USB_UAS) || defined(CONFIG_USB_UAS_MODULE)
 static int is_uas_device(struct usb_interface *intf)
 {
-	int i;
-
-	for (i = 0; i < intf->num_altsetting; i++) {
-		struct usb_host_interface *alt = &intf->altsetting[i];
+	struct usb_device *udev = interface_to_usbdev(intf);
 
-		if (alt->desc.bInterfaceClass == USB_CLASS_MASS_STORAGE &&
-			alt->desc.bInterfaceSubClass == USB_SC_SCSI &&
-			alt->desc.bInterfaceProtocol == USB_PR_UAS)
-			return 0;
-	}
+#define USB_QUIRK_UAS_MODE		0x80000000
 
-	return -ENODEV;
+	return !!(udev->quirks & USB_QUIRK_UAS_MODE);
 }
 #endif
 
@@ -1078,7 +1067,7 @@
 	int result;
 
 #if defined(CONFIG_USB_UAS) || defined(CONFIG_USB_UAS_MODULE)
-	if (uas_check && !is_uas_device(intf))
+	if (is_uas_device(intf))
 		return -ENODEV;
 #endif
 
diff -ur a/drivers/vhost/net.c b/drivers/vhost/net.c
--- a/drivers/vhost/net.c	2013-08-24 11:36:59.000000000 +0200
+++ b/drivers/vhost/net.c	2014-02-17 11:57:30.000000000 +0100
@@ -857,9 +857,9 @@
 };
 
 static struct miscdevice vhost_net_misc = {
-	MISC_DYNAMIC_MINOR,
-	"vhost-net",
-	&vhost_net_fops,
+	.minor = VHOST_NET_MINOR,
+	.name = "vhost-net",
+	.fops = &vhost_net_fops,
 };
 
 static int vhost_net_init(void)
@@ -880,3 +880,5 @@
 MODULE_LICENSE("GPL v2");
 MODULE_AUTHOR("Michael S. Tsirkin");
 MODULE_DESCRIPTION("Host kernel accelerator for virtio net");
+MODULE_ALIAS_MISCDEV(VHOST_NET_MINOR);
+MODULE_ALIAS("devname:vhost-net");
diff -ur a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig
--- a/drivers/virtio/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/drivers/virtio/Kconfig	2014-01-21 09:37:24.000000000 +0100
@@ -46,4 +46,15 @@
 
  	 If unsure, say N.
 
+config VIRTIO_MMIO_CMDLINE_DEVICES
+	bool "Memory mapped virtio devices parameter parsing"
+	depends on VIRTIO_MMIO
+	---help---
+	 Allow virtio-mmio devices instantiation via the kernel command line
+	 or module parameters. Be aware that using incorrect parameters (base
+	 address in particular) can crash your system - you have been warned.
+	 See Documentation/kernel-parameters.txt for details.
+
+	 If unsure, say 'N'.
+
 endmenu
diff -ur a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c
--- a/drivers/virtio/virtio_balloon.c	2013-08-24 11:36:59.000000000 +0200
+++ b/drivers/virtio/virtio_balloon.c	2014-02-17 11:57:30.000000000 +0100
@@ -87,7 +87,7 @@
 	init_completion(&vb->acked);
 
 	/* We should always be able to add one buffer to an empty queue. */
-	if (virtqueue_add_buf(vq, &sg, 1, 0, vb) < 0)
+	if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
 		BUG();
 	virtqueue_kick(vq);
 
@@ -220,7 +220,7 @@
 
 	vq = vb->stats_vq;
 	sg_init_one(&sg, vb->stats, sizeof(vb->stats));
-	if (virtqueue_add_buf(vq, &sg, 1, 0, vb) < 0)
+	if (virtqueue_add_buf(vq, &sg, 1, 0, vb, GFP_KERNEL) < 0)
 		BUG();
 	virtqueue_kick(vq);
 }
@@ -313,7 +313,8 @@
 		 * use it to signal us later.
 		 */
 		sg_init_one(&sg, vb->stats, sizeof vb->stats);
-		if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb) < 0)
+		if (virtqueue_add_buf(vb->stats_vq, &sg, 1, 0, vb, GFP_KERNEL)
+		    < 0)
 			BUG();
 		virtqueue_kick(vb->stats_vq);
 	}
diff -ur a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c
--- a/drivers/virtio/virtio.c	2013-08-24 11:36:59.000000000 +0200
+++ b/drivers/virtio/virtio.c	2014-02-17 11:57:30.000000000 +0100
@@ -140,8 +140,11 @@
 	err = drv->probe(dev);
 	if (err)
 		add_status(dev, VIRTIO_CONFIG_S_FAILED);
-	else
+	else {
 		add_status(dev, VIRTIO_CONFIG_S_DRIVER_OK);
+		if (drv->scan)
+			drv->scan(dev);
+	}
 
 	return err;
 }
diff -ur a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c
--- a/drivers/virtio/virtio_mmio.c	2013-08-24 11:36:59.000000000 +0200
+++ b/drivers/virtio/virtio_mmio.c	2014-02-17 11:57:30.000000000 +0100
@@ -6,6 +6,50 @@
  * This module allows virtio devices to be used over a virtual, memory mapped
  * platform device.
  *
+ * The guest device(s) may be instantiated in one of three equivalent ways:
+ *
+ * 1. Static platform device in board's code, eg.:
+ *
+ *	static struct platform_device v2m_virtio_device = {
+ *		.name = "virtio-mmio",
+ *		.id = -1,
+ *		.num_resources = 2,
+ *		.resource = (struct resource []) {
+ *			{
+ *				.start = 0x1001e000,
+ *				.end = 0x1001e0ff,
+ *				.flags = IORESOURCE_MEM,
+ *			}, {
+ *				.start = 42 + 32,
+ *				.end = 42 + 32,
+ *				.flags = IORESOURCE_IRQ,
+ *			},
+ *		}
+ *	};
+ *
+ * 2. Device Tree node, eg.:
+ *
+ *		virtio_block@1e000 {
+ *			compatible = "virtio,mmio";
+ *			reg = <0x1e000 0x100>;
+ *			interrupts = <42>;
+ *		}
+ *
+ * 3. Kernel module (or command line) parameter. Can be used more than once -
+ *    one device will be created for each one. Syntax:
+ *
+ *		[virtio_mmio.]device=<size>@<baseaddr>:<irq>[:<id>]
+ *    where:
+ *		<size>     := size (can use standard suffixes like K, M or G)
+ *		<baseaddr> := physical base address
+ *		<irq>      := interrupt number (as passed to request_irq())
+ *		<id>       := (optional) platform device id
+ *    eg.:
+ *		virtio_mmio.device=0x100@0x100b0000:48 \
+ *				virtio_mmio.device=1K@0x1001e000:74
+ *
+ *
+ *
  * Registers layout (all 32-bit wide):
  *
  * offset d. name             description
@@ -42,6 +86,8 @@
  * See the COPYING file in the top-level directory.
  */
 
+#define pr_fmt(fmt) "virtio-mmio: " fmt
+
 #include <linux/highmem.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
@@ -310,8 +356,8 @@
 			vm_dev->base + VIRTIO_MMIO_QUEUE_PFN);
 
 	/* Create the vring */
-	vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN,
-				 vdev, info->queue, vm_notify, callback, name);
+	vq = vring_new_virtqueue(info->num, VIRTIO_MMIO_VRING_ALIGN, vdev,
+				 true, info->queue, vm_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto error_new_virtqueue;
@@ -443,6 +489,130 @@
 
 
 
+/* Devices list parameter */
+
+#if defined(CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES)
+
+static struct device vm_cmdline_parent = {
+	.init_name = "virtio-mmio-cmdline",
+};
+
+static int vm_cmdline_parent_registered;
+static int vm_cmdline_id;
+
+static int vm_cmdline_set(const char *device,
+		const struct kernel_param *kp)
+{
+	int err;
+	struct resource resources[2] = {};
+	char *str;
+	long long int base, size;
+	unsigned int irq;
+	int processed, consumed = 0;
+	struct platform_device *pdev;
+
+	/* Consume "size" part of the command line parameter */
+	size = memparse(device, &str);
+
+	/* Get "@<base>:<irq>[:<id>]" chunks */
+	processed = sscanf(str, "@%lli:%u%n:%d%n",
+			&base, &irq, &consumed,
+			&vm_cmdline_id, &consumed);
+
+	/*
+	 * sscanf() must processes at least 2 chunks; also there
+	 * must be no extra characters after the last chunk, so
+	 * str[consumed] must be '\0'
+	 */
+	if (processed < 2 || str[consumed])
+		return -EINVAL;
+
+	resources[0].flags = IORESOURCE_MEM;
+	resources[0].start = base;
+	resources[0].end = base + size - 1;
+
+	resources[1].flags = IORESOURCE_IRQ;
+	resources[1].start = resources[1].end = irq;
+
+	if (!vm_cmdline_parent_registered) {
+		err = device_register(&vm_cmdline_parent);
+		if (err) {
+			pr_err("Failed to register parent device!\n");
+			return err;
+		}
+		vm_cmdline_parent_registered = 1;
+	}
+
+	pr_info("Registering device virtio-mmio.%d at 0x%llx-0x%llx, IRQ %d.\n",
+		       vm_cmdline_id,
+		       (unsigned long long)resources[0].start,
+		       (unsigned long long)resources[0].end,
+		       (int)resources[1].start);
+
+	pdev = platform_device_register_resndata(&vm_cmdline_parent,
+			"virtio-mmio", vm_cmdline_id++,
+			resources, ARRAY_SIZE(resources), NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	return 0;
+}
+
+static int vm_cmdline_get_device(struct device *dev, void *data)
+{
+	char *buffer = data;
+	unsigned int len = strlen(buffer);
+	struct platform_device *pdev = to_platform_device(dev);
+
+	snprintf(buffer + len, PAGE_SIZE - len, "0x%llx@0x%llx:%llu:%d\n",
+			pdev->resource[0].end - pdev->resource[0].start + 1ULL,
+			(unsigned long long)pdev->resource[0].start,
+			(unsigned long long)pdev->resource[1].start,
+			pdev->id);
+	return 0;
+}
+
+static int vm_cmdline_get(char *buffer, const struct kernel_param *kp)
+{
+	buffer[0] = '\0';
+	device_for_each_child(&vm_cmdline_parent, buffer,
+			vm_cmdline_get_device);
+	return strlen(buffer) + 1;
+}
+
+static struct kernel_param_ops vm_cmdline_param_ops = {
+	.set = vm_cmdline_set,
+	.get = vm_cmdline_get,
+};
+
+device_param_cb(device, &vm_cmdline_param_ops, NULL, S_IRUSR);
+
+static int vm_unregister_cmdline_device(struct device *dev,
+		void *data)
+{
+	platform_device_unregister(to_platform_device(dev));
+
+	return 0;
+}
+
+static void vm_unregister_cmdline_devices(void)
+{
+	if (vm_cmdline_parent_registered) {
+		device_for_each_child(&vm_cmdline_parent, NULL,
+				vm_unregister_cmdline_device);
+		device_unregister(&vm_cmdline_parent);
+		vm_cmdline_parent_registered = 0;
+	}
+}
+
+#else
+
+static void vm_unregister_cmdline_devices(void)
+{
+}
+
+#endif
+
 /* Platform driver */
 
 static struct of_device_id virtio_mmio_match[] = {
@@ -469,6 +639,7 @@
 static void __exit virtio_mmio_exit(void)
 {
 	platform_driver_unregister(&virtio_mmio_driver);
+	vm_unregister_cmdline_devices();
 }
 
 module_init(virtio_mmio_init);
diff -ur a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
--- a/drivers/virtio/virtio_pci.c	2013-08-24 11:36:59.000000000 +0200
+++ b/drivers/virtio/virtio_pci.c	2014-02-17 11:57:30.000000000 +0100
@@ -55,6 +55,10 @@
 	unsigned msix_vectors;
 	/* Vectors allocated, excluding per-vq vectors if any */
 	unsigned msix_used_vectors;
+
+	/* Status saved during hibernate/restore */
+	u8 saved_status;
+
 	/* Whether we have vector per vq */
 	bool per_vq_vectors;
 };
@@ -414,8 +418,8 @@
 		  vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
 
 	/* create the vring */
-	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN,
-				 vdev, info->queue, vp_notify, callback, name);
+	vq = vring_new_virtqueue(info->num, VIRTIO_PCI_VRING_ALIGN, vdev,
+				 true, info->queue, vp_notify, callback, name);
 	if (!vq) {
 		err = -ENOMEM;
 		goto out_activate_queue;
@@ -708,19 +712,114 @@
 }
 
 #ifdef CONFIG_PM
-static int virtio_pci_suspend(struct pci_dev *pci_dev, pm_message_t state)
+static int virtio_pci_suspend(struct device *dev)
 {
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+
 	pci_save_state(pci_dev);
 	pci_set_power_state(pci_dev, PCI_D3hot);
 	return 0;
 }
 
-static int virtio_pci_resume(struct pci_dev *pci_dev)
+static int virtio_pci_resume(struct device *dev)
 {
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+
 	pci_restore_state(pci_dev);
 	pci_set_power_state(pci_dev, PCI_D0);
 	return 0;
 }
+
+static int virtio_pci_freeze(struct device *dev)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+	struct virtio_driver *drv;
+	int ret;
+
+	drv = container_of(vp_dev->vdev.dev.driver,
+			   struct virtio_driver, driver);
+
+	ret = 0;
+	vp_dev->saved_status = vp_get_status(&vp_dev->vdev);
+	if (drv && drv->freeze)
+		ret = drv->freeze(&vp_dev->vdev);
+
+	if (!ret)
+		pci_disable_device(pci_dev);
+	return ret;
+}
+
+static int restore_common(struct device *dev)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+	int ret;
+
+	ret = pci_enable_device(pci_dev);
+	if (ret)
+		return ret;
+	pci_set_master(pci_dev);
+	vp_finalize_features(&vp_dev->vdev);
+
+	return ret;
+}
+
+static int virtio_pci_thaw(struct device *dev)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+	struct virtio_driver *drv;
+	int ret;
+
+	ret = restore_common(dev);
+	if (ret)
+		return ret;
+
+	drv = container_of(vp_dev->vdev.dev.driver,
+			   struct virtio_driver, driver);
+
+	if (drv && drv->thaw)
+		ret = drv->thaw(&vp_dev->vdev);
+	else if (drv && drv->restore)
+		ret = drv->restore(&vp_dev->vdev);
+
+	/* Finally, tell the device we're all set */
+	if (!ret)
+		vp_set_status(&vp_dev->vdev, vp_dev->saved_status);
+
+	return ret;
+}
+
+static int virtio_pci_restore(struct device *dev)
+{
+	struct pci_dev *pci_dev = to_pci_dev(dev);
+	struct virtio_pci_device *vp_dev = pci_get_drvdata(pci_dev);
+	struct virtio_driver *drv;
+	int ret;
+
+	drv = container_of(vp_dev->vdev.dev.driver,
+			   struct virtio_driver, driver);
+
+	ret = restore_common(dev);
+	if (!ret && drv && drv->restore)
+		ret = drv->restore(&vp_dev->vdev);
+
+	/* Finally, tell the device we're all set */
+	if (!ret)
+		vp_set_status(&vp_dev->vdev, vp_dev->saved_status);
+
+	return ret;
+}
+
+static const struct dev_pm_ops virtio_pci_pm_ops = {
+	.suspend	= virtio_pci_suspend,
+	.resume		= virtio_pci_resume,
+	.freeze		= virtio_pci_freeze,
+	.thaw		= virtio_pci_thaw,
+	.restore	= virtio_pci_restore,
+	.poweroff	= virtio_pci_suspend,
+};
 #endif
 
 static struct pci_driver virtio_pci_driver = {
@@ -729,8 +828,7 @@
 	.probe		= virtio_pci_probe,
 	.remove		= __devexit_p(virtio_pci_remove),
 #ifdef CONFIG_PM
-	.suspend	= virtio_pci_suspend,
-	.resume		= virtio_pci_resume,
+	.driver.pm	= &virtio_pci_pm_ops,
 #endif
 };
 
diff -ur a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
--- a/drivers/virtio/virtio_ring.c	2013-08-24 11:36:59.000000000 +0200
+++ b/drivers/virtio/virtio_ring.c	2014-02-17 11:57:30.000000000 +0100
@@ -28,17 +28,20 @@
 #ifdef CONFIG_SMP
 /* Where possible, use SMP barriers which are more lightweight than mandatory
  * barriers, because mandatory barriers control MMIO effects on accesses
- * through relaxed memory I/O windows (which virtio does not use). */
-#define virtio_mb() smp_mb()
-#define virtio_rmb() smp_rmb()
-#define virtio_wmb() smp_wmb()
+ * through relaxed memory I/O windows (which virtio-pci does not use). */
+#define virtio_mb(vq) \
+	do { if ((vq)->weak_barriers) smp_mb(); else mb(); } while(0)
+#define virtio_rmb(vq) \
+	do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
+#define virtio_wmb(vq) \
+	do { if ((vq)->weak_barriers) smp_rmb(); else rmb(); } while(0)
 #else
 /* We must force memory ordering even if guest is UP since host could be
  * running on another CPU, but SMP barriers are defined to barrier() in that
  * configuration. So fall back to mandatory barriers instead. */
-#define virtio_mb() mb()
-#define virtio_rmb() rmb()
-#define virtio_wmb() wmb()
+#define virtio_mb(vq) mb()
+#define virtio_rmb(vq) rmb()
+#define virtio_wmb(vq) wmb()
 #endif
 
 #ifdef DEBUG
@@ -77,6 +80,9 @@
 	/* Actual memory layout for this queue */
 	struct vring vring;
 
+	/* Can we use weak barriers? */
+	bool weak_barriers;
+
 	/* Other side has made a mess, don't try any more. */
 	bool broken;
 
@@ -167,12 +173,29 @@
 	return head;
 }
 
-int virtqueue_add_buf_gfp(struct virtqueue *_vq,
-			  struct scatterlist sg[],
-			  unsigned int out,
-			  unsigned int in,
-			  void *data,
-			  gfp_t gfp)
+/**
+ * virtqueue_add_buf - expose buffer to other end
+ * @vq: the struct virtqueue we're talking about.
+ * @sg: the description of the buffer(s).
+ * @out_num: the number of sg readable by other side
+ * @in_num: the number of sg which are writable (after readable ones)
+ * @data: the token identifying the buffer.
+ * @gfp: how to do memory allocations (if necessary).
+ *
+ * Caller must ensure we don't call this with other virtqueue operations
+ * at the same time (except where noted).
+ *
+ * Returns remaining capacity of queue or a negative error
+ * (ie. ENOSPC).  Note that it only really makes sense to treat all
+ * positive return values as "available": indirect buffers mean that
+ * we can put an entire sg[] array inside a single queue entry.
+ */
+int virtqueue_add_buf(struct virtqueue *_vq,
+		      struct scatterlist sg[],
+		      unsigned int out,
+		      unsigned int in,
+		      void *data,
+		      gfp_t gfp)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	unsigned int i, avail, uninitialized_var(prev);
@@ -243,31 +266,77 @@
 
 	return vq->num_free;
 }
-EXPORT_SYMBOL_GPL(virtqueue_add_buf_gfp);
+EXPORT_SYMBOL_GPL(virtqueue_add_buf);
 
-void virtqueue_kick(struct virtqueue *_vq)
+/**
+ * virtqueue_kick_prepare - first half of split virtqueue_kick call.
+ * @vq: the struct virtqueue
+ *
+ * Instead of virtqueue_kick(), you can do:
+ *	if (virtqueue_kick_prepare(vq))
+ *		virtqueue_notify(vq);
+ *
+ * This is sometimes useful because the virtqueue_kick_prepare() needs
+ * to be serialized, but the actual virtqueue_notify() call does not.
+ */
+bool virtqueue_kick_prepare(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
 	u16 new, old;
+	bool needs_kick;
+
 	START_USE(vq);
 	/* Descriptors and available array need to be set before we expose the
 	 * new available array entries. */
-	virtio_wmb();
+	virtio_wmb(vq);
 
 	old = vq->vring.avail->idx;
 	new = vq->vring.avail->idx = old + vq->num_added;
 	vq->num_added = 0;
 
 	/* Need to update avail index before checking if we should notify */
-	virtio_mb();
-
-	if (vq->event ?
-	    vring_need_event(vring_avail_event(&vq->vring), new, old) :
-	    !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY))
-		/* Prod other side to tell it about changes. */
-		vq->notify(&vq->vq);
+	virtio_mb(vq);
 
+	if (vq->event) {
+		needs_kick = vring_need_event(vring_avail_event(&vq->vring),
+					      new, old);
+	} else {
+		needs_kick = !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY);
+	}
 	END_USE(vq);
+	return needs_kick;
+}
+EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
+
+/**
+ * virtqueue_notify - second half of split virtqueue_kick call.
+ * @vq: the struct virtqueue
+ *
+ * This does not need to be serialized.
+ */
+void virtqueue_notify(struct virtqueue *_vq)
+{
+	struct vring_virtqueue *vq = to_vvq(_vq);
+
+	/* Prod other side to tell it about changes. */
+	vq->notify(_vq);
+}
+EXPORT_SYMBOL_GPL(virtqueue_notify);
+
+/**
+ * virtqueue_kick - update after add_buf
+ * @vq: the struct virtqueue
+ *
+ * After one or more virtqueue_add_buf calls, invoke this to kick
+ * the other side.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ */
+void virtqueue_kick(struct virtqueue *vq)
+{
+	if (virtqueue_kick_prepare(vq))
+		virtqueue_notify(vq);
 }
 EXPORT_SYMBOL_GPL(virtqueue_kick);
 
@@ -301,6 +370,22 @@
 	return vq->last_used_idx != vq->vring.used->idx;
 }
 
+/**
+ * virtqueue_get_buf - get the next used buffer
+ * @vq: the struct virtqueue we're talking about.
+ * @len: the length written into the buffer
+ *
+ * If the driver wrote data into the buffer, @len will be set to the
+ * amount written.  This means you don't need to clear the buffer
+ * beforehand to ensure there's no data leakage in the case of short
+ * writes.
+ *
+ * Caller must ensure we don't call this with other virtqueue
+ * operations at the same time (except where noted).
+ *
+ * Returns NULL if there are no used buffers, or the "data" token
+ * handed to virtqueue_add_buf().
+ */
 void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -321,7 +406,7 @@
 	}
 
 	/* Only get used array entries after they have been exposed by host. */
-	virtio_rmb();
+	virtio_rmb(vq);
 
 	i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id;
 	*len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len;
@@ -344,7 +429,7 @@
 	 * the read in the next get_buf call. */
 	if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) {
 		vring_used_event(&vq->vring) = vq->last_used_idx;
-		virtio_mb();
+		virtio_mb(vq);
 	}
 
 	END_USE(vq);
@@ -373,7 +458,7 @@
 	 * entry. Always do both to keep code simple. */
 	vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT;
 	vring_used_event(&vq->vring) = vq->last_used_idx;
-	virtio_mb();
+	virtio_mb(vq);
 	if (unlikely(more_used(vq))) {
 		END_USE(vq);
 		return false;
@@ -400,7 +485,7 @@
 	/* TODO: tune this threshold */
 	bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4;
 	vring_used_event(&vq->vring) = vq->last_used_idx + bufs;
-	virtio_mb();
+	virtio_mb(vq);
 	if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) {
 		END_USE(vq);
 		return false;
@@ -411,6 +496,14 @@
 }
 EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
 
+/**
+ * virtqueue_detach_unused_buf - detach first unused buffer
+ * @vq: the struct virtqueue we're talking about.
+ *
+ * Returns NULL or the "data" token handed to virtqueue_add_buf().
+ * This is not valid on an active queue; it is useful only for device
+ * shutdown.
+ */
 void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
 {
 	struct vring_virtqueue *vq = to_vvq(_vq);
@@ -460,6 +553,7 @@
 struct virtqueue *vring_new_virtqueue(unsigned int num,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
+				      bool weak_barriers,
 				      void *pages,
 				      void (*notify)(struct virtqueue *),
 				      void (*callback)(struct virtqueue *),
@@ -483,6 +577,7 @@
 	vq->vq.vdev = vdev;
 	vq->vq.name = name;
 	vq->notify = notify;
+	vq->weak_barriers = weak_barriers;
 	vq->broken = false;
 	vq->last_used_idx = 0;
 	vq->num_added = 0;
diff -ur a/fs/9p/v9fs.c b/fs/9p/v9fs.c
--- a/fs/9p/v9fs.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/9p/v9fs.c	2014-02-17 11:57:00.000000000 +0100
@@ -559,6 +559,11 @@
  */
 static void v9fs_destroy_inode_cache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(v9fs_inode_cache);
 }
 
diff -ur a/fs/adfs/super.c b/fs/adfs/super.c
--- a/fs/adfs/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/adfs/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -276,6 +276,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(adfs_inode_cachep);
 }
 
diff -ur a/fs/affs/super.c b/fs/affs/super.c
--- a/fs/affs/super.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/affs/super.c	2014-02-17 11:57:00.000000000 +0100
@@ -129,6 +129,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(affs_inode_cachep);
 }
 
diff -ur a/fs/afs/super.c b/fs/afs/super.c
--- a/fs/afs/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/afs/super.c	2014-02-17 11:56:56.000000000 +0100
@@ -123,6 +123,11 @@
 		BUG();
 	}
 
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(afs_inode_cachep);
 	_leave("");
 }
diff -ur a/fs/attr.c b/fs/attr.c
--- a/fs/attr.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/attr.c	2014-02-17 11:56:57.000000000 +0100
@@ -15,6 +15,10 @@
 #include <linux/security.h>
 #include <linux/evm.h>
 
+#ifdef CONFIG_FS_SYNO_ACL
+#include "synoacl_int.h"
+#endif
+
 /**
  * inode_change_ok - check if attribute changes to an inode are allowed
  * @inode:	inode to check
@@ -153,12 +157,6 @@
 	if (ia_valid & ATTR_CTIME)
 		inode->i_ctime = timespec_trunc(attr->ia_ctime,
 						inode->i_sb->s_time_gran);
-#ifdef MY_ABC_HERE
-	if (ia_valid & ATTR_CREATE_TIME) {
-		inode->i_CreateTime = timespec_trunc(attr->ia_ctime,
-						inode->i_sb->s_time_gran);
-	}
-#endif
 	if (ia_valid & ATTR_MODE) {
 		umode_t mode = attr->ia_mode;
 
@@ -176,6 +174,9 @@
 	int error;
 	struct timespec now;
 	unsigned int ia_valid = attr->ia_valid;
+#ifdef CONFIG_FS_SYNO_ACL
+	int isSYNOACL = 0;
+#endif
 
 	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
@@ -244,12 +245,26 @@
 	if (error)
 		return error;
 
+#ifdef CONFIG_FS_SYNO_ACL
+	isSYNOACL = IS_SYNOACL(dentry);
+	if (isSYNOACL) {
+		error = synoacl_op_inode_chg_ok(dentry, attr);
+		if (error) {
+			return error;
+		}
+	}
+#endif
 	if (inode->i_op->setattr)
 		error = inode->i_op->setattr(dentry, attr);
 	else
 		error = simple_setattr(dentry, attr);
 
 	if (!error) {
+#ifdef CONFIG_FS_SYNO_ACL
+		if (isSYNOACL) {
+			synoacl_op_setattr_post(dentry, attr);
+		}
+#endif
 		fsnotify_change(dentry, ia_valid);
 		evm_inode_post_setattr(dentry, ia_valid);
 	}
diff -ur a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
--- a/fs/befs/linuxvfs.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/befs/linuxvfs.c	2014-02-17 11:57:01.000000000 +0100
@@ -454,6 +454,11 @@
 static void
 befs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(befs_inode_cachep);
 }
 
diff -ur a/fs/bfs/inode.c b/fs/bfs/inode.c
--- a/fs/bfs/inode.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/bfs/inode.c	2014-02-17 11:57:01.000000000 +0100
@@ -280,6 +280,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(bfs_inode_cachep);
 }
 
diff -ur a/fs/block_dev.c b/fs/block_dev.c
--- a/fs/block_dev.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/block_dev.c	2014-02-17 11:56:56.000000000 +0100
@@ -236,39 +236,6 @@
 }
 EXPORT_SYMBOL(fsync_bdev);
 
-#ifdef MY_ABC_HERE
-int sync_wait_fs_sync(struct super_block *sb)
-{
-	int retry = 0;
-	do {
-		int cnt;
-		struct inode *tmp;
-
-		/* fail-safe protection*/
-		if (retry++ > SYNO_EXT4_SYNC_DALLOC_RETRY) {
-			printk(KERN_ERR"freeze_bdev retry sync more than %d times\n", retry);
-			break;
-		}
-
-		cnt = 0;
-		list_for_each_entry(tmp, &(sb->s_bdi->wb.b_dirty), i_wb_list) {
-			if (tmp->i_sb == sb) {
-				cnt++;
-			}
-		}
-		if (0 == cnt) {
-			break;
-		}
-
-		printk(KERN_DEBUG"freeze_bdev still has %d dirty inode, sync again\n", cnt);
-		sync_filesystem(sb);
-	} while (1);
-
-	return 0;
-}
-EXPORT_SYMBOL(sync_wait_fs_sync);
-#endif
-
 /**
  * freeze_bdev  --  lock a filesystem and force it into a consistent state
  * @bdev:	blockdevice to lock
diff -ur a/fs/btrfs/acl.c b/fs/btrfs/acl.c
--- a/fs/btrfs/acl.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/acl.c	2014-02-17 11:56:58.000000000 +0100
@@ -121,6 +121,8 @@
 			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (ret < 0)
 				return ret;
+			if (ret == 0)
+				acl = NULL;
 		}
 		ret = 0;
 		break;
@@ -227,7 +229,11 @@
 		if (ret > 0) {
 			/* we need an acl */
 			ret = btrfs_set_acl(trans, inode, acl, ACL_TYPE_ACCESS);
+		} else {
+			cache_no_acl(inode);
 		}
+	} else {
+		cache_no_acl(inode);
 	}
 failed:
 	posix_acl_release(acl);
diff -ur a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
--- a/fs/btrfs/async-thread.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/async-thread.c	2014-02-17 11:56:58.000000000 +0100
@@ -107,7 +107,8 @@
 		worker->idle = 1;
 
 		/* the list may be empty if the worker is just starting */
-		if (!list_empty(&worker->worker_list)) {
+		if (!list_empty(&worker->worker_list) &&
+		    !worker->workers->stopping) {
 			list_move(&worker->worker_list,
 				 &worker->workers->idle_list);
 		}
@@ -127,7 +128,8 @@
 		spin_lock_irqsave(&worker->workers->lock, flags);
 		worker->idle = 0;
 
-		if (!list_empty(&worker->worker_list)) {
+		if (!list_empty(&worker->worker_list) &&
+		    !worker->workers->stopping) {
 			list_move_tail(&worker->worker_list,
 				      &worker->workers->worker_list);
 		}
@@ -171,11 +173,11 @@
 	spin_unlock_irqrestore(&workers->lock, flags);
 }
 
-static noinline int run_ordered_completions(struct btrfs_workers *workers,
+static noinline void run_ordered_completions(struct btrfs_workers *workers,
 					    struct btrfs_work *work)
 {
 	if (!workers->ordered)
-		return 0;
+		return;
 
 	set_bit(WORK_DONE_BIT, &work->flags);
 
@@ -220,7 +222,6 @@
 	}
 
 	spin_unlock(&workers->order_lock);
-	return 0;
 }
 
 static void put_worker(struct btrfs_worker_thread *worker)
@@ -341,7 +342,7 @@
 		if (freezing(current)) {
 			worker->working = 0;
 			spin_unlock_irq(&worker->lock);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			spin_unlock_irq(&worker->lock);
 			if (!kthread_should_stop()) {
@@ -406,13 +407,14 @@
 /*
  * this will wait for all the worker threads to shutdown
  */
-int btrfs_stop_workers(struct btrfs_workers *workers)
+void btrfs_stop_workers(struct btrfs_workers *workers)
 {
 	struct list_head *cur;
 	struct btrfs_worker_thread *worker;
 	int can_stop;
 
 	spin_lock_irq(&workers->lock);
+	workers->stopping = 1;
 	list_splice_init(&workers->idle_list, &workers->worker_list);
 	while (!list_empty(&workers->worker_list)) {
 		cur = workers->worker_list.next;
@@ -434,7 +436,6 @@
 		put_worker(worker);
 	}
 	spin_unlock_irq(&workers->lock);
-	return 0;
 }
 
 /*
@@ -457,6 +458,7 @@
 	workers->ordered = 0;
 	workers->atomic_start_pending = 0;
 	workers->atomic_worker_start = async_helper;
+	workers->stopping = 0;
 }
 
 /*
@@ -482,15 +484,19 @@
 	atomic_set(&worker->num_pending, 0);
 	atomic_set(&worker->refs, 1);
 	worker->workers = workers;
-	worker->task = kthread_run(worker_loop, worker,
-				   "btrfs-%s-%d", workers->name,
-				   workers->num_workers + 1);
+	worker->task = kthread_create(worker_loop, worker,
+				      "btrfs-%s-%d", workers->name,
+				      workers->num_workers + 1);
 	if (IS_ERR(worker->task)) {
 		ret = PTR_ERR(worker->task);
-		kfree(worker);
 		goto fail;
 	}
+
 	spin_lock_irq(&workers->lock);
+	if (workers->stopping) {
+		spin_unlock_irq(&workers->lock);
+		goto fail_kthread;
+	}
 	list_add_tail(&worker->worker_list, &workers->idle_list);
 	worker->idle = 1;
 	workers->num_workers++;
@@ -498,8 +504,13 @@
 	WARN_ON(workers->num_workers_starting < 0);
 	spin_unlock_irq(&workers->lock);
 
+	wake_up_process(worker->task);
 	return 0;
+
+fail_kthread:
+	kthread_stop(worker->task);
 fail:
+	kfree(worker);
 	spin_lock_irq(&workers->lock);
 	workers->num_workers_starting--;
 	spin_unlock_irq(&workers->lock);
@@ -622,14 +633,14 @@
  * it was taken from.  It is intended for use with long running work functions
  * that make some progress and want to give the cpu up for others.
  */
-int btrfs_requeue_work(struct btrfs_work *work)
+void btrfs_requeue_work(struct btrfs_work *work)
 {
 	struct btrfs_worker_thread *worker = work->worker;
 	unsigned long flags;
 	int wake = 0;
 
 	if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags))
-		goto out;
+		return;
 
 	spin_lock_irqsave(&worker->lock, flags);
 	if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags))
@@ -656,9 +667,6 @@
 	if (wake)
 		wake_up_process(worker->task);
 	spin_unlock_irqrestore(&worker->lock, flags);
-out:
-
-	return 0;
 }
 
 void btrfs_set_work_high_prio(struct btrfs_work *work)
diff -ur a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
--- a/fs/btrfs/async-thread.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/async-thread.h	2014-02-17 11:56:58.000000000 +0100
@@ -107,13 +107,15 @@
 
 	/* extra name for this worker, used for current->name */
 	char *name;
+
+	int stopping;
 };
 
 void btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work);
 int btrfs_start_workers(struct btrfs_workers *workers);
-int btrfs_stop_workers(struct btrfs_workers *workers);
+void btrfs_stop_workers(struct btrfs_workers *workers);
 void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max,
 			struct btrfs_workers *async_starter);
-int btrfs_requeue_work(struct btrfs_work *work);
+void btrfs_requeue_work(struct btrfs_work *work);
 void btrfs_set_work_high_prio(struct btrfs_work *work);
 #endif
diff -ur a/fs/btrfs/backref.c b/fs/btrfs/backref.c
--- a/fs/btrfs/backref.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/backref.c	2014-02-17 11:56:58.000000000 +0100
@@ -16,22 +16,1096 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
+#include "ulist.h"
+#include "transaction.h"
+#include "delayed-ref.h"
+#include "locking.h"
 
-struct __data_ref {
-	struct list_head list;
+struct extent_inode_elem {
 	u64 inum;
-	u64 root;
-	u64 extent_data_item_offset;
+	u64 offset;
+	struct extent_inode_elem *next;
 };
 
-struct __shared_ref {
-	struct list_head list;
+static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
+				struct btrfs_file_extent_item *fi,
+				u64 extent_item_pos,
+				struct extent_inode_elem **eie)
+{
+	u64 offset = 0;
+	struct extent_inode_elem *e;
+
+	if (!btrfs_file_extent_compression(eb, fi) &&
+	    !btrfs_file_extent_encryption(eb, fi) &&
+	    !btrfs_file_extent_other_encoding(eb, fi)) {
+		u64 data_offset;
+		u64 data_len;
+
+		data_offset = btrfs_file_extent_offset(eb, fi);
+		data_len = btrfs_file_extent_num_bytes(eb, fi);
+
+		if (extent_item_pos < data_offset ||
+		    extent_item_pos >= data_offset + data_len)
+			return 1;
+		offset = extent_item_pos - data_offset;
+	}
+
+	e = kmalloc(sizeof(*e), GFP_NOFS);
+	if (!e)
+		return -ENOMEM;
+
+	e->next = *eie;
+	e->inum = key->objectid;
+	e->offset = key->offset + offset;
+	*eie = e;
+
+	return 0;
+}
+
+static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
+				u64 extent_item_pos,
+				struct extent_inode_elem **eie)
+{
 	u64 disk_byte;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	int slot;
+	int nritems;
+	int extent_type;
+	int ret;
+
+	/*
+	 * from the shared data ref, we only have the leaf but we need
+	 * the key. thus, we must look into all items and see that we
+	 * find one (some) with a reference to our extent item.
+	 */
+	nritems = btrfs_header_nritems(eb);
+	for (slot = 0; slot < nritems; ++slot) {
+		btrfs_item_key_to_cpu(eb, &key, slot);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		extent_type = btrfs_file_extent_type(eb, fi);
+		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+		if (disk_byte != wanted_disk_byte)
+			continue;
+
+		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
+		if (ret < 0)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * this structure records all encountered refs on the way up to the root
+ */
+struct __prelim_ref {
+	struct list_head list;
+	u64 root_id;
+	struct btrfs_key key_for_search;
+	int level;
+	int count;
+	struct extent_inode_elem *inode_list;
+	u64 parent;
+	u64 wanted_disk_byte;
 };
 
+static struct kmem_cache *btrfs_prelim_ref_cache;
+
+int __init btrfs_prelim_ref_init(void)
+{
+	btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
+					sizeof(struct __prelim_ref),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_prelim_ref_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_prelim_ref_exit(void)
+{
+	if (btrfs_prelim_ref_cache)
+		kmem_cache_destroy(btrfs_prelim_ref_cache);
+}
+
+/*
+ * the rules for all callers of this function are:
+ * - obtaining the parent is the goal
+ * - if you add a key, you must know that it is a correct key
+ * - if you cannot add the parent or a correct key, then we will look into the
+ *   block later to set a correct key
+ *
+ * delayed refs
+ * ============
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    -   |     -
+ *      key to resolve |    -   |     y    |    y   |     y
+ *  tree block logical |    -   |     -    |    -   |     -
+ *  root for resolving |    y   |     y    |    y   |     y
+ *
+ * - column 1:       we've the parent -> done
+ * - column 2, 3, 4: we use the key to find the parent
+ *
+ * on disk refs (inline or keyed)
+ * ==============================
+ *        backref type | shared | indirect | shared | indirect
+ * information         |   tree |     tree |   data |     data
+ * --------------------+--------+----------+--------+----------
+ *      parent logical |    y   |     -    |    y   |     -
+ *      key to resolve |    -   |     -    |    -   |     y
+ *  tree block logical |    y   |     y    |    y   |     y
+ *  root for resolving |    -   |     y    |    y   |     y
+ *
+ * - column 1, 3: we've the parent -> done
+ * - column 2:    we take the first key from the block to find the parent
+ *                (see __add_missing_keys)
+ * - column 4:    we use the key to find the parent
+ *
+ * additional information that's available but not required to find the parent
+ * block might help in merging entries to gain some speed.
+ */
+
+static int __add_prelim_ref(struct list_head *head, u64 root_id,
+			    struct btrfs_key *key, int level,
+			    u64 parent, u64 wanted_disk_byte, int count,
+			    gfp_t gfp_mask)
+{
+	struct __prelim_ref *ref;
+
+	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->root_id = root_id;
+	if (key)
+		ref->key_for_search = *key;
+	else
+		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
+
+	ref->inode_list = NULL;
+	ref->level = level;
+	ref->count = count;
+	ref->parent = parent;
+	ref->wanted_disk_byte = wanted_disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
+				struct ulist *parents, int level,
+				struct btrfs_key *key_for_search, u64 time_seq,
+				u64 wanted_disk_byte,
+				const u64 *extent_item_pos)
+{
+	int ret = 0;
+	int slot;
+	struct extent_buffer *eb;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	struct extent_inode_elem *eie = NULL, *old = NULL;
+	u64 disk_byte;
+
+	if (level != 0) {
+		eb = path->nodes[level];
+		ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
+		if (ret < 0)
+			return ret;
+		return 0;
+	}
+
+	/*
+	 * We normally enter this function with the path already pointing to
+	 * the first item to check. But sometimes, we may enter it with
+	 * slot==nritems. In that case, go to the next leaf before we continue.
+	 */
+	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
+		ret = btrfs_next_old_leaf(root, path, time_seq);
+
+	while (!ret) {
+		eb = path->nodes[0];
+		slot = path->slots[0];
+
+		btrfs_item_key_to_cpu(eb, &key, slot);
+
+		if (key.objectid != key_for_search->objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+
+		if (disk_byte == wanted_disk_byte) {
+			eie = NULL;
+			old = NULL;
+			if (extent_item_pos) {
+				ret = check_extent_in_eb(&key, eb, fi,
+						*extent_item_pos,
+						&eie);
+				if (ret < 0)
+					break;
+			}
+			if (ret > 0)
+				goto next;
+			ret = ulist_add_merge(parents, eb->start,
+					      (uintptr_t)eie,
+					      (u64 *)&old, GFP_NOFS);
+			if (ret < 0)
+				break;
+			if (!ret && extent_item_pos) {
+				while (old->next)
+					old = old->next;
+				old->next = eie;
+			}
+		}
+next:
+		ret = btrfs_next_old_item(root, path, time_seq);
+	}
+
+	if (ret > 0)
+		ret = 0;
+	return ret;
+}
+
+/*
+ * resolve an indirect backref in the form (root_id, key, level)
+ * to a logical address
+ */
+static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
+				  struct btrfs_path *path, u64 time_seq,
+				  struct __prelim_ref *ref,
+				  struct ulist *parents,
+				  const u64 *extent_item_pos)
+{
+	struct btrfs_root *root;
+	struct btrfs_key root_key;
+	struct extent_buffer *eb;
+	int ret = 0;
+	int root_level;
+	int level = ref->level;
+
+	root_key.objectid = ref->root_id;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(root)) {
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	root_level = btrfs_old_root_level(root, time_seq);
+
+	if (root_level + 1 == level)
+		goto out;
+
+	path->lowest_level = level;
+	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
+	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
+		 "%d for key (%llu %u %llu)\n",
+		 ref->root_id, level, ref->count, ret,
+		 ref->key_for_search.objectid, ref->key_for_search.type,
+		 ref->key_for_search.offset);
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[level];
+	while (!eb) {
+		if (!level) {
+			WARN_ON(1);
+			ret = 1;
+			goto out;
+		}
+		level--;
+		eb = path->nodes[level];
+	}
+
+	ret = add_all_parents(root, path, parents, level, &ref->key_for_search,
+				time_seq, ref->wanted_disk_byte,
+				extent_item_pos);
+out:
+	path->lowest_level = 0;
+	btrfs_release_path(path);
+	return ret;
+}
+
+/*
+ * resolve all indirect backrefs from the list
+ */
+static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
+				   struct btrfs_path *path, u64 time_seq,
+				   struct list_head *head,
+				   const u64 *extent_item_pos)
+{
+	int err;
+	int ret = 0;
+	struct __prelim_ref *ref;
+	struct __prelim_ref *ref_safe;
+	struct __prelim_ref *new_ref;
+	struct ulist *parents;
+	struct ulist_node *node;
+	struct ulist_iterator uiter;
+
+	parents = ulist_alloc(GFP_NOFS);
+	if (!parents)
+		return -ENOMEM;
+
+	/*
+	 * _safe allows us to insert directly after the current item without
+	 * iterating over the newly inserted items.
+	 * we're also allowed to re-assign ref during iteration.
+	 */
+	list_for_each_entry_safe(ref, ref_safe, head, list) {
+		if (ref->parent)	/* already direct */
+			continue;
+		if (ref->count == 0)
+			continue;
+		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
+					     parents, extent_item_pos);
+		if (err == -ENOMEM)
+			goto out;
+		if (err)
+			continue;
+
+		/* we put the first parent into the ref at hand */
+		ULIST_ITER_INIT(&uiter);
+		node = ulist_next(parents, &uiter);
+		ref->parent = node ? node->val : 0;
+		ref->inode_list = node ?
+			(struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
+
+		/* additional parents require new refs being added here */
+		while ((node = ulist_next(parents, &uiter))) {
+			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
+						   GFP_NOFS);
+			if (!new_ref) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			memcpy(new_ref, ref, sizeof(*ref));
+			new_ref->parent = node->val;
+			new_ref->inode_list = (struct extent_inode_elem *)
+							(uintptr_t)node->aux;
+			list_add(&new_ref->list, &ref->list);
+		}
+		ulist_reinit(parents);
+	}
+out:
+	ulist_free(parents);
+	return ret;
+}
+
+static inline int ref_for_same_block(struct __prelim_ref *ref1,
+				     struct __prelim_ref *ref2)
+{
+	if (ref1->level != ref2->level)
+		return 0;
+	if (ref1->root_id != ref2->root_id)
+		return 0;
+	if (ref1->key_for_search.type != ref2->key_for_search.type)
+		return 0;
+	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
+		return 0;
+	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
+		return 0;
+	if (ref1->parent != ref2->parent)
+		return 0;
+
+	return 1;
+}
+
+/*
+ * read tree blocks and add keys where required.
+ */
+static int __add_missing_keys(struct btrfs_fs_info *fs_info,
+			      struct list_head *head)
+{
+	struct list_head *pos;
+	struct extent_buffer *eb;
+
+	list_for_each(pos, head) {
+		struct __prelim_ref *ref;
+		ref = list_entry(pos, struct __prelim_ref, list);
+
+		if (ref->parent)
+			continue;
+		if (ref->key_for_search.type)
+			continue;
+		BUG_ON(!ref->wanted_disk_byte);
+		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
+				     fs_info->tree_root->leafsize, 0);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
+		btrfs_tree_read_lock(eb);
+		if (btrfs_header_level(eb) == 0)
+			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
+		else
+			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
+		btrfs_tree_read_unlock(eb);
+		free_extent_buffer(eb);
+	}
+	return 0;
+}
+
+/*
+ * merge two lists of backrefs and adjust counts accordingly
+ *
+ * mode = 1: merge identical keys, if key is set
+ *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
+ *           additionally, we could even add a key range for the blocks we
+ *           looked into to merge even more (-> replace unresolved refs by those
+ *           having a parent).
+ * mode = 2: merge identical parents
+ */
+static void __merge_refs(struct list_head *head, int mode)
+{
+	struct list_head *pos1;
+
+	list_for_each(pos1, head) {
+		struct list_head *n2;
+		struct list_head *pos2;
+		struct __prelim_ref *ref1;
+
+		ref1 = list_entry(pos1, struct __prelim_ref, list);
+
+		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
+		     pos2 = n2, n2 = pos2->next) {
+			struct __prelim_ref *ref2;
+			struct __prelim_ref *xchg;
+			struct extent_inode_elem *eie;
+
+			ref2 = list_entry(pos2, struct __prelim_ref, list);
+
+			if (mode == 1) {
+				if (!ref_for_same_block(ref1, ref2))
+					continue;
+				if (!ref1->parent && ref2->parent) {
+					xchg = ref1;
+					ref1 = ref2;
+					ref2 = xchg;
+				}
+			} else {
+				if (ref1->parent != ref2->parent)
+					continue;
+			}
+
+			eie = ref1->inode_list;
+			while (eie && eie->next)
+				eie = eie->next;
+			if (eie)
+				eie->next = ref2->inode_list;
+			else
+				ref1->inode_list = ref2->inode_list;
+			ref1->count += ref2->count;
+
+			list_del(&ref2->list);
+			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
+		}
+
+	}
+}
+
+/*
+ * add all currently queued delayed refs from this head whose seq nr is
+ * smaller or equal that seq to the list
+ */
+static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
+			      struct list_head *prefs)
+{
+	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
+	struct rb_node *n = &head->node.rb_node;
+	struct btrfs_key key;
+	struct btrfs_key op_key = {0};
+	int sgn;
+	int ret = 0;
+
+	if (extent_op && extent_op->update_key)
+		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
+
+	while ((n = rb_prev(n))) {
+		struct btrfs_delayed_ref_node *node;
+		node = rb_entry(n, struct btrfs_delayed_ref_node,
+				rb_node);
+		if (node->bytenr != head->node.bytenr)
+			break;
+		WARN_ON(node->is_head);
+
+		if (node->seq > seq)
+			continue;
+
+		switch (node->action) {
+		case BTRFS_ADD_DELAYED_EXTENT:
+		case BTRFS_UPDATE_DELAYED_HEAD:
+			WARN_ON(1);
+			continue;
+		case BTRFS_ADD_DELAYED_REF:
+			sgn = 1;
+			break;
+		case BTRFS_DROP_DELAYED_REF:
+			sgn = -1;
+			break;
+		default:
+			BUG_ON(1);
+		}
+		switch (node->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, &op_key,
+					       ref->level + 1, 0, node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_SHARED_BLOCK_REF_KEY: {
+			struct btrfs_delayed_tree_ref *ref;
+
+			ref = btrfs_delayed_node_to_tree_ref(node);
+			ret = __add_prelim_ref(prefs, ref->root, NULL,
+					       ref->level + 1, ref->parent,
+					       node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
+					       node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_delayed_data_ref *ref;
+
+			ref = btrfs_delayed_node_to_data_ref(node);
+
+			key.objectid = ref->objectid;
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = ref->offset;
+			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
+					       ref->parent, node->bytenr,
+					       node->ref_mod * sgn, GFP_ATOMIC);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * add all inline backrefs for bytenr to the list
+ */
+static int __add_inline_refs(struct btrfs_fs_info *fs_info,
+			     struct btrfs_path *path, u64 bytenr,
+			     int *info_level, struct list_head *prefs)
+{
+	int ret = 0;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	unsigned long ptr;
+	unsigned long end;
+	struct btrfs_extent_item *ei;
+	u64 flags;
+	u64 item_size;
+
+	/*
+	 * enumerate all inline refs
+	 */
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+	btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+
+		info = (struct btrfs_tree_block_info *)ptr;
+		*info_level = btrfs_tree_block_level(leaf, info);
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
+		*info_level = found_key.offset;
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	while (ptr < end) {
+		struct btrfs_extent_inline_ref *iref;
+		u64 offset;
+		int type;
+
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		offset = btrfs_extent_inline_ref_offset(leaf, iref);
+
+		switch (type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, NULL,
+						*info_level + 1, offset,
+						bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, offset, NULL,
+					       *info_level + 1, 0,
+					       bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			return ret;
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+
+	return 0;
+}
+
+/*
+ * add all non-inline backrefs for bytenr to the list
+ */
+static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
+			    struct btrfs_path *path, u64 bytenr,
+			    int info_level, struct list_head *prefs)
+{
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	int ret;
+	int slot;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	while (1) {
+		ret = btrfs_next_item(extent_root, path);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = 0;
+			break;
+		}
+
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid != bytenr)
+			break;
+		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
+			continue;
+		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
+			break;
+
+		switch (key.type) {
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, 0, NULL,
+						info_level + 1, key.offset,
+						bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY: {
+			struct btrfs_shared_data_ref *sdref;
+			int count;
+
+			sdref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_shared_data_ref);
+			count = btrfs_shared_data_ref_count(leaf, sdref);
+			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
+						bytenr, count, GFP_NOFS);
+			break;
+		}
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			ret = __add_prelim_ref(prefs, key.offset, NULL,
+					       info_level + 1, 0,
+					       bytenr, 1, GFP_NOFS);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY: {
+			struct btrfs_extent_data_ref *dref;
+			int count;
+			u64 root;
+
+			dref = btrfs_item_ptr(leaf, slot,
+					      struct btrfs_extent_data_ref);
+			count = btrfs_extent_data_ref_count(leaf, dref);
+			key.objectid = btrfs_extent_data_ref_objectid(leaf,
+								      dref);
+			key.type = BTRFS_EXTENT_DATA_KEY;
+			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
+			root = btrfs_extent_data_ref_root(leaf, dref);
+			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
+					       bytenr, count, GFP_NOFS);
+			break;
+		}
+		default:
+			WARN_ON(1);
+		}
+		if (ret)
+			return ret;
+
+	}
+
+	return ret;
+}
+
+/*
+ * this adds all existing backrefs (inline backrefs, backrefs and delayed
+ * refs) for the given bytenr to the refs list, merges duplicates and resolves
+ * indirect refs to their parent bytenr.
+ * When roots are found, they're added to the roots list
+ *
+ * FIXME some caching might speed things up
+ */
+static int find_parent_nodes(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info, u64 bytenr,
+			     u64 time_seq, struct ulist *refs,
+			     struct ulist *roots, const u64 *extent_item_pos)
+{
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_delayed_ref_root *delayed_refs = NULL;
+	struct btrfs_delayed_ref_head *head;
+	int info_level = 0;
+	int ret;
+	struct list_head prefs_delayed;
+	struct list_head prefs;
+	struct __prelim_ref *ref;
+
+	INIT_LIST_HEAD(&prefs);
+	INIT_LIST_HEAD(&prefs_delayed);
+
+	key.objectid = bytenr;
+	key.offset = (u64)-1;
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	if (!trans)
+		path->search_commit_root = 1;
+
+	/*
+	 * grab both a lock on the path and a lock on the delayed ref head.
+	 * We need both to get a consistent picture of how the refs look
+	 * at a specified point in time
+	 */
+again:
+	head = NULL;
+
+	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret == 0);
+
+	if (trans) {
+		/*
+		 * look if there are updates for this ref queued and lock the
+		 * head
+		 */
+		delayed_refs = &trans->transaction->delayed_refs;
+		spin_lock(&delayed_refs->lock);
+		head = btrfs_find_delayed_ref_head(trans, bytenr);
+		if (head) {
+			if (!mutex_trylock(&head->mutex)) {
+				atomic_inc(&head->node.refs);
+				spin_unlock(&delayed_refs->lock);
+
+				btrfs_release_path(path);
+
+				/*
+				 * Mutex was contended, block until it's
+				 * released and try again
+				 */
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+				btrfs_put_delayed_ref(&head->node);
+				goto again;
+			}
+			ret = __add_delayed_refs(head, time_seq,
+						 &prefs_delayed);
+			mutex_unlock(&head->mutex);
+			if (ret) {
+				spin_unlock(&delayed_refs->lock);
+				goto out;
+			}
+		}
+		spin_unlock(&delayed_refs->lock);
+	}
+
+	if (path->slots[0]) {
+		struct extent_buffer *leaf;
+		int slot;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid == bytenr &&
+		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		     key.type == BTRFS_METADATA_ITEM_KEY)) {
+			ret = __add_inline_refs(fs_info, path, bytenr,
+						&info_level, &prefs);
+			if (ret)
+				goto out;
+			ret = __add_keyed_refs(fs_info, path, bytenr,
+					       info_level, &prefs);
+			if (ret)
+				goto out;
+		}
+	}
+	btrfs_release_path(path);
+
+	list_splice_init(&prefs_delayed, &prefs);
+
+	ret = __add_missing_keys(fs_info, &prefs);
+	if (ret)
+		goto out;
+
+	__merge_refs(&prefs, 1);
+
+	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
+				      extent_item_pos);
+	if (ret)
+		goto out;
+
+	__merge_refs(&prefs, 2);
+
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		WARN_ON(ref->count < 0);
+		if (ref->count && ref->root_id && ref->parent == 0) {
+			/* no parent == root of tree */
+			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
+			if (ret < 0)
+				goto out;
+		}
+		if (ref->count && ref->parent) {
+			struct extent_inode_elem *eie = NULL;
+			if (extent_item_pos && !ref->inode_list) {
+				u32 bsz;
+				struct extent_buffer *eb;
+				bsz = btrfs_level_size(fs_info->extent_root,
+							info_level);
+				eb = read_tree_block(fs_info->extent_root,
+							   ref->parent, bsz, 0);
+				if (!eb || !extent_buffer_uptodate(eb)) {
+					free_extent_buffer(eb);
+					ret = -EIO;
+					goto out;
+				}
+				ret = find_extent_in_eb(eb, bytenr,
+							*extent_item_pos, &eie);
+				free_extent_buffer(eb);
+				if (ret < 0)
+					goto out;
+				ref->inode_list = eie;
+			}
+			ret = ulist_add_merge(refs, ref->parent,
+					      (uintptr_t)ref->inode_list,
+					      (u64 *)&eie, GFP_NOFS);
+			if (ret < 0)
+				goto out;
+			if (!ret && extent_item_pos) {
+				/*
+				 * we've recorded that parent, so we must extend
+				 * its inode list here
+				 */
+				BUG_ON(!eie);
+				while (eie->next)
+					eie = eie->next;
+				eie->next = ref->inode_list;
+			}
+		}
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+
+out:
+	btrfs_free_path(path);
+	while (!list_empty(&prefs)) {
+		ref = list_first_entry(&prefs, struct __prelim_ref, list);
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+	while (!list_empty(&prefs_delayed)) {
+		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
+				       list);
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
+	}
+
+	return ret;
+}
+
+static void free_leaf_list(struct ulist *blocks)
+{
+	struct ulist_node *node = NULL;
+	struct extent_inode_elem *eie;
+	struct extent_inode_elem *eie_next;
+	struct ulist_iterator uiter;
+
+	ULIST_ITER_INIT(&uiter);
+	while ((node = ulist_next(blocks, &uiter))) {
+		if (!node->aux)
+			continue;
+		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
+		for (; eie; eie = eie_next) {
+			eie_next = eie->next;
+			kfree(eie);
+		}
+		node->aux = 0;
+	}
+
+	ulist_free(blocks);
+}
+
+/*
+ * Finds all leafs with a reference to the specified combination of bytenr and
+ * offset. key_list_head will point to a list of corresponding keys (caller must
+ * free each list element). The leafs will be stored in the leafs ulist, which
+ * must be freed with ulist_free.
+ *
+ * returns 0 on success, <0 on error
+ */
+static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 time_seq, struct ulist **leafs,
+				const u64 *extent_item_pos)
+{
+	struct ulist *tmp;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*leafs = ulist_alloc(GFP_NOFS);
+	if (!*leafs) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	ret = find_parent_nodes(trans, fs_info, bytenr,
+				time_seq, *leafs, tmp, extent_item_pos);
+	ulist_free(tmp);
+
+	if (ret < 0 && ret != -ENOENT) {
+		free_leaf_list(*leafs);
+		return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * walk all backrefs for a given extent to find all roots that reference this
+ * extent. Walking a backref means finding all extents that reference this
+ * extent and in turn walk the backrefs of those, too. Naturally this is a
+ * recursive process, but here it is implemented in an iterative fashion: We
+ * find all referencing extents for the extent in question and put them on a
+ * list. In turn, we find all referencing extents for those, further appending
+ * to the list. The way we iterate the list allows adding more elements after
+ * the current while iterating. The process stops when we reach the end of the
+ * list. Found roots are added to the roots list.
+ *
+ * returns 0 on success, < 0 on error.
+ */
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 time_seq, struct ulist **roots)
+{
+	struct ulist *tmp;
+	struct ulist_node *node = NULL;
+	struct ulist_iterator uiter;
+	int ret;
+
+	tmp = ulist_alloc(GFP_NOFS);
+	if (!tmp)
+		return -ENOMEM;
+	*roots = ulist_alloc(GFP_NOFS);
+	if (!*roots) {
+		ulist_free(tmp);
+		return -ENOMEM;
+	}
+
+	ULIST_ITER_INIT(&uiter);
+	while (1) {
+		ret = find_parent_nodes(trans, fs_info, bytenr,
+					time_seq, tmp, *roots, NULL);
+		if (ret < 0 && ret != -ENOENT) {
+			ulist_free(tmp);
+			ulist_free(*roots);
+			return ret;
+		}
+		node = ulist_next(tmp, &uiter);
+		if (!node)
+			break;
+		bytenr = node->val;
+	}
+
+	ulist_free(tmp);
+	return 0;
+}
+
+
 static int __inode_info(u64 inum, u64 ioff, u8 key_type,
 			struct btrfs_root *fs_root, struct btrfs_path *path,
 			struct btrfs_key *found_key)
@@ -82,9 +1156,77 @@
 				found_key);
 }
 
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off)
+{
+	int ret, slot;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	unsigned long ptr;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = start_off;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	while (1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			/*
+			 * If the item at offset is not found,
+			 * btrfs_search_slot will point us to the slot
+			 * where it should be inserted. In our case
+			 * that will be the slot directly before the
+			 * next INODE_REF_KEY_V2 item. In the case
+			 * that we're pointing to the last slot in a
+			 * leaf, we must move one leaf over.
+			 */
+			ret = btrfs_next_leaf(root, path);
+			if (ret) {
+				if (ret >= 1)
+					ret = -ENOENT;
+				break;
+			}
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		/*
+		 * Check that we're still looking at an extended ref key for
+		 * this particular objectid. If we have different
+		 * objectid or type then there are no more to be found
+		 * in the tree and we can exit.
+		 */
+		ret = -ENOENT;
+		if (found_key.objectid != inode_objectid)
+			break;
+		if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
+			break;
+
+		ret = 0;
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		extref = (struct btrfs_inode_extref *)ptr;
+		*ret_extref = extref;
+		if (found_off)
+			*found_off = found_key.offset;
+		break;
+	}
+
+	return ret;
+}
+
 /*
- * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
- * of the path are separated by '/' and the path is guaranteed to be
+ * this iterates to turn a name (from iref/extref) into a full filesystem path.
+ * Elements of the path are separated by '/' and the path is guaranteed to be
  * 0-terminated. the path is only given within the current file system.
  * Therefore, it never starts with a '/'. the caller is responsible to provide
  * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
@@ -96,33 +1238,39 @@
  * required for the path to fit into the buffer. in that case, the returned
  * value will be smaller than dest. callers must check this!
  */
-static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
-				struct btrfs_inode_ref *iref,
-				struct extent_buffer *eb_in, u64 parent,
-				char *dest, u32 size)
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size)
 {
-	u32 len;
 	int slot;
 	u64 next_inum;
 	int ret;
-	s64 bytes_left = size - 1;
+	s64 bytes_left = ((s64)size) - 1;
 	struct extent_buffer *eb = eb_in;
 	struct btrfs_key found_key;
+	int leave_spinning = path->leave_spinning;
+	struct btrfs_inode_ref *iref;
 
 	if (bytes_left >= 0)
 		dest[bytes_left] = '\0';
 
+	path->leave_spinning = 1;
 	while (1) {
-		len = btrfs_inode_ref_name_len(eb, iref);
-		bytes_left -= len;
+		bytes_left -= name_len;
 		if (bytes_left >= 0)
 			read_extent_buffer(eb, dest + bytes_left,
-						(unsigned long)(iref + 1), len);
-		if (eb != eb_in)
+					   name_off, name_len);
+		if (eb != eb_in) {
+			btrfs_tree_read_unlock_blocking(eb);
 			free_extent_buffer(eb);
+		}
 		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+		if (ret > 0)
+			ret = -ENOENT;
 		if (ret)
 			break;
+
 		next_inum = found_key.offset;
 
 		/* regular exit ahead */
@@ -132,11 +1280,17 @@
 		slot = path->slots[0];
 		eb = path->nodes[0];
 		/* make sure we can use eb after releasing the path */
-		if (eb != eb_in)
+		if (eb != eb_in) {
 			atomic_inc(&eb->refs);
+			btrfs_tree_read_lock(eb);
+			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		}
 		btrfs_release_path(path);
-
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		name_len = btrfs_inode_ref_name_len(eb, iref);
+		name_off = (unsigned long)(iref + 1);
+
 		parent = next_inum;
 		--bytes_left;
 		if (bytes_left >= 0)
@@ -144,6 +1298,7 @@
 	}
 
 	btrfs_release_path(path);
+	path->leave_spinning = leave_spinning;
 
 	if (ret)
 		return ERR_PTR(ret);
@@ -157,16 +1312,21 @@
  * tree blocks and <0 on error.
  */
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-			struct btrfs_path *path, struct btrfs_key *found_key)
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags_ret)
 {
 	int ret;
 	u64 flags;
+	u64 size = 0;
 	u32 item_size;
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
 
-	key.type = BTRFS_EXTENT_ITEM_KEY;
+	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
+		key.type = BTRFS_METADATA_ITEM_KEY;
+	else
+		key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.objectid = logical;
 	key.offset = (u64)-1;
 
@@ -179,10 +1339,18 @@
 		return ret;
 
 	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
-	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
+		size = fs_info->extent_root->leafsize;
+	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
+		size = found_key->offset;
+
+	if ((found_key->type != BTRFS_EXTENT_ITEM_KEY &&
+	     found_key->type != BTRFS_METADATA_ITEM_KEY) ||
 	    found_key->objectid > logical ||
-	    found_key->objectid + found_key->offset <= logical)
+	    found_key->objectid + size <= logical) {
+		pr_debug("logical %llu is not within any extent\n", logical);
 		return -ENOENT;
+	}
 
 	eb = path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
@@ -191,10 +1359,21 @@
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(eb, ei);
 
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
-	if (flags & BTRFS_EXTENT_FLAG_DATA)
-		return BTRFS_EXTENT_FLAG_DATA;
+	pr_debug("logical %llu is at position %llu within the extent (%llu "
+		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
+		 logical, logical - found_key->objectid, found_key->objectid,
+		 found_key->offset, flags, item_size);
+
+	WARN_ON(!flags_ret);
+	if (flags_ret) {
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+		else if (flags & BTRFS_EXTENT_FLAG_DATA)
+			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
+		else
+			BUG_ON(1);
+		return 0;
+	}
 
 	return -EIO;
 }
@@ -287,295 +1466,89 @@
 	return 0;
 }
 
-static int __data_list_add(struct list_head *head, u64 inum,
-				u64 extent_data_item_offset, u64 root)
-{
-	struct __data_ref *ref;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	ref->inum = inum;
-	ref->extent_data_item_offset = extent_data_item_offset;
-	ref->root = root;
-	list_add_tail(&ref->list, head);
-
-	return 0;
-}
-
-static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
-				struct btrfs_extent_data_ref *dref)
-{
-	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
-				btrfs_extent_data_ref_offset(eb, dref),
-				btrfs_extent_data_ref_root(eb, dref));
-}
-
-static int __shared_list_add(struct list_head *head, u64 disk_byte)
-{
-	struct __shared_ref *ref;
-
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
-	if (!ref)
-		return -ENOMEM;
-
-	ref->disk_byte = disk_byte;
-	list_add_tail(&ref->list, head);
-
-	return 0;
-}
-
-static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
-					   u64 logical, u64 inum,
-					   u64 extent_data_item_offset,
-					   u64 extent_offset,
-					   struct btrfs_path *path,
-					   struct list_head *data_refs,
-					   iterate_extent_inodes_t *iterate,
-					   void *ctx)
-{
-	u64 ref_root;
-	u32 item_size;
-	struct btrfs_key key;
-	struct extent_buffer *eb;
-	struct btrfs_extent_item *ei;
-	struct btrfs_extent_inline_ref *eiref;
-	struct __data_ref *ref;
-	int ret;
-	int type;
-	int last;
-	unsigned long ptr = 0;
-
-	WARN_ON(!list_empty(data_refs));
-	ret = extent_from_logical(fs_info, logical, path, &key);
-	if (ret & BTRFS_EXTENT_FLAG_DATA)
-		ret = -EIO;
-	if (ret < 0)
-		goto out;
-
-	eb = path->nodes[0];
-	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-	ret = 0;
-	ref_root = 0;
-	/*
-	 * as done in iterate_extent_inodes, we first build a list of refs to
-	 * iterate, then free the path and then iterate them to avoid deadlocks.
-	 */
-	do {
-		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-						&eiref, &type);
-		if (last < 0) {
-			ret = last;
-			goto out;
-		}
-		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
-		    type == BTRFS_SHARED_BLOCK_REF_KEY) {
-			ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
-			ret = __data_list_add(data_refs, inum,
-						extent_data_item_offset,
-						ref_root);
-		}
-	} while (!ret && !last);
-
-	btrfs_release_path(path);
-
-	if (ref_root == 0) {
-		printk(KERN_ERR "btrfs: failed to find tree block ref "
-			"for shared data backref %llu\n", logical);
-		WARN_ON(1);
-		ret = -EIO;
-	}
-
-out:
-	while (!list_empty(data_refs)) {
-		ref = list_first_entry(data_refs, struct __data_ref, list);
-		list_del(&ref->list);
-		if (!ret)
-			ret = iterate(ref->inum, extent_offset +
-					ref->extent_data_item_offset,
-					ref->root, ctx);
-		kfree(ref);
-	}
-
-	return ret;
-}
-
-static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
-				    u64 logical, u64 orig_extent_item_objectid,
-				    u64 extent_offset, struct btrfs_path *path,
-				    struct list_head *data_refs,
-				    iterate_extent_inodes_t *iterate,
-				    void *ctx)
+static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
+				u64 root, u64 extent_item_objectid,
+				iterate_extent_inodes_t *iterate, void *ctx)
 {
-	u64 disk_byte;
-	struct btrfs_key key;
-	struct btrfs_file_extent_item *fi;
-	struct extent_buffer *eb;
-	int slot;
-	int nritems;
-	int ret;
-	int found = 0;
+	struct extent_inode_elem *eie;
+	int ret = 0;
 
-	eb = read_tree_block(fs_info->tree_root, logical,
-				fs_info->tree_root->leafsize, 0);
-	if (!eb)
-		return -EIO;
-
-	/*
-	 * from the shared data ref, we only have the leaf but we need
-	 * the key. thus, we must look into all items and see that we
-	 * find one (some) with a reference to our extent item.
-	 */
-	nritems = btrfs_header_nritems(eb);
-	for (slot = 0; slot < nritems; ++slot) {
-		btrfs_item_key_to_cpu(eb, &key, slot);
-		if (key.type != BTRFS_EXTENT_DATA_KEY)
-			continue;
-		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
-		if (!fi) {
-			free_extent_buffer(eb);
-			return -EIO;
-		}
-		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
-		if (disk_byte != orig_extent_item_objectid) {
-			if (found)
-				break;
-			else
-				continue;
-		}
-		++found;
-		ret = __iter_shared_inline_ref_inodes(fs_info, logical,
-							key.objectid,
-							key.offset,
-							extent_offset, path,
-							data_refs,
-							iterate, ctx);
-		if (ret)
+	for (eie = inode_list; eie; eie = eie->next) {
+		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
+			 "root %llu\n", extent_item_objectid,
+			 eie->inum, eie->offset, root);
+		ret = iterate(eie->inum, eie->offset, root, ctx);
+		if (ret) {
+			pr_debug("stopping iteration for %llu due to ret=%d\n",
+				 extent_item_objectid, ret);
 			break;
+		}
 	}
 
-	if (!found) {
-		printk(KERN_ERR "btrfs: failed to follow shared data backref "
-			"to parent %llu\n", logical);
-		WARN_ON(1);
-		ret = -EIO;
-	}
-
-	free_extent_buffer(eb);
 	return ret;
 }
 
 /*
  * calls iterate() for every inode that references the extent identified by
- * the given parameters. will use the path given as a parameter and return it
- * released.
+ * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
-				struct btrfs_path *path,
-				u64 extent_item_objectid,
-				u64 extent_offset,
+				u64 extent_item_objectid, u64 extent_item_pos,
+				int search_commit_root,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
-	unsigned long ptr = 0;
-	int last;
 	int ret;
-	int type;
-	u64 logical;
-	u32 item_size;
-	struct btrfs_extent_inline_ref *eiref;
-	struct btrfs_extent_data_ref *dref;
-	struct extent_buffer *eb;
-	struct btrfs_extent_item *ei;
-	struct btrfs_key key;
-	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
-	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
-	struct __data_ref *ref_d;
-	struct __shared_ref *ref_s;
-
-	eb = path->nodes[0];
-	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
-	item_size = btrfs_item_size_nr(eb, path->slots[0]);
-
-	/* first we iterate the inline refs, ... */
-	do {
-		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
-						&eiref, &type);
-		if (last == -ENOENT) {
-			ret = 0;
-			break;
-		}
-		if (last < 0) {
-			ret = last;
-			break;
-		}
+	struct btrfs_trans_handle *trans = NULL;
+	struct ulist *refs = NULL;
+	struct ulist *roots = NULL;
+	struct ulist_node *ref_node = NULL;
+	struct ulist_node *root_node = NULL;
+	struct seq_list tree_mod_seq_elem = {};
+	struct ulist_iterator ref_uiter;
+	struct ulist_iterator root_uiter;
+
+	pr_debug("resolving all inodes for extent %llu\n",
+			extent_item_objectid);
+
+	if (!search_commit_root) {
+		trans = btrfs_join_transaction(fs_info->extent_root);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+	}
 
-		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
-			ret = __data_list_add_eb(&data_refs, eb, dref);
-		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
-			logical = btrfs_extent_inline_ref_offset(eb, eiref);
-			ret = __shared_list_add(&shared_refs, logical);
-		}
-	} while (!ret && !last);
+	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
+				   tree_mod_seq_elem.seq, &refs,
+				   &extent_item_pos);
+	if (ret)
+		goto out;
 
-	/* ... then we proceed to in-tree references and ... */
-	while (!ret) {
-		++path->slots[0];
-		if (path->slots[0] > btrfs_header_nritems(eb)) {
-			ret = btrfs_next_leaf(fs_info->extent_root, path);
-			if (ret) {
-				if (ret == 1)
-					ret = 0; /* we're done */
-				break;
-			}
-			eb = path->nodes[0];
-		}
-		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
-		if (key.objectid != extent_item_objectid)
+	ULIST_ITER_INIT(&ref_uiter);
+	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
+		ret = btrfs_find_all_roots(trans, fs_info, ref_node->val,
+					   tree_mod_seq_elem.seq, &roots);
+		if (ret)
 			break;
-		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
-			dref = btrfs_item_ptr(eb, path->slots[0],
-						struct btrfs_extent_data_ref);
-			ret = __data_list_add_eb(&data_refs, eb, dref);
-		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
-			ret = __shared_list_add(&shared_refs, key.offset);
+		ULIST_ITER_INIT(&root_uiter);
+		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
+			pr_debug("root %llu references leaf %llu, data list "
+				 "%#llx\n", root_node->val, ref_node->val,
+				 ref_node->aux);
+			ret = iterate_leaf_refs((struct extent_inode_elem *)
+						(uintptr_t)ref_node->aux,
+						root_node->val,
+						extent_item_objectid,
+						iterate, ctx);
 		}
+		ulist_free(roots);
 	}
 
-	btrfs_release_path(path);
-
-	/*
-	 * ... only at the very end we can process the refs we found. this is
-	 * because the iterator function we call is allowed to make tree lookups
-	 * and we have to avoid deadlocks. additionally, we need more tree
-	 * lookups ourselves for shared data refs.
-	 */
-	while (!list_empty(&data_refs)) {
-		ref_d = list_first_entry(&data_refs, struct __data_ref, list);
-		list_del(&ref_d->list);
-		if (!ret)
-			ret = iterate(ref_d->inum, extent_offset +
-					ref_d->extent_data_item_offset,
-					ref_d->root, ctx);
-		kfree(ref_d);
-	}
-
-	while (!list_empty(&shared_refs)) {
-		ref_s = list_first_entry(&shared_refs, struct __shared_ref,
-					list);
-		list_del(&ref_s->list);
-		if (!ret)
-			ret = __iter_shared_inline_ref(fs_info,
-							ref_s->disk_byte,
-							extent_item_objectid,
-							extent_offset, path,
-							&data_refs,
-							iterate, ctx);
-		kfree(ref_s);
+	free_leaf_list(refs);
+out:
+	if (!search_commit_root) {
+		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+		btrfs_end_transaction(trans, fs_info->extent_root);
 	}
 
 	return ret;
@@ -586,28 +1559,34 @@
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
-	u64 offset;
+	u64 extent_item_pos;
+	u64 flags = 0;
 	struct btrfs_key found_key;
+	int search_commit_root = path->search_commit_root;
 
-	ret = extent_from_logical(fs_info, logical, path,
-					&found_key);
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
-		ret = -EINVAL;
+	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
+	btrfs_release_path(path);
 	if (ret < 0)
 		return ret;
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		return -EINVAL;
 
-	offset = logical - found_key.objectid;
-	ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
-					offset, iterate, ctx);
+	extent_item_pos = logical - found_key.objectid;
+	ret = iterate_extent_inodes(fs_info, found_key.objectid,
+					extent_item_pos, search_commit_root,
+					iterate, ctx);
 
 	return ret;
 }
 
-static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
-				struct btrfs_path *path,
-				iterate_irefs_t *iterate, void *ctx)
+typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
+			      struct extent_buffer *eb, void *ctx);
+
+static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
+			      struct btrfs_path *path,
+			      iterate_irefs_t *iterate, void *ctx)
 {
-	int ret;
+	int ret = 0;
 	int slot;
 	u32 cur;
 	u32 len;
@@ -619,9 +1598,10 @@
 	struct btrfs_inode_ref *iref;
 	struct btrfs_key found_key;
 
-	while (1) {
+	while (!ret) {
+		path->leave_spinning = 1;
 		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
-					&found_key);
+				     &found_key);
 		if (ret < 0)
 			break;
 		if (ret) {
@@ -635,6 +1615,8 @@
 		eb = path->nodes[0];
 		/* make sure we can use eb after releasing the path */
 		atomic_inc(&eb->refs);
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 
 		item = btrfs_item_nr(eb, slot);
@@ -643,14 +1625,17 @@
 		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
-			ret = iterate(parent, iref, eb, ctx);
-			if (ret) {
-				free_extent_buffer(eb);
+			pr_debug("following ref at offset %u for inode %llu in "
+				 "tree %llu\n", cur, found_key.objectid,
+				 fs_root->objectid);
+			ret = iterate(parent, name_len,
+				      (unsigned long)(iref + 1), eb, ctx);
+			if (ret)
 				break;
-			}
 			len = sizeof(*iref) + name_len;
 			iref = (struct btrfs_inode_ref *)((char *)iref + len);
 		}
+		btrfs_tree_read_unlock_blocking(eb);
 		free_extent_buffer(eb);
 	}
 
@@ -659,12 +1644,98 @@
 	return ret;
 }
 
+static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
+				 struct btrfs_path *path,
+				 iterate_irefs_t *iterate, void *ctx)
+{
+	int ret;
+	int slot;
+	u64 offset = 0;
+	u64 parent;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u32 item_size;
+	u32 cur_offset;
+	unsigned long ptr;
+
+	while (1) {
+		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
+					    &offset);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		atomic_inc(&eb->refs);
+
+		btrfs_tree_read_lock(eb);
+		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+		btrfs_release_path(path);
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		cur_offset = 0;
+
+		while (cur_offset < item_size) {
+			u32 name_len;
+
+			extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
+			parent = btrfs_inode_extref_parent(eb, extref);
+			name_len = btrfs_inode_extref_name_len(eb, extref);
+			ret = iterate(parent, name_len,
+				      (unsigned long)&extref->name, eb, ctx);
+			if (ret)
+				break;
+
+			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
+			cur_offset += sizeof(*extref);
+		}
+		btrfs_tree_read_unlock_blocking(eb);
+		free_extent_buffer(eb);
+
+		offset++;
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+			 struct btrfs_path *path, iterate_irefs_t *iterate,
+			 void *ctx)
+{
+	int ret;
+	int found_refs = 0;
+
+	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
+	if (!ret)
+		++found_refs;
+	else if (ret != -ENOENT)
+		return ret;
+
+	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
+	if (ret == -ENOENT && found_refs)
+		return 0;
+
+	return ret;
+}
+
 /*
  * returns 0 if the path could be dumped (probably truncated)
  * returns <0 in case of an error
  */
-static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
-				struct extent_buffer *eb, void *ctx)
+static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
+			 struct extent_buffer *eb, void *ctx)
 {
 	struct inode_fs_paths *ipath = ctx;
 	char *fspath;
@@ -677,8 +1748,8 @@
 					ipath->fspath->bytes_left - s_ptr : 0;
 
 	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
-	fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
-				inum, fspath_min, bytes_left);
+	fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
+				   name_off, eb, inum, fspath_min, bytes_left);
 	if (IS_ERR(fspath))
 		return PTR_ERR(fspath);
 
@@ -708,22 +1779,16 @@
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
 {
 	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
-				inode_to_path, ipath);
+			     inode_to_path, ipath);
 }
 
-/*
- * allocates space to return multiple file system paths for an inode.
- * total_bytes to allocate are passed, note that space usable for actual path
- * information will be total_bytes - sizeof(struct inode_fs_paths).
- * the returned pointer must be freed with free_ipath() in the end.
- */
 struct btrfs_data_container *init_data_container(u32 total_bytes)
 {
 	struct btrfs_data_container *data;
 	size_t alloc_bytes;
 
 	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
-	data = kmalloc(alloc_bytes, GFP_NOFS);
+	data = vmalloc(alloc_bytes);
 	if (!data)
 		return ERR_PTR(-ENOMEM);
 
@@ -772,5 +1837,8 @@
 
 void free_ipath(struct inode_fs_paths *ipath)
 {
+	if (!ipath)
+		return;
+	vfree(ipath->fspath);
 	kfree(ipath);
 }
diff -ur a/fs/btrfs/backref.h b/fs/btrfs/backref.h
--- a/fs/btrfs/backref.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/backref.h	2014-02-17 11:56:58.000000000 +0100
@@ -19,7 +19,9 @@
 #ifndef __BTRFS_BACKREF__
 #define __BTRFS_BACKREF__
 
-#include "ioctl.h"
+#include <linux/btrfs.h>
+#include "ulist.h"
+#include "extent_io.h"
 
 struct inode_fs_paths {
 	struct btrfs_path		*btrfs_path;
@@ -29,23 +31,21 @@
 
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
 		void *ctx);
-typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
-				struct extent_buffer *eb, void *ctx);
 
 int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 			struct btrfs_path *path);
 
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
-			struct btrfs_path *path, struct btrfs_key *found_key);
+			struct btrfs_path *path, struct btrfs_key *found_key,
+			u64 *flags);
 
 int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 				struct btrfs_extent_item *ei, u32 item_size,
 				u64 *out_root, u8 *out_level);
 
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
-				struct btrfs_path *path,
 				u64 extent_item_objectid,
-				u64 extent_offset,
+				u64 extent_offset, int search_commit_root,
 				iterate_extent_inodes_t *iterate, void *ctx);
 
 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
@@ -54,9 +54,24 @@
 
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
 
+int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info, u64 bytenr,
+				u64 time_seq, struct ulist **roots);
+char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+			u32 name_len, unsigned long name_off,
+			struct extent_buffer *eb_in, u64 parent,
+			char *dest, u32 size);
+
 struct btrfs_data_container *init_data_container(u32 total_bytes);
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 					struct btrfs_path *path);
 void free_ipath(struct inode_fs_paths *ipath);
 
+int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
+			  u64 start_off, struct btrfs_path *path,
+			  struct btrfs_inode_extref **ret_extref,
+			  u64 *found_off);
+
+int __init btrfs_prelim_ref_init(void);
+void btrfs_prelim_ref_exit(void);
 #endif
diff -ur a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
--- a/fs/btrfs/btrfs_inode.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/btrfs_inode.h	2014-02-17 11:56:58.000000000 +0100
@@ -24,6 +24,25 @@
 #include "ordered-data.h"
 #include "delayed-inode.h"
 
+/*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero.  When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ */
+#define BTRFS_INODE_ORDERED_DATA_CLOSE		0
+#define BTRFS_INODE_ORPHAN_META_RESERVED	1
+#define BTRFS_INODE_DUMMY			2
+#define BTRFS_INODE_IN_DEFRAG			3
+#define BTRFS_INODE_DELALLOC_META_RESERVED	4
+#define BTRFS_INODE_HAS_ORPHAN_ITEM		5
+#define BTRFS_INODE_HAS_ASYNC_EXTENT		6
+#define BTRFS_INODE_NEEDS_FULL_SYNC		7
+#define BTRFS_INODE_COPY_EVERYTHING		8
+#define BTRFS_INODE_IN_DELALLOC_LIST		9
+#define BTRFS_INODE_READDIO_NEED_LOCK		10
+
 /* in memory btrfs inode */
 struct btrfs_inode {
 	/* which subvolume this inode belongs to */
@@ -51,12 +70,12 @@
 	/* held while logging the inode in tree-log.c */
 	struct mutex log_mutex;
 
+	/* held while doing delalloc reservations */
+	struct mutex delalloc_mutex;
+
 	/* used to order data wrt metadata */
 	struct btrfs_ordered_inode_tree ordered_tree;
 
-	/* for keeping track of orphaned inodes */
-	struct list_head i_orphan;
-
 	/* list of all the delalloc inodes in the FS.  There are times we need
 	 * to write all the delalloc pages to disk, and this list is used
 	 * to walk them all.
@@ -72,17 +91,16 @@
 	/* node for the red-black tree that links inodes in subvolume root */
 	struct rb_node rb_node;
 
-	/* the space_info for where this inode's data allocations are done */
-	struct btrfs_space_info *space_info;
+	unsigned long runtime_flags;
+
+	/* Keep track of who's O_SYNC/fsyncing currently */
+	atomic_t sync_writers;
 
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
 	u64 generation;
 
-	/* sequence number for NFS changes */
-	u64 sequence;
-
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
@@ -132,6 +150,9 @@
 	/* flags field from the on disk inode */
 	u32 flags;
 
+	/* a local copy of root's last_log_commit */
+	unsigned long last_log_commit;
+
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
@@ -142,22 +163,9 @@
 	unsigned reserved_extents;
 
 	/*
-	 * ordered_data_close is set by truncate when a file that used
-	 * to have good data has been truncated to zero.  When it is set
-	 * the btrfs file release call will add this inode to the
-	 * ordered operations list so that we make sure to flush out any
-	 * new data the application may have written before commit.
-	 */
-	unsigned ordered_data_close:1;
-	unsigned orphan_meta_reserved:1;
-	unsigned dummy_inode:1;
-	unsigned in_defrag:1;
-	unsigned delalloc_meta_reserved:1;
-
-	/*
 	 * always compress this one file
 	 */
-	unsigned force_compress:4;
+	unsigned force_compress;
 
 	struct btrfs_delayed_node *delayed_node;
 
@@ -190,13 +198,66 @@
 	BTRFS_I(inode)->disk_i_size = size;
 }
 
-static inline bool btrfs_is_free_space_inode(struct btrfs_root *root,
-				       struct inode *inode)
+static inline bool btrfs_is_free_space_inode(struct inode *inode)
 {
-	if (root == root->fs_info->tree_root ||
-	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (root == root->fs_info->tree_root &&
+	    btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID)
+		return true;
+	if (BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
 		return true;
 	return false;
 }
 
+static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
+{
+	if (BTRFS_I(inode)->logged_trans == generation &&
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->last_log_commit &&
+	    BTRFS_I(inode)->last_sub_trans <=
+	    BTRFS_I(inode)->root->last_log_commit)
+		return 1;
+	return 0;
+}
+
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u64 disk_bytenr;
+	u64 bytes;
+	void *private;
+
+	/* number of bios pending for this dio */
+	atomic_t pending_bios;
+
+	/* IO errors */
+	int errors;
+
+	/* orig_bio is our btrfs_io_bio */
+	struct bio *orig_bio;
+
+	/* dio_bio came from fs/direct-io.c */
+	struct bio *dio_bio;
+	u8 csum[0];
+};
+
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+	set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+	smp_mb();
+}
+
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb__before_clear_bit();
+	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+		  &BTRFS_I(inode)->runtime_flags);
+}
+
 #endif
Nur in b/fs/btrfs: check-integrity.c.
Nur in b/fs/btrfs: check-integrity.h.
diff -ur a/fs/btrfs/compression.c b/fs/btrfs/compression.c
--- a/fs/btrfs/compression.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/compression.c	2014-02-17 11:56:58.000000000 +0100
@@ -82,6 +82,10 @@
 	u32 sums;
 };
 
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen);
+
 static inline int compressed_bio_size(struct btrfs_root *root,
 				      unsigned long disk_size)
 {
@@ -106,7 +110,6 @@
 				 u64 disk_start)
 {
 	int ret;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page *page;
 	unsigned long i;
 	char *kaddr;
@@ -120,18 +123,17 @@
 		page = cb->compressed_pages[i];
 		csum = ~(u32)0;
 
-		kaddr = kmap_atomic(page, KM_USER0);
-		csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE);
+		kaddr = kmap_atomic(page);
+		csum = btrfs_csum_data(kaddr, csum, PAGE_CACHE_SIZE);
 		btrfs_csum_final(csum, (char *)&csum);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 
 		if (csum != *cb_sum) {
 			printk(KERN_INFO "btrfs csum failed ino %llu "
 			       "extent %llu csum %u "
 			       "wanted %u mirror %d\n",
-			       (unsigned long long)btrfs_ino(inode),
-			       (unsigned long long)disk_start,
-			       csum, *cb_sum, cb->mirror_num);
+			       btrfs_ino(inode), disk_start, csum, *cb_sum,
+			       cb->mirror_num);
 			ret = -EIO;
 			goto fail;
 		}
@@ -226,8 +228,8 @@
  * Clear the writeback bits on all of the file
  * pages for a compressed write
  */
-static noinline int end_compressed_writeback(struct inode *inode, u64 start,
-					     unsigned long ram_size)
+static noinline void end_compressed_writeback(struct inode *inode, u64 start,
+					      unsigned long ram_size)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT;
@@ -253,7 +255,6 @@
 		index += ret;
 	}
 	/* the inode may be gone now */
-	return 0;
 }
 
 /*
@@ -373,7 +374,7 @@
 		page = compressed_pages[pg_index];
 		page->mapping = inode->i_mapping;
 		if (bio->bi_size)
-			ret = io_tree->ops->merge_bio_hook(page, 0,
+			ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
 							   PAGE_CACHE_SIZE,
 							   bio, 0);
 		else
@@ -392,20 +393,21 @@
 			 */
 			atomic_inc(&cb->pending_bios);
 			ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM */
 
 			if (!skip_sum) {
 				ret = btrfs_csum_one_bio(root, inode, bio,
 							 start, 1);
-				BUG_ON(ret);
+				BUG_ON(ret); /* -ENOMEM */
 			}
 
 			ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM */
 
 			bio_put(bio);
 
 			bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+			BUG_ON(!bio);
 			bio->bi_private = cb;
 			bio->bi_end_io = end_compressed_bio_write;
 			bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
@@ -421,15 +423,15 @@
 	bio_get(bio);
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 
 	if (!skip_sum) {
 		ret = btrfs_csum_one_bio(root, inode, bio, start, 1);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 
 	ret = btrfs_map_bio(root, WRITE, bio, 0, 1);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 
 	bio_put(bio);
 	return 0;
@@ -497,7 +499,7 @@
 		 * sure they map to this compressed extent on disk.
 		 */
 		set_page_extent_mapped(page);
-		lock_extent(tree, last_offset, end, GFP_NOFS);
+		lock_extent(tree, last_offset, end);
 		read_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, last_offset,
 					   PAGE_CACHE_SIZE);
@@ -507,7 +509,7 @@
 		    (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) ||
 		    (em->block_start >> 9) != cb->orig_bio->bi_sector) {
 			free_extent_map(em);
-			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_extent(tree, last_offset, end);
 			unlock_page(page);
 			page_cache_release(page);
 			break;
@@ -521,10 +523,10 @@
 			if (zero_offset) {
 				int zeros;
 				zeros = PAGE_CACHE_SIZE - zero_offset;
-				userpage = kmap_atomic(page, KM_USER0);
+				userpage = kmap_atomic(page);
 				memset(userpage + zero_offset, 0, zeros);
 				flush_dcache_page(page);
-				kunmap_atomic(userpage, KM_USER0);
+				kunmap_atomic(userpage);
 			}
 		}
 
@@ -535,7 +537,7 @@
 			nr_pages++;
 			page_cache_release(page);
 		} else {
-			unlock_extent(tree, last_offset, end, GFP_NOFS);
+			unlock_extent(tree, last_offset, end);
 			unlock_page(page);
 			page_cache_release(page);
 			break;
@@ -577,6 +579,7 @@
 	u64 em_start;
 	struct extent_map *em;
 	int ret = -ENOMEM;
+	int faili = 0;
 	u32 *sums;
 
 	tree = &BTRFS_I(inode)->io_tree;
@@ -588,6 +591,8 @@
 				   page_offset(bio->bi_io_vec->bv_page),
 				   PAGE_CACHE_SIZE);
 	read_unlock(&em_tree->lock);
+	if (!em)
+		return -EIO;
 
 	compressed_len = em->block_len;
 	cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
@@ -624,12 +629,20 @@
 	for (pg_index = 0; pg_index < nr_pages; pg_index++) {
 		cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
 							      __GFP_HIGHMEM);
-		if (!cb->compressed_pages[pg_index])
+		if (!cb->compressed_pages[pg_index]) {
+			faili = pg_index - 1;
+			ret = -ENOMEM;
 			goto fail2;
+		}
 	}
+	faili = nr_pages - 1;
 	cb->nr_pages = nr_pages;
 
-	add_ra_bio_pages(inode, em_start + em_len, cb);
+	/* In the parent-locked case, we only locked the range we are
+	 * interested in.  In all other cases, we can opportunistically
+	 * cache decompressed data that goes beyond the requested range. */
+	if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
+		add_ra_bio_pages(inode, em_start + em_len, cb);
 
 	/* include any pages we added in add_ra-bio_pages */
 	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
@@ -648,7 +661,7 @@
 		page->index = em_start >> PAGE_CACHE_SHIFT;
 
 		if (comp_bio->bi_size)
-			ret = tree->ops->merge_bio_hook(page, 0,
+			ret = tree->ops->merge_bio_hook(READ, page, 0,
 							PAGE_CACHE_SIZE,
 							comp_bio, 0);
 		else
@@ -660,7 +673,7 @@
 			bio_get(comp_bio);
 
 			ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM */
 
 			/*
 			 * inc the count before we submit the bio so
@@ -673,19 +686,21 @@
 			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 				ret = btrfs_lookup_bio_sums(root, inode,
 							comp_bio, sums);
-				BUG_ON(ret);
+				BUG_ON(ret); /* -ENOMEM */
 			}
 			sums += (comp_bio->bi_size + root->sectorsize - 1) /
 				root->sectorsize;
 
 			ret = btrfs_map_bio(root, READ, comp_bio,
 					    mirror_num, 0);
-			BUG_ON(ret);
+			if (ret)
+				bio_endio(comp_bio, ret);
 
 			bio_put(comp_bio);
 
 			comp_bio = compressed_bio_alloc(bdev, cur_disk_byte,
 							GFP_NOFS);
+			BUG_ON(!comp_bio);
 			comp_bio->bi_private = cb;
 			comp_bio->bi_end_io = end_compressed_bio_read;
 
@@ -696,22 +711,25 @@
 	bio_get(comp_bio);
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 
 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 		ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
-	BUG_ON(ret);
+	if (ret)
+		bio_endio(comp_bio, ret);
 
 	bio_put(comp_bio);
 	return 0;
 
 fail2:
-	for (pg_index = 0; pg_index < nr_pages; pg_index++)
-		free_page((unsigned long)cb->compressed_pages[pg_index]);
+	while (faili >= 0) {
+		__free_page(cb->compressed_pages[faili]);
+		faili--;
+	}
 
 	kfree(cb->compressed_pages);
 fail1:
@@ -727,12 +745,12 @@
 static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
 static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
 
-struct btrfs_compress_op *btrfs_compress_op[] = {
+static struct btrfs_compress_op *btrfs_compress_op[] = {
 	&btrfs_zlib_compress,
 	&btrfs_lzo_compress,
 };
 
-int __init btrfs_init_compress(void)
+void __init btrfs_init_compress(void)
 {
 	int i;
 
@@ -742,7 +760,6 @@
 		atomic_set(&comp_alloc_workspace[i], 0);
 		init_waitqueue_head(&comp_workspace_wait[i]);
 	}
-	return 0;
 }
 
 /*
@@ -816,6 +833,7 @@
 	btrfs_compress_op[idx]->free_workspace(workspace);
 	atomic_dec(alloc_workspace);
 wake:
+	smp_mb();
 	if (waitqueue_active(workspace_wait))
 		wake_up(workspace_wait);
 }
@@ -898,8 +916,9 @@
  * be contiguous.  They all correspond to the range of bytes covered by
  * the compressed extent.
  */
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-			    struct bio_vec *bvec, int vcnt, size_t srclen)
+static int btrfs_decompress_biovec(int type, struct page **pages_in,
+				   u64 disk_start, struct bio_vec *bvec,
+				   int vcnt, size_t srclen)
 {
 	struct list_head *workspace;
 	int ret;
@@ -991,9 +1010,9 @@
 		bytes = min(PAGE_CACHE_SIZE - *pg_offset,
 			    PAGE_CACHE_SIZE - buf_offset);
 		bytes = min(bytes, working_bytes);
-		kaddr = kmap_atomic(page_out, KM_USER0);
+		kaddr = kmap_atomic(page_out);
 		memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 		flush_dcache_page(page_out);
 
 		*pg_offset += bytes;
diff -ur a/fs/btrfs/compression.h b/fs/btrfs/compression.h
--- a/fs/btrfs/compression.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/compression.h	2014-02-17 11:56:58.000000000 +0100
@@ -19,7 +19,7 @@
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
 
-int btrfs_init_compress(void);
+void btrfs_init_compress(void);
 void btrfs_exit_compress(void);
 
 int btrfs_compress_pages(int type, struct address_space *mapping,
@@ -30,8 +30,6 @@
 			 unsigned long *total_in,
 			 unsigned long *total_out,
 			 unsigned long max_out);
-int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-			    struct bio_vec *bvec, int vcnt, size_t srclen);
 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
 		     unsigned long start_byte, size_t srclen, size_t destlen);
 int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
diff -ur a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
--- a/fs/btrfs/ctree.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/ctree.c	2014-02-17 11:56:58.000000000 +0100
@@ -18,6 +18,7 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/rbtree.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -36,8 +37,11 @@
 			      struct btrfs_root *root,
 			      struct extent_buffer *dst_buf,
 			      struct extent_buffer *src_buf);
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct btrfs_path *path, int level, int slot);
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot);
+static void tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+				 struct extent_buffer *eb);
+static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 
 struct btrfs_path *btrfs_alloc_path(void)
 {
@@ -156,10 +160,23 @@
 {
 	struct extent_buffer *eb;
 
-	rcu_read_lock();
-	eb = rcu_dereference(root->node);
-	extent_buffer_get(eb);
-	rcu_read_unlock();
+	while (1) {
+		rcu_read_lock();
+		eb = rcu_dereference(root->node);
+
+		/*
+		 * RCU really hurts here, we could free up the root node because
+		 * it was cow'ed but we may not get the new root node yet so do
+		 * the inc_not_zero dance and if it doesn't work then
+		 * synchronize_rcu and try again.
+		 */
+		if (atomic_inc_not_zero(&eb->refs)) {
+			rcu_read_unlock();
+			break;
+		}
+		rcu_read_unlock();
+		synchronize_rcu();
+	}
 	return eb;
 }
 
@@ -186,7 +203,7 @@
  * tree until you end up with a lock on the root.  A locked buffer
  * is returned, with a reference held.
  */
-struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
+static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 {
 	struct extent_buffer *eb;
 
@@ -207,10 +224,12 @@
  */
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
+	spin_lock(&root->fs_info->trans_lock);
 	if (root->track_dirty && list_empty(&root->dirty_list)) {
 		list_add(&root->dirty_list,
 			 &root->fs_info->dirty_cowonly_roots);
 	}
+	spin_unlock(&root->fs_info->trans_lock);
 }
 
 /*
@@ -255,15 +274,14 @@
 	else
 		btrfs_set_header_owner(cow, new_root_objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
 	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-		ret = btrfs_inc_ref(trans, root, cow, 1);
+		ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 	else
-		ret = btrfs_inc_ref(trans, root, cow, 0);
+		ret = btrfs_inc_ref(trans, root, cow, 0, 1);
 
 	if (ret)
 		return ret;
@@ -273,6 +291,512 @@
 	return 0;
 }
 
+enum mod_log_op {
+	MOD_LOG_KEY_REPLACE,
+	MOD_LOG_KEY_ADD,
+	MOD_LOG_KEY_REMOVE,
+	MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+	MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+	MOD_LOG_MOVE_KEYS,
+	MOD_LOG_ROOT_REPLACE,
+};
+
+struct tree_mod_move {
+	int dst_slot;
+	int nr_items;
+};
+
+struct tree_mod_root {
+	u64 logical;
+	u8 level;
+};
+
+struct tree_mod_elem {
+	struct rb_node node;
+	u64 index;		/* shifted logical */
+	u64 seq;
+	enum mod_log_op op;
+
+	/* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
+	int slot;
+
+	/* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
+	u64 generation;
+
+	/* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
+	struct btrfs_disk_key key;
+	u64 blockptr;
+
+	/* this is used for op == MOD_LOG_MOVE_KEYS */
+	struct tree_mod_move move;
+
+	/* this is used for op == MOD_LOG_ROOT_REPLACE */
+	struct tree_mod_root old_root;
+};
+
+static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
+{
+	read_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
+{
+	read_unlock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
+{
+	write_lock(&fs_info->tree_mod_log_lock);
+}
+
+static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
+{
+	write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Increment the upper half of tree_mod_seq, set lower half zero.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_major(struct btrfs_fs_info *fs_info)
+{
+	u64 seq = atomic64_read(&fs_info->tree_mod_seq);
+	seq &= 0xffffffff00000000ull;
+	seq += 1ull << 32;
+	atomic64_set(&fs_info->tree_mod_seq, seq);
+	return seq;
+}
+
+/*
+ * Increment the lower half of tree_mod_seq.
+ *
+ * Must be called with fs_info->tree_mod_seq_lock held. The way major numbers
+ * are generated should not technically require a spin lock here. (Rationale:
+ * incrementing the minor while incrementing the major seq number is between its
+ * atomic64_read and atomic64_set calls doesn't duplicate sequence numbers, it
+ * just returns a unique sequence number as usual.) We have decided to leave
+ * that requirement in here and rethink it once we notice it really imposes a
+ * problem on some workload.
+ */
+static inline u64 btrfs_inc_tree_mod_seq_minor(struct btrfs_fs_info *fs_info)
+{
+	return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * return the last minor in the previous major tree_mod_seq number
+ */
+u64 btrfs_tree_mod_seq_prev(u64 seq)
+{
+	return (seq & 0xffffffff00000000ull) - 1ull;
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem)
+{
+	u64 seq;
+
+	tree_mod_log_write_lock(fs_info);
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!elem->seq) {
+		elem->seq = btrfs_inc_tree_mod_seq_major(fs_info);
+		list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+	}
+	seq = btrfs_inc_tree_mod_seq_minor(fs_info);
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	tree_mod_log_write_unlock(fs_info);
+
+	return seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct rb_node *next;
+	struct seq_list *cur_elem;
+	struct tree_mod_elem *tm;
+	u64 min_seq = (u64)-1;
+	u64 seq_putting = elem->seq;
+
+	if (!seq_putting)
+		return;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	list_del(&elem->list);
+	elem->seq = 0;
+
+	list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
+		if (cur_elem->seq < min_seq) {
+			if (seq_putting > cur_elem->seq) {
+				/*
+				 * blocker with lower sequence number exists, we
+				 * cannot remove anything from the log
+				 */
+				spin_unlock(&fs_info->tree_mod_seq_lock);
+				return;
+			}
+			min_seq = cur_elem->seq;
+		}
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	/*
+	 * anything that's lower than the lowest existing (read: blocked)
+	 * sequence number can be removed from the tree.
+	 */
+	tree_mod_log_write_lock(fs_info);
+	tm_root = &fs_info->tree_mod_log;
+	for (node = rb_first(tm_root); node; node = next) {
+		next = rb_next(node);
+		tm = container_of(node, struct tree_mod_elem, node);
+		if (tm->seq > min_seq)
+			continue;
+		rb_erase(node, tm_root);
+		kfree(tm);
+	}
+	tree_mod_log_write_unlock(fs_info);
+}
+
+/*
+ * key order of the log:
+ *       index -> sequence
+ *
+ * the index is the shifted logical of the *new* root node for root replace
+ * operations, or the shifted logical of the affected block for all other
+ * operations.
+ */
+static noinline int
+__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
+{
+	struct rb_root *tm_root;
+	struct rb_node **new;
+	struct rb_node *parent = NULL;
+	struct tree_mod_elem *cur;
+	int ret = 0;
+
+	BUG_ON(!tm);
+
+	tree_mod_log_write_lock(fs_info);
+	if (list_empty(&fs_info->tree_mod_seq_list)) {
+		tree_mod_log_write_unlock(fs_info);
+		/*
+		 * Ok we no longer care about logging modifications, free up tm
+		 * and return 0.  Any callers shouldn't be using tm after
+		 * calling tree_mod_log_insert, but if they do we can just
+		 * change this to return a special error code to let the callers
+		 * do their own thing.
+		 */
+		kfree(tm);
+		return 0;
+	}
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	tm_root = &fs_info->tree_mod_log;
+	new = &tm_root->rb_node;
+	while (*new) {
+		cur = container_of(*new, struct tree_mod_elem, node);
+		parent = *new;
+		if (cur->index < tm->index)
+			new = &((*new)->rb_left);
+		else if (cur->index > tm->index)
+			new = &((*new)->rb_right);
+		else if (cur->seq < tm->seq)
+			new = &((*new)->rb_left);
+		else if (cur->seq > tm->seq)
+			new = &((*new)->rb_right);
+		else {
+			ret = -EEXIST;
+			kfree(tm);
+			goto out;
+		}
+	}
+
+	rb_link_node(&tm->node, parent, new);
+	rb_insert_color(&tm->node, tm_root);
+out:
+	tree_mod_log_write_unlock(fs_info);
+	return ret;
+}
+
+/*
+ * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
+ * returns zero with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * call tree_mod_log_write_unlock() to release.
+ */
+static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+				    struct extent_buffer *eb) {
+	smp_mb();
+	if (list_empty(&(fs_info)->tree_mod_seq_list))
+		return 1;
+	if (eb && btrfs_header_level(eb) == 0)
+		return 1;
+	return 0;
+}
+
+static inline int
+__tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+			  struct extent_buffer *eb, int slot,
+			  enum mod_log_op op, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return -ENOMEM;
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	if (op != MOD_LOG_KEY_ADD) {
+		btrfs_node_key(eb, &tm->key, slot);
+		tm->blockptr = btrfs_node_blockptr(eb, slot);
+	}
+	tm->op = op;
+	tm->slot = slot;
+	tm->generation = btrfs_node_ptr_generation(eb, slot);
+
+	return __tree_mod_log_insert(fs_info, tm);
+}
+
+static noinline int
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+			struct extent_buffer *eb, int slot,
+			enum mod_log_op op, gfp_t flags)
+{
+	if (tree_mod_dont_log(fs_info, eb))
+		return 0;
+
+	return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
+}
+
+static noinline int
+tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *eb, int dst_slot, int src_slot,
+			 int nr_items, gfp_t flags)
+{
+	struct tree_mod_elem *tm;
+	int ret;
+	int i;
+
+	if (tree_mod_dont_log(fs_info, eb))
+		return 0;
+
+	/*
+	 * When we override something during the move, we log these removals.
+	 * This can only happen when we move towards the beginning of the
+	 * buffer, i.e. dst_slot < src_slot.
+	 */
+	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+		ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+				MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return -ENOMEM;
+
+	tm->index = eb->start >> PAGE_CACHE_SHIFT;
+	tm->slot = src_slot;
+	tm->move.dst_slot = dst_slot;
+	tm->move.nr_items = nr_items;
+	tm->op = MOD_LOG_MOVE_KEYS;
+
+	return __tree_mod_log_insert(fs_info, tm);
+}
+
+static inline void
+__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+	int i;
+	u32 nritems;
+	int ret;
+
+	if (btrfs_header_level(eb) == 0)
+		return;
+
+	nritems = btrfs_header_nritems(eb);
+	for (i = nritems - 1; i >= 0; i--) {
+		ret = __tree_mod_log_insert_key(fs_info, eb, i,
+				MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
+}
+
+static noinline int
+tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
+			 struct extent_buffer *old_root,
+			 struct extent_buffer *new_root, gfp_t flags,
+			 int log_removal)
+{
+	struct tree_mod_elem *tm;
+
+	if (tree_mod_dont_log(fs_info, NULL))
+		return 0;
+
+	if (log_removal)
+		__tree_mod_log_free_eb(fs_info, old_root);
+
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return -ENOMEM;
+
+	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
+	tm->old_root.logical = old_root->start;
+	tm->old_root.level = btrfs_header_level(old_root);
+	tm->generation = btrfs_header_generation(old_root);
+	tm->op = MOD_LOG_ROOT_REPLACE;
+
+	return __tree_mod_log_insert(fs_info, tm);
+}
+
+static struct tree_mod_elem *
+__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
+		      int smallest)
+{
+	struct rb_root *tm_root;
+	struct rb_node *node;
+	struct tree_mod_elem *cur = NULL;
+	struct tree_mod_elem *found = NULL;
+	u64 index = start >> PAGE_CACHE_SHIFT;
+
+	tree_mod_log_read_lock(fs_info);
+	tm_root = &fs_info->tree_mod_log;
+	node = tm_root->rb_node;
+	while (node) {
+		cur = container_of(node, struct tree_mod_elem, node);
+		if (cur->index < index) {
+			node = node->rb_left;
+		} else if (cur->index > index) {
+			node = node->rb_right;
+		} else if (cur->seq < min_seq) {
+			node = node->rb_left;
+		} else if (!smallest) {
+			/* we want the node with the highest seq */
+			if (found)
+				BUG_ON(found->seq > cur->seq);
+			found = cur;
+			node = node->rb_left;
+		} else if (cur->seq > min_seq) {
+			/* we want the node with the smallest seq */
+			if (found)
+				BUG_ON(found->seq < cur->seq);
+			found = cur;
+			node = node->rb_right;
+		} else {
+			found = cur;
+			break;
+		}
+	}
+	tree_mod_log_read_unlock(fs_info);
+
+	return found;
+}
+
+/*
+ * this returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
+			   u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 1);
+}
+
+/*
+ * this returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *
+tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
+{
+	return __tree_mod_log_search(fs_info, start, min_seq, 0);
+}
+
+static noinline void
+tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+		     struct extent_buffer *src, unsigned long dst_offset,
+		     unsigned long src_offset, int nr_items)
+{
+	int ret;
+	int i;
+
+	if (tree_mod_dont_log(fs_info, NULL))
+		return;
+
+	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+		return;
+
+	for (i = 0; i < nr_items; i++) {
+		ret = __tree_mod_log_insert_key(fs_info, src,
+						i + src_offset,
+						MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		BUG_ON(ret < 0);
+		ret = __tree_mod_log_insert_key(fs_info, dst,
+						     i + dst_offset,
+						     MOD_LOG_KEY_ADD,
+						     GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
+}
+
+static inline void
+tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
+		     int dst_offset, int src_offset, int nr_items)
+{
+	int ret;
+	ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
+				       nr_items, GFP_NOFS);
+	BUG_ON(ret < 0);
+}
+
+static noinline void
+tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
+			  struct extent_buffer *eb, int slot, int atomic)
+{
+	int ret;
+
+	ret = __tree_mod_log_insert_key(fs_info, eb, slot,
+					MOD_LOG_KEY_REPLACE,
+					atomic ? GFP_ATOMIC : GFP_NOFS);
+	BUG_ON(ret < 0);
+}
+
+static noinline void
+tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
+{
+	if (tree_mod_dont_log(fs_info, eb))
+		return;
+	__tree_mod_log_free_eb(fs_info, eb);
+}
+
+static noinline void
+tree_mod_log_set_root_pointer(struct btrfs_root *root,
+			      struct extent_buffer *new_root_node,
+			      int log_removal)
+{
+	int ret;
+	ret = tree_mod_log_insert_root(root->fs_info, root->node,
+				       new_root_node, GFP_NOFS, log_removal);
+	BUG_ON(ret < 0);
+}
+
 /*
  * check if the tree block can be shared by multiple trees
  */
@@ -330,9 +854,15 @@
 
 	if (btrfs_block_can_be_shared(root, buf)) {
 		ret = btrfs_lookup_extent_info(trans, root, buf->start,
-					       buf->len, &refs, &flags);
-		BUG_ON(ret);
-		BUG_ON(refs == 0);
+					       btrfs_header_level(buf), 1,
+					       &refs, &flags);
+		if (ret)
+			return ret;
+		if (refs == 0) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			return ret;
+		}
 	} else {
 		refs = 1;
 		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
@@ -350,43 +880,46 @@
 		if ((owner == root->root_key.objectid ||
 		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
 		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
-			ret = btrfs_inc_ref(trans, root, buf, 1);
-			BUG_ON(ret);
+			ret = btrfs_inc_ref(trans, root, buf, 1, 1);
+			BUG_ON(ret); /* -ENOMEM */
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID) {
-				ret = btrfs_dec_ref(trans, root, buf, 0);
-				BUG_ON(ret);
-				ret = btrfs_inc_ref(trans, root, cow, 1);
-				BUG_ON(ret);
+				ret = btrfs_dec_ref(trans, root, buf, 0, 1);
+				BUG_ON(ret); /* -ENOMEM */
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
+				BUG_ON(ret); /* -ENOMEM */
 			}
 			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
 		} else {
 
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+			BUG_ON(ret); /* -ENOMEM */
 		}
 		if (new_flags != 0) {
+			int level = btrfs_header_level(buf);
+
 			ret = btrfs_set_disk_extent_flags(trans, root,
 							  buf->start,
 							  buf->len,
-							  new_flags, 0);
-			BUG_ON(ret);
+							  new_flags, level, 0);
+			if (ret)
+				return ret;
 		}
 	} else {
 		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
 			if (root->root_key.objectid ==
 			    BTRFS_TREE_RELOC_OBJECTID)
-				ret = btrfs_inc_ref(trans, root, cow, 1);
+				ret = btrfs_inc_ref(trans, root, cow, 1, 1);
 			else
-				ret = btrfs_inc_ref(trans, root, cow, 0);
-			BUG_ON(ret);
-			ret = btrfs_dec_ref(trans, root, buf, 1);
-			BUG_ON(ret);
+				ret = btrfs_inc_ref(trans, root, cow, 0, 1);
+			BUG_ON(ret); /* -ENOMEM */
+			ret = btrfs_dec_ref(trans, root, buf, 1, 1);
+			BUG_ON(ret); /* -ENOMEM */
 		}
 		clean_tree_block(trans, root, buf);
 		*last_ref = 1;
@@ -415,7 +948,7 @@
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *cow;
-	int level;
+	int level, ret;
 	int last_ref = 0;
 	int unlock_orig = 0;
 	u64 parent_start;
@@ -463,14 +996,20 @@
 	else
 		btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
-	update_ref_for_cow(trans, root, buf, cow, &last_ref);
+	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
 
-	if (root->ref_cows)
-		btrfs_reloc_cow_block(trans, root, buf, cow);
+	if (root->ref_cows) {
+		ret = btrfs_reloc_cow_block(trans, root, buf, cow);
+		if (ret)
+			return ret;
+	}
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
@@ -481,6 +1020,7 @@
 			parent_start = 0;
 
 		extent_buffer_get(cow);
+		tree_mod_log_set_root_pointer(root, cow, 1);
 		rcu_assign_pointer(root->node, cow);
 
 		btrfs_free_tree_block(trans, root, buf, parent_start,
@@ -494,22 +1034,314 @@
 			parent_start = 0;
 
 		WARN_ON(trans->transid != btrfs_header_generation(parent));
+		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
+					MOD_LOG_KEY_REPLACE, GFP_NOFS);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
+		if (last_ref)
+			tree_mod_log_free_eb(root->fs_info, buf);
 		btrfs_free_tree_block(trans, root, buf, parent_start,
 				      last_ref);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
-	free_extent_buffer(buf);
+	free_extent_buffer_stale(buf);
 	btrfs_mark_buffer_dirty(cow);
 	*cow_ret = cow;
 	return 0;
 }
 
+/*
+ * returns the logical address of the oldest predecessor of the given root.
+ * entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *
+__tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
+			   struct extent_buffer *eb_root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	struct tree_mod_elem *found = NULL;
+	u64 root_logical = eb_root->start;
+	int looped = 0;
+
+	if (!time_seq)
+		return NULL;
+
+	/*
+	 * the very last operation that's logged for a root is the replacement
+	 * operation (if it is replaced at all). this has the index of the *new*
+	 * root, making it the very first operation that's logged for this root.
+	 */
+	while (1) {
+		tm = tree_mod_log_search_oldest(fs_info, root_logical,
+						time_seq);
+		if (!looped && !tm)
+			return NULL;
+		/*
+		 * if there are no tree operation for the oldest root, we simply
+		 * return it. this should only happen if that (old) root is at
+		 * level 0.
+		 */
+		if (!tm)
+			break;
+
+		/*
+		 * if there's an operation that's not a root replacement, we
+		 * found the oldest version of our root. normally, we'll find a
+		 * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+		 */
+		if (tm->op != MOD_LOG_ROOT_REPLACE)
+			break;
+
+		found = tm;
+		root_logical = tm->old_root.logical;
+		looped = 1;
+	}
+
+	/* if there's no old root to return, return what we found instead */
+	if (!found)
+		found = tm;
+
+	return found;
+}
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. then, all
+ * previous operations will be rewinded (until we reach something older than
+ * time_seq).
+ */
+static void
+__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
+		      u64 time_seq, struct tree_mod_elem *first_tm)
+{
+	u32 n;
+	struct rb_node *next;
+	struct tree_mod_elem *tm = first_tm;
+	unsigned long o_dst;
+	unsigned long o_src;
+	unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+	n = btrfs_header_nritems(eb);
+	tree_mod_log_read_lock(fs_info);
+	while (tm && tm->seq >= time_seq) {
+		/*
+		 * all the operations are recorded with the operator used for
+		 * the modification. as we're going backwards, we do the
+		 * opposite of each operation here.
+		 */
+		switch (tm->op) {
+		case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+			BUG_ON(tm->slot < n);
+			/* Fallthrough */
+		case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+		case MOD_LOG_KEY_REMOVE:
+			btrfs_set_node_key(eb, &tm->key, tm->slot);
+			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+			btrfs_set_node_ptr_generation(eb, tm->slot,
+						      tm->generation);
+			n++;
+			break;
+		case MOD_LOG_KEY_REPLACE:
+			BUG_ON(tm->slot >= n);
+			btrfs_set_node_key(eb, &tm->key, tm->slot);
+			btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+			btrfs_set_node_ptr_generation(eb, tm->slot,
+						      tm->generation);
+			break;
+		case MOD_LOG_KEY_ADD:
+			/* if a move operation is needed it's in the log */
+			n--;
+			break;
+		case MOD_LOG_MOVE_KEYS:
+			o_dst = btrfs_node_key_ptr_offset(tm->slot);
+			o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+			memmove_extent_buffer(eb, o_dst, o_src,
+					      tm->move.nr_items * p_size);
+			break;
+		case MOD_LOG_ROOT_REPLACE:
+			/*
+			 * this operation is special. for roots, this must be
+			 * handled explicitly before rewinding.
+			 * for non-roots, this operation may exist if the node
+			 * was a root: root A -> child B; then A gets empty and
+			 * B is promoted to the new root. in the mod log, we'll
+			 * have a root-replace operation for B, a tree block
+			 * that is no root. we simply ignore that operation.
+			 */
+			break;
+		}
+		next = rb_next(&tm->node);
+		if (!next)
+			break;
+		tm = container_of(next, struct tree_mod_elem, node);
+		if (tm->index != first_tm->index)
+			break;
+	}
+	tree_mod_log_read_unlock(fs_info);
+	btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+static struct extent_buffer *
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		    struct extent_buffer *eb, u64 time_seq)
+{
+	struct extent_buffer *eb_rewin;
+	struct tree_mod_elem *tm;
+
+	if (!time_seq)
+		return eb;
+
+	if (btrfs_header_level(eb) == 0)
+		return eb;
+
+	tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+	if (!tm)
+		return eb;
+
+	btrfs_set_path_blocking(path);
+	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
+	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		BUG_ON(tm->slot != 0);
+		eb_rewin = alloc_dummy_extent_buffer(eb->start,
+						fs_info->tree_root->nodesize);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
+		btrfs_set_header_bytenr(eb_rewin, eb->start);
+		btrfs_set_header_backref_rev(eb_rewin,
+					     btrfs_header_backref_rev(eb));
+		btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+	} else {
+		eb_rewin = btrfs_clone_extent_buffer(eb);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
+	}
+
+	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
+	btrfs_tree_read_unlock_blocking(eb);
+	free_extent_buffer(eb);
+
+	extent_buffer_get(eb_rewin);
+	btrfs_tree_read_lock(eb_rewin);
+	__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+	WARN_ON(btrfs_header_nritems(eb_rewin) >
+		BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
+
+	return eb_rewin;
+}
+
+/*
+ * get_old_root() rewinds the state of @root's root node to the given @time_seq
+ * value. If there are no changes, the current root->root_node is returned. If
+ * anything changed in between, there's a fresh buffer allocated on which the
+ * rewind operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+static inline struct extent_buffer *
+get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	struct extent_buffer *eb = NULL;
+	struct extent_buffer *eb_root;
+	struct extent_buffer *old;
+	struct tree_mod_root *old_root = NULL;
+	u64 old_generation = 0;
+	u64 logical;
+	u32 blocksize;
+
+	eb_root = btrfs_read_lock_root_node(root);
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+	if (!tm)
+		return eb_root;
+
+	if (tm->op == MOD_LOG_ROOT_REPLACE) {
+		old_root = &tm->old_root;
+		old_generation = tm->generation;
+		logical = old_root->logical;
+	} else {
+		logical = eb_root->start;
+	}
+
+	tm = tree_mod_log_search(root->fs_info, logical, time_seq);
+	if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
+		blocksize = btrfs_level_size(root, old_root->level);
+		old = read_tree_block(root, logical, blocksize, 0);
+		if (!old || !extent_buffer_uptodate(old)) {
+			free_extent_buffer(old);
+			pr_warn("btrfs: failed to read tree block %llu from get_old_root\n",
+				logical);
+			WARN_ON(1);
+		} else {
+			eb = btrfs_clone_extent_buffer(old);
+			free_extent_buffer(old);
+		}
+	} else if (old_root) {
+		btrfs_tree_read_unlock(eb_root);
+		free_extent_buffer(eb_root);
+		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+	} else {
+		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
+		eb = btrfs_clone_extent_buffer(eb_root);
+		btrfs_tree_read_unlock_blocking(eb_root);
+		free_extent_buffer(eb_root);
+	}
+
+	if (!eb)
+		return NULL;
+	extent_buffer_get(eb);
+	btrfs_tree_read_lock(eb);
+	if (old_root) {
+		btrfs_set_header_bytenr(eb, eb->start);
+		btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+		btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+		btrfs_set_header_level(eb, old_root->level);
+		btrfs_set_header_generation(eb, old_generation);
+	}
+	if (tm)
+		__tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
+	else
+		WARN_ON(btrfs_header_level(eb) != 0);
+	WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
+
+	return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+	struct tree_mod_elem *tm;
+	int level;
+	struct extent_buffer *eb_root = btrfs_root_node(root);
+
+	tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
+	if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
+		level = tm->old_root.level;
+	} else {
+		level = btrfs_header_level(eb_root);
+	}
+	free_extent_buffer(eb_root);
+
+	return level;
+}
+
 static inline int should_cow_block(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct extent_buffer *buf)
@@ -550,19 +1382,14 @@
 	u64 search_start;
 	int ret;
 
-	if (trans->transaction != root->fs_info->running_transaction) {
-		printk(KERN_CRIT "trans %llu running %llu\n",
-		       (unsigned long long)trans->transid,
-		       (unsigned long long)
+	if (trans->transaction != root->fs_info->running_transaction)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
+		       trans->transid,
 		       root->fs_info->running_transaction->transid);
-		WARN_ON(1);
-	}
-	if (trans->transid != root->fs_info->generation) {
-		printk(KERN_CRIT "trans %llu running %llu\n",
-		       (unsigned long long)trans->transid,
-		       (unsigned long long)root->fs_info->generation);
-		WARN_ON(1);
-	}
+
+	if (trans->transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "trans %llu running %llu\n",
+		       trans->transid, root->fs_info->generation);
 
 	if (!should_cow_block(trans, root, buf)) {
 		*cow_ret = buf;
@@ -635,7 +1462,7 @@
  */
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, int cache_only, u64 *last_ret,
+		       int start_slot, u64 *last_ret,
 		       struct btrfs_key *progress)
 {
 	struct extent_buffer *cur;
@@ -655,13 +1482,9 @@
 	struct btrfs_disk_key disk_key;
 
 	parent_level = btrfs_header_level(parent);
-	if (cache_only && parent_level != 1)
-		return 0;
 
-	if (trans->transaction != root->fs_info->running_transaction)
-		WARN_ON(1);
-	if (trans->transid != root->fs_info->generation)
-		WARN_ON(1);
+	WARN_ON(trans->transaction != root->fs_info->running_transaction);
+	WARN_ON(trans->transid != root->fs_info->generation);
 
 	parent_nritems = btrfs_header_nritems(parent);
 	blocksize = btrfs_level_size(root, parent_level - 1);
@@ -700,21 +1523,23 @@
 
 		cur = btrfs_find_tree_block(root, blocknr, blocksize);
 		if (cur)
-			uptodate = btrfs_buffer_uptodate(cur, gen);
+			uptodate = btrfs_buffer_uptodate(cur, gen, 0);
 		else
 			uptodate = 0;
 		if (!cur || !uptodate) {
-			if (cache_only) {
-				free_extent_buffer(cur);
-				continue;
-			}
 			if (!cur) {
 				cur = read_tree_block(root, blocknr,
 							 blocksize, gen);
-				if (!cur)
+				if (!cur || !extent_buffer_uptodate(cur)) {
+					free_extent_buffer(cur);
 					return -EIO;
+				}
 			} else if (!uptodate) {
-				btrfs_read_buffer(cur, gen);
+				err = btrfs_read_buffer(cur, gen);
+				if (err) {
+					free_extent_buffer(cur);
+					return err;
+				}
 			}
 		}
 		if (search_start == 0)
@@ -829,20 +1654,18 @@
 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		      int level, int *slot)
 {
-	if (level == 0) {
+	if (level == 0)
 		return generic_bin_search(eb,
 					  offsetof(struct btrfs_leaf, items),
 					  sizeof(struct btrfs_item),
 					  key, btrfs_header_nritems(eb),
 					  slot);
-	} else {
+	else
 		return generic_bin_search(eb,
 					  offsetof(struct btrfs_node, ptrs),
 					  sizeof(struct btrfs_key_ptr),
 					  key, btrfs_header_nritems(eb),
 					  slot);
-	}
-	return -1;
 }
 
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
@@ -875,6 +1698,8 @@
 				   struct extent_buffer *parent, int slot)
 {
 	int level = btrfs_header_level(parent);
+	struct extent_buffer *eb;
+
 	if (slot < 0)
 		return NULL;
 	if (slot >= btrfs_header_nritems(parent))
@@ -882,9 +1707,15 @@
 
 	BUG_ON(level == 0);
 
-	return read_tree_block(root, btrfs_node_blockptr(parent, slot),
-		       btrfs_level_size(root, level - 1),
-		       btrfs_node_ptr_generation(parent, slot));
+	eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
+			     btrfs_level_size(root, level - 1),
+			     btrfs_node_ptr_generation(parent, slot));
+	if (eb && !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		eb = NULL;
+	}
+
+	return eb;
 }
 
 /*
@@ -934,7 +1765,12 @@
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
-		BUG_ON(!child);
+		if (!child) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			goto enospc;
+		}
+
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
@@ -944,6 +1780,7 @@
 			goto enospc;
 		}
 
+		tree_mod_log_set_root_pointer(root, child, 1);
 		rcu_assign_pointer(root->node, child);
 
 		add_root_to_dirty_list(root);
@@ -959,15 +1796,13 @@
 		root_sub_used(root, mid->len);
 		btrfs_free_tree_block(trans, root, mid, 0, 1);
 		/* once for the root ptr */
-		free_extent_buffer(mid);
+		free_extent_buffer_stale(mid);
 		return 0;
 	}
 	if (btrfs_header_nritems(mid) >
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	btrfs_header_nritems(mid);
-
 	left = read_node_slot(root, parent, pslot - 1);
 	if (left) {
 		btrfs_tree_lock(left);
@@ -997,7 +1832,6 @@
 		wret = push_node_left(trans, root, left, mid, 1);
 		if (wret < 0)
 			ret = wret;
-		btrfs_header_nritems(mid);
 	}
 
 	/*
@@ -1010,17 +1844,16 @@
 		if (btrfs_header_nritems(right) == 0) {
 			clean_tree_block(trans, root, right);
 			btrfs_tree_unlock(right);
-			wret = del_ptr(trans, root, path, level + 1, pslot +
-				       1);
-			if (wret)
-				ret = wret;
+			del_ptr(root, path, level + 1, pslot + 1);
 			root_sub_used(root, right->len);
 			btrfs_free_tree_block(trans, root, right, 0, 1);
-			free_extent_buffer(right);
+			free_extent_buffer_stale(right);
 			right = NULL;
 		} else {
 			struct btrfs_disk_key right_key;
 			btrfs_node_key(right, &right_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot + 1, 0);
 			btrfs_set_node_key(parent, &right_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 		}
@@ -1035,7 +1868,11 @@
 		 * otherwise we would have pulled some pointers from the
 		 * right
 		 */
-		BUG_ON(!left);
+		if (!left) {
+			ret = -EROFS;
+			btrfs_std_error(root->fs_info, ret);
+			goto enospc;
+		}
 		wret = balance_node_right(trans, root, mid, left);
 		if (wret < 0) {
 			ret = wret;
@@ -1051,17 +1888,17 @@
 	if (btrfs_header_nritems(mid) == 0) {
 		clean_tree_block(trans, root, mid);
 		btrfs_tree_unlock(mid);
-		wret = del_ptr(trans, root, path, level + 1, pslot);
-		if (wret)
-			ret = wret;
+		del_ptr(root, path, level + 1, pslot);
 		root_sub_used(root, mid->len);
 		btrfs_free_tree_block(trans, root, mid, 0, 1);
-		free_extent_buffer(mid);
+		free_extent_buffer_stale(mid);
 		mid = NULL;
 	} else {
 		/* update the parent key to reflect our changes */
 		struct btrfs_disk_key mid_key;
 		btrfs_node_key(mid, &mid_key, 0);
+		tree_mod_log_set_node_key(root->fs_info, parent,
+					  pslot, 0);
 		btrfs_set_node_key(parent, &mid_key, pslot);
 		btrfs_mark_buffer_dirty(parent);
 	}
@@ -1159,6 +1996,8 @@
 			struct btrfs_disk_key disk_key;
 			orig_slot += left_nr;
 			btrfs_node_key(mid, &disk_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot);
 			btrfs_mark_buffer_dirty(parent);
 			if (btrfs_header_nritems(left) > orig_slot) {
@@ -1210,6 +2049,8 @@
 			struct btrfs_disk_key disk_key;
 
 			btrfs_node_key(right, &disk_key, 0);
+			tree_mod_log_set_node_key(root->fs_info, parent,
+						  pslot + 1, 0);
 			btrfs_set_node_key(parent, &disk_key, pslot + 1);
 			btrfs_mark_buffer_dirty(parent);
 
@@ -1302,12 +2143,8 @@
 	}
 }
 
-/*
- * returns -EAGAIN if it had to drop the path, or zero if everything was in
- * cache
- */
-static noinline int reada_for_balance(struct btrfs_root *root,
-				      struct btrfs_path *path, int level)
+static noinline void reada_for_balance(struct btrfs_root *root,
+				       struct btrfs_path *path, int level)
 {
 	int slot;
 	int nritems;
@@ -1316,12 +2153,11 @@
 	u64 gen;
 	u64 block1 = 0;
 	u64 block2 = 0;
-	int ret = 0;
 	int blocksize;
 
 	parent = path->nodes[level + 1];
 	if (!parent)
-		return 0;
+		return;
 
 	nritems = btrfs_header_nritems(parent);
 	slot = path->slots[level + 1];
@@ -1331,7 +2167,12 @@
 		block1 = btrfs_node_blockptr(parent, slot - 1);
 		gen = btrfs_node_ptr_generation(parent, slot - 1);
 		eb = btrfs_find_tree_block(root, block1, blocksize);
-		if (eb && btrfs_buffer_uptodate(eb, gen))
+		/*
+		 * if we get -eagain from btrfs_buffer_uptodate, we
+		 * don't want to return eagain here.  That will loop
+		 * forever
+		 */
+		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
 			block1 = 0;
 		free_extent_buffer(eb);
 	}
@@ -1339,32 +2180,15 @@
 		block2 = btrfs_node_blockptr(parent, slot + 1);
 		gen = btrfs_node_ptr_generation(parent, slot + 1);
 		eb = btrfs_find_tree_block(root, block2, blocksize);
-		if (eb && btrfs_buffer_uptodate(eb, gen))
+		if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
 			block2 = 0;
 		free_extent_buffer(eb);
 	}
-	if (block1 || block2) {
-		ret = -EAGAIN;
-
-		/* release the whole path */
-		btrfs_release_path(path);
-
-		/* read the blocks */
-		if (block1)
-			readahead_tree_block(root, block1, blocksize, 0);
-		if (block2)
-			readahead_tree_block(root, block2, blocksize, 0);
 
-		if (block1) {
-			eb = read_tree_block(root, block1, blocksize, 0);
-			free_extent_buffer(eb);
-		}
-		if (block2) {
-			eb = read_tree_block(root, block2, blocksize, 0);
-			free_extent_buffer(eb);
-		}
-	}
-	return ret;
+	if (block1)
+		readahead_tree_block(root, block1, blocksize, 0);
+	if (block2)
+		readahead_tree_block(root, block2, blocksize, 0);
 }
 
 
@@ -1382,7 +2206,8 @@
  * if lowest_unlock is 1, level 0 won't be unlocked
  */
 static noinline void unlock_up(struct btrfs_path *path, int level,
-			       int lowest_unlock)
+			       int lowest_unlock, int min_write_lock_level,
+			       int *write_lock_level)
 {
 	int i;
 	int skip_level = level;
@@ -1414,6 +2239,11 @@
 		if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
 			btrfs_tree_unlock_rw(t, path->locks[i]);
 			path->locks[i] = 0;
+			if (write_lock_level &&
+			    i > min_write_lock_level &&
+			    i <= *write_lock_level) {
+				*write_lock_level = i - 1;
+			}
 		}
 	}
 }
@@ -1456,7 +2286,7 @@
 read_block_for_search(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct btrfs_path *p,
 		       struct extent_buffer **eb_ret, int level, int slot,
-		       struct btrfs_key *key)
+		       struct btrfs_key *key, u64 time_seq)
 {
 	u64 blocknr;
 	u64 gen;
@@ -1471,34 +2301,29 @@
 
 	tmp = btrfs_find_tree_block(root, blocknr, blocksize);
 	if (tmp) {
-		if (btrfs_buffer_uptodate(tmp, 0)) {
-			if (btrfs_buffer_uptodate(tmp, gen)) {
-				/*
-				 * we found an up to date block without
-				 * sleeping, return
-				 * right away
-				 */
-				*eb_ret = tmp;
-				return 0;
-			}
-			/* the pages were up to date, but we failed
-			 * the generation number check.  Do a full
-			 * read for the generation number that is correct.
-			 * We must do this without dropping locks so
-			 * we can trust our generation number
-			 */
-			free_extent_buffer(tmp);
-			btrfs_set_path_blocking(p);
+		/* first we do an atomic uptodate check */
+		if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
+			*eb_ret = tmp;
+			return 0;
+		}
 
-			tmp = read_tree_block(root, blocknr, blocksize, gen);
-			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-				*eb_ret = tmp;
-				return 0;
-			}
-			free_extent_buffer(tmp);
-			btrfs_release_path(p);
-			return -EIO;
+		/* the pages were up to date, but we failed
+		 * the generation number check.  Do a full
+		 * read for the generation number that is correct.
+		 * We must do this without dropping locks so
+		 * we can trust our generation number
+		 */
+		btrfs_set_path_blocking(p);
+
+		/* now we're allowed to do a blocking uptodate check */
+		ret = btrfs_read_buffer(tmp, gen);
+		if (!ret) {
+			*eb_ret = tmp;
+			return 0;
 		}
+		free_extent_buffer(tmp);
+		btrfs_release_path(p);
+		return -EIO;
 	}
 
 	/*
@@ -1526,7 +2351,7 @@
 		 * and give up so that our caller doesn't loop forever
 		 * on our EAGAINs.
 		 */
-		if (!btrfs_buffer_uptodate(tmp, 0))
+		if (!btrfs_buffer_uptodate(tmp, 0, 0))
 			ret = -EIO;
 		free_extent_buffer(tmp);
 	}
@@ -1559,11 +2384,8 @@
 			goto again;
 		}
 
-		sret = reada_for_balance(root, p, level);
-		if (sret)
-			goto again;
-
 		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
 		sret = split_node(trans, root, p, level);
 		btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -1583,11 +2405,8 @@
 			goto again;
 		}
 
-		sret = reada_for_balance(root, p, level);
-		if (sret)
-			goto again;
-
 		btrfs_set_path_blocking(p);
+		reada_for_balance(root, p, level);
 		sret = balance_level(trans, root, p, level);
 		btrfs_clear_path_blocking(p, NULL, 0);
 
@@ -1610,6 +2429,40 @@
 	return ret;
 }
 
+static void key_search_validate(struct extent_buffer *b,
+				struct btrfs_key *key,
+				int level)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+	struct btrfs_disk_key disk_key;
+
+	btrfs_cpu_key_to_disk(&disk_key, key);
+
+	if (level == 0)
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_leaf, items[0].key),
+		    sizeof(disk_key)));
+	else
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_node, ptrs[0].key),
+		    sizeof(disk_key)));
+#endif
+}
+
+static int key_search(struct extent_buffer *b, struct btrfs_key *key,
+		      int level, int *prev_cmp, int *slot)
+{
+	if (*prev_cmp != 0) {
+		*prev_cmp = bin_search(b, key, level, slot);
+		return *prev_cmp;
+	}
+
+	key_search_validate(b, key, level);
+	*slot = 0;
+
+	return 0;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -1637,6 +2490,8 @@
 	/* everything at write_lock_level or lower must be write locked */
 	int write_lock_level = 0;
 	u8 lowest_level = 0;
+	int min_write_lock_level;
+	int prev_cmp;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
@@ -1664,7 +2519,10 @@
 	if (cow && (p->keep_locks || p->lowest_level))
 		write_lock_level = BTRFS_MAX_LEVEL;
 
+	min_write_lock_level = write_lock_level;
+
 again:
+	prev_cmp = -1;
 	/*
 	 * we try very hard to do read locks on the root
 	 */
@@ -1728,7 +2586,10 @@
 			 * must have write locks on this node and the
 			 * parent
 			 */
-			if (level + 1 > write_lock_level) {
+			if (level > write_lock_level ||
+			    (level + 1 > write_lock_level &&
+			    level + 1 < BTRFS_MAX_LEVEL &&
+			    p->nodes[level + 1])) {
 				write_lock_level = level + 1;
 				btrfs_release_path(p);
 				goto again;
@@ -1762,7 +2623,7 @@
 		if (!cow)
 			btrfs_unlock_up_safe(p, level + 1);
 
-		ret = bin_search(b, key, level, &slot);
+		ret = key_search(b, key, level, &prev_cmp, &slot);
 
 		if (level != 0) {
 			int dec = 0;
@@ -1795,7 +2656,8 @@
 				goto again;
 			}
 
-			unlock_up(p, level, lowest_unlock);
+			unlock_up(p, level, lowest_unlock,
+				  min_write_lock_level, &write_lock_level);
 
 			if (level == lowest_level) {
 				if (dec)
@@ -1804,7 +2666,7 @@
 			}
 
 			err = read_block_for_search(trans, root, p,
-						    &b, level, slot, key);
+						    &b, level, slot, key, 0);
 			if (err == -EAGAIN)
 				goto again;
 			if (err) {
@@ -1857,7 +2719,8 @@
 				}
 			}
 			if (!p->search_for_split)
-				unlock_up(p, level, lowest_unlock);
+				unlock_up(p, level, lowest_unlock,
+					  min_write_lock_level, &write_lock_level);
 			goto done;
 		}
 	}
@@ -1875,21 +2738,198 @@
 }
 
 /*
+ * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
+ * current state of the tree together with the operations recorded in the tree
+ * modification log to search for the key in a previous version of this tree, as
+ * denoted by the time_seq parameter.
+ *
+ * Naturally, there is no support for insert, delete or cow operations.
+ *
+ * The resulting path and return value will be set up as if we called
+ * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
+ */
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+			  struct btrfs_path *p, u64 time_seq)
+{
+	struct extent_buffer *b;
+	int slot;
+	int ret;
+	int err;
+	int level;
+	int lowest_unlock = 1;
+	u8 lowest_level = 0;
+	int prev_cmp;
+
+	lowest_level = p->lowest_level;
+	WARN_ON(p->nodes[0] != NULL);
+
+	if (p->search_commit_root) {
+		BUG_ON(time_seq);
+		return btrfs_search_slot(NULL, root, key, p, 0, 0);
+	}
+
+again:
+	prev_cmp = -1;
+	b = get_old_root(root, time_seq);
+	level = btrfs_header_level(b);
+	p->locks[level] = BTRFS_READ_LOCK;
+
+	while (b) {
+		level = btrfs_header_level(b);
+		p->nodes[level] = b;
+		btrfs_clear_path_blocking(p, NULL, 0);
+
+		/*
+		 * we have a lock on b and as long as we aren't changing
+		 * the tree, there is no way to for the items in b to change.
+		 * It is safe to drop the lock on our parent before we
+		 * go through the expensive btree search on b.
+		 */
+		btrfs_unlock_up_safe(p, level + 1);
+
+		ret = key_search(b, key, level, &prev_cmp, &slot);
+
+		if (level != 0) {
+			int dec = 0;
+			if (ret && slot > 0) {
+				dec = 1;
+				slot -= 1;
+			}
+			p->slots[level] = slot;
+			unlock_up(p, level, lowest_unlock, 0, NULL);
+
+			if (level == lowest_level) {
+				if (dec)
+					p->slots[level]++;
+				goto done;
+			}
+
+			err = read_block_for_search(NULL, root, p, &b, level,
+						    slot, key, time_seq);
+			if (err == -EAGAIN)
+				goto again;
+			if (err) {
+				ret = err;
+				goto done;
+			}
+
+			level = btrfs_header_level(b);
+			err = btrfs_try_tree_read_lock(b);
+			if (!err) {
+				btrfs_set_path_blocking(p);
+				btrfs_tree_read_lock(b);
+				btrfs_clear_path_blocking(p, b,
+							  BTRFS_READ_LOCK);
+			}
+			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
+			if (!b) {
+				ret = -ENOMEM;
+				goto done;
+			}
+			p->locks[level] = BTRFS_READ_LOCK;
+			p->nodes[level] = b;
+		} else {
+			p->slots[level] = slot;
+			unlock_up(p, level, lowest_unlock, 0, NULL);
+			goto done;
+		}
+	}
+	ret = 1;
+done:
+	if (!p->leave_spinning)
+		btrfs_set_path_blocking(p);
+	if (ret < 0)
+		btrfs_release_path(p);
+
+	return ret;
+}
+
+/*
+ * helper to use instead of search slot if no exact match is needed but
+ * instead the next or previous item should be returned.
+ * When find_higher is true, the next higher item is returned, the next lower
+ * otherwise.
+ * When return_any and find_higher are both true, and no higher item is found,
+ * return the next lower instead.
+ * When return_any is true and find_higher is false, and no lower item is found,
+ * return the next higher instead.
+ * It returns 0 if any item is found, 1 if none is found (tree empty), and
+ * < 0 on error
+ */
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any)
+{
+	int ret;
+	struct extent_buffer *leaf;
+
+again:
+	ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
+	if (ret <= 0)
+		return ret;
+	/*
+	 * a return value of 1 means the path is at the position where the
+	 * item should be inserted. Normally this is the next bigger item,
+	 * but in case the previous item is the last in a leaf, path points
+	 * to the first free slot in the previous leaf, i.e. at an invalid
+	 * item.
+	 */
+	leaf = p->nodes[0];
+
+	if (find_higher) {
+		if (p->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, p);
+			if (ret <= 0)
+				return ret;
+			if (!return_any)
+				return 1;
+			/*
+			 * no higher item found, return the next
+			 * lower instead
+			 */
+			return_any = 0;
+			find_higher = 0;
+			btrfs_release_path(p);
+			goto again;
+		}
+	} else {
+		if (p->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, p);
+			if (ret < 0)
+				return ret;
+			if (!ret) {
+				p->slots[0] = btrfs_header_nritems(leaf) - 1;
+				return 0;
+			}
+			if (!return_any)
+				return 1;
+			/*
+			 * no lower item found, return the next
+			 * higher instead
+			 */
+			return_any = 0;
+			find_higher = 1;
+			btrfs_release_path(p);
+			goto again;
+		} else {
+			--p->slots[0];
+		}
+	}
+	return 0;
+}
+
+/*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
  * This is used after shifting pointers to the left, so it stops
  * fixing up pointers when a given leaf/node is not in slot 0 of the
  * higher levels
  *
- * If this fails to write a tree block, it returns -1, but continues
- * fixing up the blocks in ram so the tree is consistent.
  */
-static int fixup_low_keys(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, struct btrfs_path *path,
-			  struct btrfs_disk_key *key, int level)
+static void fixup_low_keys(struct btrfs_root *root, struct btrfs_path *path,
+			   struct btrfs_disk_key *key, int level)
 {
 	int i;
-	int ret = 0;
 	struct extent_buffer *t;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1897,12 +2937,12 @@
 		if (!path->nodes[i])
 			break;
 		t = path->nodes[i];
+		tree_mod_log_set_node_key(root->fs_info, t, tslot, 1);
 		btrfs_set_node_key(t, key, tslot);
 		btrfs_mark_buffer_dirty(path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
-	return ret;
 }
 
 /*
@@ -1911,9 +2951,8 @@
  * This function isn't completely safe. It's the caller's responsibility
  * that the new key won't break the order
  */
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, struct btrfs_path *path,
-			    struct btrfs_key *new_key)
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+			     struct btrfs_key *new_key)
 {
 	struct btrfs_disk_key disk_key;
 	struct extent_buffer *eb;
@@ -1923,21 +2962,18 @@
 	slot = path->slots[0];
 	if (slot > 0) {
 		btrfs_item_key(eb, &disk_key, slot - 1);
-		if (comp_keys(&disk_key, new_key) >= 0)
-			return -1;
+		BUG_ON(comp_keys(&disk_key, new_key) >= 0);
 	}
 	if (slot < btrfs_header_nritems(eb) - 1) {
 		btrfs_item_key(eb, &disk_key, slot + 1);
-		if (comp_keys(&disk_key, new_key) <= 0)
-			return -1;
+		BUG_ON(comp_keys(&disk_key, new_key) <= 0);
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, new_key);
 	btrfs_set_item_key(eb, &disk_key, slot);
 	btrfs_mark_buffer_dirty(eb);
 	if (slot == 0)
-		fixup_low_keys(trans, root, path, &disk_key, 1);
-	return 0;
+		fixup_low_keys(root, path, &disk_key, 1);
 }
 
 /*
@@ -1983,12 +3019,18 @@
 	} else
 		push_items = min(src_nritems - 8, push_items);
 
+	tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
+			     push_items);
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(dst_nritems),
 			   btrfs_node_key_ptr_offset(0),
 			   push_items * sizeof(struct btrfs_key_ptr));
 
 	if (push_items < src_nritems) {
+		/*
+		 * don't call tree_mod_log_eb_move here, key removal was already
+		 * fully logged by tree_mod_log_eb_copy above.
+		 */
 		memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
 				      btrfs_node_key_ptr_offset(push_items),
 				      (src_nritems - push_items) *
@@ -2042,11 +3084,14 @@
 	if (max_push < push_items)
 		push_items = max_push;
 
+	tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
 	memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
 				      btrfs_node_key_ptr_offset(0),
 				      (dst_nritems) *
 				      sizeof(struct btrfs_key_ptr));
 
+	tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
+			     src_nritems - push_items, push_items);
 	copy_extent_buffer(dst, src,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(src_nritems - push_items),
@@ -2103,13 +3148,11 @@
 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
-	write_extent_buffer(c, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(c),
+	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c),
 			    BTRFS_FSID_SIZE);
 
 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
-			    BTRFS_UUID_SIZE);
+			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
 
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
@@ -2121,6 +3164,7 @@
 	btrfs_mark_buffer_dirty(c);
 
 	old = root->node;
+	tree_mod_log_set_root_pointer(root, c, 0);
 	rcu_assign_pointer(root->node, c);
 
 	/* the super has an extra ref to root->node */
@@ -2140,36 +3184,42 @@
  *
  * slot and level indicate where you want the key to go, and
  * blocknr is the block the key points to.
- *
- * returns zero on success and < 0 on any error
  */
-static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, struct btrfs_disk_key
-		      *key, u64 bytenr, int slot, int level)
+static void insert_ptr(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct btrfs_path *path,
+		       struct btrfs_disk_key *key, u64 bytenr,
+		       int slot, int level)
 {
 	struct extent_buffer *lower;
 	int nritems;
+	int ret;
 
 	BUG_ON(!path->nodes[level]);
 	btrfs_assert_tree_locked(path->nodes[level]);
 	lower = path->nodes[level];
 	nritems = btrfs_header_nritems(lower);
 	BUG_ON(slot > nritems);
-	if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root))
-		BUG();
+	BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
 	if (slot != nritems) {
+		if (level)
+			tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
+					     slot, nritems - slot);
 		memmove_extent_buffer(lower,
 			      btrfs_node_key_ptr_offset(slot + 1),
 			      btrfs_node_key_ptr_offset(slot),
 			      (nritems - slot) * sizeof(struct btrfs_key_ptr));
 	}
+	if (level) {
+		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
+					      MOD_LOG_KEY_ADD, GFP_NOFS);
+		BUG_ON(ret < 0);
+	}
 	btrfs_set_node_key(lower, key, slot);
 	btrfs_set_node_blockptr(lower, slot, bytenr);
 	WARN_ON(trans->transid == 0);
 	btrfs_set_node_ptr_generation(lower, slot, trans->transid);
 	btrfs_set_header_nritems(lower, nritems + 1);
 	btrfs_mark_buffer_dirty(lower);
-	return 0;
 }
 
 /*
@@ -2190,13 +3240,21 @@
 	struct btrfs_disk_key disk_key;
 	int mid;
 	int ret;
-	int wret;
 	u32 c_nritems;
 
 	c = path->nodes[level];
 	WARN_ON(btrfs_header_generation(c) != trans->transid);
 	if (c == root->node) {
-		/* trying to split the root, lets make a new one */
+		/*
+		 * trying to split the root, lets make a new one
+		 *
+		 * tree mod log: We don't log_removal old root in
+		 * insert_new_root, because that root buffer will be kept as a
+		 * normal node. We are going to log removal of half of the
+		 * elements below with tree_mod_log_eb_copy. We're holding a
+		 * tree lock on the buffer, which is why we cannot race with
+		 * other tree_mod_log users.
+		 */
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
@@ -2229,13 +3287,12 @@
 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
 	write_extent_buffer(split, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(split),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(split), BTRFS_FSID_SIZE);
 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
+			    btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
 
-
+	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
 	copy_extent_buffer(split, c,
 			   btrfs_node_key_ptr_offset(0),
 			   btrfs_node_key_ptr_offset(mid),
@@ -2247,11 +3304,8 @@
 	btrfs_mark_buffer_dirty(c);
 	btrfs_mark_buffer_dirty(split);
 
-	wret = insert_ptr(trans, root, path, &disk_key, split->start,
-			  path->slots[level + 1] + 1,
-			  level + 1);
-	if (wret)
-		ret = wret;
+	insert_ptr(trans, root, path, &disk_key, split->start,
+		   path->slots[level + 1] + 1, level + 1);
 
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
@@ -2273,14 +3327,21 @@
  */
 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
 {
+	struct btrfs_item *start_item;
+	struct btrfs_item *end_item;
+	struct btrfs_map_token token;
 	int data_len;
 	int nritems = btrfs_header_nritems(l);
 	int end = min(nritems, start + nr) - 1;
 
 	if (!nr)
 		return 0;
-	data_len = btrfs_item_end_nr(l, start);
-	data_len = data_len - btrfs_item_offset_nr(l, end);
+	btrfs_init_map_token(&token);
+	start_item = btrfs_item_nr(l, start);
+	end_item = btrfs_item_nr(l, end);
+	data_len = btrfs_token_item_offset(l, start_item, &token) +
+		btrfs_token_item_size(l, start_item, &token);
+	data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
 	data_len += sizeof(struct btrfs_item) * nr;
 	WARN_ON(data_len < 0);
 	return data_len;
@@ -2320,6 +3381,7 @@
 {
 	struct extent_buffer *left = path->nodes[0];
 	struct extent_buffer *upper = path->nodes[1];
+	struct btrfs_map_token token;
 	struct btrfs_disk_key disk_key;
 	int slot;
 	u32 i;
@@ -2331,6 +3393,8 @@
 	u32 data_end;
 	u32 this_item_size;
 
+	btrfs_init_map_token(&token);
+
 	if (empty)
 		nr = 0;
 	else
@@ -2371,8 +3435,7 @@
 	if (push_items == 0)
 		goto out_unlock;
 
-	if (!empty && push_items == left_nritems)
-		WARN_ON(1);
+	WARN_ON(!empty && push_items == left_nritems);
 
 	/* push left to right */
 	right_nritems = btrfs_header_nritems(right);
@@ -2408,8 +3471,8 @@
 	push_space = BTRFS_LEAF_DATA_SIZE(root);
 	for (i = 0; i < right_nritems; i++) {
 		item = btrfs_item_nr(right, i);
-		push_space -= btrfs_item_size(right, item);
-		btrfs_set_item_offset(right, item, push_space);
+		push_space -= btrfs_token_item_size(right, item, &token);
+		btrfs_set_token_item_offset(right, item, push_space, &token);
 	}
 
 	left_nritems -= push_items;
@@ -2537,9 +3600,11 @@
 	u32 old_left_nritems;
 	u32 nr;
 	int ret = 0;
-	int wret;
 	u32 this_item_size;
 	u32 old_left_item_size;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	if (empty)
 		nr = min(right_nritems, max_slot);
@@ -2600,18 +3665,17 @@
 
 		item = btrfs_item_nr(left, i);
 
-		ioff = btrfs_item_offset(left, item);
-		btrfs_set_item_offset(left, item,
-		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size));
+		ioff = btrfs_token_item_offset(left, item, &token);
+		btrfs_set_token_item_offset(left, item,
+		      ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
+		      &token);
 	}
 	btrfs_set_header_nritems(left, old_left_nritems + push_items);
 
 	/* fixup right node */
-	if (push_items > right_nritems) {
-		printk(KERN_CRIT "push items %d nr %u\n", push_items,
+	if (push_items > right_nritems)
+		WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
 		       right_nritems);
-		WARN_ON(1);
-	}
 
 	if (push_items < right_nritems) {
 		push_space = btrfs_item_offset_nr(right, push_items - 1) -
@@ -2632,8 +3696,9 @@
 	for (i = 0; i < right_nritems; i++) {
 		item = btrfs_item_nr(right, i);
 
-		push_space = push_space - btrfs_item_size(right, item);
-		btrfs_set_item_offset(right, item, push_space);
+		push_space = push_space - btrfs_token_item_size(right,
+								item, &token);
+		btrfs_set_token_item_offset(right, item, push_space, &token);
 	}
 
 	btrfs_mark_buffer_dirty(left);
@@ -2643,9 +3708,7 @@
 		clean_tree_block(trans, root, right);
 
 	btrfs_item_key(right, &disk_key, 0);
-	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
-	if (wret)
-		ret = wret;
+	fixup_low_keys(root, path, &disk_key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
@@ -2716,7 +3779,8 @@
 			      path->nodes[1], slot - 1, &left);
 	if (ret) {
 		/* we hit -ENOSPC, but it isn't fatal here */
-		ret = 1;
+		if (ret == -ENOSPC)
+			ret = 1;
 		goto out;
 	}
 
@@ -2738,22 +3802,21 @@
 /*
  * split the path's leaf in two, making sure there is at least data_size
  * available for the resulting leaf level of the path.
- *
- * returns 0 if all went well and < 0 on failure.
  */
-static noinline int copy_for_split(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct btrfs_path *path,
-			       struct extent_buffer *l,
-			       struct extent_buffer *right,
-			       int slot, int mid, int nritems)
+static noinline void copy_for_split(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *l,
+				    struct extent_buffer *right,
+				    int slot, int mid, int nritems)
 {
 	int data_copy_size;
 	int rt_data_off;
 	int i;
-	int ret = 0;
-	int wret;
 	struct btrfs_disk_key disk_key;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	nritems = nritems - mid;
 	btrfs_set_header_nritems(right, nritems);
@@ -2775,17 +3838,15 @@
 		struct btrfs_item *item = btrfs_item_nr(right, i);
 		u32 ioff;
 
-		ioff = btrfs_item_offset(right, item);
-		btrfs_set_item_offset(right, item, ioff + rt_data_off);
+		ioff = btrfs_token_item_offset(right, item, &token);
+		btrfs_set_token_item_offset(right, item,
+					    ioff + rt_data_off, &token);
 	}
 
 	btrfs_set_header_nritems(l, mid);
-	ret = 0;
 	btrfs_item_key(right, &disk_key, 0);
-	wret = insert_ptr(trans, root, path, &disk_key, right->start,
-			  path->slots[1] + 1, 1);
-	if (wret)
-		ret = wret;
+	insert_ptr(trans, root, path, &disk_key, right->start,
+		   path->slots[1] + 1, 1);
 
 	btrfs_mark_buffer_dirty(right);
 	btrfs_mark_buffer_dirty(l);
@@ -2803,8 +3864,6 @@
 	}
 
 	BUG_ON(path->slots[0] < 0);
-
-	return ret;
 }
 
 /*
@@ -2896,7 +3955,7 @@
 		return -EOVERFLOW;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size) {
+	if (data_size && path->nodes[1]) {
 		wret = push_leaf_right(trans, root, path, data_size,
 				       data_size, 0, 0);
 		if (wret < 0)
@@ -2983,22 +4042,17 @@
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(right),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(right), BTRFS_FSID_SIZE);
 
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
+			    btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
 
 	if (split == 0) {
 		if (mid <= slot) {
 			btrfs_set_header_nritems(right, 0);
-			wret = insert_ptr(trans, root, path,
-					  &disk_key, right->start,
-					  path->slots[1] + 1, 1);
-			if (wret)
-				ret = wret;
-
+			insert_ptr(trans, root, path, &disk_key, right->start,
+				   path->slots[1] + 1, 1);
 			btrfs_tree_unlock(path->nodes[0]);
 			free_extent_buffer(path->nodes[0]);
 			path->nodes[0] = right;
@@ -3006,29 +4060,20 @@
 			path->slots[1] += 1;
 		} else {
 			btrfs_set_header_nritems(right, 0);
-			wret = insert_ptr(trans, root, path,
-					  &disk_key,
-					  right->start,
+			insert_ptr(trans, root, path, &disk_key, right->start,
 					  path->slots[1], 1);
-			if (wret)
-				ret = wret;
 			btrfs_tree_unlock(path->nodes[0]);
 			free_extent_buffer(path->nodes[0]);
 			path->nodes[0] = right;
 			path->slots[0] = 0;
-			if (path->slots[1] == 0) {
-				wret = fixup_low_keys(trans, root,
-						path, &disk_key, 1);
-				if (wret)
-					ret = wret;
-			}
+			if (path->slots[1] == 0)
+				fixup_low_keys(root, path, &disk_key, 1);
 		}
 		btrfs_mark_buffer_dirty(right);
 		return ret;
 	}
 
-	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
-	BUG_ON(ret);
+	copy_for_split(trans, root, path, l, right, slot, mid, nritems);
 
 	if (split == 2) {
 		BUG_ON(num_doubles != 0);
@@ -3036,7 +4081,7 @@
 		goto again;
 	}
 
-	return ret;
+	return 0;
 
 push_for_double:
 	push_for_double_split(trans, root, path, data_size);
@@ -3238,11 +4283,9 @@
 		return ret;
 
 	path->slots[0]++;
-	ret = setup_items_for_insert(trans, root, path, new_key, &item_size,
-				     item_size, item_size +
-				     sizeof(struct btrfs_item), 1);
-	BUG_ON(ret);
-
+	setup_items_for_insert(root, path, new_key, &item_size,
+			       item_size, item_size +
+			       sizeof(struct btrfs_item), 1);
 	leaf = path->nodes[0];
 	memcpy_extent_buffer(leaf,
 			     btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -3257,10 +4300,8 @@
  * off the end of the item or if we shift the item to chop bytes off
  * the front.
  */
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			u32 new_size, int from_end)
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+			 u32 new_size, int from_end)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -3271,13 +4312,16 @@
 	unsigned int old_size;
 	unsigned int size_diff;
 	int i;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 
 	old_size = btrfs_item_size_nr(leaf, slot);
 	if (old_size == new_size)
-		return 0;
+		return;
 
 	nritems = btrfs_header_nritems(leaf);
 	data_end = leaf_data_end(root, leaf);
@@ -3297,8 +4341,9 @@
 		u32 ioff;
 		item = btrfs_item_nr(leaf, i);
 
-		ioff = btrfs_item_offset(leaf, item);
-		btrfs_set_item_offset(leaf, item, ioff + size_diff);
+		ioff = btrfs_token_item_offset(leaf, item, &token);
+		btrfs_set_token_item_offset(leaf, item,
+					    ioff + size_diff, &token);
 	}
 
 	/* shift the data */
@@ -3339,7 +4384,7 @@
 		btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
 		btrfs_set_item_key(leaf, &disk_key, slot);
 		if (slot == 0)
-			fixup_low_keys(trans, root, path, &disk_key, 1);
+			fixup_low_keys(root, path, &disk_key, 1);
 	}
 
 	item = btrfs_item_nr(leaf, slot);
@@ -3350,15 +4395,13 @@
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	return 0;
 }
 
 /*
- * make the item pointed to by the path bigger, data_size is the new size.
+ * make the item pointed to by the path bigger, data_size is the added size.
  */
-int btrfs_extend_item(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root, struct btrfs_path *path,
-		      u32 data_size)
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+		       u32 data_size)
 {
 	int slot;
 	struct extent_buffer *leaf;
@@ -3368,6 +4411,9 @@
 	unsigned int old_data;
 	unsigned int old_size;
 	int i;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 
@@ -3397,8 +4443,9 @@
 		u32 ioff;
 		item = btrfs_item_nr(leaf, i);
 
-		ioff = btrfs_item_offset(leaf, item);
-		btrfs_set_item_offset(leaf, item, ioff - data_size);
+		ioff = btrfs_token_item_offset(leaf, item, &token);
+		btrfs_set_token_item_offset(leaf, item,
+					    ioff - data_size, &token);
 	}
 
 	/* shift the data */
@@ -3416,145 +4463,6 @@
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	return 0;
-}
-
-/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
- * Returns the number of keys that were inserted.
- */
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path,
-			    struct btrfs_key *cpu_key, u32 *data_size,
-			    int nr)
-{
-	struct extent_buffer *leaf;
-	struct btrfs_item *item;
-	int ret = 0;
-	int slot;
-	int i;
-	u32 nritems;
-	u32 total_data = 0;
-	u32 total_size = 0;
-	unsigned int data_end;
-	struct btrfs_disk_key disk_key;
-	struct btrfs_key found_key;
-
-	for (i = 0; i < nr; i++) {
-		if (total_size + data_size[i] + sizeof(struct btrfs_item) >
-		    BTRFS_LEAF_DATA_SIZE(root)) {
-			break;
-			nr = i;
-		}
-		total_data += data_size[i];
-		total_size += data_size[i] + sizeof(struct btrfs_item);
-	}
-	BUG_ON(nr == 0);
-
-	ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-	if (ret == 0)
-		return -EEXIST;
-	if (ret < 0)
-		goto out;
-
-	leaf = path->nodes[0];
-
-	nritems = btrfs_header_nritems(leaf);
-	data_end = leaf_data_end(root, leaf);
-
-	if (btrfs_leaf_free_space(root, leaf) < total_size) {
-		for (i = nr; i >= 0; i--) {
-			total_data -= data_size[i];
-			total_size -= data_size[i] + sizeof(struct btrfs_item);
-			if (total_size < btrfs_leaf_free_space(root, leaf))
-				break;
-		}
-		nr = i;
-	}
-
-	slot = path->slots[0];
-	BUG_ON(slot < 0);
-
-	if (slot != nritems) {
-		unsigned int old_data = btrfs_item_end_nr(leaf, slot);
-
-		item = btrfs_item_nr(leaf, slot);
-		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-
-		/* figure out how many keys we can insert in here */
-		total_data = data_size[0];
-		for (i = 1; i < nr; i++) {
-			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
-				break;
-			total_data += data_size[i];
-		}
-		nr = i;
-
-		if (old_data < data_end) {
-			btrfs_print_leaf(root, leaf);
-			printk(KERN_CRIT "slot %d old_data %d data_end %d\n",
-			       slot, old_data, data_end);
-			BUG_ON(1);
-		}
-		/*
-		 * item0..itemN ... dataN.offset..dataN.size .. data0.size
-		 */
-		/* first correct the data pointers */
-		for (i = slot; i < nritems; i++) {
-			u32 ioff;
-
-			item = btrfs_item_nr(leaf, i);
-			ioff = btrfs_item_offset(leaf, item);
-			btrfs_set_item_offset(leaf, item, ioff - total_data);
-		}
-		/* shift the items */
-		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
-			      btrfs_item_nr_offset(slot),
-			      (nritems - slot) * sizeof(struct btrfs_item));
-
-		/* shift the data */
-		memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
-			      data_end - total_data, btrfs_leaf_data(leaf) +
-			      data_end, old_data - data_end);
-		data_end = old_data;
-	} else {
-		/*
-		 * this sucks but it has to be done, if we are inserting at
-		 * the end of the leaf only insert 1 of the items, since we
-		 * have no way of knowing whats on the next leaf and we'd have
-		 * to drop our current locks to figure it out
-		 */
-		nr = 1;
-	}
-
-	/* setup the item for the new data */
-	for (i = 0; i < nr; i++) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
-		btrfs_set_item_key(leaf, &disk_key, slot + i);
-		item = btrfs_item_nr(leaf, slot + i);
-		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
-		data_end -= data_size[i];
-		btrfs_set_item_size(leaf, item, data_size[i]);
-	}
-	btrfs_set_header_nritems(leaf, nritems + nr);
-	btrfs_mark_buffer_dirty(leaf);
-
-	ret = 0;
-	if (slot == 0) {
-		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
-	}
-
-	if (btrfs_leaf_free_space(root, leaf) < 0) {
-		btrfs_print_leaf(root, leaf);
-		BUG();
-	}
-out:
-	if (!ret)
-		ret = nr;
-	return ret;
 }
 
 /*
@@ -3562,19 +4470,20 @@
  * to save stack depth by doing the bulk of the work in a function
  * that doesn't call btrfs_search_slot
  */
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct btrfs_path *path,
-			   struct btrfs_key *cpu_key, u32 *data_size,
-			   u32 total_data, u32 total_size, int nr)
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    u32 total_data, u32 total_size, int nr)
 {
 	struct btrfs_item *item;
 	int i;
 	u32 nritems;
 	unsigned int data_end;
 	struct btrfs_disk_key disk_key;
-	int ret;
 	struct extent_buffer *leaf;
 	int slot;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 	slot = path->slots[0];
@@ -3606,8 +4515,9 @@
 			u32 ioff;
 
 			item = btrfs_item_nr(leaf, i);
-			ioff = btrfs_item_offset(leaf, item);
-			btrfs_set_item_offset(leaf, item, ioff - total_data);
+			ioff = btrfs_token_item_offset(leaf, item, &token);
+			btrfs_set_token_item_offset(leaf, item,
+						    ioff - total_data, &token);
 		}
 		/* shift the items */
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
@@ -3626,17 +4536,17 @@
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
 		btrfs_set_item_key(leaf, &disk_key, slot + i);
 		item = btrfs_item_nr(leaf, slot + i);
-		btrfs_set_item_offset(leaf, item, data_end - data_size[i]);
+		btrfs_set_token_item_offset(leaf, item,
+					    data_end - data_size[i], &token);
 		data_end -= data_size[i];
-		btrfs_set_item_size(leaf, item, data_size[i]);
+		btrfs_set_token_item_size(leaf, item, data_size[i], &token);
 	}
 
 	btrfs_set_header_nritems(leaf, nritems + nr);
 
-	ret = 0;
 	if (slot == 0) {
 		btrfs_cpu_key_to_disk(&disk_key, cpu_key);
-		ret = fixup_low_keys(trans, root, path, &disk_key, 1);
+		fixup_low_keys(root, path, &disk_key, 1);
 	}
 	btrfs_unlock_up_safe(path, 1);
 	btrfs_mark_buffer_dirty(leaf);
@@ -3645,7 +4555,6 @@
 		btrfs_print_leaf(root, leaf);
 		BUG();
 	}
-	return ret;
 }
 
 /*
@@ -3672,16 +4581,14 @@
 	if (ret == 0)
 		return -EEXIST;
 	if (ret < 0)
-		goto out;
+		return ret;
 
 	slot = path->slots[0];
 	BUG_ON(slot < 0);
 
-	ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+	setup_items_for_insert(root, path, cpu_key, data_size,
 			       total_data, total_size, nr);
-
-out:
-	return ret;
+	return 0;
 }
 
 /*
@@ -3717,22 +4624,29 @@
  * the tree should have been previously balanced so the deletion does not
  * empty a node.
  */
-static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		   struct btrfs_path *path, int level, int slot)
+static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
+		    int level, int slot)
 {
 	struct extent_buffer *parent = path->nodes[level];
 	u32 nritems;
-	int ret = 0;
-	int wret;
+	int ret;
 
 	nritems = btrfs_header_nritems(parent);
 	if (slot != nritems - 1) {
+		if (level)
+			tree_mod_log_eb_move(root->fs_info, parent, slot,
+					     slot + 1, nritems - slot - 1);
 		memmove_extent_buffer(parent,
 			      btrfs_node_key_ptr_offset(slot),
 			      btrfs_node_key_ptr_offset(slot + 1),
 			      sizeof(struct btrfs_key_ptr) *
 			      (nritems - slot - 1));
+	} else if (level) {
+		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
+					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
+		BUG_ON(ret < 0);
 	}
+
 	nritems--;
 	btrfs_set_header_nritems(parent, nritems);
 	if (nritems == 0 && parent == root->node) {
@@ -3743,12 +4657,9 @@
 		struct btrfs_disk_key disk_key;
 
 		btrfs_node_key(parent, &disk_key, 0);
-		wret = fixup_low_keys(trans, root, path, &disk_key, level + 1);
-		if (wret)
-			ret = wret;
+		fixup_low_keys(root, path, &disk_key, level + 1);
 	}
 	btrfs_mark_buffer_dirty(parent);
-	return ret;
 }
 
 /*
@@ -3761,17 +4672,13 @@
  * The path must have already been setup for deleting the leaf, including
  * all the proper balancing.  path->nodes[1] must be locked.
  */
-static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-				   struct btrfs_root *root,
-				   struct btrfs_path *path,
-				   struct extent_buffer *leaf)
+static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct extent_buffer *leaf)
 {
-	int ret;
-
 	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
-	ret = del_ptr(trans, root, path, 1, path->slots[1]);
-	if (ret)
-		return ret;
+	del_ptr(root, path, 1, path->slots[1]);
 
 	/*
 	 * btrfs_free_extent is expensive, we want to make sure we
@@ -3781,8 +4688,9 @@
 
 	root_sub_used(root, leaf->len);
 
+	extent_buffer_get(leaf);
 	btrfs_free_tree_block(trans, root, leaf, 0, 1);
-	return 0;
+	free_extent_buffer_stale(leaf);
 }
 /*
  * delete the item at the leaf level in path.  If that empties
@@ -3799,6 +4707,9 @@
 	int wret;
 	int i;
 	u32 nritems;
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
 
 	leaf = path->nodes[0];
 	last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
@@ -3820,8 +4731,9 @@
 			u32 ioff;
 
 			item = btrfs_item_nr(leaf, i);
-			ioff = btrfs_item_offset(leaf, item);
-			btrfs_set_item_offset(leaf, item, ioff + dsize);
+			ioff = btrfs_token_item_offset(leaf, item, &token);
+			btrfs_set_token_item_offset(leaf, item,
+						    ioff + dsize, &token);
 		}
 
 		memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
@@ -3839,8 +4751,7 @@
 		} else {
 			btrfs_set_path_blocking(path);
 			clean_tree_block(trans, root, leaf);
-			ret = btrfs_del_leaf(trans, root, path, leaf);
-			BUG_ON(ret);
+			btrfs_del_leaf(trans, root, path, leaf);
 		}
 	} else {
 		int used = leaf_space_used(leaf, 0, nritems);
@@ -3848,10 +4759,7 @@
 			struct btrfs_disk_key disk_key;
 
 			btrfs_item_key(leaf, &disk_key, 0);
-			wret = fixup_low_keys(trans, root, path,
-					      &disk_key, 1);
-			if (wret)
-				ret = wret;
+			fixup_low_keys(root, path, &disk_key, 1);
 		}
 
 		/* delete the leaf if it is mostly empty */
@@ -3879,9 +4787,9 @@
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path, leaf);
-				BUG_ON(ret);
+				btrfs_del_leaf(trans, root, path, leaf);
 				free_extent_buffer(leaf);
+				ret = 0;
 			} else {
 				/* if we're still in the path, make sure
 				 * we're dirty.  Otherwise, one of the
@@ -3907,7 +4815,7 @@
  * This may release the path, and so you may lose any locks held at the
  * time you call it.
  */
-int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
 	struct btrfs_key key;
 	struct btrfs_disk_key found_key;
@@ -3937,8 +4845,8 @@
 
 /*
  * A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are either in cache or have a minimum
- * transaction id.  This is used by the btree defrag code, and tree logging
+ * for nodes or leaves that are have a minimum transaction id.
+ * This is used by the btree defrag code, and tree logging
  *
  * This does not cow, but it does stuff the starting key it finds back
  * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -3959,7 +4867,7 @@
  */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 			 struct btrfs_key *max_key,
-			 struct btrfs_path *path, int cache_only,
+			 struct btrfs_path *path,
 			 u64 min_trans)
 {
 	struct extent_buffer *cur;
@@ -3999,15 +4907,12 @@
 		if (sret && slot > 0)
 			slot--;
 		/*
-		 * check this node pointer against the cache_only and
-		 * min_trans parameters.  If it isn't in cache or is too
-		 * old, skip to the next one.
+		 * check this node pointer against the min_trans parameters.
+		 * If it is too old, old, skip to the next one.
 		 */
 		while (slot < nritems) {
 			u64 blockptr;
 			u64 gen;
-			struct extent_buffer *tmp;
-			struct btrfs_disk_key disk_key;
 
 			blockptr = btrfs_node_blockptr(cur, slot);
 			gen = btrfs_node_ptr_generation(cur, slot);
@@ -4015,27 +4920,7 @@
 				slot++;
 				continue;
 			}
-			if (!cache_only)
-				break;
-
-			if (max_key) {
-				btrfs_node_key(cur, &disk_key, slot);
-				if (comp_keys(&disk_key, max_key) >= 0) {
-					ret = 1;
-					goto out;
-				}
-			}
-
-			tmp = btrfs_find_tree_block(root, blockptr,
-					    btrfs_level_size(root, level - 1));
-
-			if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
-				free_extent_buffer(tmp);
-				break;
-			}
-			if (tmp)
-				free_extent_buffer(tmp);
-			slot++;
+			break;
 		}
 find_next_key:
 		/*
@@ -4046,7 +4931,7 @@
 			path->slots[level] = slot;
 			btrfs_set_path_blocking(path);
 			sret = btrfs_find_next_key(root, path, min_key, level,
-						  cache_only, min_trans);
+						  min_trans);
 			if (sret == 0) {
 				btrfs_release_path(path);
 				goto again;
@@ -4059,18 +4944,18 @@
 		path->slots[level] = slot;
 		if (level == path->lowest_level) {
 			ret = 0;
-			unlock_up(path, level, 1);
+			unlock_up(path, level, 1, 0, NULL);
 			goto out;
 		}
 		btrfs_set_path_blocking(path);
 		cur = read_node_slot(root, cur, slot);
-		BUG_ON(!cur);
+		BUG_ON(!cur); /* -ENOMEM */
 
 		btrfs_tree_read_lock(cur);
 
 		path->locks[level - 1] = BTRFS_READ_LOCK;
 		path->nodes[level - 1] = cur;
-		unlock_up(path, level, 1);
+		unlock_up(path, level, 1, 0, NULL);
 		btrfs_clear_path_blocking(path, NULL, 0);
 	}
 out:
@@ -4080,11 +4965,439 @@
 	return ret;
 }
 
+static void tree_move_down(struct btrfs_root *root,
+			   struct btrfs_path *path,
+			   int *level, int root_level)
+{
+	BUG_ON(*level == 0);
+	path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
+					path->slots[*level]);
+	path->slots[*level - 1] = 0;
+	(*level)--;
+}
+
+static int tree_move_next_or_upnext(struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    int *level, int root_level)
+{
+	int ret = 0;
+	int nritems;
+	nritems = btrfs_header_nritems(path->nodes[*level]);
+
+	path->slots[*level]++;
+
+	while (path->slots[*level] >= nritems) {
+		if (*level == root_level)
+			return -1;
+
+		/* move upnext */
+		path->slots[*level] = 0;
+		free_extent_buffer(path->nodes[*level]);
+		path->nodes[*level] = NULL;
+		(*level)++;
+		path->slots[*level]++;
+
+		nritems = btrfs_header_nritems(path->nodes[*level]);
+		ret = 1;
+	}
+	return ret;
+}
+
+/*
+ * Returns 1 if it had to move up and next. 0 is returned if it moved only next
+ * or down.
+ */
+static int tree_advance(struct btrfs_root *root,
+			struct btrfs_path *path,
+			int *level, int root_level,
+			int allow_down,
+			struct btrfs_key *key)
+{
+	int ret;
+
+	if (*level == 0 || !allow_down) {
+		ret = tree_move_next_or_upnext(root, path, level, root_level);
+	} else {
+		tree_move_down(root, path, level, root_level);
+		ret = 0;
+	}
+	if (ret >= 0) {
+		if (*level == 0)
+			btrfs_item_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+		else
+			btrfs_node_key_to_cpu(path->nodes[*level], key,
+					path->slots[*level]);
+	}
+	return ret;
+}
+
+static int tree_compare_item(struct btrfs_root *left_root,
+			     struct btrfs_path *left_path,
+			     struct btrfs_path *right_path,
+			     char *tmp_buf)
+{
+	int cmp;
+	int len1, len2;
+	unsigned long off1, off2;
+
+	len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
+	len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
+	if (len1 != len2)
+		return 1;
+
+	off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
+	off2 = btrfs_item_ptr_offset(right_path->nodes[0],
+				right_path->slots[0]);
+
+	read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
+
+	cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
+	if (cmp)
+		return 1;
+	return 0;
+}
+
+#define ADVANCE 1
+#define ADVANCE_ONLY_NEXT -1
+
+/*
+ * This function compares two trees and calls the provided callback for
+ * every changed/new/deleted item it finds.
+ * If shared tree blocks are encountered, whole subtrees are skipped, making
+ * the compare pretty fast on snapshotted subvolumes.
+ *
+ * This currently works on commit roots only. As commit roots are read only,
+ * we don't do any locking. The commit roots are protected with transactions.
+ * Transactions are ended and rejoined when a commit is tried in between.
+ *
+ * This function checks for modifications done to the trees while comparing.
+ * If it detects a change, it aborts immediately.
+ */
+int btrfs_compare_trees(struct btrfs_root *left_root,
+			struct btrfs_root *right_root,
+			btrfs_changed_cb_t changed_cb, void *ctx)
+{
+	int ret;
+	int cmp;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_path *left_path = NULL;
+	struct btrfs_path *right_path = NULL;
+	struct btrfs_key left_key;
+	struct btrfs_key right_key;
+	char *tmp_buf = NULL;
+	int left_root_level;
+	int right_root_level;
+	int left_level;
+	int right_level;
+	int left_end_reached;
+	int right_end_reached;
+	int advance_left;
+	int advance_right;
+	u64 left_blockptr;
+	u64 right_blockptr;
+	u64 left_start_ctransid;
+	u64 right_start_ctransid;
+	u64 ctransid;
+
+	left_path = btrfs_alloc_path();
+	if (!left_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	right_path = btrfs_alloc_path();
+	if (!right_path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS);
+	if (!tmp_buf) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	left_path->search_commit_root = 1;
+	left_path->skip_locking = 1;
+	right_path->search_commit_root = 1;
+	right_path->skip_locking = 1;
+
+	spin_lock(&left_root->root_item_lock);
+	left_start_ctransid = btrfs_root_ctransid(&left_root->root_item);
+	spin_unlock(&left_root->root_item_lock);
+
+	spin_lock(&right_root->root_item_lock);
+	right_start_ctransid = btrfs_root_ctransid(&right_root->root_item);
+	spin_unlock(&right_root->root_item_lock);
+
+	trans = btrfs_join_transaction(left_root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
+
+	/*
+	 * Strategy: Go to the first items of both trees. Then do
+	 *
+	 * If both trees are at level 0
+	 *   Compare keys of current items
+	 *     If left < right treat left item as new, advance left tree
+	 *       and repeat
+	 *     If left > right treat right item as deleted, advance right tree
+	 *       and repeat
+	 *     If left == right do deep compare of items, treat as changed if
+	 *       needed, advance both trees and repeat
+	 * If both trees are at the same level but not at level 0
+	 *   Compare keys of current nodes/leafs
+	 *     If left < right advance left tree and repeat
+	 *     If left > right advance right tree and repeat
+	 *     If left == right compare blockptrs of the next nodes/leafs
+	 *       If they match advance both trees but stay at the same level
+	 *         and repeat
+	 *       If they don't match advance both trees while allowing to go
+	 *         deeper and repeat
+	 * If tree levels are different
+	 *   Advance the tree that needs it and repeat
+	 *
+	 * Advancing a tree means:
+	 *   If we are at level 0, try to go to the next slot. If that's not
+	 *   possible, go one level up and repeat. Stop when we found a level
+	 *   where we could go to the next slot. We may at this point be on a
+	 *   node or a leaf.
+	 *
+	 *   If we are not at level 0 and not on shared tree blocks, go one
+	 *   level deeper.
+	 *
+	 *   If we are not at level 0 and on shared tree blocks, go one slot to
+	 *   the right if possible or go up and right.
+	 */
+
+	left_level = btrfs_header_level(left_root->commit_root);
+	left_root_level = left_level;
+	left_path->nodes[left_level] = left_root->commit_root;
+	extent_buffer_get(left_path->nodes[left_level]);
+
+	right_level = btrfs_header_level(right_root->commit_root);
+	right_root_level = right_level;
+	right_path->nodes[right_level] = right_root->commit_root;
+	extent_buffer_get(right_path->nodes[right_level]);
+
+	if (left_level == 0)
+		btrfs_item_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	else
+		btrfs_node_key_to_cpu(left_path->nodes[left_level],
+				&left_key, left_path->slots[left_level]);
+	if (right_level == 0)
+		btrfs_item_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+	else
+		btrfs_node_key_to_cpu(right_path->nodes[right_level],
+				&right_key, right_path->slots[right_level]);
+
+	left_end_reached = right_end_reached = 0;
+	advance_left = advance_right = 0;
+
+	while (1) {
+		/*
+		 * We need to make sure the transaction does not get committed
+		 * while we do anything on commit roots. This means, we need to
+		 * join and leave transactions for every item that we process.
+		 */
+		if (trans && btrfs_should_end_transaction(trans, left_root)) {
+			btrfs_release_path(left_path);
+			btrfs_release_path(right_path);
+
+			ret = btrfs_end_transaction(trans, left_root);
+			trans = NULL;
+			if (ret < 0)
+				goto out;
+		}
+		/* now rejoin the transaction */
+		if (!trans) {
+			trans = btrfs_join_transaction(left_root);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				trans = NULL;
+				goto out;
+			}
+
+			spin_lock(&left_root->root_item_lock);
+			ctransid = btrfs_root_ctransid(&left_root->root_item);
+			spin_unlock(&left_root->root_item_lock);
+			if (ctransid != left_start_ctransid)
+				left_start_ctransid = 0;
+
+			spin_lock(&right_root->root_item_lock);
+			ctransid = btrfs_root_ctransid(&right_root->root_item);
+			spin_unlock(&right_root->root_item_lock);
+			if (ctransid != right_start_ctransid)
+				right_start_ctransid = 0;
+
+			if (!left_start_ctransid || !right_start_ctransid) {
+				WARN(1, KERN_WARNING
+					"btrfs: btrfs_compare_tree detected "
+					"a change in one of the trees while "
+					"iterating. This is probably a "
+					"bug.\n");
+				ret = -EIO;
+				goto out;
+			}
+
+			/*
+			 * the commit root may have changed, so start again
+			 * where we stopped
+			 */
+			left_path->lowest_level = left_level;
+			right_path->lowest_level = right_level;
+			ret = btrfs_search_slot(NULL, left_root,
+					&left_key, left_path, 0, 0);
+			if (ret < 0)
+				goto out;
+			ret = btrfs_search_slot(NULL, right_root,
+					&right_key, right_path, 0, 0);
+			if (ret < 0)
+				goto out;
+		}
+
+		if (advance_left && !left_end_reached) {
+			ret = tree_advance(left_root, left_path, &left_level,
+					left_root_level,
+					advance_left != ADVANCE_ONLY_NEXT,
+					&left_key);
+			if (ret < 0)
+				left_end_reached = ADVANCE;
+			advance_left = 0;
+		}
+		if (advance_right && !right_end_reached) {
+			ret = tree_advance(right_root, right_path, &right_level,
+					right_root_level,
+					advance_right != ADVANCE_ONLY_NEXT,
+					&right_key);
+			if (ret < 0)
+				right_end_reached = ADVANCE;
+			advance_right = 0;
+		}
+
+		if (left_end_reached && right_end_reached) {
+			ret = 0;
+			goto out;
+		} else if (left_end_reached) {
+			if (right_level == 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_right = ADVANCE;
+			continue;
+		} else if (right_end_reached) {
+			if (left_level == 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+			}
+			advance_left = ADVANCE;
+			continue;
+		}
+
+		if (left_level == 0 && right_level == 0) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&left_key,
+						BTRFS_COMPARE_TREE_NEW,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				ret = changed_cb(left_root, right_root,
+						left_path, right_path,
+						&right_key,
+						BTRFS_COMPARE_TREE_DELETED,
+						ctx);
+				if (ret < 0)
+					goto out;
+				advance_right = ADVANCE;
+			} else {
+				enum btrfs_compare_tree_result cmp;
+
+				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
+				ret = tree_compare_item(left_root, left_path,
+						right_path, tmp_buf);
+				if (ret)
+					cmp = BTRFS_COMPARE_TREE_CHANGED;
+				else
+					cmp = BTRFS_COMPARE_TREE_SAME;
+				ret = changed_cb(left_root, right_root,
+						 left_path, right_path,
+						 &left_key, cmp, ctx);
+				if (ret < 0)
+					goto out;
+				advance_left = ADVANCE;
+				advance_right = ADVANCE;
+			}
+		} else if (left_level == right_level) {
+			cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+			if (cmp < 0) {
+				advance_left = ADVANCE;
+			} else if (cmp > 0) {
+				advance_right = ADVANCE;
+			} else {
+				left_blockptr = btrfs_node_blockptr(
+						left_path->nodes[left_level],
+						left_path->slots[left_level]);
+				right_blockptr = btrfs_node_blockptr(
+						right_path->nodes[right_level],
+						right_path->slots[right_level]);
+				if (left_blockptr == right_blockptr) {
+					/*
+					 * As we're on a shared block, don't
+					 * allow to go deeper.
+					 */
+					advance_left = ADVANCE_ONLY_NEXT;
+					advance_right = ADVANCE_ONLY_NEXT;
+				} else {
+					advance_left = ADVANCE;
+					advance_right = ADVANCE;
+				}
+			}
+		} else if (left_level < right_level) {
+			advance_right = ADVANCE;
+		} else {
+			advance_left = ADVANCE;
+		}
+	}
+
+out:
+	btrfs_free_path(left_path);
+	btrfs_free_path(right_path);
+	kfree(tmp_buf);
+
+	if (trans) {
+		if (!ret)
+			ret = btrfs_end_transaction(trans, left_root);
+		else
+			btrfs_end_transaction(trans, left_root);
+	}
+
+	return ret;
+}
+
 /*
  * this is similar to btrfs_next_leaf, but does not try to preserve
  * and fixup the path.  It looks for and returns the next key in the
- * tree based on the current path and the cache_only and min_trans
- * parameters.
+ * tree based on the current path and the min_trans parameters.
  *
  * 0 is returned if another key is found, < 0 if there are any errors
  * and 1 is returned if there are no higher keys in the tree
@@ -4093,8 +5406,7 @@
  * calling this function.
  */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-			struct btrfs_key *key, int level,
-			int cache_only, u64 min_trans)
+			struct btrfs_key *key, int level, u64 min_trans)
 {
 	int slot;
 	struct extent_buffer *c;
@@ -4145,21 +5457,8 @@
 		if (level == 0)
 			btrfs_item_key_to_cpu(c, key, slot);
 		else {
-			u64 blockptr = btrfs_node_blockptr(c, slot);
 			u64 gen = btrfs_node_ptr_generation(c, slot);
 
-			if (cache_only) {
-				struct extent_buffer *cur;
-				cur = btrfs_find_tree_block(root, blockptr,
-					    btrfs_level_size(root, level - 1));
-				if (!cur || !btrfs_buffer_uptodate(cur, gen)) {
-					slot++;
-					if (cur)
-						free_extent_buffer(cur);
-					goto next;
-				}
-				free_extent_buffer(cur);
-			}
 			if (gen < min_trans) {
 				slot++;
 				goto next;
@@ -4178,6 +5477,12 @@
  */
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
+	return btrfs_next_old_leaf(root, path, 0);
+}
+
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+			u64 time_seq)
+{
 	int slot;
 	int level;
 	struct extent_buffer *c;
@@ -4202,7 +5507,10 @@
 	path->keep_locks = 1;
 	path->leave_spinning = 1;
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (time_seq)
+		ret = btrfs_search_old_slot(root, &key, path, time_seq);
+	else
+		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	path->keep_locks = 0;
 
 	if (ret < 0)
@@ -4247,7 +5555,7 @@
 		next = c;
 		next_rw_lock = path->locks[level];
 		ret = read_block_for_search(NULL, root, path, &next, level,
-					    slot, &key);
+					    slot, &key, 0);
 		if (ret == -EAGAIN)
 			goto again;
 
@@ -4258,6 +5566,19 @@
 
 		if (!path->skip_locking) {
 			ret = btrfs_try_tree_read_lock(next);
+			if (!ret && time_seq) {
+				/*
+				 * If we don't get the lock, we may be racing
+				 * with push_leaf_left, holding that lock while
+				 * itself waiting for the leaf we've currently
+				 * locked. To solve this situation, we give up
+				 * on our lock and cycle.
+				 */
+				free_extent_buffer(next);
+				btrfs_release_path(path);
+				cond_resched();
+				goto again;
+			}
 			if (!ret) {
 				btrfs_set_path_blocking(path);
 				btrfs_tree_read_lock(next);
@@ -4284,7 +5605,7 @@
 			break;
 
 		ret = read_block_for_search(NULL, root, path, &next, level,
-					    0, &key);
+					    0, &key, 0);
 		if (ret == -EAGAIN)
 			goto again;
 
@@ -4306,7 +5627,7 @@
 	}
 	ret = 0;
 done:
-	unlock_up(path, 0, 1);
+	unlock_up(path, 0, 1, 0, NULL);
 	path->leave_spinning = old_spinning;
 	if (!old_spinning)
 		btrfs_set_path_blocking(path);
diff -ur a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
--- a/fs/btrfs/ctree.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/ctree.h	2014-02-17 11:56:58.000000000 +0100
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/rwsem.h>
+#include <linux/semaphore.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
@@ -31,10 +32,10 @@
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
 #include <linux/pagemap.h>
+#include <linux/btrfs.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
-#include "ioctl.h"
 
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -46,7 +47,9 @@
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 
-#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
+
+#define BTRFS_MAX_MIRRORS 3
 
 #define BTRFS_MAX_LEVEL 8
 
@@ -86,6 +89,15 @@
 /* holds checksums of all the data extents */
 #define BTRFS_CSUM_TREE_OBJECTID 7ULL
 
+/* holds quota configuration and tracking */
+#define BTRFS_QUOTA_TREE_OBJECTID 8ULL
+
+/* for storing items that use the BTRFS_UUID_KEY* types */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
+/* for storing balance parameters in the root tree */
+#define BTRFS_BALANCE_OBJECTID -4ULL
+
 /* orhpan objectid for tracking unlinked/truncated files */
 #define BTRFS_ORPHAN_OBJECTID -5ULL
 
@@ -108,7 +120,7 @@
 #define BTRFS_FREE_SPACE_OBJECTID -11ULL
 
 /*
- * The inode number assigned to the special inode for sotring
+ * The inode number assigned to the special inode for storing
  * free ino cache
  */
 #define BTRFS_FREE_INO_OBJECTID -12ULL
@@ -134,12 +146,27 @@
 
 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
 
+#define BTRFS_DEV_REPLACE_DEVID 0ULL
+
+/*
+ * the max metadata block size.  This limit is somewhat artificial,
+ * but the memmove costs go through the roof for larger blocks.
+ */
+#define BTRFS_MAX_METADATA_BLOCKSIZE 65536
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
  */
 #define BTRFS_NAME_LEN 255
 
+/*
+ * Theoretical limit is larger, but we keep this down to a sane
+ * value. That should limit greatly the possibility of collisions on
+ * inode ref items.
+ */
+#define BTRFS_LINK_MAX 65535U
+
 /* 32 bytes in various csum fields */
 #define BTRFS_CSUM_SIZE 32
 
@@ -151,6 +178,9 @@
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
 
+/* spefic to btrfs_map_block(), therefore not in include/linux/blk_types.h */
+#define REQ_GET_READ_MIRRORS	(1 << 30)
+
 #define BTRFS_FT_UNKNOWN	0
 #define BTRFS_FT_REG_FILE	1
 #define BTRFS_FT_DIR		2
@@ -162,6 +192,11 @@
 #define BTRFS_FT_XATTR		8
 #define BTRFS_FT_MAX		9
 
+/* ioprio of readahead is set to idle */
+#define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+
+#define BTRFS_DIRTY_METADATA_THRESH	(32 * 1024 * 1024)
+
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
  * block layout.
@@ -307,7 +342,11 @@
 /*
  * File system states
  */
+#define BTRFS_FS_STATE_ERROR		0
+#define BTRFS_FS_STATE_REMOUNTING	1
+#define BTRFS_FS_STATE_TRANS_ABORTED	2
 
+/* Super block flags */
 /* Errors detected */
 #define BTRFS_SUPER_FLAG_ERROR		(1ULL << 2)
 
@@ -389,7 +428,7 @@
 	__le64 bytes_used;
 	__le64 num_devices;
 	/* future */
-	__le64 unsed_64[4];
+	__le64 unused_64[4];
 
 	u8 tree_root_level;
 	u8 chunk_root_level;
@@ -443,11 +482,17 @@
 	char label[BTRFS_LABEL_SIZE];
 
 	__le64 cache_generation;
+	__le64 uuid_tree_generation;
 
 	/* future expansion */
-	__le64 reserved[31];
+	__le64 reserved[30];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
+
+#ifdef MY_ABC_HERE
+	u8 syno_reserved[561];
+	__le32 archive_version;
+#endif
 } __attribute__ ((__packed__));
 
 /*
@@ -458,6 +503,23 @@
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
 #define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO	(1ULL << 3)
+/*
+ * some patches floated around with a second compression method
+ * lets save that incompat here for when they do get in
+ * Note we don't actually support it, we're just reserving the
+ * number
+ */
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZOv2	(1ULL << 4)
+
+/*
+ * older kernels tried to do bigger metadata blocks, but the
+ * code was pretty buggy.  Lets not let them try anymore.
+ */
+#define BTRFS_FEATURE_INCOMPAT_BIG_METADATA	(1ULL << 5)
+
+#define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF	(1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56		(1ULL << 7)
+#define BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA	(1ULL << 8)
 
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
@@ -465,7 +527,11 @@
 	(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |		\
 	 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |	\
 	 BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |		\
-	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
+	 BTRFS_FEATURE_INCOMPAT_BIG_METADATA |		\
+	 BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |		\
+	 BTRFS_FEATURE_INCOMPAT_RAID56 |		\
+	 BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF |		\
+	 BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -530,6 +596,10 @@
 	unsigned int skip_locking:1;
 	unsigned int leave_spinning:1;
 	unsigned int search_commit_root:1;
+#ifdef MY_ABC_HERE
+	unsigned int caseless_key:1;
+	unsigned int caseless_name:1;
+#endif
 };
 
 /*
@@ -612,6 +682,14 @@
 	/* name goes here */
 } __attribute__ ((__packed__));
 
+struct btrfs_inode_extref {
+	__le64 parent_objectid;
+	__le64 index;
+	__le16 name_len;
+	__u8   name[0];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
 struct btrfs_timespec {
 	__le64 sec;
 	__le32 nsec;
@@ -681,6 +759,36 @@
 	struct btrfs_disk_key drop_progress;
 	u8 drop_level;
 	u8 level;
+
+	/*
+	 * The following fields appear after subvol_uuids+subvol_times
+	 * were introduced.
+	 */
+
+	/*
+	 * This generation number is used to test if the new fields are valid
+	 * and up to date while reading the root item. Everytime the root item
+	 * is written out, the "generation" field is copied into this field. If
+	 * anyone ever mounted the fs with an older kernel, we will have
+	 * mismatching generation values here and thus must invalidate the
+	 * new fields. See btrfs_update_root and btrfs_find_last_root for
+	 * details.
+	 * the offset of generation_v2 is also used as the start for the memset
+	 * when invalidating the fields.
+	 */
+	__le64 generation_v2;
+	u8 uuid[BTRFS_UUID_SIZE];
+	u8 parent_uuid[BTRFS_UUID_SIZE];
+	u8 received_uuid[BTRFS_UUID_SIZE];
+	__le64 ctransid; /* updated when an inode changes */
+	__le64 otransid; /* trans when created */
+	__le64 stransid; /* trans when sent. non-zero for received subvol */
+	__le64 rtransid; /* trans when received. non-zero for received subvol */
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec otime;
+	struct btrfs_timespec stime;
+	struct btrfs_timespec rtime;
+	__le64 reserved[8]; /* for future */
 } __attribute__ ((__packed__));
 
 /*
@@ -692,6 +800,54 @@
 	__le16 name_len;
 } __attribute__ ((__packed__));
 
+struct btrfs_disk_balance_args {
+	/*
+	 * profiles to operate on, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 profiles;
+
+	/* usage filter */
+	__le64 usage;
+
+	/* devid filter */
+	__le64 devid;
+
+	/* devid subset filter [pstart..pend) */
+	__le64 pstart;
+	__le64 pend;
+
+	/* btrfs virtual address space subset filter [vstart..vend) */
+	__le64 vstart;
+	__le64 vend;
+
+	/*
+	 * profile to convert to, single is denoted by
+	 * BTRFS_AVAIL_ALLOC_BIT_SINGLE
+	 */
+	__le64 target;
+
+	/* BTRFS_BALANCE_ARGS_* */
+	__le64 flags;
+
+	__le64 unused[8];
+} __attribute__ ((__packed__));
+
+/*
+ * store balance parameters to disk so that balance can be properly
+ * resumed after crash or unmount
+ */
+struct btrfs_balance_item {
+	/* BTRFS_BALANCE_* */
+	__le64 flags;
+
+	struct btrfs_disk_balance_args data;
+	struct btrfs_disk_balance_args meta;
+	struct btrfs_disk_balance_args sys;
+
+	__le64 unused[4];
+} __attribute__ ((__packed__));
+
 #define BTRFS_FILE_EXTENT_INLINE 0
 #define BTRFS_FILE_EXTENT_REG 1
 #define BTRFS_FILE_EXTENT_PREALLOC 2
@@ -750,15 +906,123 @@
 	u8 csum;
 } __attribute__ ((__packed__));
 
+struct btrfs_dev_stats_item {
+	/*
+	 * grow this item struct at the end for future enhancements and keep
+	 * the existing values unchanged
+	 */
+	__le64 values[BTRFS_DEV_STAT_VALUES_MAX];
+} __attribute__ ((__packed__));
+
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_ALWAYS	0
+#define BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID	1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_NEVER_STARTED	0
+#define BTRFS_DEV_REPLACE_ITEM_STATE_STARTED		1
+#define BTRFS_DEV_REPLACE_ITEM_STATE_SUSPENDED		2
+#define BTRFS_DEV_REPLACE_ITEM_STATE_FINISHED		3
+#define BTRFS_DEV_REPLACE_ITEM_STATE_CANCELED		4
+
+struct btrfs_dev_replace {
+	u64 replace_state;	/* see #define above */
+	u64 time_started;	/* seconds since 1-Jan-1970 */
+	u64 time_stopped;	/* seconds since 1-Jan-1970 */
+	atomic64_t num_write_errors;
+	atomic64_t num_uncorrectable_read_errors;
+
+	u64 cursor_left;
+	u64 committed_cursor_left;
+	u64 cursor_left_last_write_of_item;
+	u64 cursor_right;
+
+	u64 cont_reading_from_srcdev_mode;	/* see #define above */
+
+	int is_valid;
+	int item_needs_writeback;
+	struct btrfs_device *srcdev;
+	struct btrfs_device *tgtdev;
+
+	pid_t lock_owner;
+	atomic_t nesting_level;
+	struct mutex lock_finishing_cancel_unmount;
+	struct mutex lock_management_lock;
+	struct mutex lock;
+
+	struct btrfs_scrub_progress scrub_progress;
+};
+
+struct btrfs_dev_replace_item {
+	/*
+	 * grow this item struct at the end for future enhancements and keep
+	 * the existing values unchanged
+	 */
+	__le64 src_devid;
+	__le64 cursor_left;
+	__le64 cursor_right;
+	__le64 cont_reading_from_srcdev_mode;
+
+	__le64 replace_state;
+	__le64 time_started;
+	__le64 time_stopped;
+	__le64 num_write_errors;
+	__le64 num_uncorrectable_read_errors;
+} __attribute__ ((__packed__));
+
 /* different types of block groups (and chunks) */
-#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
-#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
-#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
-#define BTRFS_BLOCK_GROUP_RAID0    (1 << 3)
-#define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
-#define BTRFS_BLOCK_GROUP_DUP	   (1 << 5)
-#define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
-#define BTRFS_NR_RAID_TYPES	   5
+#define BTRFS_BLOCK_GROUP_DATA		(1ULL << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM	(1ULL << 1)
+#define BTRFS_BLOCK_GROUP_METADATA	(1ULL << 2)
+#define BTRFS_BLOCK_GROUP_RAID0		(1ULL << 3)
+#define BTRFS_BLOCK_GROUP_RAID1		(1ULL << 4)
+#define BTRFS_BLOCK_GROUP_DUP		(1ULL << 5)
+#define BTRFS_BLOCK_GROUP_RAID10	(1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5         (1ULL << 7)
+#define BTRFS_BLOCK_GROUP_RAID6         (1ULL << 8)
+#define BTRFS_BLOCK_GROUP_RESERVED	BTRFS_AVAIL_ALLOC_BIT_SINGLE
+
+enum btrfs_raid_types {
+	BTRFS_RAID_RAID10,
+	BTRFS_RAID_RAID1,
+	BTRFS_RAID_DUP,
+	BTRFS_RAID_RAID0,
+	BTRFS_RAID_SINGLE,
+	BTRFS_RAID_RAID5,
+	BTRFS_RAID_RAID6,
+	BTRFS_NR_RAID_TYPES
+};
+
+#define BTRFS_BLOCK_GROUP_TYPE_MASK	(BTRFS_BLOCK_GROUP_DATA |    \
+					 BTRFS_BLOCK_GROUP_SYSTEM |  \
+					 BTRFS_BLOCK_GROUP_METADATA)
+
+#define BTRFS_BLOCK_GROUP_PROFILE_MASK	(BTRFS_BLOCK_GROUP_RAID0 |   \
+					 BTRFS_BLOCK_GROUP_RAID1 |   \
+					 BTRFS_BLOCK_GROUP_RAID5 |   \
+					 BTRFS_BLOCK_GROUP_RAID6 |   \
+					 BTRFS_BLOCK_GROUP_DUP |     \
+					 BTRFS_BLOCK_GROUP_RAID10)
+/*
+ * We need a bit for restriper to be able to tell when chunks of type
+ * SINGLE are available.  This "extended" profile format is used in
+ * fs_info->avail_*_alloc_bits (in-memory) and balance item fields
+ * (on-disk).  The corresponding on-disk bit in chunk.type is reserved
+ * to avoid remappings between two formats in future.
+ */
+#define BTRFS_AVAIL_ALLOC_BIT_SINGLE	(1ULL << 48)
+
+#define BTRFS_EXTENDED_PROFILE_MASK	(BTRFS_BLOCK_GROUP_PROFILE_MASK | \
+					 BTRFS_AVAIL_ALLOC_BIT_SINGLE)
+
+static inline u64 chunk_to_extended(u64 flags)
+{
+	if ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK) == 0)
+		flags |= BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+
+	return flags;
+}
+static inline u64 extended_to_chunk(u64 flags)
+{
+	return flags & ~BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+}
 
 struct btrfs_block_group_item {
 	__le64 used;
@@ -766,6 +1030,72 @@
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+/*
+ * is subvolume quota turned on?
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_ON		(1ULL << 0)
+/*
+ * RESCAN is set during the initialization phase
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_RESCAN		(1ULL << 1)
+/*
+ * Some qgroup entries are known to be out of date,
+ * either because the configuration has changed in a way that
+ * makes a rescan necessary, or because the fs has been mounted
+ * with a non-qgroup-aware version.
+ * Turning qouta off and on again makes it inconsistent, too.
+ */
+#define BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT	(1ULL << 2)
+
+#define BTRFS_QGROUP_STATUS_VERSION        1
+
+struct btrfs_qgroup_status_item {
+	__le64 version;
+	/*
+	 * the generation is updated during every commit. As older
+	 * versions of btrfs are not aware of qgroups, it will be
+	 * possible to detect inconsistencies by checking the
+	 * generation on mount time
+	 */
+	__le64 generation;
+
+	/* flag definitions see above */
+	__le64 flags;
+
+	/*
+	 * only used during scanning to record the progress
+	 * of the scan. It contains a logical address
+	 */
+	__le64 rescan;
+} __attribute__ ((__packed__));
+
+struct btrfs_qgroup_info_item {
+	__le64 generation;
+	__le64 rfer;
+	__le64 rfer_cmpr;
+	__le64 excl;
+	__le64 excl_cmpr;
+} __attribute__ ((__packed__));
+
+/* flags definition for qgroup limits */
+#define BTRFS_QGROUP_LIMIT_MAX_RFER	(1ULL << 0)
+#define BTRFS_QGROUP_LIMIT_MAX_EXCL	(1ULL << 1)
+#define BTRFS_QGROUP_LIMIT_RSV_RFER	(1ULL << 2)
+#define BTRFS_QGROUP_LIMIT_RSV_EXCL	(1ULL << 3)
+#define BTRFS_QGROUP_LIMIT_RFER_CMPR	(1ULL << 4)
+#define BTRFS_QGROUP_LIMIT_EXCL_CMPR	(1ULL << 5)
+
+struct btrfs_qgroup_limit_item {
+	/*
+	 * only updated when any of the other values change
+	 */
+	__le64 flags;
+	__le64 max_rfer;
+	__le64 max_excl;
+	__le64 rsv_rfer;
+	__le64 rsv_excl;
+} __attribute__ ((__packed__));
+
 struct btrfs_space_info {
 	u64 flags;
 
@@ -786,13 +1116,16 @@
 				   account */
 
 	/*
-	 * we bump reservation progress every time we decrement
-	 * bytes_reserved.  This way people waiting for reservations
-	 * know something good has happened and they can check
-	 * for progress.  The number here isn't to be trusted, it
-	 * just shows reclaim activity
+	 * bytes_pinned is kept in line with what is actually pinned, as in
+	 * we've called update_block_group and dropped the bytes_used counter
+	 * and increased the bytes_pinned counter.  However this means that
+	 * bytes_pinned does not reflect the bytes that will be pinned once the
+	 * delayed refs are flushed, so this counter is inc'ed everytime we call
+	 * btrfs_free_extent so it is a realtime count of what will be freed
+	 * once the transaction is committed.  It will be zero'ed everytime the
+	 * transaction commits.
 	 */
-	unsigned long reservation_progress;
+	struct percpu_counter total_bytes_pinned;
 
 	unsigned int full:1;	/* indicates that we cannot allocate any more
 				   chunks for this space */
@@ -812,12 +1145,22 @@
 	wait_queue_head_t wait;
 };
 
+#define	BTRFS_BLOCK_RSV_GLOBAL		1
+#define	BTRFS_BLOCK_RSV_DELALLOC	2
+#define	BTRFS_BLOCK_RSV_TRANS		3
+#define	BTRFS_BLOCK_RSV_CHUNK		4
+#define	BTRFS_BLOCK_RSV_DELOPS		5
+#define	BTRFS_BLOCK_RSV_EMPTY		6
+#define	BTRFS_BLOCK_RSV_TEMP		7
+
 struct btrfs_block_rsv {
 	u64 size;
 	u64 reserved;
 	struct btrfs_space_info *space_info;
 	spinlock_t lock;
-	unsigned int full:1;
+	unsigned short full;
+	unsigned short type;
+	unsigned short failfast;
 };
 
 /*
@@ -850,6 +1193,7 @@
 	BTRFS_CACHE_STARTED	= 1,
 	BTRFS_CACHE_FAST	= 2,
 	BTRFS_CACHE_FINISHED	= 3,
+	BTRFS_CACHE_ERROR	= 4,
 };
 
 enum btrfs_disk_cache_state {
@@ -882,6 +1226,10 @@
 	u64 flags;
 	u64 sectorsize;
 	u64 cache_generation;
+
+	/* for raid56, this is a full stripe, without parity */
+	unsigned long full_stripe_len;
+
 	unsigned int ro:1;
 	unsigned int dirty:1;
 	unsigned int iref:1;
@@ -911,11 +1259,44 @@
 	 * Today it will only have one thing on it, but that may change
 	 */
 	struct list_head cluster_list;
+
+	/* For delayed block group creation */
+	struct list_head new_bg_list;
 };
 
+/* delayed seq elem */
+struct seq_list {
+	struct list_head list;
+	u64 seq;
+};
+
+enum btrfs_orphan_cleanup_state {
+	ORPHAN_CLEANUP_STARTED	= 1,
+	ORPHAN_CLEANUP_DONE	= 2,
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+	struct list_head hash_list;
+	wait_queue_head_t wait;
+	spinlock_t lock;
+};
+
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+	struct list_head stripe_cache;
+	spinlock_t cache_lock;
+	int cache_size;
+	struct btrfs_stripe_hash table[];
+};
+
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
+
+/* fs_info */
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_balance_control;
 struct btrfs_delayed_root;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
@@ -926,6 +1307,8 @@
 	struct btrfs_root *dev_root;
 	struct btrfs_root *fs_root;
 	struct btrfs_root *csum_root;
+	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -935,6 +1318,7 @@
 
 	/* block group cache stuff */
 	spinlock_t block_group_cache_lock;
+	u64 first_logical_byte;
 	struct rb_root block_group_cache_tree;
 
 	/* keep track of unallocated space */
@@ -971,9 +1355,26 @@
 	 * is required instead of the faster short fsync log commits
 	 */
 	u64 last_trans_log_full_commit;
-	unsigned long mount_opt:20;
+	unsigned long mount_opt;
 	unsigned long compress_type:4;
+	int commit_interval;
+	/*
+	 * It is a suggestive number, the read side is safe even it gets a
+	 * wrong number because we will write out the data into a regular
+	 * extent. The write side(mount/remount) is under ->s_umount lock,
+	 * so it is also safe.
+	 */
 	u64 max_inline;
+	/*
+	 * Protected by ->chunk_mutex and sb->s_umount.
+	 *
+	 * The reason that we use two lock to protect it is because only
+	 * remount and mount operations can change it and these two operations
+	 * are under sb->s_umount, but the read side (chunk allocation) can not
+	 * acquire sb->s_umount or the deadlock would happen. So we use two
+	 * locks to protect it. On the write side, we must acquire two locks,
+	 * and on the read side, we just need acquire one of them.
+	 */
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
 	wait_queue_head_t transaction_throttle;
@@ -981,6 +1382,17 @@
 	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
+	/*
+	 * Used to protect the incompat_flags, compat_flags, compat_ro_flags
+	 * when they are updated.
+	 *
+	 * Because we do not clear the flags for ever, so we needn't use
+	 * the lock on the read side.
+	 *
+	 * We also needn't use the lock when we mount the fs, because
+	 * there is no other task which will update the flag.
+	 */
+	spinlock_t super_lock;
 	struct btrfs_super_block *super_copy;
 	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
@@ -992,6 +1404,13 @@
 	struct mutex cleaner_mutex;
 	struct mutex chunk_mutex;
 	struct mutex volume_mutex;
+
+	/* this is used during read/modify/write to make sure
+	 * no two ios are trying to mod the same stripe at the same
+	 * time
+	 */
+	struct btrfs_stripe_hash_table *stripe_hash_table;
+
 	/*
 	 * this protects the ordered operations list only while we are
 	 * processing all of the entries on it.  This way we make
@@ -1000,6 +1419,13 @@
 	 * before jumping into the main commit.
 	 */
 	struct mutex ordered_operations_mutex;
+
+	/*
+	 * Same as ordered_operations_mutex except this is for ordered extents
+	 * and not the operations.
+	 */
+	struct mutex ordered_extent_flush_mutex;
+
 	struct rw_semaphore extent_commit_sem;
 
 	struct rw_semaphore cleanup_work_sem;
@@ -1015,13 +1441,22 @@
 	struct mutex reloc_mutex;
 
 	struct list_head trans_list;
-	struct list_head hashers;
 	struct list_head dead_roots;
 	struct list_head caching_block_groups;
 
 	spinlock_t delayed_iput_lock;
 	struct list_head delayed_iputs;
 
+	/* this protects tree_mod_seq_list */
+	spinlock_t tree_mod_seq_lock;
+	atomic64_t tree_mod_seq;
+	struct list_head tree_mod_seq_list;
+	struct seq_list tree_mod_seq_elem;
+
+	/* this protects tree_mod_log */
+	rwlock_t tree_mod_log_lock;
+	struct rb_root tree_mod_log;
+
 	atomic_t nr_async_submits;
 	atomic_t async_submit_draining;
 	atomic_t nr_async_bios;
@@ -1029,31 +1464,22 @@
 	atomic_t open_ioctl_trans;
 
 	/*
-	 * this is used by the balancing code to wait for all the pending
-	 * ordered extents
+	 * this is used to protect the following list -- ordered_roots.
 	 */
-	spinlock_t ordered_extent_lock;
+	spinlock_t ordered_root_lock;
 
 	/*
-	 * all of the data=ordered extents pending writeback
+	 * all fs/file tree roots in which there are data=ordered extents
+	 * pending writeback are added into this list.
+	 *
 	 * these can span multiple transactions and basically include
 	 * every dirty data page that isn't from nodatacow
 	 */
-	struct list_head ordered_extents;
+	struct list_head ordered_roots;
 
-	/*
-	 * all of the inodes that have delalloc bytes.  It is possible for
-	 * this list to be empty even when there is still dirty data=ordered
-	 * extents waiting to finish IO.
-	 */
-	struct list_head delalloc_inodes;
-
-	/*
-	 * special rename and truncate targets that must be on disk before
-	 * we're allowed to commit.  This is basically the ext3 style
-	 * data=ordered list.
-	 */
-	struct list_head ordered_operations;
+	spinlock_t delalloc_root_lock;
+	/* all fs/file tree roots that have delalloc inodes. */
+	struct list_head delalloc_roots;
 
 	/*
 	 * there is a pool of worker threads for checksumming during writes
@@ -1068,8 +1494,11 @@
 	struct btrfs_workers generic_worker;
 	struct btrfs_workers workers;
 	struct btrfs_workers delalloc_workers;
+	struct btrfs_workers flush_workers;
 	struct btrfs_workers endio_workers;
 	struct btrfs_workers endio_meta_workers;
+	struct btrfs_workers endio_raid56_workers;
+	struct btrfs_workers rmw_workers;
 	struct btrfs_workers endio_meta_write_workers;
 	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers endio_freespace_worker;
@@ -1093,15 +1522,15 @@
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
-	int enospc_unlink;
-	int trans_no_join;
 
 	u64 total_pinned;
 
-	/* protected by the delalloc lock, used to keep from writing
-	 * metadata until there is a nice batch
-	 */
-	u64 dirty_metadata_bytes;
+	/* used to keep from writing metadata until there is a nice batch */
+	struct percpu_counter dirty_metadata_bytes;
+	struct percpu_counter delalloc_bytes;
+	s32 dirty_metadata_batch;
+	s32 delalloc_batch;
+
 	struct list_head dirty_cowonly_roots;
 
 	struct btrfs_fs_devices *fs_devices;
@@ -1113,10 +1542,9 @@
 	 */
 	struct list_head space_info;
 
-	struct reloc_control *reloc_ctl;
+	struct btrfs_space_info *data_sinfo;
 
-	spinlock_t delalloc_lock;
-	u64 delalloc_bytes;
+	struct reloc_control *reloc_ctl;
 
 	/* data_alloc_cluster is only used in ssd mode */
 	struct btrfs_free_cluster data_alloc_cluster;
@@ -1129,15 +1557,25 @@
 	struct rb_root defrag_inodes;
 	atomic_t defrag_running;
 
-	spinlock_t ref_cache_lock;
-	u64 total_ref_cache_size;
-
+	/* Used to protect avail_{data, metadata, system}_alloc_bits */
+	seqlock_t profiles_lock;
+	/*
+	 * these three are in extended format (availability of single
+	 * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
+	 * types are denoted by corresponding BTRFS_BLOCK_GROUP_* bits)
+	 */
 	u64 avail_data_alloc_bits;
 	u64 avail_metadata_alloc_bits;
 	u64 avail_system_alloc_bits;
-	u64 data_alloc_profile;
-	u64 metadata_alloc_profile;
-	u64 system_alloc_profile;
+
+	/* restriper state */
+	spinlock_t balance_lock;
+	struct mutex balance_mutex;
+	atomic_t balance_running;
+	atomic_t balance_pause_req;
+	atomic_t balance_cancel_req;
+	struct btrfs_balance_control *balance_ctl;
+	wait_queue_head_t balance_wait_q;
 
 	unsigned data_chunk_allocations;
 	unsigned metadata_ratio;
@@ -1154,9 +1592,54 @@
 	struct rw_semaphore scrub_super_lock;
 	int scrub_workers_refcnt;
 	struct btrfs_workers scrub_workers;
+	struct btrfs_workers scrub_wr_completion_workers;
+	struct btrfs_workers scrub_nocow_workers;
+
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	u32 check_integrity_print_mask;
+#endif
+	/*
+	 * quota information
+	 */
+	unsigned int quota_enabled:1;
+
+	/*
+	 * quota_enabled only changes state after a commit. This holds the
+	 * next state.
+	 */
+	unsigned int pending_quota_state:1;
+
+	/* is qgroup tracking in a consistent state? */
+	u64 qgroup_flags;
+
+	/* holds configuration and tracking. Protected by qgroup_lock */
+	struct rb_root qgroup_tree;
+	spinlock_t qgroup_lock;
+
+	/*
+	 * used to avoid frequently calling ulist_alloc()/ulist_free()
+	 * when doing qgroup accounting, it must be protected by qgroup_lock.
+	 */
+	struct ulist *qgroup_ulist;
+
+	/* protect user change for quota operations */
+	struct mutex qgroup_ioctl_lock;
+
+	/* list of dirty qgroups to be written at next commit */
+	struct list_head dirty_qgroups;
+
+	/* used by btrfs_qgroup_record_ref for an efficient tree traversal */
+	u64 qgroup_seq;
+
+	/* qgroup rescan items */
+	struct mutex qgroup_rescan_lock; /* protects the progress item */
+	struct btrfs_key qgroup_rescan_progress;
+	struct btrfs_workers qgroup_rescan_workers;
+	struct completion qgroup_rescan_completion;
+	struct btrfs_work qgroup_rescan_work;
 
 	/* filesystem state */
-	u64 fs_state;
+	unsigned long fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
 
@@ -1166,6 +1649,16 @@
 
 	/* next backup root to be overwritten */
 	int backup_root_index;
+
+	int num_tolerated_disk_barrier_failures;
+
+	/* device replace state */
+	struct btrfs_dev_replace dev_replace;
+
+	atomic_t mutually_exclusive_operation_running;
+
+	struct semaphore uuid_tree_rescan_sem;
+	unsigned int update_uuid_tree_gen:1;
 };
 
 /*
@@ -1206,9 +1699,9 @@
 	wait_queue_head_t log_commit_wait[2];
 	atomic_t log_writers;
 	atomic_t log_commit[2];
+	atomic_t log_batch;
 	unsigned long log_transid;
 	unsigned long last_log_commit;
-	unsigned long log_batch;
 	pid_t log_start_pid;
 	bool log_multiple_pids;
 
@@ -1252,8 +1745,11 @@
 
 	struct list_head root_list;
 
+	spinlock_t log_extents_lock[2];
+	struct list_head logged_list[2];
+
 	spinlock_t orphan_lock;
-	struct list_head orphan_list;
+	atomic_t orphan_inodes;
 	struct btrfs_block_rsv *orphan_block_rsv;
 	int orphan_item_inserted;
 	int orphan_cleanup_state;
@@ -1274,6 +1770,33 @@
 	dev_t anon_dev;
 
 	int force_cow;
+
+	spinlock_t root_item_lock;
+	atomic_t refs;
+
+	spinlock_t delalloc_lock;
+	/*
+	 * all of the inodes that have delalloc bytes.  It is possible for
+	 * this list to be empty even when there is still dirty data=ordered
+	 * extents waiting to finish IO.
+	 */
+	struct list_head delalloc_inodes;
+	struct list_head delalloc_root;
+	u64 nr_delalloc_inodes;
+	/*
+	 * this is used by the balancing code to wait for all the pending
+	 * ordered extents
+	 */
+	spinlock_t ordered_extent_lock;
+
+	/*
+	 * all of the data=ordered extents pending writeback
+	 * these can span multiple transactions and basically include
+	 * every dirty data page that isn't from nodatacow
+	 */
+	struct list_head ordered_extents;
+	struct list_head ordered_root;
+	u64 nr_ordered_extents;
 };
 
 struct btrfs_ioctl_defrag_range_args {
@@ -1315,6 +1838,7 @@
  */
 #define BTRFS_INODE_ITEM_KEY		1
 #define BTRFS_INODE_REF_KEY		12
+#define BTRFS_INODE_EXTREF_KEY		13
 #define BTRFS_XATTR_ITEM_KEY		24
 #define BTRFS_ORPHAN_ITEM_KEY		48
 /* reserve 2-15 close to the inode for later flexibility */
@@ -1326,6 +1850,9 @@
 #define BTRFS_DIR_LOG_ITEM_KEY  60
 #define BTRFS_DIR_LOG_INDEX_KEY 72
 #define BTRFS_DIR_ITEM_KEY	84
+#ifdef MY_ABC_HERE
+#define BTRFS_DIR_ITEM_CASELESS_KEY 91
+#endif
 #define BTRFS_DIR_INDEX_KEY	96
 /*
  * extent data is for file data
@@ -1363,6 +1890,12 @@
  */
 #define BTRFS_EXTENT_ITEM_KEY	168
 
+/*
+ * The same as the BTRFS_EXTENT_ITEM_KEY, except it's metadata we already know
+ * the length, so we save the level in key->offset instead of the length.
+ */
+#define BTRFS_METADATA_ITEM_KEY	169
+
 #define BTRFS_TREE_BLOCK_REF_KEY	176
 
 #define BTRFS_EXTENT_DATA_REF_KEY	178
@@ -1384,6 +1917,57 @@
 #define BTRFS_CHUNK_ITEM_KEY	228
 
 /*
+ * Records the overall state of the qgroups.
+ * There's only one instance of this key present,
+ * (0, BTRFS_QGROUP_STATUS_KEY, 0)
+ */
+#define BTRFS_QGROUP_STATUS_KEY         240
+/*
+ * Records the currently used space of the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_INFO_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_INFO_KEY           242
+/*
+ * Contains the user configured limits for the qgroup.
+ * One key per qgroup, (0, BTRFS_QGROUP_LIMIT_KEY, qgroupid).
+ */
+#define BTRFS_QGROUP_LIMIT_KEY          244
+/*
+ * Records the child-parent relationship of qgroups. For
+ * each relation, 2 keys are present:
+ * (childid, BTRFS_QGROUP_RELATION_KEY, parentid)
+ * (parentid, BTRFS_QGROUP_RELATION_KEY, childid)
+ */
+#define BTRFS_QGROUP_RELATION_KEY       246
+
+#define BTRFS_BALANCE_ITEM_KEY	248
+
+/*
+ * Persistantly stores the io stats in the device tree.
+ * One key for all stats, (0, BTRFS_DEV_STATS_KEY, devid).
+ */
+#define BTRFS_DEV_STATS_KEY	249
+
+/*
+ * Persistantly stores the device replace state in the device tree.
+ * The key is built like this: (0, BTRFS_DEV_REPLACE_KEY, 0).
+ */
+#define BTRFS_DEV_REPLACE_KEY	250
+
+/*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These items are part of the filesystem UUID tree.
+ * The key is built like this:
+ * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
+ */
+#if BTRFS_UUID_SIZE != 16
+#error "UUID items require BTRFS_UUID_SIZE == 16!"
+#endif
+#define BTRFS_UUID_KEY_SUBVOL	251	/* for UUIDs assigned to subvols */
+#define BTRFS_UUID_KEY_RECEIVED_SUBVOL	252	/* for UUIDs assigned to
+						 * received subvols */
+
+/*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
@@ -1413,9 +1997,21 @@
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
 #define BTRFS_MOUNT_RECOVERY		(1 << 18)
+#define BTRFS_MOUNT_SKIP_BALANCE	(1 << 19)
+#define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 20)
+#define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
+#define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
+#define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
+
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+#define BTRFS_MOUNT_SYNO_ACL		(1 << 31)
+#endif
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt)	((o) & BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)	((root)->fs_info->mount_opt & \
 					 BTRFS_MOUNT_##opt)
 /*
@@ -1436,6 +2032,17 @@
 
 #define BTRFS_INODE_ROOT_ITEM_INIT	(1 << 31)
 
+struct btrfs_map_token {
+	struct extent_buffer *eb;
+	char *kaddr;
+	unsigned long offset;
+};
+
+static inline void btrfs_init_map_token (struct btrfs_map_token *token)
+{
+	token->kaddr = NULL;
+}
+
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
@@ -1456,23 +2063,66 @@
 			    offsetof(type, member),			\
 			   sizeof(((type *)0)->member)))
 
-#ifndef BTRFS_SETGET_FUNCS
+#define DECLARE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			       unsigned long off,			\
+                              struct btrfs_map_token *token);		\
+void btrfs_set_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			    unsigned long off, u##bits val,		\
+			    struct btrfs_map_token *token);		\
+static inline u##bits btrfs_get_##bits(struct extent_buffer *eb, void *ptr, \
+				       unsigned long off)		\
+{									\
+	return btrfs_get_token_##bits(eb, ptr, off, NULL);		\
+}									\
+static inline void btrfs_set_##bits(struct extent_buffer *eb, void *ptr, \
+				    unsigned long off, u##bits val)	\
+{									\
+       btrfs_set_token_##bits(eb, ptr, off, val, NULL);			\
+}
+
+DECLARE_BTRFS_SETGET_BITS(8)
+DECLARE_BTRFS_SETGET_BITS(16)
+DECLARE_BTRFS_SETGET_BITS(32)
+DECLARE_BTRFS_SETGET_BITS(64)
+
 #define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
-void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);
-#endif
+static inline u##bits btrfs_##name(struct extent_buffer *eb, type *s)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	return btrfs_get_##bits(eb, s, offsetof(type, member));		\
+}									\
+static inline void btrfs_set_##name(struct extent_buffer *eb, type *s,	\
+				    u##bits val)			\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	btrfs_set_##bits(eb, s, offsetof(type, member), val);		\
+}									\
+static inline u##bits btrfs_token_##name(struct extent_buffer *eb, type *s, \
+					 struct btrfs_map_token *token)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	return btrfs_get_token_##bits(eb, s, offsetof(type, member), token); \
+}									\
+static inline void btrfs_set_token_##name(struct extent_buffer *eb,	\
+					  type *s, u##bits val,		\
+                                         struct btrfs_map_token *token)	\
+{									\
+	BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member);	\
+	btrfs_set_token_##bits(eb, s, offsetof(type, member), val, token); \
+}
 
 #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits)		\
 static inline u##bits btrfs_##name(struct extent_buffer *eb)		\
 {									\
-	type *p = page_address(eb->first_page);				\
+	type *p = page_address(eb->pages[0]);				\
 	u##bits res = le##bits##_to_cpu(p->member);			\
 	return res;							\
 }									\
 static inline void btrfs_set_##name(struct extent_buffer *eb,		\
 				    u##bits val)			\
 {									\
-	type *p = page_address(eb->first_page);				\
+	type *p = page_address(eb->pages[0]);				\
 	p->member = cpu_to_le##bits(val);				\
 }
 
@@ -1521,14 +2171,14 @@
 BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
 			 generation, 64);
 
-static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
 {
-	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
-static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
 {
-	return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
 }
 
 BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
@@ -1611,6 +2261,13 @@
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
 
+/* struct btrfs_inode_extref */
+BTRFS_SETGET_FUNCS(inode_extref_parent, struct btrfs_inode_extref,
+		   parent_objectid, 64);
+BTRFS_SETGET_FUNCS(inode_extref_name_len, struct btrfs_inode_extref,
+		   name_len, 16);
+BTRFS_SETGET_FUNCS(inode_extref_index, struct btrfs_inode_extref, index, 64);
+
 /* struct btrfs_inode_item */
 BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64);
 BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64);
@@ -1624,6 +2281,23 @@
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+			 sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+			 transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+			 nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+			 block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
 
 static inline struct btrfs_timespec *
 btrfs_inode_atime(struct btrfs_inode_item *inode_item)
@@ -1651,6 +2325,8 @@
 
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
@@ -1661,10 +2337,10 @@
 		   chunk_offset, 64);
 BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
 
-static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 {
 	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
-	return (u8 *)((unsigned long)dev + ptr);
+	return (unsigned long)dev + ptr;
 }
 
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
@@ -1732,6 +2408,10 @@
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
 BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
+			 blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+			 generation, 64);
 
 static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
 {
@@ -1788,6 +2468,8 @@
 /* struct btrfs_item */
 BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
 BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
 
 static inline unsigned long btrfs_item_nr_offset(int nr)
 {
@@ -1850,6 +2532,13 @@
 BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
 BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
+			 data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
+			 name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
+			 transid, 64);
 
 static inline void btrfs_dir_item_key(struct extent_buffer *eb,
 				      struct btrfs_dir_item *item,
@@ -1952,6 +2641,12 @@
 BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
 BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
+			 nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
 
 static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
 {
@@ -1987,16 +2682,14 @@
 	btrfs_set_header_flags(eb, flags);
 }
 
-static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb)
 {
-	unsigned long ptr = offsetof(struct btrfs_header, fsid);
-	return (u8 *)ptr;
+	return offsetof(struct btrfs_header, fsid);
 }
 
-static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
 {
-	unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
-	return (u8 *)ptr;
+	return offsetof(struct btrfs_header, chunk_tree_uuid);
 }
 
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
@@ -2022,6 +2715,16 @@
 BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
 			 last_snapshot, 64);
+BTRFS_SETGET_STACK_FUNCS(root_generation_v2, struct btrfs_root_item,
+			 generation_v2, 64);
+BTRFS_SETGET_STACK_FUNCS(root_ctransid, struct btrfs_root_item,
+			 ctransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_otransid, struct btrfs_root_item,
+			 otransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_stransid, struct btrfs_root_item,
+			 stransid, 64);
+BTRFS_SETGET_STACK_FUNCS(root_rtransid, struct btrfs_root_item,
+			 rtransid, 64);
 
 static inline bool btrfs_root_readonly(struct btrfs_root *root)
 {
@@ -2077,8 +2780,86 @@
 BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
 		   num_devices, 64);
 
-/* struct btrfs_super_block */
+/* struct btrfs_balance_item */
+BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64);
+
+static inline void btrfs_balance_data(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_set_balance_data(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, data, ba);
+}
+
+static inline void btrfs_balance_meta(struct extent_buffer *eb,
+				      struct btrfs_balance_item *bi,
+				      struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
+
+static inline void btrfs_set_balance_meta(struct extent_buffer *eb,
+					  struct btrfs_balance_item *bi,
+					  struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, meta, ba);
+}
 
+static inline void btrfs_balance_sys(struct extent_buffer *eb,
+				     struct btrfs_balance_item *bi,
+				     struct btrfs_disk_balance_args *ba)
+{
+	read_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void btrfs_set_balance_sys(struct extent_buffer *eb,
+					 struct btrfs_balance_item *bi,
+					 struct btrfs_disk_balance_args *ba)
+{
+	write_eb_member(eb, bi, struct btrfs_balance_item, sys, ba);
+}
+
+static inline void
+btrfs_disk_balance_args_to_cpu(struct btrfs_balance_args *cpu,
+			       struct btrfs_disk_balance_args *disk)
+{
+	memset(cpu, 0, sizeof(*cpu));
+
+	cpu->profiles = le64_to_cpu(disk->profiles);
+	cpu->usage = le64_to_cpu(disk->usage);
+	cpu->devid = le64_to_cpu(disk->devid);
+	cpu->pstart = le64_to_cpu(disk->pstart);
+	cpu->pend = le64_to_cpu(disk->pend);
+	cpu->vstart = le64_to_cpu(disk->vstart);
+	cpu->vend = le64_to_cpu(disk->vend);
+	cpu->target = le64_to_cpu(disk->target);
+	cpu->flags = le64_to_cpu(disk->flags);
+}
+
+static inline void
+btrfs_cpu_balance_args_to_disk(struct btrfs_disk_balance_args *disk,
+			       struct btrfs_balance_args *cpu)
+{
+	memset(disk, 0, sizeof(*disk));
+
+	disk->profiles = cpu_to_le64(cpu->profiles);
+	disk->usage = cpu_to_le64(cpu->usage);
+	disk->devid = cpu_to_le64(cpu->devid);
+	disk->pstart = cpu_to_le64(cpu->pstart);
+	disk->pend = cpu_to_le64(cpu->pend);
+	disk->vstart = cpu_to_le64(cpu->vstart);
+	disk->vend = cpu_to_le64(cpu->vend);
+	disk->target = cpu_to_le64(cpu->target);
+	disk->flags = cpu_to_le64(cpu->flags);
+}
+
+/* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
 BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
@@ -2126,11 +2907,16 @@
 			 csum_type, 16);
 BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
 			 cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+			 uuid_tree_generation, 64);
 
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
-	int t = btrfs_super_csum_type(s);
-	BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes));
+	u16 t = btrfs_super_csum_type(s);
+	/*
+	 * csum type is validated at mount time
+	 */
 	return btrfs_csum_sizes[t];
 }
 
@@ -2141,6 +2927,14 @@
 
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+			 struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+			 struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+			 struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+			 struct btrfs_file_extent_item, num_bytes, 64);
 
 static inline unsigned long
 btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
@@ -2196,7 +2990,117 @@
 	return btrfs_item_size(eb, e) - offset;
 }
 
-static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
+/* btrfs_dev_stats_item */
+static inline u64 btrfs_dev_stats_value(struct extent_buffer *eb,
+					struct btrfs_dev_stats_item *ptr,
+					int index)
+{
+	u64 val;
+
+	read_extent_buffer(eb, &val,
+			   offsetof(struct btrfs_dev_stats_item, values) +
+			    ((unsigned long)ptr) + (index * sizeof(u64)),
+			   sizeof(val));
+	return val;
+}
+
+static inline void btrfs_set_dev_stats_value(struct extent_buffer *eb,
+					     struct btrfs_dev_stats_item *ptr,
+					     int index, u64 val)
+{
+	write_extent_buffer(eb, &val,
+			    offsetof(struct btrfs_dev_stats_item, values) +
+			     ((unsigned long)ptr) + (index * sizeof(u64)),
+			    sizeof(val));
+}
+
+/* btrfs_qgroup_status_item */
+BTRFS_SETGET_FUNCS(qgroup_status_generation, struct btrfs_qgroup_status_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_version, struct btrfs_qgroup_status_item,
+		   version, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_flags, struct btrfs_qgroup_status_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_status_rescan, struct btrfs_qgroup_status_item,
+		   rescan, 64);
+
+/* btrfs_qgroup_info_item */
+BTRFS_SETGET_FUNCS(qgroup_info_generation, struct btrfs_qgroup_info_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer, struct btrfs_qgroup_info_item, rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_rfer_cmpr, struct btrfs_qgroup_info_item,
+		   rfer_cmpr, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl, struct btrfs_qgroup_info_item, excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_info_excl_cmpr, struct btrfs_qgroup_info_item,
+		   excl_cmpr, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_generation,
+			 struct btrfs_qgroup_info_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer, struct btrfs_qgroup_info_item,
+			 rfer, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_rfer_cmpr,
+			 struct btrfs_qgroup_info_item, rfer_cmpr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl, struct btrfs_qgroup_info_item,
+			 excl, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_qgroup_info_excl_cmpr,
+			 struct btrfs_qgroup_info_item, excl_cmpr, 64);
+
+/* btrfs_qgroup_limit_item */
+BTRFS_SETGET_FUNCS(qgroup_limit_flags, struct btrfs_qgroup_limit_item,
+		   flags, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_rfer, struct btrfs_qgroup_limit_item,
+		   max_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_max_excl, struct btrfs_qgroup_limit_item,
+		   max_excl, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_rfer, struct btrfs_qgroup_limit_item,
+		   rsv_rfer, 64);
+BTRFS_SETGET_FUNCS(qgroup_limit_rsv_excl, struct btrfs_qgroup_limit_item,
+		   rsv_excl, 64);
+
+/* btrfs_dev_replace_item */
+BTRFS_SETGET_FUNCS(dev_replace_src_devid,
+		   struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cont_reading_from_srcdev_mode,
+		   struct btrfs_dev_replace_item, cont_reading_from_srcdev_mode,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_replace_state, struct btrfs_dev_replace_item,
+		   replace_state, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_started, struct btrfs_dev_replace_item,
+		   time_started, 64);
+BTRFS_SETGET_FUNCS(dev_replace_time_stopped, struct btrfs_dev_replace_item,
+		   time_stopped, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_write_errors, struct btrfs_dev_replace_item,
+		   num_write_errors, 64);
+BTRFS_SETGET_FUNCS(dev_replace_num_uncorrectable_read_errors,
+		   struct btrfs_dev_replace_item, num_uncorrectable_read_errors,
+		   64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_left, struct btrfs_dev_replace_item,
+		   cursor_left, 64);
+BTRFS_SETGET_FUNCS(dev_replace_cursor_right, struct btrfs_dev_replace_item,
+		   cursor_right, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_src_devid,
+			 struct btrfs_dev_replace_item, src_devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cont_reading_from_srcdev_mode,
+			 struct btrfs_dev_replace_item,
+			 cont_reading_from_srcdev_mode, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_replace_state,
+			 struct btrfs_dev_replace_item, replace_state, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_started,
+			 struct btrfs_dev_replace_item, time_started, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_time_stopped,
+			 struct btrfs_dev_replace_item, time_stopped, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_write_errors,
+			 struct btrfs_dev_replace_item, num_write_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_num_uncorrectable_read_errors,
+			 struct btrfs_dev_replace_item,
+			 num_uncorrectable_read_errors, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_left,
+			 struct btrfs_dev_replace_item, cursor_left, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dev_replace_cursor_right,
+			 struct btrfs_dev_replace_item, cursor_right, 64);
+
+static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
 {
 	return sb->s_fs_info;
 }
@@ -2238,7 +3142,7 @@
 						 unsigned num_items)
 {
 	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
-		3 * num_items;
+		2 * num_items;
 }
 
 /*
@@ -2252,18 +3156,21 @@
 		num_items;
 }
 
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags);
+			     u64 offset, int metadata, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
 		     u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
 				    u64 bytenr, u64 num_bytes);
+int btrfs_exclude_logged_extents(struct btrfs_root *root,
+				 struct extent_buffer *eb);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr);
@@ -2271,8 +3178,6 @@
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
-u64 btrfs_find_block_group(struct btrfs_root *root,
-			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root, u32 blocksize,
 					u64 parent, u64 root_objectid,
@@ -2282,10 +3187,6 @@
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref);
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize,
-					    int level);
 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     u64 root_objectid, u64 owner,
@@ -2294,36 +3195,33 @@
 				   struct btrfs_root *root,
 				   u64 root_objectid, u64 owner, u64 offset,
 				   struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  u64 num_bytes, u64 min_alloc_size,
-				  u64 empty_size, u64 hint_byte,
-				  u64 search_end, struct btrfs_key *ins,
-				  u64 data);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+			 struct btrfs_key *ins, int is_data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref);
+		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data);
+				int level, int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset);
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
 				       u64 start, u64 len);
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root);
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root);
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset);
+			 u64 root_objectid, u64 owner, u64 offset, int for_cow);
 
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
@@ -2337,10 +3235,22 @@
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root);
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+
+enum btrfs_reserve_flush_enum {
+	/* If we are in the transaction, we can't flush anything.*/
+	BTRFS_RESERVE_NO_FLUSH,
+	/*
+	 * Flushing delalloc may cause deadlock somewhere, in this
+	 * case, use FLUSH LIMIT
+	 */
+	BTRFS_RESERVE_FLUSH_LIMIT,
+	BTRFS_RESERVE_FLUSH_ALL,
+};
+
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -2348,40 +3258,43 @@
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode);
 void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending);
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int nitems,
+				     u64 *qgroup_reserved, bool use_global_rsv);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes);
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes);
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor);
 int btrfs_block_rsv_refill(struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved);
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-				   struct btrfs_block_rsv *block_rsv,
-				   u64 min_reserved);
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor);
 void btrfs_block_rsv_release(struct btrfs_root *root,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
-int btrfs_set_block_group_rw(struct btrfs_root *root,
-			     struct btrfs_block_group_cache *cache);
+void btrfs_set_block_group_rw(struct btrfs_root *root,
+			      struct btrfs_block_group_cache *cache);
 void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
 int btrfs_error_unpin_extent_range(struct btrfs_root *root,
@@ -2393,6 +3306,9 @@
 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
 
 int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info);
+int __get_raid_index(u64 flags);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -2400,18 +3316,33 @@
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
-int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, struct btrfs_path *path,
-			    struct btrfs_key *new_key);
+void btrfs_set_item_key_safe(struct btrfs_root *root, struct btrfs_path *path,
+			     struct btrfs_key *new_key);
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
 			struct btrfs_key *key, int lowest_level,
-			int cache_only, u64 min_trans);
+			u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 			 struct btrfs_key *max_key,
-			 struct btrfs_path *path, int cache_only,
+			 struct btrfs_path *path,
 			 u64 min_trans);
+enum btrfs_compare_tree_result {
+	BTRFS_COMPARE_TREE_NEW,
+	BTRFS_COMPARE_TREE_DELETED,
+	BTRFS_COMPARE_TREE_CHANGED,
+	BTRFS_COMPARE_TREE_SAME,
+};
+typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
+				  struct btrfs_root *right_root,
+				  struct btrfs_path *left_path,
+				  struct btrfs_path *right_path,
+				  struct btrfs_key *key,
+				  enum btrfs_compare_tree_result result,
+				  void *ctx);
+int btrfs_compare_trees(struct btrfs_root *left_root,
+			struct btrfs_root *right_root,
+			btrfs_changed_cb_t cb, void *ctx);
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
@@ -2422,12 +3353,10 @@
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
 int btrfs_block_can_be_shared(struct btrfs_root *root,
 			      struct extent_buffer *buf);
-int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_path *path, u32 data_size);
-int btrfs_truncate_item(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			u32 new_size, int from_end);
+void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
+		       u32 data_size);
+void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
+			 u32 new_size, int from_end);
 int btrfs_split_item(struct btrfs_trans_handle *trans,
 		     struct btrfs_root *root,
 		     struct btrfs_path *path,
@@ -2440,9 +3369,14 @@
 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_path *p, int
 		      ins_len, int cow);
+int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
+			  struct btrfs_path *p, u64 time_seq);
+int btrfs_search_slot_for_read(struct btrfs_root *root,
+			       struct btrfs_key *key, struct btrfs_path *p,
+			       int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct extent_buffer *parent,
-		       int start_slot, int cache_only, u64 *last_ret,
+		       int start_slot, u64 *last_ret,
 		       struct btrfs_key *progress);
 void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
@@ -2461,10 +3395,9 @@
 	return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }
 
-int setup_items_for_insert(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct btrfs_path *path,
-			   struct btrfs_key *cpu_key, u32 *data_size,
-			   u32 total_data, u32 total_size, int nr);
+void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
+			    struct btrfs_key *cpu_key, u32 *data_size,
+			    u32 total_data, u32 total_size, int nr);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, void *data, u32 data_size);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
@@ -2482,10 +3415,24 @@
 }
 
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
-int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
+int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
+			u64 time_seq);
+static inline int btrfs_next_old_item(struct btrfs_root *root,
+				      struct btrfs_path *p, u64 time_seq)
+{
+	++p->slots[0];
+	if (p->slots[0] >= btrfs_header_nritems(p->nodes[0]))
+		return btrfs_next_old_leaf(root, p, time_seq);
+	return 0;
+}
+static inline int btrfs_next_item(struct btrfs_root *root, struct btrfs_path *p)
+{
+	return btrfs_next_old_item(root, p, 0);
+}
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref);
+int __must_check btrfs_drop_snapshot(struct btrfs_root *root,
+				     struct btrfs_block_rsv *block_rsv,
+				     int update_ref, int for_reloc);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct extent_buffer *node,
@@ -2498,19 +3445,42 @@
 	smp_mb();
 	return fs_info->closing;
 }
+
+/*
+ * If we remount the fs to be R/O or umount the fs, the cleaner needn't do
+ * anything except sleeping. This function is used to check the status of
+ * the fs.
+ */
+static inline int btrfs_need_cleaner_sleep(struct btrfs_root *root)
+{
+	return (root->fs_info->sb->s_flags & MS_RDONLY ||
+		btrfs_fs_closing(root->fs_info));
+}
+
 static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 {
+	kfree(fs_info->balance_ctl);
 	kfree(fs_info->delayed_root);
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
 	kfree(fs_info->chunk_root);
 	kfree(fs_info->dev_root);
 	kfree(fs_info->csum_root);
+	kfree(fs_info->quota_root);
+	kfree(fs_info->uuid_root);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->super_for_commit);
 	kfree(fs_info);
 }
 
+/* tree mod log functions from ctree.c */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			   struct seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+			    struct seq_list *elem);
+u64 btrfs_tree_mod_seq_prev(u64 seq);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
 			struct btrfs_path *path,
@@ -2528,18 +3498,34 @@
 int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_key *key, struct btrfs_root_item
 		      *item);
-int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_key *key, struct btrfs_root_item
-		      *item);
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
-			 btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_key *key,
+				   struct btrfs_root_item *item);
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 void btrfs_set_root_node(struct btrfs_root_item *item,
 			 struct extent_buffer *node);
 void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root);
+
+/* uuid-tree.c */
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+					      u64));
 
 /* dir-item.c */
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+			  const char *name, int name_len);
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, const char *name,
 			  int name_len, struct inode *dir,
@@ -2559,9 +3545,6 @@
 btrfs_search_dir_index_item(struct btrfs_root *root,
 			    struct btrfs_path *path, u64 dirid,
 			    const char *name, int name_len);
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
-			      struct btrfs_path *path,
-			      const char *name, int name_len);
 int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct btrfs_path *path,
@@ -2596,12 +3579,12 @@
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
 			   u64 inode_objectid, u64 ref_objectid, u64 *index);
-struct btrfs_inode_ref *
-btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod);
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len,
+			      u64 inode_objectid, u64 ref_objectid, int mod,
+			      u64 *ret_index);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
@@ -2609,13 +3592,28 @@
 		       *root, struct btrfs_path *path,
 		       struct btrfs_key *location, int mod);
 
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow);
+
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
+				   u64 ref_objectid, const char *name,
+				   int name_len,
+				   struct btrfs_inode_extref **extref_ret);
+
 /* file-item.c */
+struct btrfs_dio_private;
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 logical_offset, u32 *dst);
+			      struct btrfs_dio_private *dip, struct bio *bio,
+			      u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -2631,19 +3629,31 @@
 			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
 		       struct bio *bio, u64 file_start, int contig);
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, int cow);
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct btrfs_path *path,
 			u64 isize);
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 			     struct list_head *list, int search_commit);
 /* inode.c */
+struct btrfs_delalloc_work {
+	struct inode *inode;
+	int wait;
+	int delay_iput;
+	struct completion completion;
+	struct list_head list;
+	struct btrfs_work work;
+};
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput);
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
+
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
 					   size_t pg_offset, u64 start, u64 len,
 					   int create);
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes);
 
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
 #if defined(ClearPageFsMisc) && !defined(ClearPageChecked)
@@ -2660,7 +3670,11 @@
 	page_cache_sync_readahead(mapping, ra, file, offset, req_size);
 }
 
+#ifdef MY_ABC_HERE
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry, int caseless);
+#else
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
+#endif
 int btrfs_set_inode_index(struct inode *dir, u64 *index);
 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
@@ -2673,27 +3687,27 @@
 			struct btrfs_root *root,
 			struct inode *dir, u64 objectid,
 			const char *name, int name_len);
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front);
 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
 			       struct inode *inode, u64 new_size,
 			       u32 min_type);
 
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+				    int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state);
-int btrfs_writepages(struct address_space *mapping,
-		     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *new_root, u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
-			 size_t size, struct bio *bio, unsigned long bio_flags);
-
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
+			 size_t size, struct bio *bio,
+			 unsigned long bio_flags);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-int btrfs_dirty_inode(struct inode *inode);
-int btrfs_update_time(struct file *file);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
@@ -2708,13 +3722,14 @@
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      struct inode *inode);
+int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, struct inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
-int btrfs_invalidate_inodes(struct btrfs_root *root);
+void btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -2730,23 +3745,40 @@
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_is_empty_uuid(u8 *uuid);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_pages);
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space);
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+			       struct btrfs_ioctl_balance_args *bargs);
+
+
 /* file.c */
+int btrfs_auto_defrag_init(void);
+void btrfs_auto_defrag_exit(void);
 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
 			   struct inode *inode);
 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned);
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned);
+int btrfs_replace_extent_cache(struct inode *inode, struct extent_map *replace,
+			       u64 start, u64 end, int skip_pinned,
+			       int modified);
 extern const struct file_operations btrfs_file_operations;
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache);
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *drop_end, int drop_cache);
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			      struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
-void btrfs_drop_pages(struct page **pages, size_t num_pages);
 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 		      struct page **pages, size_t num_pages,
 		      loff_t pos, size_t write_bytes,
@@ -2754,7 +3786,7 @@
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, int cache_only);
+			struct btrfs_root *root);
 
 /* sysfs.c */
 int btrfs_init_sysfs(void);
@@ -2766,13 +3798,129 @@
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+
+#ifdef CONFIG_PRINTK
+__printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
+#else
+static inline __printf(2, 3)
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+}
+#endif
+
+#define btrfs_emerg(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
+#define btrfs_alert(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
+#define btrfs_crit(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
+#define btrfs_err(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_ERR fmt, ##args)
+#define btrfs_warn(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
+#define btrfs_notice(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
+#define btrfs_info(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_INFO fmt, ##args)
+#define btrfs_debug(fs_info, fmt, args...) \
+	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
+
+#ifdef CONFIG_BTRFS_ASSERT
+
+static inline void assfail(char *expr, char *file, int line)
+{
+	printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d",
+	       expr, file, line);
+	BUG();
+}
+
+#define ASSERT(expr)	\
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr)	((void)0)
+#endif
+
+#define btrfs_assert()
+__printf(5, 6)
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-		     unsigned int line, int errno);
+		     unsigned int line, int errno, const char *fmt, ...);
+
+
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, const char *function,
+			       unsigned int line, int errno);
+
+#define btrfs_set_fs_incompat(__fs_info, opt) \
+	__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
+					   u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & flag)) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_incompat_flags(disk_super);
+		if (!(features & flag)) {
+			features |= flag;
+			btrfs_set_super_incompat_flags(disk_super, features);
+			printk(KERN_INFO "btrfs: setting %llu feature flag\n",
+					 flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+#define btrfs_fs_incompat(fs_info, opt) \
+	__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	disk_super = fs_info->super_copy;
+	return !!(btrfs_super_incompat_flags(disk_super) & flag);
+}
+
+/*
+ * Call btrfs_abort_transaction as early as possible when an error condition is
+ * detected, that way the exact line number is reported.
+ */
+
+#define btrfs_abort_transaction(trans, root, errno)		\
+do {								\
+	__btrfs_abort_transaction(trans, root, __func__,	\
+				  __LINE__, errno);		\
+} while (0)
 
 #define btrfs_std_error(fs_info, errno)				\
 do {								\
 	if ((errno))						\
-		__btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+		__btrfs_std_error((fs_info), __func__,		\
+				   __LINE__, (errno), NULL);	\
+} while (0)
+
+#define btrfs_error(fs_info, errno, fmt, args...)		\
+do {								\
+	__btrfs_std_error((fs_info), __func__, __LINE__,	\
+			  (errno), fmt, ##args);		\
+} while (0)
+
+__printf(5, 6)
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...);
+
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
+#define btrfs_panic(fs_info, errno, fmt, args...)			\
+do {									\
+	__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args);	\
+	BUG();								\
 } while (0)
 
 /* acl.c */
@@ -2802,25 +3950,26 @@
 			    struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
-void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct extent_buffer *buf,
-			   struct extent_buffer *cow);
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow);
 void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending,
 			      u64 *bytes_to_reserve);
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			      struct btrfs_pending_snapshot *pending);
 
 /* scrub.c */
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
-		    struct btrfs_scrub_progress *progress, int readonly);
-int btrfs_scrub_pause(struct btrfs_root *root);
-int btrfs_scrub_pause_super(struct btrfs_root *root);
-int btrfs_scrub_continue(struct btrfs_root *root);
-int btrfs_scrub_continue_super(struct btrfs_root *root);
-int btrfs_scrub_cancel(struct btrfs_root *root);
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
-int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace);
+void btrfs_scrub_pause(struct btrfs_root *root);
+void btrfs_scrub_pause_super(struct btrfs_root *root);
+void btrfs_scrub_continue(struct btrfs_root *root);
+void btrfs_scrub_continue_super(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_fs_info *info);
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info,
+			   struct btrfs_device *dev);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress);
 
@@ -2840,4 +3989,64 @@
 int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 			 u64 start, int err);
 
+/* qgroup.c */
+struct qgroup_update {
+	struct list_head list;
+	struct btrfs_delayed_ref_node *node;
+	struct btrfs_delayed_extent_op *extent_op;
+};
+
+int btrfs_quota_enable(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info);
+int btrfs_quota_disable(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
+void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info);
+int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 src, u64 dst);
+int btrfs_create_qgroup(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info, u64 qgroupid,
+			char *name);
+int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info, u64 qgroupid);
+int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info, u64 qgroupid,
+		       struct btrfs_qgroup_limit *limit);
+int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
+void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
+struct btrfs_delayed_extent_op;
+int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans,
+			    struct btrfs_delayed_ref_node *node,
+			    struct btrfs_delayed_extent_op *extent_op);
+int btrfs_qgroup_account_ref(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info,
+			     struct btrfs_delayed_ref_node *node,
+			     struct btrfs_delayed_extent_op *extent_op);
+int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
+		      struct btrfs_fs_info *fs_info);
+int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans,
+			 struct btrfs_fs_info *fs_info, u64 srcid, u64 objectid,
+			 struct btrfs_qgroup_inherit *inherit);
+int btrfs_qgroup_reserve(struct btrfs_root *root, u64 num_bytes);
+void btrfs_qgroup_free(struct btrfs_root *root, u64 num_bytes);
+
+void assert_qgroups_uptodate(struct btrfs_trans_handle *trans);
+
+static inline int is_fstree(u64 rootid)
+{
+	if (rootid == BTRFS_FS_TREE_OBJECTID ||
+	    (s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+	return signal_pending(current);
+}
+
+
 #endif
diff -ur a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
--- a/fs/btrfs/delayed-inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/delayed-inode.c	2014-02-17 11:56:58.000000000 +0100
@@ -21,15 +21,17 @@
 #include "delayed-inode.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "ctree.h"
 
-#define BTRFS_DELAYED_WRITEBACK		400
-#define BTRFS_DELAYED_BACKGROUND	100
+#define BTRFS_DELAYED_WRITEBACK		512
+#define BTRFS_DELAYED_BACKGROUND	128
+#define BTRFS_DELAYED_BATCH		16
 
 static struct kmem_cache *delayed_node_cache;
 
 int __init btrfs_delayed_inode_init(void)
 {
-	delayed_node_cache = kmem_cache_create("delayed_node",
+	delayed_node_cache = kmem_cache_create("btrfs_delayed_node",
 					sizeof(struct btrfs_delayed_node),
 					0,
 					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
@@ -62,6 +64,7 @@
 	INIT_LIST_HEAD(&delayed_node->n_list);
 	INIT_LIST_HEAD(&delayed_node->p_list);
 	delayed_node->bytes_reserved = 0;
+	memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
 }
 
 static inline int btrfs_is_continuous_delayed_item(
@@ -115,6 +118,7 @@
 	return NULL;
 }
 
+/* Will return either the node or PTR_ERR(-ENOMEM) */
 static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
 							struct inode *inode)
 {
@@ -199,7 +203,7 @@
 	spin_unlock(&root->lock);
 }
 
-struct btrfs_delayed_node *btrfs_first_delayed_node(
+static struct btrfs_delayed_node *btrfs_first_delayed_node(
 			struct btrfs_delayed_root *delayed_root)
 {
 	struct list_head *p;
@@ -218,7 +222,7 @@
 	return node;
 }
 
-struct btrfs_delayed_node *btrfs_next_delayed_node(
+static struct btrfs_delayed_node *btrfs_next_delayed_node(
 						struct btrfs_delayed_node *node)
 {
 	struct btrfs_delayed_root *delayed_root;
@@ -279,7 +283,7 @@
 	__btrfs_release_delayed_node(node, 0);
 }
 
-struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
 					struct btrfs_delayed_root *delayed_root)
 {
 	struct list_head *p;
@@ -305,7 +309,7 @@
 	__btrfs_release_delayed_node(node, 1);
 }
 
-struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
 {
 	struct btrfs_delayed_item *item;
 	item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
@@ -380,7 +384,7 @@
 	return NULL;
 }
 
-struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
 					struct btrfs_delayed_node *delayed_node,
 					struct btrfs_key *key)
 {
@@ -391,45 +395,6 @@
 	return item;
 }
 
-struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
-					struct btrfs_delayed_node *delayed_node,
-					struct btrfs_key *key)
-{
-	struct btrfs_delayed_item *item;
-
-	item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
-					   NULL, NULL);
-	return item;
-}
-
-struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
-					struct btrfs_delayed_node *delayed_node,
-					struct btrfs_key *key)
-{
-	struct btrfs_delayed_item *item, *next;
-
-	item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
-					   NULL, &next);
-	if (!item)
-		item = next;
-
-	return item;
-}
-
-struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
-					struct btrfs_delayed_node *delayed_node,
-					struct btrfs_key *key)
-{
-	struct btrfs_delayed_item *item, *next;
-
-	item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
-					   NULL, &next);
-	if (!item)
-		item = next;
-
-	return item;
-}
-
 static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
 				    struct btrfs_delayed_item *ins,
 				    int action)
@@ -492,6 +457,15 @@
 					BTRFS_DELAYED_DELETION_ITEM);
 }
 
+static void finish_one_item(struct btrfs_delayed_root *delayed_root)
+{
+	int seq = atomic_inc_return(&delayed_root->items_seq);
+	if ((atomic_dec_return(&delayed_root->items) <
+	    BTRFS_DELAYED_BACKGROUND || seq % BTRFS_DELAYED_BATCH == 0) &&
+	    waitqueue_active(&delayed_root->wait))
+		wake_up(&delayed_root->wait);
+}
+
 static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
 {
 	struct rb_root *root;
@@ -510,10 +484,8 @@
 
 	rb_erase(&delayed_item->rb_node, root);
 	delayed_item->delayed_node->count--;
-	atomic_dec(&delayed_root->items);
-	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
-	    waitqueue_active(&delayed_root->wait))
-		wake_up(&delayed_root->wait);
+
+	finish_one_item(delayed_root);
 }
 
 static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
@@ -525,7 +497,7 @@
 	}
 }
 
-struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
 					struct btrfs_delayed_node *delayed_node)
 {
 	struct rb_node *p;
@@ -538,7 +510,7 @@
 	return item;
 }
 
-struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
 					struct btrfs_delayed_node *delayed_node)
 {
 	struct rb_node *p;
@@ -551,7 +523,7 @@
 	return item;
 }
 
-struct btrfs_delayed_item *__btrfs_next_delayed_item(
+static struct btrfs_delayed_item *__btrfs_next_delayed_item(
 						struct btrfs_delayed_item *item)
 {
 	struct rb_node *p;
@@ -564,20 +536,6 @@
 	return next;
 }
 
-static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
-						   u64 root_id)
-{
-	struct btrfs_key root_key;
-
-	if (root->objectid == root_id)
-		return root;
-
-	root_key.objectid = root_id;
-	root_key.type = BTRFS_ROOT_ITEM_KEY;
-	root_key.offset = (u64)-1;
-	return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
-}
-
 static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 					       struct btrfs_root *root,
 					       struct btrfs_delayed_item *item)
@@ -595,8 +553,12 @@
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
-	if (!ret)
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+					      item->key.objectid,
+					      num_bytes, 1);
 		item->bytes_reserved = num_bytes;
+	}
 
 	return ret;
 }
@@ -610,6 +572,9 @@
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_item",
+				      item->key.objectid, item->bytes_reserved,
+				      0);
 	btrfs_block_rsv_release(root, rsv,
 				item->bytes_reserved);
 }
@@ -624,7 +589,7 @@
 	struct btrfs_block_rsv *dst_rsv;
 	u64 num_bytes;
 	int ret;
-	int release = false;
+	bool release = false;
 
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
@@ -641,8 +606,9 @@
 	 * we're accounted for.
 	 */
 	if (!src_rsv || (!trans->bytes_reserved &&
-	    src_rsv != &root->fs_info->delalloc_block_rsv)) {
-		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+			 src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
 		/*
 		 * Since we're under a transaction reserve_metadata_bytes could
 		 * try to commit the transaction which will make it return
@@ -651,13 +617,18 @@
 		 */
 		if (ret == -EAGAIN)
 			ret = -ENOSPC;
-		if (!ret)
+		if (!ret) {
 			node->bytes_reserved = num_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+						      "delayed_inode",
+						      btrfs_ino(inode),
+						      num_bytes, 1);
+		}
 		return ret;
-	} else if (src_rsv == &root->fs_info->delalloc_block_rsv) {
+	} else if (src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
 		spin_lock(&BTRFS_I(inode)->lock);
-		if (BTRFS_I(inode)->delalloc_meta_reserved) {
-			BTRFS_I(inode)->delalloc_meta_reserved = 0;
+		if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+				       &BTRFS_I(inode)->runtime_flags)) {
 			spin_unlock(&BTRFS_I(inode)->lock);
 			release = true;
 			goto migrate;
@@ -672,7 +643,8 @@
 		 * reserve something strictly for us.  If not be a pain and try
 		 * to steal from the delalloc block rsv.
 		 */
-		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+		ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
+					  BTRFS_RESERVE_NO_FLUSH);
 		if (!ret)
 			goto out;
 
@@ -707,11 +679,17 @@
 	 * reservation here.  I think it may be time for a documentation page on
 	 * how block rsvs. work.
 	 */
-	if (!ret)
+	if (!ret) {
+		trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+					      btrfs_ino(inode), num_bytes, 1);
 		node->bytes_reserved = num_bytes;
+	}
 
-	if (release)
+	if (release) {
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), num_bytes, 0);
 		btrfs_block_rsv_release(root, src_rsv, num_bytes);
+	}
 
 	return ret;
 }
@@ -725,6 +703,8 @@
 		return;
 
 	rsv = &root->fs_info->delayed_block_rsv;
+	trace_btrfs_space_reservation(root->fs_info, "delayed_inode",
+				      node->inode_id, node->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, rsv,
 				node->bytes_reserved);
 	node->bytes_reserved = 0;
@@ -734,10 +714,9 @@
  * This helper will insert some continuous items into the same leaf according
  * to the free space of the leaf.
  */
-static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_delayed_item *item)
+static int btrfs_batch_insert_items(struct btrfs_root *root,
+				    struct btrfs_path *path,
+				    struct btrfs_delayed_item *item)
 {
 	struct btrfs_delayed_item *curr, *next;
 	int free_space;
@@ -816,10 +795,8 @@
 	btrfs_clear_path_blocking(path, NULL, 0);
 
 	/* insert the keys of the items */
-	ret = setup_items_for_insert(trans, root, path, keys, data_size,
-				     total_data_size, total_size, nitems);
-	if (ret)
-		goto error;
+	setup_items_for_insert(root, path, keys, data_size,
+			       total_data_size, total_size, nitems);
 
 	/* insert the dir index items */
 	slot = path->slots[0];
@@ -853,7 +830,6 @@
 				     struct btrfs_delayed_item *delayed_item)
 {
 	struct extent_buffer *leaf;
-	struct btrfs_item *item;
 	char *ptr;
 	int ret;
 
@@ -864,7 +840,6 @@
 
 	leaf = path->nodes[0];
 
-	item = btrfs_item_nr(leaf, path->slots[0]);
 	ptr = btrfs_item_ptr(leaf, path->slots[0], char);
 
 	write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -904,7 +879,7 @@
 	if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
 		/* insert the continuous items into the same leaf */
 		path->slots[0]++;
-		btrfs_batch_insert_items(trans, root, path, curr);
+		btrfs_batch_insert_items(root, path, curr);
 	}
 	btrfs_release_delayed_item(prev);
 	btrfs_mark_buffer_dirty(path->nodes[0]);
@@ -1008,9 +983,10 @@
 		btrfs_release_delayed_item(prev);
 		ret = 0;
 		btrfs_release_path(path);
-		if (curr)
+		if (curr) {
+			mutex_unlock(&node->mutex);
 			goto do_again;
-		else
+		} else
 			goto delete_fail;
 	}
 
@@ -1035,40 +1011,29 @@
 		delayed_node->count--;
 
 		delayed_root = delayed_node->root->fs_info->delayed_root;
-		atomic_dec(&delayed_root->items);
-		if (atomic_read(&delayed_root->items) <
-		    BTRFS_DELAYED_BACKGROUND &&
-		    waitqueue_active(&delayed_root->wait))
-			wake_up(&delayed_root->wait);
+		finish_one_item(delayed_root);
 	}
 }
 
-static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct btrfs_delayed_node *node)
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					struct btrfs_delayed_node *node)
 {
 	struct btrfs_key key;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
 	int ret;
 
-	mutex_lock(&node->mutex);
-	if (!node->inode_dirty) {
-		mutex_unlock(&node->mutex);
-		return 0;
-	}
-
 	key.objectid = node->inode_id;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
+
 	ret = btrfs_lookup_inode(trans, root, path, &key, 1);
 	if (ret > 0) {
 		btrfs_release_path(path);
-		mutex_unlock(&node->mutex);
 		return -ENOENT;
 	} else if (ret < 0) {
-		mutex_unlock(&node->mutex);
 		return ret;
 	}
 
@@ -1083,20 +1048,65 @@
 
 	btrfs_delayed_inode_release_metadata(root, node);
 	btrfs_release_delayed_inode(node);
-	mutex_unlock(&node->mutex);
 
 	return 0;
 }
 
-/* Called when committing the transaction. */
-int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root)
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+					     struct btrfs_root *root,
+					     struct btrfs_path *path,
+					     struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	mutex_lock(&node->mutex);
+	if (!node->inode_dirty) {
+		mutex_unlock(&node->mutex);
+		return 0;
+	}
+
+	ret = __btrfs_update_delayed_inode(trans, root, path, node);
+	mutex_unlock(&node->mutex);
+	return ret;
+}
+
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path,
+				   struct btrfs_delayed_node *node)
+{
+	int ret;
+
+	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+	if (ret)
+		return ret;
+
+	ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+	return ret;
+}
+
+/*
+ * Called when committing the transaction.
+ * Returns 0 on success.
+ * Returns < 0 on error and returns with an aborted transaction with any
+ * outstanding delayed items cleaned up.
+ */
+static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root, int nr)
 {
 	struct btrfs_delayed_root *delayed_root;
 	struct btrfs_delayed_node *curr_node, *prev_node;
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *block_rsv;
 	int ret = 0;
+	bool count = (nr > 0);
+
+	if (trans->aborted)
+		return -EIO;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1109,18 +1119,13 @@
 	delayed_root = btrfs_get_delayed_root(root);
 
 	curr_node = btrfs_first_delayed_node(delayed_root);
-	while (curr_node) {
-		root = curr_node->root;
-		ret = btrfs_insert_delayed_items(trans, path, root,
-						 curr_node);
-		if (!ret)
-			ret = btrfs_delete_delayed_items(trans, path, root,
-							 curr_node);
-		if (!ret)
-			ret = btrfs_update_delayed_inode(trans, root, path,
+	while (curr_node && (!count || (count && nr--))) {
+		ret = __btrfs_commit_inode_delayed_items(trans, path,
 							 curr_node);
 		if (ret) {
 			btrfs_release_delayed_node(curr_node);
+			curr_node = NULL;
+			btrfs_abort_transaction(trans, root, ret);
 			break;
 		}
 
@@ -1129,56 +1134,113 @@
 		btrfs_release_delayed_node(prev_node);
 	}
 
+	if (curr_node)
+		btrfs_release_delayed_node(curr_node);
 	btrfs_free_path(path);
 	trans->block_rsv = block_rsv;
+
 	return ret;
 }
 
-static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
-					      struct btrfs_delayed_node *node)
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
 {
+	return __btrfs_run_delayed_items(trans, root, -1);
+}
+
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, int nr)
+{
+	return __btrfs_run_delayed_items(trans, root, nr);
+}
+
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+				     struct inode *inode)
+{
+	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
 	struct btrfs_path *path;
 	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
+	if (!delayed_node)
+		return 0;
+
+	mutex_lock(&delayed_node->mutex);
+	if (!delayed_node->count) {
+		mutex_unlock(&delayed_node->mutex);
+		btrfs_release_delayed_node(delayed_node);
+		return 0;
+	}
+	mutex_unlock(&delayed_node->mutex);
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
 
-	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
-	if (!ret)
-		ret = btrfs_delete_delayed_items(trans, path, node->root, node);
-	if (!ret)
-		ret = btrfs_update_delayed_inode(trans, node->root, path, node);
-	btrfs_free_path(path);
+	ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 
+	btrfs_release_delayed_node(delayed_node);
+	btrfs_free_path(path);
 	trans->block_rsv = block_rsv;
+
 	return ret;
 }
 
-int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
-				     struct inode *inode)
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
 	if (!delayed_node)
 		return 0;
 
 	mutex_lock(&delayed_node->mutex);
-	if (!delayed_node->count) {
+	if (!delayed_node->inode_dirty) {
 		mutex_unlock(&delayed_node->mutex);
 		btrfs_release_delayed_node(delayed_node);
 		return 0;
 	}
 	mutex_unlock(&delayed_node->mutex);
 
-	ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+	trans = btrfs_join_transaction(delayed_node->root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto trans_out;
+	}
+	path->leave_spinning = 1;
+
+	block_rsv = trans->block_rsv;
+	trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+
+	mutex_lock(&delayed_node->mutex);
+	if (delayed_node->inode_dirty)
+		ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+						   path, delayed_node);
+	else
+		ret = 0;
+	mutex_unlock(&delayed_node->mutex);
+
+	btrfs_free_path(path);
+	trans->block_rsv = block_rsv;
+trans_out:
+	btrfs_end_transaction(trans, delayed_node->root);
+	btrfs_btree_balance_dirty(delayed_node->root);
+out:
 	btrfs_release_delayed_node(delayed_node);
+
 	return ret;
 }
 
@@ -1194,49 +1256,49 @@
 	btrfs_release_delayed_node(delayed_node);
 }
 
-struct btrfs_async_delayed_node {
-	struct btrfs_root *root;
-	struct btrfs_delayed_node *delayed_node;
+struct btrfs_async_delayed_work {
+	struct btrfs_delayed_root *delayed_root;
+	int nr;
 	struct btrfs_work work;
 };
 
-static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
+static void btrfs_async_run_delayed_root(struct btrfs_work *work)
 {
-	struct btrfs_async_delayed_node *async_node;
+	struct btrfs_async_delayed_work *async_work;
+	struct btrfs_delayed_root *delayed_root;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
 	struct btrfs_delayed_node *delayed_node = NULL;
 	struct btrfs_root *root;
 	struct btrfs_block_rsv *block_rsv;
-	unsigned long nr = 0;
-	int need_requeue = 0;
-	int ret;
+	int total_done = 0;
 
-	async_node = container_of(work, struct btrfs_async_delayed_node, work);
+	async_work = container_of(work, struct btrfs_async_delayed_work, work);
+	delayed_root = async_work->delayed_root;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		goto out;
-	path->leave_spinning = 1;
 
-	delayed_node = async_node->delayed_node;
+again:
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND / 2)
+		goto free_path;
+
+	delayed_node = btrfs_first_prepared_delayed_node(delayed_root);
+	if (!delayed_node)
+		goto free_path;
+
+	path->leave_spinning = 1;
 	root = delayed_node->root;
 
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
-		goto free_path;
+		goto release_path;
 
 	block_rsv = trans->block_rsv;
 	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
-	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
-	if (!ret)
-		ret = btrfs_delete_delayed_items(trans, path, root,
-						 delayed_node);
-
-	if (!ret)
-		btrfs_update_delayed_inode(trans, root, path, delayed_node);
-
+	__btrfs_commit_inode_delayed_items(trans, path, delayed_node);
 	/*
 	 * Maybe new delayed items have been inserted, so we need requeue
 	 * the work. Besides that, we must dequeue the empty delayed nodes
@@ -1262,59 +1324,47 @@
 	 * Task1 will sleep until the transaction is commited.
 	 */
 	mutex_lock(&delayed_node->mutex);
-	if (delayed_node->count)
-		need_requeue = 1;
-	else
-		btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
-					   delayed_node);
+	btrfs_dequeue_delayed_node(root->fs_info->delayed_root, delayed_node);
 	mutex_unlock(&delayed_node->mutex);
 
-	nr = trans->blocks_used;
-
 	trans->block_rsv = block_rsv;
 	btrfs_end_transaction_dmeta(trans, root);
-	__btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty_nodelay(root);
+
+release_path:
+	btrfs_release_path(path);
+	total_done++;
+
+	btrfs_release_prepared_delayed_node(delayed_node);
+	if (async_work->nr == 0 || total_done < async_work->nr)
+		goto again;
+
 free_path:
 	btrfs_free_path(path);
 out:
-	if (need_requeue)
-		btrfs_requeue_work(&async_node->work);
-	else {
-		btrfs_release_prepared_delayed_node(delayed_node);
-		kfree(async_node);
-	}
+	wake_up(&delayed_root->wait);
+	kfree(async_work);
 }
 
+
 static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
-				     struct btrfs_root *root, int all)
+				     struct btrfs_root *root, int nr)
 {
-	struct btrfs_async_delayed_node *async_node;
-	struct btrfs_delayed_node *curr;
-	int count = 0;
+	struct btrfs_async_delayed_work *async_work;
 
-again:
-	curr = btrfs_first_prepared_delayed_node(delayed_root);
-	if (!curr)
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
 		return 0;
 
-	async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
-	if (!async_node) {
-		btrfs_release_prepared_delayed_node(curr);
+	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
+	if (!async_work)
 		return -ENOMEM;
-	}
-
-	async_node->root = root;
-	async_node->delayed_node = curr;
 
-	async_node->work.func = btrfs_async_run_delayed_node_done;
-	async_node->work.flags = 0;
-
-	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work);
-	count++;
-
-	if (all || count < 4)
-		goto again;
+	async_work->delayed_root = delayed_root;
+	async_work->work.func = btrfs_async_run_delayed_root;
+	async_work->work.flags = 0;
+	async_work->nr = nr;
 
+	btrfs_queue_worker(&root->fs_info->delayed_workers, &async_work->work);
 	return 0;
 }
 
@@ -1325,32 +1375,58 @@
 	WARN_ON(btrfs_first_delayed_node(delayed_root));
 }
 
+static int refs_newer(struct btrfs_delayed_root *delayed_root,
+		      int seq, int count)
+{
+	int val = atomic_read(&delayed_root->items_seq);
+
+	if (val < seq || val >= seq + count)
+		return 1;
+	return 0;
+}
+
 void btrfs_balance_delayed_items(struct btrfs_root *root)
 {
 	struct btrfs_delayed_root *delayed_root;
+	int seq;
 
 	delayed_root = btrfs_get_delayed_root(root);
 
 	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
 		return;
 
+	seq = atomic_read(&delayed_root->items_seq);
+
 	if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
 		int ret;
-		ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
+		DEFINE_WAIT(__wait);
+
+		ret = btrfs_wq_run_delayed_node(delayed_root, root, 0);
 		if (ret)
 			return;
 
-		wait_event_interruptible_timeout(
-				delayed_root->wait,
-				(atomic_read(&delayed_root->items) <
-				 BTRFS_DELAYED_BACKGROUND),
-				HZ);
-		return;
+		while (1) {
+			prepare_to_wait(&delayed_root->wait, &__wait,
+					TASK_INTERRUPTIBLE);
+
+			if (refs_newer(delayed_root, seq,
+				       BTRFS_DELAYED_BATCH) ||
+			    atomic_read(&delayed_root->items) <
+			    BTRFS_DELAYED_BACKGROUND) {
+				break;
+			}
+			if (!signal_pending(current))
+				schedule();
+			else
+				break;
+		}
+		finish_wait(&delayed_root->wait, &__wait);
 	}
 
-	btrfs_wq_run_delayed_node(delayed_root, root, 0);
+	btrfs_wq_run_delayed_node(delayed_root, root, BTRFS_DELAYED_BATCH);
 }
 
+/* Will return 0 or -ENOMEM */
 int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root, const char *name,
 				   int name_len, struct inode *dir,
@@ -1372,35 +1448,34 @@
 		goto release_node;
 	}
 
-	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
-	/*
-	 * we have reserved enough space when we start a new transaction,
-	 * so reserving metadata failure is impossible
-	 */
-	BUG_ON(ret);
-
 	delayed_item->key.objectid = btrfs_ino(dir);
 	btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
 	delayed_item->key.offset = index;
 
 	dir_item = (struct btrfs_dir_item *)delayed_item->data;
 	dir_item->location = *disk_key;
-	dir_item->transid = cpu_to_le64(trans->transid);
-	dir_item->data_len = 0;
-	dir_item->name_len = cpu_to_le16(name_len);
-	dir_item->type = type;
+	btrfs_set_stack_dir_transid(dir_item, trans->transid);
+	btrfs_set_stack_dir_data_len(dir_item, 0);
+	btrfs_set_stack_dir_name_len(dir_item, name_len);
+	btrfs_set_stack_dir_type(dir_item, type);
 	memcpy((char *)(dir_item + 1), name, name_len);
 
+	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+	/*
+	 * we have reserved enough space when we start a new transaction,
+	 * so reserving metadata failure is impossible
+	 */
+	BUG_ON(ret);
+
+
 	mutex_lock(&delayed_node->mutex);
 	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
 	if (unlikely(ret)) {
-		printk(KERN_ERR "err add delayed dir index item(name: %s) into "
-				"the insertion tree of the delayed node"
+		printk(KERN_ERR "err add delayed dir index item(name: %.*s) "
+				"into the insertion tree of the delayed node"
 				"(root id: %llu, inode id: %llu, errno: %d)\n",
-				name,
-				(unsigned long long)delayed_node->root->objectid,
-				(unsigned long long)delayed_node->inode_id,
-				ret);
+				name_len, name, delayed_node->root->objectid,
+				delayed_node->inode_id, ret);
 		BUG();
 	}
 	mutex_unlock(&delayed_node->mutex);
@@ -1471,9 +1546,7 @@
 		printk(KERN_ERR "err add delayed dir index item(index: %llu) "
 				"into the deletion tree of the delayed node"
 				"(root id: %llu, inode id: %llu, errno: %d)\n",
-				(unsigned long long)index,
-				(unsigned long long)node->root->objectid,
-				(unsigned long long)node->inode_id,
+				index, node->root->objectid, node->inode_id,
 				ret);
 		BUG();
 	}
@@ -1624,7 +1697,7 @@
 
 		di = (struct btrfs_dir_item *)curr->data;
 		name = (char *)(di + 1);
-		name_len = le16_to_cpu(di->name_len);
+		name_len = btrfs_stack_dir_name_len(di);
 
 		d_type = btrfs_filetype_table[di->type];
 		btrfs_disk_key_to_cpu(&location, &di->location);
@@ -1641,27 +1714,6 @@
 	return 0;
 }
 
-BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
-			 sequence, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
-			 transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
-			 nbytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
-			 block_group, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
-
 static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode_item *inode_item,
 				  struct inode *inode)
@@ -1674,7 +1726,7 @@
 	btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
 	btrfs_set_stack_inode_generation(inode_item,
 					 BTRFS_I(inode)->generation);
-	btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+	btrfs_set_stack_inode_sequence(inode_item, inode->i_version);
 	btrfs_set_stack_inode_transid(inode_item, trans->transid);
 	btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
 	btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
@@ -1722,7 +1774,7 @@
 	set_nlink(inode, btrfs_stack_inode_nlink(inode_item));
 	inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
 	BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
-	BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+	inode->i_version = btrfs_stack_inode_sequence(inode_item);
 	inode->i_rdev = 0;
 	*rdev = btrfs_stack_inode_rdev(inode_item);
 	BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
@@ -1847,3 +1899,21 @@
 		}
 	}
 }
+
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root)
+{
+	struct btrfs_delayed_root *delayed_root;
+	struct btrfs_delayed_node *curr_node, *prev_node;
+
+	delayed_root = btrfs_get_delayed_root(root);
+
+	curr_node = btrfs_first_delayed_node(delayed_root);
+	while (curr_node) {
+		__btrfs_kill_delayed_node(curr_node);
+
+		prev_node = curr_node;
+		curr_node = btrfs_next_delayed_node(curr_node);
+		btrfs_release_delayed_node(prev_node);
+	}
+}
+
diff -ur a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
--- a/fs/btrfs/delayed-inode.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/delayed-inode.h	2014-02-17 11:56:58.000000000 +0100
@@ -43,6 +43,7 @@
 	 */
 	struct list_head prepare_list;
 	atomic_t items;		/* for delayed items */
+	atomic_t items_seq;	/* for delayed items */
 	int nodes;		/* for delayed nodes */
 	wait_queue_head_t wait;
 };
@@ -86,6 +87,7 @@
 				struct btrfs_delayed_root *delayed_root)
 {
 	atomic_set(&delayed_root->items, 0);
+	atomic_set(&delayed_root->items_seq, 0);
 	delayed_root->nodes = 0;
 	spin_lock_init(&delayed_root->lock);
 	init_waitqueue_head(&delayed_root->wait);
@@ -107,6 +109,8 @@
 
 int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
+int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, int nr);
 
 void btrfs_balance_delayed_items(struct btrfs_root *root);
 
@@ -115,6 +119,7 @@
 /* Used for evicting the inode. */
 void btrfs_remove_delayed_node(struct inode *inode);
 void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
 
 
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
@@ -124,6 +129,9 @@
 /* Used for drop dead root */
 void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
 
+/* Used for clean the transaction */
+void btrfs_destroy_delayed_inodes(struct btrfs_root *root);
+
 /* Used for readdir() */
 void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
 			     struct list_head *del_list);
diff -ur a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
--- a/fs/btrfs/delayed-ref.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/delayed-ref.c	2014-02-17 11:56:58.000000000 +0100
@@ -23,6 +23,10 @@
 #include "delayed-ref.h"
 #include "transaction.h"
 
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
 /*
  * delayed back reference update tracking.  For subvolume trees
  * we queue up extent allocations and backref maintenance for
@@ -36,9 +40,9 @@
  * compare two delayed tree backrefs with same bytenr and type
  */
 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
-			  struct btrfs_delayed_tree_ref *ref1)
+			  struct btrfs_delayed_tree_ref *ref1, int type)
 {
-	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+	if (type == BTRFS_TREE_BLOCK_REF_KEY) {
 		if (ref1->root < ref2->root)
 			return -1;
 		if (ref1->root > ref2->root)
@@ -85,7 +89,8 @@
  * type of the delayed backrefs and content of delayed backrefs.
  */
 static int comp_entry(struct btrfs_delayed_ref_node *ref2,
-		      struct btrfs_delayed_ref_node *ref1)
+		      struct btrfs_delayed_ref_node *ref1,
+		      bool compare_seq)
 {
 	if (ref1->bytenr < ref2->bytenr)
 		return -1;
@@ -101,10 +106,18 @@
 		return -1;
 	if (ref1->type > ref2->type)
 		return 1;
+	/* merging of sequenced refs is not allowed */
+	if (compare_seq) {
+		if (ref1->seq < ref2->seq)
+			return -1;
+		if (ref1->seq > ref2->seq)
+			return 1;
+	}
 	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
 	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
 		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
-				      btrfs_delayed_node_to_tree_ref(ref1));
+				      btrfs_delayed_node_to_tree_ref(ref1),
+				      ref1->type);
 	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
 		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
 		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
@@ -134,7 +147,7 @@
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 rb_node);
 
-		cmp = comp_entry(entry, ins);
+		cmp = comp_entry(entry, ins, 1);
 		if (cmp < 0)
 			p = &(*p)->rb_left;
 		else if (cmp > 0)
@@ -150,16 +163,22 @@
 
 /*
  * find an head entry based on bytenr. This returns the delayed ref
- * head if it was able to find one, or NULL if nothing was in that spot
+ * head if it was able to find one, or NULL if nothing was in that spot.
+ * If return_bigger is given, the next bigger entry is returned if no exact
+ * match is found.
  */
 static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
 				  u64 bytenr,
-				  struct btrfs_delayed_ref_node **last)
+				  struct btrfs_delayed_ref_node **last,
+				  int return_bigger)
 {
-	struct rb_node *n = root->rb_node;
+	struct rb_node *n;
 	struct btrfs_delayed_ref_node *entry;
-	int cmp;
+	int cmp = 0;
 
+again:
+	n = root->rb_node;
+	entry = NULL;
 	while (n) {
 		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
 		WARN_ON(!entry->in_tree);
@@ -182,6 +201,19 @@
 		else
 			return entry;
 	}
+	if (entry && return_bigger) {
+		if (cmp > 0) {
+			n = rb_next(&entry->rb_node);
+			if (!n)
+				n = rb_first(root);
+			entry = rb_entry(n, struct btrfs_delayed_ref_node,
+					 rb_node);
+			bytenr = entry->bytenr;
+			return_bigger = 0;
+			goto again;
+		}
+		return entry;
+	}
 	return NULL;
 }
 
@@ -209,6 +241,138 @@
 	return 0;
 }
 
+static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
+				    struct btrfs_delayed_ref_root *delayed_refs,
+				    struct btrfs_delayed_ref_node *ref)
+{
+	rb_erase(&ref->rb_node, &delayed_refs->root);
+	ref->in_tree = 0;
+	btrfs_put_delayed_ref(ref);
+	delayed_refs->num_entries--;
+	if (trans->delayed_ref_updates)
+		trans->delayed_ref_updates--;
+}
+
+static int merge_ref(struct btrfs_trans_handle *trans,
+		     struct btrfs_delayed_ref_root *delayed_refs,
+		     struct btrfs_delayed_ref_node *ref, u64 seq)
+{
+	struct rb_node *node;
+	int merged = 0;
+	int mod = 0;
+	int done = 0;
+
+	node = rb_prev(&ref->rb_node);
+	while (node) {
+		struct btrfs_delayed_ref_node *next;
+
+		next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		node = rb_prev(node);
+		if (next->bytenr != ref->bytenr)
+			break;
+		if (seq && next->seq >= seq)
+			break;
+		if (comp_entry(ref, next, 0))
+			continue;
+
+		if (ref->action == next->action) {
+			mod = next->ref_mod;
+		} else {
+			if (ref->ref_mod < next->ref_mod) {
+				struct btrfs_delayed_ref_node *tmp;
+
+				tmp = ref;
+				ref = next;
+				next = tmp;
+				done = 1;
+			}
+			mod = -next->ref_mod;
+		}
+
+		merged++;
+		drop_delayed_ref(trans, delayed_refs, next);
+		ref->ref_mod += mod;
+		if (ref->ref_mod == 0) {
+			drop_delayed_ref(trans, delayed_refs, ref);
+			break;
+		} else {
+			/*
+			 * You can't have multiples of the same ref on a tree
+			 * block.
+			 */
+			WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+
+		if (done)
+			break;
+		node = rb_prev(&ref->rb_node);
+	}
+
+	return merged;
+}
+
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head)
+{
+	struct rb_node *node;
+	u64 seq = 0;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		struct seq_list *elem;
+
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		seq = elem->seq;
+	}
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+
+	node = rb_prev(&head->node.rb_node);
+	while (node) {
+		struct btrfs_delayed_ref_node *ref;
+
+		ref = rb_entry(node, struct btrfs_delayed_ref_node,
+			       rb_node);
+		if (ref->bytenr != head->node.bytenr)
+			break;
+
+		/* We can't merge refs that are outside of our seq count */
+		if (seq && ref->seq >= seq)
+			break;
+		if (merge_ref(trans, delayed_refs, ref, seq))
+			node = rb_prev(&head->node.rb_node);
+		else
+			node = rb_prev(node);
+	}
+}
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq)
+{
+	struct seq_list *elem;
+	int ret = 0;
+
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	if (!list_empty(&fs_info->tree_mod_seq_list)) {
+		elem = list_first_entry(&fs_info->tree_mod_seq_list,
+					struct seq_list, list);
+		if (seq >= elem->seq) {
+			pr_debug("holding back delayed_ref %#x.%x, lowest is %#x.%x (%p)\n",
+				 (u32)(seq >> 32), (u32)seq,
+				 (u32)(elem->seq >> 32), (u32)elem->seq,
+				 delayed_refs);
+			ret = 1;
+		}
+	}
+
+	spin_unlock(&fs_info->tree_mod_seq_lock);
+	return ret;
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 start)
 {
@@ -223,20 +387,8 @@
 		node = rb_first(&delayed_refs->root);
 	} else {
 		ref = NULL;
-		find_ref_head(&delayed_refs->root, start, &ref);
+		find_ref_head(&delayed_refs->root, start + 1, &ref, 1);
 		if (ref) {
-			struct btrfs_delayed_ref_node *tmp;
-
-			node = rb_prev(&ref->rb_node);
-			while (node) {
-				tmp = rb_entry(node,
-					       struct btrfs_delayed_ref_node,
-					       rb_node);
-				if (tmp->bytenr < start)
-					break;
-				ref = tmp;
-				node = rb_prev(&ref->rb_node);
-			}
 			node = &ref->rb_node;
 		} else
 			node = rb_first(&delayed_refs->root);
@@ -280,6 +432,14 @@
 	return 1;
 }
 
+void btrfs_release_ref_cluster(struct list_head *cluster)
+{
+	struct list_head *pos, *q;
+
+	list_for_each_safe(pos, q, cluster)
+		list_del_init(pos);
+}
+
 /*
  * helper function to update an extent delayed ref in the
  * rbtree.  existing and update must both have the same
@@ -302,18 +462,11 @@
 		 * every changing the extent allocation tree.
 		 */
 		existing->ref_mod--;
-		if (existing->ref_mod == 0) {
-			rb_erase(&existing->rb_node,
-				 &delayed_refs->root);
-			existing->in_tree = 0;
-			btrfs_put_delayed_ref(existing);
-			delayed_refs->num_entries--;
-			if (trans->delayed_ref_updates)
-				trans->delayed_ref_updates--;
-		} else {
+		if (existing->ref_mod == 0)
+			drop_delayed_ref(trans, delayed_refs, existing);
+		else
 			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
-		}
 	} else {
 		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
 			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
@@ -376,7 +529,7 @@
 					ref->extent_op->flags_to_set;
 				existing_ref->extent_op->update_flags = 1;
 			}
-			kfree(ref->extent_op);
+			btrfs_free_delayed_extent_op(ref->extent_op);
 		}
 	}
 	/*
@@ -390,7 +543,8 @@
  * this does all the dirty work in terms of maintaining the correct
  * overall modification count.
  */
-static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
+					struct btrfs_trans_handle *trans,
 					struct btrfs_delayed_ref_node *ref,
 					u64 bytenr, u64 num_bytes,
 					int action, int is_data)
@@ -437,6 +591,7 @@
 	ref->action  = 0;
 	ref->is_head = 1;
 	ref->in_tree = 1;
+	ref->seq = 0;
 
 	head_ref = btrfs_delayed_node_to_head(ref);
 	head_ref->must_insert_reserved = must_insert_reserved;
@@ -445,7 +600,7 @@
 	INIT_LIST_HEAD(&head_ref->cluster);
 	mutex_init(&head_ref->mutex);
 
-	trace_btrfs_delayed_ref_head(ref, head_ref, action);
+	trace_add_delayed_ref_head(ref, head_ref, action);
 
 	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
@@ -455,27 +610,29 @@
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
 	} else {
 		delayed_refs->num_heads++;
 		delayed_refs->num_heads_ready++;
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
-	return 0;
 }
 
 /*
  * helper to insert a delayed tree ref into the rbtree.
  */
-static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+					 struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
-					 u64 ref_root, int level, int action)
+					 u64 ref_root, int level, int action,
+					 int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_tree_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -491,17 +648,20 @@
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
+	if (need_ref_seq(for_cow, ref_root))
+		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+	ref->seq = seq;
+
 	full_ref = btrfs_delayed_node_to_tree_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
-	}
 	full_ref->level = level;
 
-	trace_btrfs_delayed_tree_ref(ref, full_ref, action);
+	trace_add_delayed_tree_ref(ref, full_ref, action);
 
 	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
@@ -511,26 +671,27 @@
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
 	} else {
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
-	return 0;
 }
 
 /*
  * helper to insert a delayed data ref into the rbtree.
  */
-static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+					 struct btrfs_trans_handle *trans,
 					 struct btrfs_delayed_ref_node *ref,
 					 u64 bytenr, u64 num_bytes, u64 parent,
 					 u64 ref_root, u64 owner, u64 offset,
-					 int action)
+					 int action, int for_cow)
 {
 	struct btrfs_delayed_ref_node *existing;
 	struct btrfs_delayed_data_ref *full_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
+	u64 seq = 0;
 
 	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		action = BTRFS_ADD_DELAYED_REF;
@@ -546,18 +707,22 @@
 	ref->is_head = 0;
 	ref->in_tree = 1;
 
+	if (need_ref_seq(for_cow, ref_root))
+		seq = btrfs_get_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+	ref->seq = seq;
+
 	full_ref = btrfs_delayed_node_to_data_ref(ref);
-	if (parent) {
-		full_ref->parent = parent;
+	full_ref->parent = parent;
+	full_ref->root = ref_root;
+	if (parent)
 		ref->type = BTRFS_SHARED_DATA_REF_KEY;
-	} else {
-		full_ref->root = ref_root;
+	else
 		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
-	}
+
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
 
-	trace_btrfs_delayed_data_ref(ref, full_ref, action);
+	trace_add_delayed_data_ref(ref, full_ref, action);
 
 	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
@@ -567,12 +732,11 @@
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
 	} else {
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
-	return 0;
 }
 
 /*
@@ -580,24 +744,25 @@
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root,  int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow)
 {
 	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
 
 	BUG_ON(extent_op && extent_op->is_data);
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref) {
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
 		return -ENOMEM;
 	}
 
@@ -610,39 +775,42 @@
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 0);
-	BUG_ON(ret);
-
-	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, level, action);
-	BUG_ON(ret);
+	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+				   num_bytes, action, 0);
+
+	add_delayed_tree_ref(fs_info, trans, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, level, action,
+				   for_cow);
 	spin_unlock(&delayed_refs->lock);
+	if (need_ref_seq(for_cow, ref_root))
+		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
+
 	return 0;
 }
 
 /*
  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
  */
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op)
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow)
 {
 	struct btrfs_delayed_data_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
 
 	BUG_ON(extent_op && !extent_op->is_data);
-	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref) {
-		kfree(ref);
+		kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
 		return -ENOMEM;
 	}
 
@@ -655,26 +823,28 @@
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
-				   action, 1);
-	BUG_ON(ret);
-
-	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
-				   parent, ref_root, owner, offset, action);
-	BUG_ON(ret);
+	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
+				   num_bytes, action, 1);
+
+	add_delayed_data_ref(fs_info, trans, &ref->node, bytenr,
+				   num_bytes, parent, ref_root, owner, offset,
+				   action, for_cow);
 	spin_unlock(&delayed_refs->lock);
+	if (need_ref_seq(for_cow, ref_root))
+		btrfs_qgroup_record_ref(trans, &ref->node, extent_op);
+
 	return 0;
 }
 
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op)
 {
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
-	int ret;
 
-	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
 	if (!head_ref)
 		return -ENOMEM;
 
@@ -683,10 +853,9 @@
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+	add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr,
 				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
 				   extent_op->is_data);
-	BUG_ON(ret);
 
 	spin_unlock(&delayed_refs->lock);
 	return 0;
@@ -704,8 +873,56 @@
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL, 0);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
 }
+
+void btrfs_delayed_ref_exit(void)
+{
+	if (btrfs_delayed_ref_head_cachep)
+		kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+	if (btrfs_delayed_tree_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+	if (btrfs_delayed_data_ref_cachep)
+		kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+	if (btrfs_delayed_extent_op_cachep)
+		kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+
+int btrfs_delayed_ref_init(void)
+{
+	btrfs_delayed_ref_head_cachep = kmem_cache_create(
+				"btrfs_delayed_ref_head",
+				sizeof(struct btrfs_delayed_ref_head), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_ref_head_cachep)
+		goto fail;
+
+	btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_tree_ref",
+				sizeof(struct btrfs_delayed_tree_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_tree_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_data_ref_cachep = kmem_cache_create(
+				"btrfs_delayed_data_ref",
+				sizeof(struct btrfs_delayed_data_ref), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_data_ref_cachep)
+		goto fail;
+
+	btrfs_delayed_extent_op_cachep = kmem_cache_create(
+				"btrfs_delayed_extent_op",
+				sizeof(struct btrfs_delayed_extent_op), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+	if (!btrfs_delayed_extent_op_cachep)
+		goto fail;
+
+	return 0;
+fail:
+	btrfs_delayed_ref_exit();
+	return -ENOMEM;
+}
diff -ur a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
--- a/fs/btrfs/delayed-ref.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/delayed-ref.h	2014-02-17 11:56:58.000000000 +0100
@@ -18,7 +18,7 @@
 #ifndef __DELAYED_REF__
 #define __DELAYED_REF__
 
-/* these are the possible values of struct btrfs_delayed_ref->action */
+/* these are the possible values of struct btrfs_delayed_ref_node->action */
 #define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
 #define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
 #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
@@ -33,6 +33,9 @@
 	/* the size of the extent */
 	u64 num_bytes;
 
+	/* seq number to keep track of insertion order */
+	u64 seq;
+
 	/* ref count on this data structure */
 	atomic_t refs;
 
@@ -57,6 +60,7 @@
 struct btrfs_delayed_extent_op {
 	struct btrfs_disk_key key;
 	u64 flags_to_set;
+	int level;
 	unsigned int update_key:1;
 	unsigned int update_flags:1;
 	unsigned int is_data:1;
@@ -98,19 +102,15 @@
 
 struct btrfs_delayed_tree_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	int level;
 };
 
 struct btrfs_delayed_data_ref {
 	struct btrfs_delayed_ref_node node;
-	union {
-		u64 root;
-		u64 parent;
-	};
+	u64 root;
+	u64 parent;
 	u64 objectid;
 	u64 offset;
 };
@@ -133,6 +133,15 @@
 	unsigned long num_heads_ready;
 
 	/*
+	 * bumped when someone is making progress on the delayed
+	 * refs, so that other procs know they are just adding to
+	 * contention intead of helping
+	 */
+	atomic_t procs_running_refs;
+	atomic_t ref_seq;
+	wait_queue_head_t wait;
+
+	/*
 	 * set when the tree is flushing before a transaction commit,
 	 * used by the throttling code to decide if new updates need
 	 * to be run right away
@@ -142,34 +151,108 @@
 	u64 run_delayed_start;
 };
 
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+	return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+	if (op)
+		kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
+
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
 	WARN_ON(atomic_read(&ref->refs) == 0);
 	if (atomic_dec_and_test(&ref->refs)) {
 		WARN_ON(ref->in_tree);
-		kfree(ref);
+		switch (ref->type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+		case BTRFS_SHARED_DATA_REF_KEY:
+			kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+			break;
+		case 0:
+			kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+			break;
+		default:
+			BUG();
+		}
 	}
 }
 
-int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes, u64 parent,
 			       u64 ref_root, int level, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow);
+int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
+			       struct btrfs_trans_handle *trans,
 			       u64 bytenr, u64 num_bytes,
 			       u64 parent, u64 ref_root,
 			       u64 owner, u64 offset, int action,
-			       struct btrfs_delayed_extent_op *extent_op);
-int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int for_cow);
+int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
+				struct btrfs_trans_handle *trans,
 				u64 bytenr, u64 num_bytes,
 				struct btrfs_delayed_extent_op *extent_op);
+void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_delayed_ref_root *delayed_refs,
+			      struct btrfs_delayed_ref_head *head);
 
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 			   struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+	mutex_unlock(&head->mutex);
+}
+
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 			   struct list_head *cluster, u64 search_start);
+void btrfs_release_ref_cluster(struct list_head *cluster);
+
+int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
+			    struct btrfs_delayed_ref_root *delayed_refs,
+			    u64 seq);
+
+/*
+ * delayed refs with a ref_seq > 0 must be held back during backref walking.
+ * this only applies to items in one of the fs-trees. for_cow items never need
+ * to be held back, so they won't get a ref_seq number.
+ */
+static inline int need_ref_seq(int for_cow, u64 rootid)
+{
+	if (for_cow)
+		return 0;
+
+	if (rootid == BTRFS_FS_TREE_OBJECTID)
+		return 1;
+
+	if ((s64)rootid >= (s64)BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+
+	return 0;
+}
+
 /*
  * a node might live in a head or a regular ref, this lets you
  * test for the proper type to use.
Nur in b/fs/btrfs: dev-replace.c.
Nur in b/fs/btrfs: dev-replace.h.
diff -ur a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
--- a/fs/btrfs/dir-item.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/dir-item.c	2014-02-17 11:56:58.000000000 +0100
@@ -21,6 +21,15 @@
 #include "hash.h"
 #include "transaction.h"
 
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len);
+
+#ifdef MY_ABC_HERE
+extern unsigned char SYNOBtrfsGlobalBuf[UNICODE_UTF8_BUFSIZE];
+extern spinlock_t SYNOBtrfsGlobalLock;  /* init at btrfs_fill_super() */
+#endif
+
 /*
  * insert a name into a directory, doing overflow properly if there is a hash
  * collision.  data_size indicates how big the item inserted should be.  On
@@ -49,9 +58,8 @@
 		di = btrfs_match_dir_item_name(root, path, name, name_len);
 		if (di)
 			return ERR_PTR(-EEXIST);
-		ret = btrfs_extend_item(trans, root, path, data_size);
-	}
-	if (ret < 0)
+		btrfs_extend_item(root, path, data_size);
+	} else if (ret < 0)
 		return ERR_PTR(ret);
 	WARN_ON(ret > 0);
 	leaf = path->nodes[0];
@@ -110,12 +118,67 @@
 	return ret;
 }
 
+#ifdef MY_ABC_HERE
+static int btrfs_insert_dir_item_caseless(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+						const char *name, int name_len, struct inode *dir,
+						struct btrfs_disk_key *disk_key, u8 type)
+{
+	int ret;
+	int upperlen;
+	unsigned long name_ptr;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->leave_spinning = 1;
+
+	key.objectid = btrfs_ino(dir);
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_CASELESS_KEY);
+
+	spin_lock(&SYNOBtrfsGlobalLock);
+	upperlen = SYNOUnicodeUTF8toUpper(SYNOBtrfsGlobalBuf, name, UNICODE_UTF8_BUFSIZE-1, name_len, NULL);
+	key.offset = btrfs_name_hash(SYNOBtrfsGlobalBuf, upperlen);
+	spin_unlock(&SYNOBtrfsGlobalLock);
+
+	dir_item = insert_with_overflow(trans, root, path, &key, (sizeof(*dir_item) + name_len),
+					name, name_len);
+	if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		if (ret == -EEXIST) {
+			ret = 0;
+		}
+		goto out_release;
+	}
+
+	leaf = path->nodes[0];
+	btrfs_set_dir_item_key(leaf, dir_item, disk_key);
+	btrfs_set_dir_type(leaf, dir_item, type);
+	btrfs_set_dir_data_len(leaf, dir_item, 0);
+	btrfs_set_dir_name_len(leaf, dir_item, name_len);
+	btrfs_set_dir_transid(leaf, dir_item, trans->transid);
+	name_ptr = (unsigned long)(dir_item + 1);
+
+	write_extent_buffer(leaf, name, name_ptr, name_len);
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+
+out_release:
+	btrfs_free_path(path);
+	return ret;
+}
+#endif
+
 /*
  * insert a directory item in the tree, doing all the magic for
  * both indexes. 'dir' indicates which objectid to insert it into,
  * 'location' is the key to stuff into the directory item, 'type' is the
  * type of the inode we're pointing to, and 'index' is the sequence number
  * to use for the second index (if one is created).
+ * Will return 0 or -ENOMEM
  */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
 			  *root, const char *name, int name_len,
@@ -180,7 +243,11 @@
 		return ret;
 	if (ret2)
 		return ret2;
+#ifdef MY_ABC_HERE
+	return btrfs_insert_dir_item_caseless(trans, root, name, name_len, dir, &disk_key, type);
+#else
 	return 0;
+#endif
 }
 
 /*
@@ -195,14 +262,29 @@
 					     int mod)
 {
 	int ret;
+#ifdef MY_ABC_HERE
+	int upperlen = 0;
+#endif
 	struct btrfs_key key;
 	int ins_len = mod < 0 ? -1 : 0;
 	int cow = mod != 0;
 
 	key.objectid = dir;
+#ifdef MY_ABC_HERE
+	if (path->caseless_key) {
+		spin_lock(&SYNOBtrfsGlobalLock);
+		btrfs_set_key_type(&key, BTRFS_DIR_ITEM_CASELESS_KEY);
+		upperlen = SYNOUnicodeUTF8toUpper(SYNOBtrfsGlobalBuf, name, UNICODE_UTF8_BUFSIZE-1, name_len, NULL);
+		key.offset = btrfs_name_hash(SYNOBtrfsGlobalBuf, upperlen);
+		spin_unlock(&SYNOBtrfsGlobalLock);
+	} else {
+#endif
 	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
 
 	key.offset = btrfs_name_hash(name, name_len);
+#ifdef MY_ABC_HERE
+	}
+#endif
 
 	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
 	if (ret < 0)
@@ -213,6 +295,65 @@
 	return btrfs_match_dir_item_name(root, path, name, name_len);
 }
 
+int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
+				   const char *name, int name_len)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_dir_item *di;
+	int data_size;
+	struct extent_buffer *leaf;
+	int slot;
+	struct btrfs_path *path;
+
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = dir;
+	btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
+	key.offset = btrfs_name_hash(name, name_len);
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+
+	/* return back any errors */
+	if (ret < 0)
+		goto out;
+
+	/* nothing found, we're safe */
+	if (ret > 0) {
+		ret = 0;
+		goto out;
+	}
+
+	/* we found an item, look for our name in the item */
+	di = btrfs_match_dir_item_name(root, path, name, name_len);
+	if (di) {
+		/* our exact name was found */
+		ret = -EEXIST;
+		goto out;
+	}
+
+	/*
+	 * see if there is room in the item to insert this
+	 * name
+	 */
+	data_size = sizeof(*di) + name_len + sizeof(struct btrfs_item);
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	if (data_size + btrfs_item_size_nr(leaf, slot) +
+	    sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root)) {
+		ret = -EOVERFLOW;
+	} else {
+		/* plenty of insertion room */
+		ret = 0;
+	}
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
 /*
  * lookup a directory item based on index.  'dir' is the objectid
  * we're searching in, and 'mod' tells us if you plan on deleting the
@@ -320,7 +461,7 @@
  * this walks through all the entries in a dir item and finds one
  * for a specific name.
  */
-struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
+static struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
 			      struct btrfs_path *path,
 			      const char *name, int name_len)
 {
@@ -343,9 +484,19 @@
 			btrfs_dir_data_len(leaf, dir_item);
 		name_ptr = (unsigned long)(dir_item + 1);
 
+#ifdef MY_ABC_HERE
+		if (path->caseless_name) {
+			if (0 == memcmp_caseless_extent_buffer(leaf, name, name_ptr, name_len)) {
+				return dir_item;
+			}
+		} else {
+#endif
 		if (btrfs_dir_name_len(leaf, dir_item) == name_len &&
 		    memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
 			return dir_item;
+#ifdef MY_ABC_HERE
+		}
+#endif
 
 		cur += this_len;
 		dir_item = (struct btrfs_dir_item *)((char *)dir_item +
@@ -383,8 +534,7 @@
 		start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			item_len - (ptr + sub_item_len - start));
-		ret = btrfs_truncate_item(trans, root, path,
-					  item_len - sub_item_len, 1);
+		btrfs_truncate_item(root, path, item_len - sub_item_len, 1);
 	}
 	return ret;
 }
diff -ur a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
--- a/fs/btrfs/disk-io.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/disk-io.c	2014-02-17 11:56:58.000000000 +0100
@@ -30,6 +30,8 @@
 #include <linux/slab.h>
 #include <linux/migrate.h>
 #include <linux/ratelimit.h>
+#include <linux/uuid.h>
+#include <linux/semaphore.h>
 #include <asm/unaligned.h>
 #include "compat.h"
 #include "ctree.h"
@@ -43,24 +45,34 @@
 #include "tree-log.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "raid56.h"
+
+#ifdef CONFIG_X86
+#include <asm/cpufeature.h>
+#endif
 
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 				    int read_only);
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+					     struct btrfs_root *root);
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
 				      struct btrfs_root *root);
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t);
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages,
 					int mark);
 static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
 				       struct extent_io_tree *pinned_extents);
 static int btrfs_cleanup_transaction(struct btrfs_root *root);
+static void btrfs_error_commit_super(struct btrfs_root *root);
 
 /*
  * end_io_wq structs are used to do processing in task context when an IO is
@@ -98,6 +110,7 @@
 	 */
 	u64 bio_offset;
 	struct btrfs_work work;
+	int error;
 };
 
 /*
@@ -140,10 +153,11 @@
 	{ .id = BTRFS_DEV_TREE_OBJECTID,	.name_stem = "dev"	},
 	{ .id = BTRFS_FS_TREE_OBJECTID,		.name_stem = "fs"	},
 	{ .id = BTRFS_CSUM_TREE_OBJECTID,	.name_stem = "csum"	},
-	{ .id = BTRFS_ORPHAN_OBJECTID,		.name_stem = "orphan"	},
+	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	.name_stem = "quota"	},
 	{ .id = BTRFS_TREE_LOG_OBJECTID,	.name_stem = "log"	},
 	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
 	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
+	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
 	{ .id = 0,				.name_stem = "tree"	},
 };
 
@@ -213,33 +227,23 @@
 	em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
 	if (ret == -EEXIST) {
-		u64 failed_start = em->start;
-		u64 failed_len = em->len;
-
 		free_extent_map(em);
 		em = lookup_extent_mapping(em_tree, start, len);
-		if (em) {
-			ret = 0;
-		} else {
-			em = lookup_extent_mapping(em_tree, failed_start,
-						   failed_len);
-			ret = -EIO;
-		}
+		if (!em)
+			em = ERR_PTR(-EIO);
 	} else if (ret) {
 		free_extent_map(em);
-		em = NULL;
+		em = ERR_PTR(ret);
 	}
 	write_unlock(&em_tree->lock);
 
-	if (ret)
-		em = ERR_PTR(ret);
 out:
 	return em;
 }
 
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
+u32 btrfs_csum_data(char *data, u32 seed, size_t len)
 {
 	return crc32c(seed, data, len);
 }
@@ -275,7 +279,7 @@
 		if (err)
 			return 1;
 		cur_len = min(len, map_len - (offset - map_start));
-		crc = btrfs_csum_data(root, kaddr + offset - map_start,
+		crc = btrfs_csum_data(kaddr + offset - map_start,
 				      crc, cur_len);
 		len -= cur_len;
 		offset += cur_len;
@@ -300,9 +304,8 @@
 			printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
 				       "failed on %llu wanted %X found %X "
 				       "level %d\n",
-				       root->fs_info->sb->s_id,
-				       (unsigned long long)buf->start, val, found,
-				       btrfs_header_level(buf));
+				       root->fs_info->sb->s_id, buf->start,
+				       val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
 				kfree(result);
 			return 1;
@@ -322,7 +325,8 @@
  * in the wrong place.
  */
 static int verify_parent_transid(struct extent_io_tree *io_tree,
-				 struct extent_buffer *eb, u64 parent_transid)
+				 struct extent_buffer *eb, u64 parent_transid,
+				 int atomic)
 {
 	struct extent_state *cached_state = NULL;
 	int ret;
@@ -330,20 +334,21 @@
 	if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
 		return 0;
 
+	if (atomic)
+		return -EAGAIN;
+
 	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
-			 0, &cached_state, GFP_NOFS);
-	if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
+			 0, &cached_state);
+	if (extent_buffer_uptodate(eb) &&
 	    btrfs_header_generation(eb) == parent_transid) {
 		ret = 0;
 		goto out;
 	}
 	printk_ratelimited("parent transid verify failed on %llu wanted %llu "
 		       "found %llu\n",
-		       (unsigned long long)eb->start,
-		       (unsigned long long)parent_transid,
-		       (unsigned long long)btrfs_header_generation(eb));
+		       eb->start, parent_transid, btrfs_header_generation(eb));
 	ret = 1;
-	clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
+	clear_extent_buffer_uptodate(eb);
 out:
 	unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
 			     &cached_state, GFP_NOFS);
@@ -351,6 +356,49 @@
 }
 
 /*
+ * Return 0 if the superblock checksum type matches the checksum value of that
+ * algorithm. Pass the raw disk superblock data.
+ */
+static int btrfs_check_super_csum(char *raw_disk_sb)
+{
+	struct btrfs_super_block *disk_sb =
+		(struct btrfs_super_block *)raw_disk_sb;
+	u16 csum_type = btrfs_super_csum_type(disk_sb);
+	int ret = 0;
+
+	if (csum_type == BTRFS_CSUM_TYPE_CRC32) {
+		u32 crc = ~(u32)0;
+		const int csum_size = sizeof(crc);
+		char result[csum_size];
+
+		/*
+		 * The super_block structure does not span the whole
+		 * BTRFS_SUPER_INFO_SIZE range, we expect that the unused space
+		 * is filled with zeros and is included in the checkum.
+		 */
+		crc = btrfs_csum_data(raw_disk_sb + BTRFS_CSUM_SIZE,
+				crc, BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE);
+		btrfs_csum_final(crc, result);
+
+		if (memcmp(raw_disk_sb, result, csum_size))
+			ret = 1;
+
+		if (ret && btrfs_super_generation(disk_sb) < 10) {
+			printk(KERN_WARNING "btrfs: super block crcs don't match, older mkfs detected\n");
+			ret = 0;
+		}
+	}
+
+	if (csum_type >= ARRAY_SIZE(btrfs_csum_sizes)) {
+		printk(KERN_ERR "btrfs: unsupported checksum algorithm %u\n",
+				csum_type);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+/*
  * helper to read a given tree block, doing retries as required when
  * the checksums don't match and we have alternate mirrors to try.
  */
@@ -359,9 +407,11 @@
 					  u64 start, u64 parent_transid)
 {
 	struct extent_io_tree *io_tree;
+	int failed = 0;
 	int ret;
 	int num_copies = 0;
 	int mirror_num = 0;
+	int failed_mirror = 0;
 
 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
@@ -369,9 +419,13 @@
 		ret = read_extent_buffer_pages(io_tree, eb, start,
 					       WAIT_COMPLETE,
 					       btree_get_extent, mirror_num);
-		if (!ret &&
-		    !verify_parent_transid(io_tree, eb, parent_transid))
-			return ret;
+		if (!ret) {
+			if (!verify_parent_transid(io_tree, eb,
+						   parent_transid, 0))
+				break;
+			else
+				ret = -EIO;
+		}
 
 		/*
 		 * This buffer's crc is fine, but its contents are corrupted, so
@@ -379,18 +433,30 @@
 		 * any less wrong.
 		 */
 		if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
-			return ret;
+			break;
 
-		num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
+		num_copies = btrfs_num_copies(root->fs_info,
 					      eb->start, eb->len);
 		if (num_copies == 1)
-			return ret;
+			break;
+
+		if (!failed_mirror) {
+			failed = 1;
+			failed_mirror = eb->read_mirror;
+		}
 
 		mirror_num++;
+		if (mirror_num == failed_mirror)
+			mirror_num++;
+
 		if (mirror_num > num_copies)
-			return ret;
+			break;
 	}
-	return -EIO;
+
+	if (failed && !ret && failed_mirror)
+		repair_eb_io_failure(root, eb, failed_mirror);
+
+	return ret;
 }
 
 /*
@@ -401,52 +467,25 @@
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
 	struct extent_io_tree *tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 found_start;
-	unsigned long len;
 	struct extent_buffer *eb;
-	int ret;
 
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-	if (page->private == EXTENT_PAGE_PRIVATE) {
-		WARN_ON(1);
-		goto out;
-	}
-	if (!page->private) {
-		WARN_ON(1);
-		goto out;
-	}
-	len = page->private >> 2;
-	WARN_ON(len == 0);
-
-	eb = alloc_extent_buffer(tree, start, len, page);
-	if (eb == NULL) {
-		WARN_ON(1);
-		goto out;
-	}
-	ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
-					     btrfs_header_generation(eb));
-	BUG_ON(ret);
-	WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
-
+	eb = (struct extent_buffer *)page->private;
+	if (page != eb->pages[0])
+		return 0;
 	found_start = btrfs_header_bytenr(eb);
 	if (found_start != start) {
 		WARN_ON(1);
-		goto err;
-	}
-	if (eb->first_page != page) {
-		WARN_ON(1);
-		goto err;
+		return 0;
 	}
 	if (!PageUptodate(page)) {
 		WARN_ON(1);
-		goto err;
+		return 0;
 	}
 	csum_tree_block(root, eb, 0);
-err:
-	free_extent_buffer(eb);
-out:
 	return 0;
 }
 
@@ -457,8 +496,7 @@
 	u8 fsid[BTRFS_UUID_SIZE];
 	int ret = 1;
 
-	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
-			   BTRFS_FSID_SIZE);
+	read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE);
 	while (fs_devices) {
 		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 			ret = 0;
@@ -472,8 +510,7 @@
 #define CORRUPT(reason, eb, root, slot)				\
 	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
 	       "root=%llu, slot=%d\n", reason,			\
-	       (unsigned long long)btrfs_header_bytenr(eb),	\
-	       (unsigned long long)root->objectid, slot)
+	       btrfs_header_bytenr(eb),	root->objectid, slot)
 
 static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
@@ -536,55 +573,60 @@
 	return 0;
 }
 
-static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
 {
 	struct extent_io_tree *tree;
 	u64 found_start;
 	int found_level;
-	unsigned long len;
 	struct extent_buffer *eb;
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 	int ret = 0;
+	int reads_done;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	if (page->private == EXTENT_PAGE_PRIVATE)
-		goto out;
 	if (!page->private)
 		goto out;
 
-	len = page->private >> 2;
-	WARN_ON(len == 0);
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	eb = (struct extent_buffer *)page->private;
 
-	eb = alloc_extent_buffer(tree, start, len, page);
-	if (eb == NULL) {
+	/* the pending IO might have been the only thing that kept this buffer
+	 * in memory.  Make sure we have a ref for all this other checks
+	 */
+	extent_buffer_get(eb);
+
+	reads_done = atomic_dec_and_test(&eb->io_pages);
+	if (!reads_done)
+		goto err;
+
+	eb->read_mirror = mirror;
+	if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
 		ret = -EIO;
-		goto out;
+		goto err;
 	}
 
 	found_start = btrfs_header_bytenr(eb);
-	if (found_start != start) {
+	if (found_start != eb->start) {
 		printk_ratelimited(KERN_INFO "btrfs bad tree block start "
 			       "%llu %llu\n",
-			       (unsigned long long)found_start,
-			       (unsigned long long)eb->start);
-		ret = -EIO;
-		goto err;
-	}
-	if (eb->first_page != page) {
-		printk(KERN_INFO "btrfs bad first page %lu %lu\n",
-		       eb->first_page->index, page->index);
-		WARN_ON(1);
+			       found_start, eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
 		printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
-			       (unsigned long long)eb->start);
+			       eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	found_level = btrfs_header_level(eb);
+	if (found_level >= BTRFS_MAX_LEVEL) {
+		btrfs_info(root->fs_info, "bad tree block level %d\n",
+			   (int)btrfs_header_level(eb));
+		ret = -EIO;
+		goto err;
+	}
 
 	btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb),
 				       eb, found_level);
@@ -605,48 +647,38 @@
 		ret = -EIO;
 	}
 
-	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
-	end = eb->start + end - 1;
+	if (!ret)
+		set_extent_buffer_uptodate(eb);
 err:
-	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
-		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+	if (reads_done &&
+	    test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 		btree_readahead_hook(root, eb, eb->start, ret);
-	}
 
+	if (ret) {
+		/*
+		 * our io error hook is going to dec the io pages
+		 * again, we have to make sure it has something
+		 * to decrement
+		 */
+		atomic_inc(&eb->io_pages);
+		clear_extent_buffer_uptodate(eb);
+	}
 	free_extent_buffer(eb);
 out:
 	return ret;
 }
 
-static int btree_io_failed_hook(struct bio *failed_bio,
-			 struct page *page, u64 start, u64 end,
-			 int mirror_num, struct extent_state *state)
+static int btree_io_failed_hook(struct page *page, int failed_mirror)
 {
-	struct extent_io_tree *tree;
-	unsigned long len;
 	struct extent_buffer *eb;
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	if (page->private == EXTENT_PAGE_PRIVATE)
-		goto out;
-	if (!page->private)
-		goto out;
-
-	len = page->private >> 2;
-	WARN_ON(len == 0);
-
-	eb = alloc_extent_buffer(tree, start, len, page);
-	if (eb == NULL)
-		goto out;
-
-	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
-		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+	eb = (struct extent_buffer *)page->private;
+	set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+	eb->read_mirror = failed_mirror;
+	atomic_dec(&eb->io_pages);
+	if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
 		btree_readahead_hook(root, eb, eb->start, -EIO);
-	}
-	free_extent_buffer(eb);
-
-out:
 	return -EIO;	/* we fixed nothing */
 }
 
@@ -661,17 +693,23 @@
 	end_io_wq->work.flags = 0;
 
 	if (bio->bi_rw & REQ_WRITE) {
-		if (end_io_wq->metadata == 1)
+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
 			btrfs_queue_worker(&fs_info->endio_meta_write_workers,
 					   &end_io_wq->work);
-		else if (end_io_wq->metadata == 2)
+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
 			btrfs_queue_worker(&fs_info->endio_freespace_worker,
 					   &end_io_wq->work);
+		else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+			btrfs_queue_worker(&fs_info->endio_raid56_workers,
+					   &end_io_wq->work);
 		else
 			btrfs_queue_worker(&fs_info->endio_write_workers,
 					   &end_io_wq->work);
 	} else {
-		if (end_io_wq->metadata)
+		if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+			btrfs_queue_worker(&fs_info->endio_raid56_workers,
+					   &end_io_wq->work);
+		else if (end_io_wq->metadata)
 			btrfs_queue_worker(&fs_info->endio_meta_workers,
 					   &end_io_wq->work);
 		else
@@ -686,6 +724,7 @@
  * 0 - if data
  * 1 - if normal metadta
  * 2 - if writing to the free space cache area
+ * 3 - raid parity work
  */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata)
@@ -718,11 +757,14 @@
 static void run_one_async_start(struct btrfs_work *work)
 {
 	struct async_submit_bio *async;
+	int ret;
 
 	async = container_of(work, struct  async_submit_bio, work);
-	async->submit_bio_start(async->inode, async->rw, async->bio,
-			       async->mirror_num, async->bio_flags,
-			       async->bio_offset);
+	ret = async->submit_bio_start(async->inode, async->rw, async->bio,
+				      async->mirror_num, async->bio_flags,
+				      async->bio_offset);
+	if (ret)
+		async->error = ret;
 }
 
 static void run_one_async_done(struct btrfs_work *work)
@@ -737,12 +779,16 @@
 	limit = btrfs_async_submit_limit(fs_info);
 	limit = limit * 2 / 3;
 
-	atomic_dec(&fs_info->nr_async_submits);
-
-	if (atomic_read(&fs_info->nr_async_submits) < limit &&
+	if (atomic_dec_return(&fs_info->nr_async_submits) < limit &&
 	    waitqueue_active(&fs_info->async_submit_wait))
 		wake_up(&fs_info->async_submit_wait);
 
+	/* If an error occured we just want to clean up the bio and move on */
+	if (async->error) {
+		bio_endio(async->bio, async->error);
+		return;
+	}
+
 	async->submit_bio_done(async->inode, async->rw, async->bio,
 			       async->mirror_num, async->bio_flags,
 			       async->bio_offset);
@@ -784,6 +830,8 @@
 	async->bio_flags = bio_flags;
 	async->bio_offset = bio_offset;
 
+	async->error = 0;
+
 	atomic_inc(&fs_info->nr_async_submits);
 
 	if (rw & REQ_SYNC)
@@ -805,15 +853,18 @@
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
 	struct btrfs_root *root;
+	int ret = 0;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 	while (bio_index < bio->bi_vcnt) {
 		root = BTRFS_I(bvec->bv_page->mapping->host)->root;
-		csum_dirty_buffer(root, bvec->bv_page);
+		ret = csum_dirty_buffer(root, bvec->bv_page);
+		if (ret)
+			break;
 		bio_index++;
 		bvec++;
 	}
-	return 0;
+	return ret;
 }
 
 static int __btree_submit_bio_start(struct inode *inode, int rw,
@@ -825,49 +876,77 @@
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	btree_csum_one_bio(bio);
-	return 0;
+	return btree_csum_one_bio(bio);
 }
 
 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int ret;
+
 	/*
 	 * when we're called for a write, we're already in the async
 	 * submission context.  Just jump into btrfs_map_bio
 	 */
-	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+	ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
+}
+
+static int check_async_write(struct inode *inode, unsigned long bio_flags)
+{
+	if (bio_flags & EXTENT_BIO_TREE_LOG)
+		return 0;
+#ifdef CONFIG_X86
+	if (cpu_has_xmm4_2)
+		return 0;
+#endif
+	return 1;
 }
 
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 				 int mirror_num, unsigned long bio_flags,
 				 u64 bio_offset)
 {
+	int async = check_async_write(inode, bio_flags);
 	int ret;
 
-	ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
-					  bio, 1);
-	BUG_ON(ret);
-
 	if (!(rw & REQ_WRITE)) {
 		/*
 		 * called for a read, do the setup so that checksum validation
 		 * can happen in the async kernel threads
 		 */
-		return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
-				     mirror_num, 0);
+		ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info,
+					  bio, 1);
+		if (ret)
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
+	} else if (!async) {
+		ret = btree_csum_one_bio(bio);
+		if (ret)
+			goto out_w_error;
+		ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio,
+				    mirror_num, 0);
+	} else {
+		/*
+		 * kthread helpers are used to submit writes so that
+		 * checksumming can happen in parallel across all CPUs
+		 */
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+					  inode, rw, bio, mirror_num, 0,
+					  bio_offset,
+					  __btree_submit_bio_start,
+					  __btree_submit_bio_done);
 	}
 
-	/*
-	 * kthread helpers are used to submit writes so that checksumming
-	 * can happen in parallel across all CPUs
-	 */
-	return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-				   inode, rw, bio, mirror_num, 0,
-				   bio_offset,
-				   __btree_submit_bio_start,
-				   __btree_submit_bio_done);
+	if (ret) {
+out_w_error:
+		bio_endio(bio, ret);
+	}
+	return ret;
 }
 
 #ifdef CONFIG_MIGRATION
@@ -892,54 +971,28 @@
 }
 #endif
 
-static int btree_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct extent_io_tree *tree;
-	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct extent_buffer *eb;
-	int was_dirty;
-
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	if (!(current->flags & PF_MEMALLOC)) {
-		return extent_write_full_page(tree, page,
-					      btree_get_extent, wbc);
-	}
-
-	redirty_page_for_writepage(wbc, page);
-	eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-	WARN_ON(!eb);
-
-	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
-	if (!was_dirty) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
-	free_extent_buffer(eb);
-
-	unlock_page(page);
-	return 0;
-}
 
 static int btree_writepages(struct address_space *mapping,
 			    struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
+	struct btrfs_fs_info *fs_info;
+	int ret;
+
 	tree = &BTRFS_I(mapping->host)->io_tree;
 	if (wbc->sync_mode == WB_SYNC_NONE) {
-		struct btrfs_root *root = BTRFS_I(mapping->host)->root;
-		u64 num_dirty;
-		unsigned long thresh = 32 * 1024 * 1024;
 
 		if (wbc->for_kupdate)
 			return 0;
 
+		fs_info = BTRFS_I(mapping->host)->root->fs_info;
 		/* this is a bit racy, but that's ok */
-		num_dirty = root->fs_info->dirty_metadata_bytes;
-		if (num_dirty < thresh)
+		ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
+					     BTRFS_DIRTY_METADATA_THRESH);
+		if (ret < 0)
 			return 0;
 	}
-	return extent_writepages(tree, mapping, btree_get_extent, wbc);
+	return btree_write_cache_pages(mapping, wbc);
 }
 
 static int btree_readpage(struct file *file, struct page *page)
@@ -951,28 +1004,10 @@
 
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
 {
-	struct extent_io_tree *tree;
-	struct extent_map_tree *map;
-	int ret;
-
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	map = &BTRFS_I(page->mapping->host)->extent_tree;
-
-	ret = try_release_extent_state(map, tree, page, gfp_flags);
-	if (!ret)
-		return 0;
-
-	ret = try_release_extent_buffer(tree, page);
-	if (ret == 1) {
-		ClearPagePrivate(page);
-		set_page_private(page, 0);
-		page_cache_release(page);
-	}
-
-	return ret;
+	return try_release_extent_buffer(page);
 }
 
 static void btree_invalidatepage(struct page *page, unsigned long offset)
@@ -990,15 +1025,30 @@
 	}
 }
 
+static int btree_set_page_dirty(struct page *page)
+{
+#ifdef DEBUG
+	struct extent_buffer *eb;
+
+	BUG_ON(!PagePrivate(page));
+	eb = (struct extent_buffer *)page->private;
+	BUG_ON(!eb);
+	BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+	BUG_ON(!atomic_read(&eb->refs));
+	btrfs_assert_tree_locked(eb);
+#endif
+	return __set_page_dirty_nobuffers(page);
+}
+
 static const struct address_space_operations btree_aops = {
 	.readpage	= btree_readpage,
-	.writepage	= btree_writepage,
 	.writepages	= btree_writepages,
 	.releasepage	= btree_releasepage,
 	.invalidatepage = btree_invalidatepage,
 #ifdef CONFIG_MIGRATION
 	.migratepage	= btree_migratepage,
 #endif
+	.set_page_dirty = btree_set_page_dirty,
 };
 
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -1041,7 +1091,7 @@
 	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
 		free_extent_buffer(buf);
 		return -EIO;
-	} else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+	} else if (extent_buffer_uptodate(buf)) {
 		*eb = buf;
 	} else {
 		free_extent_buffer(buf);
@@ -1066,20 +1116,20 @@
 	struct extent_buffer *eb;
 
 	eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-				 bytenr, blocksize, NULL);
+				 bytenr, blocksize);
 	return eb;
 }
 
 
 int btrfs_write_tree_block(struct extent_buffer *buf)
 {
-	return filemap_fdatawrite_range(buf->first_page->mapping, buf->start,
+	return filemap_fdatawrite_range(buf->pages[0]->mapping, buf->start,
 					buf->start + buf->len - 1);
 }
 
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf)
 {
-	return filemap_fdatawait_range(buf->first_page->mapping,
+	return filemap_fdatawait_range(buf->pages[0]->mapping,
 				       buf->start, buf->start + buf->len - 1);
 }
 
@@ -1094,42 +1144,38 @@
 		return NULL;
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-
-	if (ret == 0)
-		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
+	if (ret) {
+		free_extent_buffer(buf);
+		return NULL;
+	}
 	return buf;
 
 }
 
-int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		     struct extent_buffer *buf)
+void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct extent_buffer *buf)
 {
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
 	if (btrfs_header_generation(buf) ==
-	    root->fs_info->running_transaction->transid) {
+	    fs_info->running_transaction->transid) {
 		btrfs_assert_tree_locked(buf);
 
 		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-			spin_lock(&root->fs_info->delalloc_lock);
-			if (root->fs_info->dirty_metadata_bytes >= buf->len)
-				root->fs_info->dirty_metadata_bytes -= buf->len;
-			else
-				WARN_ON(1);
-			spin_unlock(&root->fs_info->delalloc_lock);
+			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+					     -buf->len,
+					     fs_info->dirty_metadata_batch);
+			/* ugh, clear_extent_buffer_dirty needs to lock the page */
+			btrfs_set_lock_blocking(buf);
+			clear_extent_buffer_dirty(buf);
 		}
-
-		/* ugh, clear_extent_buffer_dirty needs to lock the page */
-		btrfs_set_lock_blocking(buf);
-		clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
-					  buf);
 	}
-	return 0;
 }
 
-static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
-			u32 stripesize, struct btrfs_root *root,
-			struct btrfs_fs_info *fs_info,
-			u64 objectid)
+static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
+			 u32 stripesize, struct btrfs_root *root,
+			 struct btrfs_fs_info *fs_info,
+			 u64 objectid)
 {
 	root->node = NULL;
 	root->commit_root = NULL;
@@ -1143,10 +1189,11 @@
 	root->orphan_item_inserted = 0;
 	root->orphan_cleanup_state = 0;
 
-	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
 	root->highest_objectid = 0;
+	root->nr_delalloc_inodes = 0;
+	root->nr_ordered_extents = 0;
 	root->name = NULL;
 	root->inode_tree = RB_ROOT;
 	INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
@@ -1154,11 +1201,20 @@
 	root->orphan_block_rsv = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
-	INIT_LIST_HEAD(&root->orphan_list);
 	INIT_LIST_HEAD(&root->root_list);
+	INIT_LIST_HEAD(&root->delalloc_inodes);
+	INIT_LIST_HEAD(&root->delalloc_root);
+	INIT_LIST_HEAD(&root->ordered_extents);
+	INIT_LIST_HEAD(&root->ordered_root);
+	INIT_LIST_HEAD(&root->logged_list[0]);
+	INIT_LIST_HEAD(&root->logged_list[1]);
 	spin_lock_init(&root->orphan_lock);
 	spin_lock_init(&root->inode_lock);
+	spin_lock_init(&root->delalloc_lock);
+	spin_lock_init(&root->ordered_extent_lock);
 	spin_lock_init(&root->accounting_lock);
+	spin_lock_init(&root->log_extents_lock[0]);
+	spin_lock_init(&root->log_extents_lock[1]);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
@@ -1167,7 +1223,9 @@
 	atomic_set(&root->log_commit[0], 0);
 	atomic_set(&root->log_commit[1], 0);
 	atomic_set(&root->log_writers, 0);
-	root->log_batch = 0;
+	atomic_set(&root->log_batch, 0);
+	atomic_set(&root->orphan_inodes, 0);
+	atomic_set(&root->refs, 1);
 	root->log_transid = 0;
 	root->last_log_commit = 0;
 	extent_io_tree_init(&root->dirty_log_pages,
@@ -1182,39 +1240,100 @@
 	root->defrag_running = 0;
 	root->root_key.objectid = objectid;
 	root->anon_dev = 0;
-	return 0;
+
+	spin_lock_init(&root->root_item_lock);
 }
 
-static int find_and_setup_root(struct btrfs_root *tree_root,
-			       struct btrfs_fs_info *fs_info,
-			       u64 objectid,
-			       struct btrfs_root *root)
+static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info)
 {
-	int ret;
-	u32 blocksize;
-	u64 generation;
+	struct btrfs_root *root = kzalloc(sizeof(*root), GFP_NOFS);
+	if (root)
+		root->fs_info = fs_info;
+	return root;
+}
+
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	int ret = 0;
+	u64 bytenr;
+	uuid_le uuid;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root)
+		return ERR_PTR(-ENOMEM);
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
 		     root, fs_info, objectid);
-	ret = btrfs_find_last_root(tree_root, objectid,
-				   &root->root_item, &root->root_key);
-	if (ret > 0)
-		return -ENOENT;
-	BUG_ON(ret);
+	root->root_key.objectid = objectid;
+	root->root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root->root_key.offset = 0;
 
-	generation = btrfs_root_generation(&root->root_item);
-	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
-	root->commit_root = NULL;
-	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
-				     blocksize, generation);
-	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
-		free_extent_buffer(root->node);
-		root->node = NULL;
-		return -EIO;
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, objectid, NULL, 0, 0, 0);
+	if (IS_ERR(leaf)) {
+		ret = PTR_ERR(leaf);
+		leaf = NULL;
+		goto fail;
 	}
+
+	bytenr = leaf->start;
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, objectid);
+	root->node = leaf;
+
+	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf),
+			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
+			    btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+
 	root->commit_root = btrfs_root_node(root);
-	return 0;
+	root->track_dirty = 1;
+
+
+	root->root_item.flags = 0;
+	root->root_item.byte_limit = 0;
+	btrfs_set_root_bytenr(&root->root_item, leaf->start);
+	btrfs_set_root_generation(&root->root_item, trans->transid);
+	btrfs_set_root_level(&root->root_item, 0);
+	btrfs_set_root_refs(&root->root_item, 1);
+	btrfs_set_root_used(&root->root_item, leaf->len);
+	btrfs_set_root_last_snapshot(&root->root_item, 0);
+	btrfs_set_root_dirid(&root->root_item, 0);
+	uuid_le_gen(&uuid);
+	memcpy(root->root_item.uuid, uuid.b, BTRFS_UUID_SIZE);
+	root->root_item.drop_level = 0;
+
+	key.objectid = objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+	ret = btrfs_insert_root(trans, tree_root, &key, &root->root_item);
+	if (ret)
+		goto fail;
+
+	btrfs_tree_unlock(leaf);
+
+	return root;
+
+fail:
+	if (leaf) {
+		btrfs_tree_unlock(leaf);
+		free_extent_buffer(leaf);
+	}
+	kfree(root);
+
+	return ERR_PTR(ret);
 }
 
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
@@ -1224,7 +1343,7 @@
 	struct btrfs_root *tree_root = fs_info->tree_root;
 	struct extent_buffer *leaf;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
+	root = btrfs_alloc_root(fs_info);
 	if (!root)
 		return ERR_PTR(-ENOMEM);
 
@@ -1244,7 +1363,8 @@
 	root->ref_cows = 0;
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
+				      BTRFS_TREE_LOG_OBJECTID, NULL,
+				      0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
@@ -1258,8 +1378,7 @@
 	root->node = leaf;
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(root->node),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(root->node), BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
 	return root;
@@ -1292,11 +1411,11 @@
 	log_root->root_key.offset = root->root_key.objectid;
 
 	inode_item = &log_root->root_item.inode;
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
 
 	btrfs_set_root_node(&log_root->root_item, log_root->node);
 
@@ -1307,63 +1426,73 @@
 	return 0;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-					       struct btrfs_key *location)
+static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+					       struct btrfs_key *key)
 {
 	struct btrfs_root *root;
 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
 	struct btrfs_path *path;
-	struct extent_buffer *l;
 	u64 generation;
 	u32 blocksize;
-	int ret = 0;
+	int ret;
 
-	root = kzalloc(sizeof(*root), GFP_NOFS);
-	if (!root)
+	path = btrfs_alloc_path();
+	if (!path)
 		return ERR_PTR(-ENOMEM);
-	if (location->offset == (u64)-1) {
-		ret = find_and_setup_root(tree_root, fs_info,
-					  location->objectid, root);
-		if (ret) {
-			kfree(root);
-			return ERR_PTR(ret);
-		}
-		goto out;
+
+	root = btrfs_alloc_root(fs_info);
+	if (!root) {
+		ret = -ENOMEM;
+		goto alloc_fail;
 	}
 
 	__setup_root(tree_root->nodesize, tree_root->leafsize,
 		     tree_root->sectorsize, tree_root->stripesize,
-		     root, fs_info, location->objectid);
+		     root, fs_info, key->objectid);
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		kfree(root);
-		return ERR_PTR(-ENOMEM);
-	}
-	ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
-	if (ret == 0) {
-		l = path->nodes[0];
-		read_extent_buffer(l, &root->root_item,
-				btrfs_item_ptr_offset(l, path->slots[0]),
-				sizeof(root->root_item));
-		memcpy(&root->root_key, location, sizeof(*location));
-	}
-	btrfs_free_path(path);
+	ret = btrfs_find_root(tree_root, key, path,
+			      &root->root_item, &root->root_key);
 	if (ret) {
-		kfree(root);
 		if (ret > 0)
 			ret = -ENOENT;
-		return ERR_PTR(ret);
+		goto find_fail;
 	}
 
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
+	if (!root->node) {
+		ret = -ENOMEM;
+		goto find_fail;
+	} else if (!btrfs_buffer_uptodate(root->node, generation, 0)) {
+		ret = -EIO;
+		goto read_fail;
+	}
 	root->commit_root = btrfs_root_node(root);
-	BUG_ON(!root->node);
 out:
-	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
+	btrfs_free_path(path);
+	return root;
+
+read_fail:
+	free_extent_buffer(root->node);
+find_fail:
+	kfree(root);
+alloc_fail:
+	root = ERR_PTR(ret);
+	goto out;
+}
+
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location)
+{
+	struct btrfs_root *root;
+
+	root = btrfs_read_tree_root(tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 		root->ref_cows = 1;
 		btrfs_check_and_init_root_item(&root->root_item);
 	}
@@ -1371,34 +1500,10 @@
 	return root;
 }
 
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
-					      struct btrfs_key *location)
+int btrfs_init_fs_root(struct btrfs_root *root)
 {
-	struct btrfs_root *root;
 	int ret;
 
-	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
-		return fs_info->tree_root;
-	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
-		return fs_info->extent_root;
-	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
-		return fs_info->chunk_root;
-	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
-		return fs_info->dev_root;
-	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
-		return fs_info->csum_root;
-again:
-	spin_lock(&fs_info->fs_roots_radix_lock);
-	root = radix_tree_lookup(&fs_info->fs_roots_radix,
-				 (unsigned long)location->objectid);
-	spin_unlock(&fs_info->fs_roots_radix_lock);
-	if (root)
-		return root;
-
-	root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
-	if (IS_ERR(root))
-		return root;
-
 	root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
 	root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
 					GFP_NOFS);
@@ -1415,21 +1520,33 @@
 	ret = get_anon_bdev(&root->anon_dev);
 	if (ret)
 		goto fail;
+	return 0;
+fail:
+	kfree(root->free_ino_ctl);
+	kfree(root->free_ino_pinned);
+	return ret;
+}
 
-	if (btrfs_root_refs(&root->root_item) == 0) {
-		ret = -ENOENT;
-		goto fail;
-	}
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					       u64 root_id)
+{
+	struct btrfs_root *root;
 
-	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
-	if (ret < 0)
-		goto fail;
-	if (ret == 0)
-		root->orphan_item_inserted = 1;
+	spin_lock(&fs_info->fs_roots_radix_lock);
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)root_id);
+	spin_unlock(&fs_info->fs_roots_radix_lock);
+	return root;
+}
+
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root)
+{
+	int ret;
 
 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 	if (ret)
-		goto fail;
+		return ret;
 
 	spin_lock(&fs_info->fs_roots_radix_lock);
 	ret = radix_tree_insert(&fs_info->fs_roots_radix,
@@ -1437,9 +1554,63 @@
 				root);
 	if (ret == 0)
 		root->in_radix = 1;
-
 	spin_unlock(&fs_info->fs_roots_radix_lock);
 	radix_tree_preload_end();
+
+	return ret;
+}
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *location,
+				     bool check_ref)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+	if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID)
+		return fs_info->chunk_root;
+	if (location->objectid == BTRFS_DEV_TREE_OBJECTID)
+		return fs_info->dev_root;
+	if (location->objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return fs_info->csum_root;
+	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
+		return fs_info->quota_root ? fs_info->quota_root :
+					     ERR_PTR(-ENOENT);
+	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
+		return fs_info->uuid_root ? fs_info->uuid_root :
+					    ERR_PTR(-ENOENT);
+again:
+	root = btrfs_lookup_fs_root(fs_info, location->objectid);
+	if (root) {
+		if (check_ref && btrfs_root_refs(&root->root_item) == 0)
+			return ERR_PTR(-ENOENT);
+		return root;
+	}
+
+	root = btrfs_read_fs_root(fs_info->tree_root, location);
+	if (IS_ERR(root))
+		return root;
+
+	if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
+		ret = -ENOENT;
+		goto fail;
+	}
+
+	ret = btrfs_init_fs_root(root);
+	if (ret)
+		goto fail;
+
+	ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+	if (ret < 0)
+		goto fail;
+	if (ret == 0)
+		root->orphan_item_inserted = 1;
+
+	ret = btrfs_insert_fs_root(fs_info, root);
 	if (ret) {
 		if (ret == -EEXIST) {
 			free_fs_root(root);
@@ -1447,10 +1618,6 @@
 		}
 		goto fail;
 	}
-
-	ret = btrfs_find_dead_roots(fs_info->tree_root,
-				    root->root_key.objectid);
-	WARN_ON(ret);
 	return root;
 fail:
 	free_fs_root(root);
@@ -1497,41 +1664,6 @@
 	return 0;
 }
 
-static int bio_ready_for_csum(struct bio *bio)
-{
-	u64 length = 0;
-	u64 buf_len = 0;
-	u64 start = 0;
-	struct page *page;
-	struct extent_io_tree *io_tree = NULL;
-	struct bio_vec *bvec;
-	int i;
-	int ret;
-
-	bio_for_each_segment(bvec, bio, i) {
-		page = bvec->bv_page;
-		if (page->private == EXTENT_PAGE_PRIVATE) {
-			length += bvec->bv_len;
-			continue;
-		}
-		if (!page->private) {
-			length += bvec->bv_len;
-			continue;
-		}
-		length = bvec->bv_len;
-		buf_len = page->private >> 2;
-		start = page_offset(page) + bvec->bv_offset;
-		io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-	}
-	/* are we fully contained in this bio? */
-	if (buf_len <= length)
-		return 1;
-
-	ret = extent_range_uptodate(io_tree, start + length,
-				    start + buf_len - 1);
-	return ret;
-}
-
 /*
  * called by the kthread helper functions to finally call the bio end_io
  * functions.  This is where read checksum verification actually happens
@@ -1547,17 +1679,6 @@
 	bio = end_io_wq->bio;
 	fs_info = end_io_wq->info;
 
-	/* metadata bio reads are special because the whole tree block must
-	 * be checksummed at once.  This makes sure the entire block is in
-	 * ram and up to date before trying to verify things.  For
-	 * blocksize <= pagesize, it is basically a noop
-	 */
-	if (!(bio->bi_rw & REQ_WRITE) && end_io_wq->metadata &&
-	    !bio_ready_for_csum(bio)) {
-		btrfs_queue_worker(&fs_info->endio_meta_workers,
-				   &end_io_wq->work);
-		return;
-	}
 	error = end_io_wq->error;
 	bio->bi_private = end_io_wq->private;
 	bio->bi_end_io = end_io_wq->end_io;
@@ -1568,21 +1689,38 @@
 static int cleaner_kthread(void *arg)
 {
 	struct btrfs_root *root = arg;
+	int again;
 
 	do {
-		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		again = 0;
+
+		/* Make the cleaner go to sleep early. */
+		if (btrfs_need_cleaner_sleep(root))
+			goto sleep;
+
+		if (!mutex_trylock(&root->fs_info->cleaner_mutex))
+			goto sleep;
 
-		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
-		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
-			btrfs_run_delayed_iputs(root);
-			btrfs_clean_old_snapshots(root);
+		/*
+		 * Avoid the problem that we change the status of the fs
+		 * during the above check and trylock.
+		 */
+		if (btrfs_need_cleaner_sleep(root)) {
 			mutex_unlock(&root->fs_info->cleaner_mutex);
-			btrfs_run_defrag_inodes(root->fs_info);
+			goto sleep;
 		}
 
-		if (freezing(current)) {
-			refrigerator();
-		} else {
+		btrfs_run_delayed_iputs(root);
+		again = btrfs_clean_one_deleted_snapshot(root);
+		mutex_unlock(&root->fs_info->cleaner_mutex);
+
+		/*
+		 * The defragger has dealt with the R/O remount and umount,
+		 * needn't do anything special here.
+		 */
+		btrfs_run_defrag_inodes(root->fs_info);
+sleep:
+		if (!try_to_freeze() && !again) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop())
 				schedule();
@@ -1600,11 +1738,11 @@
 	u64 transid;
 	unsigned long now;
 	unsigned long delay;
-	int ret;
+	bool cannot_commit;
 
 	do {
-		delay = HZ * 30;
-		vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
+		cannot_commit = false;
+		delay = HZ * root->fs_info->commit_interval;
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 		spin_lock(&root->fs_info->trans_lock);
@@ -1615,8 +1753,9 @@
 		}
 
 		now = get_seconds();
-		if (!cur->blocked &&
-		    (now < cur->start_time || now - cur->start_time < 30)) {
+		if (cur->state < TRANS_STATE_BLOCKED &&
+		    (now < cur->start_time ||
+		     now - cur->start_time < root->fs_info->commit_interval)) {
 			spin_unlock(&root->fs_info->trans_lock);
 			delay = HZ * 5;
 			goto sleep;
@@ -1624,11 +1763,15 @@
 		transid = cur->transid;
 		spin_unlock(&root->fs_info->trans_lock);
 
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
+		/* If the file system is aborted, this will always fail. */
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) != -ENOENT)
+				cannot_commit = true;
+			goto sleep;
+		}
 		if (transid == trans->transid) {
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
+			btrfs_commit_transaction(trans, root);
 		} else {
 			btrfs_end_transaction(trans, root);
 		}
@@ -1636,12 +1779,11 @@
 		wake_up_process(root->fs_info->cleaner_kthread);
 		mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
-		if (freezing(current)) {
-			refrigerator();
-		} else {
+		if (!try_to_freeze()) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			if (!kthread_should_stop() &&
-			    !btrfs_transaction_blocked(root->fs_info))
+			    (!btrfs_transaction_blocked(root->fs_info) ||
+			     cannot_commit))
 				schedule_timeout(delay);
 			__set_current_state(TASK_RUNNING);
 		}
@@ -1848,27 +1990,66 @@
 	return 0;
 }
 
+/* helper to cleanup workers */
+static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
+{
+	btrfs_stop_workers(&fs_info->generic_worker);
+	btrfs_stop_workers(&fs_info->fixup_workers);
+	btrfs_stop_workers(&fs_info->delalloc_workers);
+	btrfs_stop_workers(&fs_info->workers);
+	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_workers);
+	btrfs_stop_workers(&fs_info->endio_raid56_workers);
+	btrfs_stop_workers(&fs_info->rmw_workers);
+	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
+	btrfs_stop_workers(&fs_info->endio_freespace_worker);
+	btrfs_stop_workers(&fs_info->submit_workers);
+	btrfs_stop_workers(&fs_info->delayed_workers);
+	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_stop_workers(&fs_info->readahead_workers);
+	btrfs_stop_workers(&fs_info->flush_workers);
+	btrfs_stop_workers(&fs_info->qgroup_rescan_workers);
+}
+
 /* helper to cleanup tree roots */
 static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 {
 	free_extent_buffer(info->tree_root->node);
 	free_extent_buffer(info->tree_root->commit_root);
-	free_extent_buffer(info->dev_root->node);
-	free_extent_buffer(info->dev_root->commit_root);
-	free_extent_buffer(info->extent_root->node);
-	free_extent_buffer(info->extent_root->commit_root);
-	free_extent_buffer(info->csum_root->node);
-	free_extent_buffer(info->csum_root->commit_root);
-
 	info->tree_root->node = NULL;
 	info->tree_root->commit_root = NULL;
-	info->dev_root->node = NULL;
-	info->dev_root->commit_root = NULL;
-	info->extent_root->node = NULL;
-	info->extent_root->commit_root = NULL;
-	info->csum_root->node = NULL;
-	info->csum_root->commit_root = NULL;
 
+	if (info->dev_root) {
+		free_extent_buffer(info->dev_root->node);
+		free_extent_buffer(info->dev_root->commit_root);
+		info->dev_root->node = NULL;
+		info->dev_root->commit_root = NULL;
+	}
+	if (info->extent_root) {
+		free_extent_buffer(info->extent_root->node);
+		free_extent_buffer(info->extent_root->commit_root);
+		info->extent_root->node = NULL;
+		info->extent_root->commit_root = NULL;
+	}
+	if (info->csum_root) {
+		free_extent_buffer(info->csum_root->node);
+		free_extent_buffer(info->csum_root->commit_root);
+		info->csum_root->node = NULL;
+		info->csum_root->commit_root = NULL;
+	}
+	if (info->quota_root) {
+		free_extent_buffer(info->quota_root->node);
+		free_extent_buffer(info->quota_root->commit_root);
+		info->quota_root->node = NULL;
+		info->quota_root->commit_root = NULL;
+	}
+	if (info->uuid_root) {
+		free_extent_buffer(info->uuid_root->node);
+		free_extent_buffer(info->uuid_root->commit_root);
+		info->uuid_root->node = NULL;
+		info->uuid_root->commit_root = NULL;
+	}
 	if (chunk_root) {
 		free_extent_buffer(info->chunk_root->node);
 		free_extent_buffer(info->chunk_root->commit_root);
@@ -1877,10 +2058,40 @@
 	}
 }
 
+static void del_fs_roots(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+	struct btrfs_root *gang[8];
+	int i;
+
+	while (!list_empty(&fs_info->dead_roots)) {
+		gang[0] = list_entry(fs_info->dead_roots.next,
+				     struct btrfs_root, root_list);
+		list_del(&gang[0]->root_list);
+
+		if (gang[0]->in_radix) {
+			btrfs_drop_and_free_fs_root(fs_info, gang[0]);
+		} else {
+			free_extent_buffer(gang[0]->node);
+			free_extent_buffer(gang[0]->commit_root);
+			btrfs_put_fs_root(gang[0]);
+		}
+	}
 
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options)
+	while (1) {
+		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
+					     (void **)gang, 0,
+					     ARRAY_SIZE(gang));
+		if (!ret)
+			break;
+		for (i = 0; i < ret; i++)
+			btrfs_drop_and_free_fs_root(fs_info, gang[i]);
+	}
+}
+
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options)
 {
 	u32 sectorsize;
 	u32 nodesize;
@@ -1892,28 +2103,25 @@
 	struct btrfs_key location;
 	struct buffer_head *bh;
 	struct btrfs_super_block *disk_super;
-	struct btrfs_root *tree_root = btrfs_sb(sb);
-	struct btrfs_fs_info *fs_info = tree_root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *tree_root;
 	struct btrfs_root *extent_root;
 	struct btrfs_root *csum_root;
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
+	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
 	struct btrfs_root *log_tree_root;
 	int ret;
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
+	bool create_uuid_tree;
+	bool check_uuid_tree;
 
-	extent_root = fs_info->extent_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	csum_root = fs_info->csum_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	chunk_root = fs_info->chunk_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	dev_root = fs_info->dev_root =
-		kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-
-	if (!extent_root || !csum_root || !chunk_root || !dev_root) {
+	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
+	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
+	if (!tree_root || !chunk_root) {
 		err = -ENOMEM;
 		goto fail;
 	}
@@ -1930,10 +2138,24 @@
 		goto fail_srcu;
 	}
 
+	ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+	if (ret) {
+		err = ret;
+		goto fail_bdi;
+	}
+	fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+					(1 + ilog2(nr_cpu_ids));
+
+	ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+	if (ret) {
+		err = ret;
+		goto fail_dirty_metadata_bytes;
+	}
+
 	fs_info->btree_inode = new_inode(sb);
 	if (!fs_info->btree_inode) {
 		err = -ENOMEM;
-		goto fail_bdi;
+		goto fail_delalloc_bytes;
 	}
 
 	mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -1942,40 +2164,47 @@
 	INIT_LIST_HEAD(&fs_info->trans_list);
 	INIT_LIST_HEAD(&fs_info->dead_roots);
 	INIT_LIST_HEAD(&fs_info->delayed_iputs);
-	INIT_LIST_HEAD(&fs_info->hashers);
-	INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-	INIT_LIST_HEAD(&fs_info->ordered_operations);
+	INIT_LIST_HEAD(&fs_info->delalloc_roots);
 	INIT_LIST_HEAD(&fs_info->caching_block_groups);
-	spin_lock_init(&fs_info->delalloc_lock);
+	spin_lock_init(&fs_info->delalloc_root_lock);
 	spin_lock_init(&fs_info->trans_lock);
-	spin_lock_init(&fs_info->ref_cache_lock);
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
 	spin_lock_init(&fs_info->free_chunk_lock);
+	spin_lock_init(&fs_info->tree_mod_seq_lock);
+	spin_lock_init(&fs_info->super_lock);
+	rwlock_init(&fs_info->tree_mod_log_lock);
 	mutex_init(&fs_info->reloc_mutex);
+	seqlock_init(&fs_info->profiles_lock);
 
 	init_completion(&fs_info->kobj_unregister);
 	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
 	INIT_LIST_HEAD(&fs_info->space_info);
+	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	btrfs_mapping_init(&fs_info->mapping_tree);
-	btrfs_init_block_rsv(&fs_info->global_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
-	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
-	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
-	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
+	btrfs_init_block_rsv(&fs_info->global_block_rsv,
+			     BTRFS_BLOCK_RSV_GLOBAL);
+	btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
+			     BTRFS_BLOCK_RSV_DELALLOC);
+	btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
+	btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
+	btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
+			     BTRFS_BLOCK_RSV_DELOPS);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
 	atomic_set(&fs_info->defrag_running, 0);
+	atomic64_set(&fs_info->tree_mod_seq, 0);
 	fs_info->sb = sb;
 	fs_info->max_inline = 8192 * 1024;
 	fs_info->metadata_ratio = 0;
 	fs_info->defrag_inodes = RB_ROOT;
-	fs_info->trans_no_join = 0;
 	fs_info->free_chunk_space = 0;
+	fs_info->tree_mod_log = RB_ROOT;
+	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -1984,8 +2213,8 @@
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
 
-	INIT_LIST_HEAD(&fs_info->ordered_extents);
-	spin_lock_init(&fs_info->ordered_extent_lock);
+	INIT_LIST_HEAD(&fs_info->ordered_roots);
+	spin_lock_init(&fs_info->ordered_root_lock);
 	fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
 					GFP_NOFS);
 	if (!fs_info->delayed_root) {
@@ -2002,6 +2231,17 @@
 	init_waitqueue_head(&fs_info->scrub_pause_wait);
 	init_rwsem(&fs_info->scrub_super_lock);
 	fs_info->scrub_workers_refcnt = 0;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	fs_info->check_integrity_print_mask = 0;
+#endif
+
+	spin_lock_init(&fs_info->balance_lock);
+	mutex_init(&fs_info->balance_mutex);
+	atomic_set(&fs_info->balance_running, 0);
+	atomic_set(&fs_info->balance_pause_req, 0);
+	atomic_set(&fs_info->balance_cancel_req, 0);
+	fs_info->balance_ctl = NULL;
+	init_waitqueue_head(&fs_info->balance_wait_q);
 
 	sb->s_blocksize = 4096;
 	sb->s_blocksize_bits = blksize_bits(4096);
@@ -2021,6 +2261,7 @@
 	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
 			     fs_info->btree_inode->i_mapping);
+	BTRFS_I(fs_info->btree_inode)->io_tree.track_uptodate = 0;
 	extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
 
 	BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -2028,11 +2269,13 @@
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
-	BTRFS_I(fs_info->btree_inode)->dummy_inode = 1;
+	set_bit(BTRFS_INODE_DUMMY,
+		&BTRFS_I(fs_info->btree_inode)->runtime_flags);
 	insert_inode_hash(fs_info->btree_inode);
 
 	spin_lock_init(&fs_info->block_group_cache_lock);
 	fs_info->block_group_cache_tree = RB_ROOT;
+	fs_info->first_logical_byte = (u64)-1;
 
 	extent_io_tree_init(&fs_info->freed_extents[0],
 			     fs_info->btree_inode->i_mapping);
@@ -2043,6 +2286,7 @@
 
 
 	mutex_init(&fs_info->ordered_operations_mutex);
+	mutex_init(&fs_info->ordered_extent_flush_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2051,6 +2295,22 @@
 	init_rwsem(&fs_info->extent_commit_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
+	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
+	fs_info->dev_replace.lock_owner = 0;
+	atomic_set(&fs_info->dev_replace.nesting_level, 0);
+	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
+	mutex_init(&fs_info->dev_replace.lock_management_lock);
+	mutex_init(&fs_info->dev_replace.lock);
+
+	spin_lock_init(&fs_info->qgroup_lock);
+	mutex_init(&fs_info->qgroup_ioctl_lock);
+	fs_info->qgroup_tree = RB_ROOT;
+	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
+	fs_info->qgroup_seq = 1;
+	fs_info->quota_enabled = 0;
+	fs_info->pending_quota_state = 0;
+	fs_info->qgroup_ulist = NULL;
+	mutex_init(&fs_info->qgroup_rescan_lock);
 
 	btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
 	btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -2060,15 +2320,41 @@
 	init_waitqueue_head(&fs_info->transaction_blocked_wait);
 	init_waitqueue_head(&fs_info->async_submit_wait);
 
+	ret = btrfs_alloc_stripe_hash_table(fs_info);
+	if (ret) {
+		err = ret;
+		goto fail_alloc;
+	}
+
 	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
+	invalidate_bdev(fs_devices->latest_bdev);
+
+	/*
+	 * Read super block and check the signature bytes only
+	 */
 	bh = btrfs_read_dev_super(fs_devices->latest_bdev);
 	if (!bh) {
 		err = -EINVAL;
 		goto fail_alloc;
 	}
 
+	/*
+	 * We want to check superblock checksum, the type is stored inside.
+	 * Pass the whole disk block of size BTRFS_SUPER_INFO_SIZE (4k).
+	 */
+	if (btrfs_check_super_csum(bh->b_data)) {
+		printk(KERN_ERR "btrfs: superblock checksum mismatch\n");
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	/*
+	 * super_copy is zeroed at allocation time and we never touch the
+	 * following bytes up to INFO_SIZE, the checksum is calculated from
+	 * the whole block of INFO_SIZE
+	 */
 	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
 	memcpy(fs_info->super_for_commit, fs_info->super_copy,
 	       sizeof(*fs_info->super_for_commit));
@@ -2076,14 +2362,20 @@
 
 	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
 
+	ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+	if (ret) {
+		printk(KERN_ERR "btrfs: superblock contains fatal errors\n");
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
 	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		goto fail_alloc;
 
 	/* check FS state, whether FS is broken. */
-	fs_info->fs_state |= btrfs_super_flags(disk_super);
-
-	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+	if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+		set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 
 	/*
 	 * run through our array of backup supers and setup
@@ -2109,15 +2401,69 @@
 	if (features) {
 		printk(KERN_ERR "BTRFS: couldn't mount because of "
 		       "unsupported optional features (%Lx).\n",
-		       (unsigned long long)features);
+		       features);
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+
+	if (btrfs_super_leafsize(disk_super) !=
+	    btrfs_super_nodesize(disk_super)) {
+		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+		       "blocksizes don't match.  node %d leaf %d\n",
+		       btrfs_super_nodesize(disk_super),
+		       btrfs_super_leafsize(disk_super));
+		err = -EINVAL;
+		goto fail_alloc;
+	}
+	if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
+		       "blocksize (%d) was too large\n",
+		       btrfs_super_leafsize(disk_super));
 		err = -EINVAL;
 		goto fail_alloc;
 	}
 
 	features = btrfs_super_incompat_flags(disk_super);
 	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-	if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
+	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+
+	if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
+		printk(KERN_ERR "btrfs: has skinny extents\n");
+
+	/*
+	 * flag our filesystem as having big metadata blocks if
+	 * they are bigger than the page size
+	 */
+	if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) {
+		if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+			printk(KERN_INFO "btrfs flagging fs with big metadata feature\n");
+		features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+	}
+
+	nodesize = btrfs_super_nodesize(disk_super);
+	leafsize = btrfs_super_leafsize(disk_super);
+	sectorsize = btrfs_super_sectorsize(disk_super);
+	stripesize = btrfs_super_stripesize(disk_super);
+	fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+	fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+	/*
+	 * mixed block groups end up with duplicate but slightly offset
+	 * extent buffers for the same range.  It leads to corruptions
+	 */
+	if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
+	    (sectorsize != leafsize)) {
+		printk(KERN_WARNING "btrfs: unequal leaf/node/sector sizes "
+				"are not allowed for mixed block groups on %s\n",
+				sb->s_id);
+		goto fail_alloc;
+	}
+
+	/*
+	 * Needn't use the lock because there is no other task which will
+	 * update the flag.
+	 */
 	btrfs_set_super_incompat_flags(disk_super, features);
 
 	features = btrfs_super_compat_ro_flags(disk_super) &
@@ -2125,7 +2471,7 @@
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
 		       "unsupported option features (%Lx).\n",
-		       (unsigned long long)features);
+		       features);
 		err = -EINVAL;
 		goto fail_alloc;
 	}
@@ -2138,16 +2484,17 @@
 			   &fs_info->generic_worker);
 
 	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+			   fs_info->thread_pool_size, NULL);
+
+	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
+			   fs_info->thread_pool_size, NULL);
 
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
-			   fs_info->thread_pool_size),
-			   &fs_info->generic_worker);
+			   fs_info->thread_pool_size), NULL);
 
 	btrfs_init_workers(&fs_info->caching_workers, "cache",
-			   2, &fs_info->generic_worker);
+			   fs_info->thread_pool_size, NULL);
 
 	/* a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
@@ -2172,6 +2519,12 @@
 	btrfs_init_workers(&fs_info->endio_meta_write_workers,
 			   "endio-meta-write", fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->endio_raid56_workers,
+			   "endio-raid56", fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->rmw_workers,
+			   "rmw", fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 	btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
@@ -2183,6 +2536,8 @@
 	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->qgroup_rescan_workers, "qgroup-rescan", 1,
+			   &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -2190,6 +2545,8 @@
 	 */
 	fs_info->endio_workers.idle_thresh = 4;
 	fs_info->endio_meta_workers.idle_thresh = 4;
+	fs_info->endio_raid56_workers.idle_thresh = 4;
+	fs_info->rmw_workers.idle_thresh = 2;
 
 	fs_info->endio_write_workers.idle_thresh = 2;
 	fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2206,14 +2563,18 @@
 	ret |= btrfs_start_workers(&fs_info->fixup_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+	ret |= btrfs_start_workers(&fs_info->rmw_workers);
+	ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_write_workers);
 	ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
 	ret |= btrfs_start_workers(&fs_info->delayed_workers);
 	ret |= btrfs_start_workers(&fs_info->caching_workers);
 	ret |= btrfs_start_workers(&fs_info->readahead_workers);
+	ret |= btrfs_start_workers(&fs_info->flush_workers);
+	ret |= btrfs_start_workers(&fs_info->qgroup_rescan_workers);
 	if (ret) {
-		ret = -ENOMEM;
+		err = -ENOMEM;
 		goto fail_sb_buffer;
 	}
 
@@ -2221,10 +2582,6 @@
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
 				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
 
-	nodesize = btrfs_super_nodesize(disk_super);
-	leafsize = btrfs_super_leafsize(disk_super);
-	sectorsize = btrfs_super_sectorsize(disk_super);
-	stripesize = btrfs_super_stripesize(disk_super);
 	tree_root->nodesize = nodesize;
 	tree_root->leafsize = leafsize;
 	tree_root->sectorsize = sectorsize;
@@ -2233,12 +2590,20 @@
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-	if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
-		    sizeof(disk_super->magic))) {
+#ifdef MY_ABC_HERE
+	sb->s_archive_version = le32_to_cpu(disk_super->archive_version);
+#endif
+	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
 		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
 
+	if (sectorsize != PAGE_SIZE) {
+		printk(KERN_WARNING "btrfs: Incompatible sector size(%lu) "
+		       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
+		goto fail_sb_buffer;
+	}
+
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
@@ -2258,8 +2623,8 @@
 	chunk_root->node = read_tree_block(chunk_root,
 					   btrfs_super_chunk_root(disk_super),
 					   blocksize, generation);
-	BUG_ON(!chunk_root->node);
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+	if (!chunk_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
 		       sb->s_id);
 		goto fail_tree_roots;
@@ -2268,19 +2633,26 @@
 	chunk_root->commit_root = btrfs_root_node(chunk_root);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
-	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
-	   BTRFS_UUID_SIZE);
+	   btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
 
-	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_chunk_tree(chunk_root);
-	mutex_unlock(&fs_info->chunk_mutex);
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
 		       sb->s_id);
 		goto fail_tree_roots;
 	}
 
-	btrfs_close_extra_devices(fs_devices);
+	/*
+	 * keep the device that is marked to be the target device for the
+	 * dev_replace procedure
+	 */
+	btrfs_close_extra_devices(fs_info, fs_devices, 0);
+
+	if (!fs_devices->latest_bdev) {
+		printk(KERN_CRIT "btrfs: failed to read devices on %s\n",
+		       sb->s_id);
+		goto fail_tree_roots;
+	}
 
 retry_root_backup:
 	blocksize = btrfs_level_size(tree_root,
@@ -2301,30 +2673,85 @@
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-	if (ret)
+	location.objectid = BTRFS_EXTENT_TREE_OBJECTID;
+	location.type = BTRFS_ROOT_ITEM_KEY;
+	location.offset = 0;
+
+	extent_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(extent_root)) {
+		ret = PTR_ERR(extent_root);
 		goto recovery_tree_root;
+	}
 	extent_root->track_dirty = 1;
+	fs_info->extent_root = extent_root;
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_DEV_TREE_OBJECTID, dev_root);
-	if (ret)
+	location.objectid = BTRFS_DEV_TREE_OBJECTID;
+	dev_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(dev_root)) {
+		ret = PTR_ERR(dev_root);
 		goto recovery_tree_root;
+	}
 	dev_root->track_dirty = 1;
+	fs_info->dev_root = dev_root;
+	btrfs_init_devices_late(fs_info);
 
-	ret = find_and_setup_root(tree_root, fs_info,
-				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
-	if (ret)
+	location.objectid = BTRFS_CSUM_TREE_OBJECTID;
+	csum_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(csum_root)) {
+		ret = PTR_ERR(csum_root);
 		goto recovery_tree_root;
-
+	}
 	csum_root->track_dirty = 1;
+	fs_info->csum_root = csum_root;
+
+	location.objectid = BTRFS_QUOTA_TREE_OBJECTID;
+	quota_root = btrfs_read_tree_root(tree_root, &location);
+	if (!IS_ERR(quota_root)) {
+		quota_root->track_dirty = 1;
+		fs_info->quota_enabled = 1;
+		fs_info->pending_quota_state = 1;
+		fs_info->quota_root = quota_root;
+	}
+
+	location.objectid = BTRFS_UUID_TREE_OBJECTID;
+	uuid_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(uuid_root)) {
+		ret = PTR_ERR(uuid_root);
+		if (ret != -ENOENT)
+			goto recovery_tree_root;
+		create_uuid_tree = true;
+		check_uuid_tree = false;
+	} else {
+		uuid_root->track_dirty = 1;
+		fs_info->uuid_root = uuid_root;
+		create_uuid_tree = false;
+		check_uuid_tree =
+		    generation != btrfs_super_uuid_tree_generation(disk_super);
+	}
 
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
-	fs_info->data_alloc_profile = (u64)-1;
-	fs_info->metadata_alloc_profile = (u64)-1;
-	fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+
+	ret = btrfs_recover_balance(fs_info);
+	if (ret) {
+		printk(KERN_WARNING "btrfs: failed to recover balance\n");
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_dev_stats(fs_info);
+	if (ret) {
+		printk(KERN_ERR "btrfs: failed to init dev_stats: %d\n",
+		       ret);
+		goto fail_block_groups;
+	}
+
+	ret = btrfs_init_dev_replace(fs_info);
+	if (ret) {
+		pr_err("btrfs: failed to init dev_replace: %d\n", ret);
+		goto fail_block_groups;
+	}
+
+	btrfs_close_extra_devices(fs_info, fs_devices, 1);
 
 	ret = btrfs_init_space_info(fs_info);
 	if (ret) {
@@ -2337,6 +2764,15 @@
 		printk(KERN_ERR "Failed to read block groups: %d\n", ret);
 		goto fail_block_groups;
 	}
+	fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	if (fs_info->fs_devices->missing_devices >
+	     fs_info->num_tolerated_disk_barrier_failures &&
+	    !(sb->s_flags & MS_RDONLY)) {
+		printk(KERN_WARNING
+		       "Btrfs: too many missing devices, writeable mount is not allowed\n");
+		goto fail_block_groups;
+	}
 
 	fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
 					       "btrfs-cleaner");
@@ -2357,25 +2793,40 @@
 		btrfs_set_opt(fs_info->mount_opt, SSD);
 	}
 
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(tree_root, CHECK_INTEGRITY)) {
+		ret = btrfsic_mount(tree_root, fs_devices,
+				    btrfs_test_opt(tree_root,
+					CHECK_INTEGRITY_INCLUDING_EXTENT_DATA) ?
+				    1 : 0,
+				    fs_info->check_integrity_print_mask);
+		if (ret)
+			printk(KERN_WARNING "btrfs: failed to initialize"
+			       " integrity check module %s\n", sb->s_id);
+	}
+#endif
+	ret = btrfs_read_qgroup_config(fs_info);
+	if (ret)
+		goto fail_trans_kthread;
+
 	/* do not make disk changes in broken FS */
-	if (btrfs_super_log_root(disk_super) != 0 &&
-	    !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
+	if (btrfs_super_log_root(disk_super) != 0) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
 		if (fs_devices->rw_devices == 0) {
 			printk(KERN_WARNING "Btrfs log replay required "
 			       "on RO media\n");
 			err = -EIO;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 		blocksize =
 		     btrfs_level_size(tree_root,
 				      btrfs_super_log_root_level(disk_super));
 
-		log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+		log_tree_root = btrfs_alloc_root(fs_info);
 		if (!log_tree_root) {
 			err = -ENOMEM;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 
 		__setup_root(nodesize, leafsize, sectorsize, stripesize,
@@ -2384,59 +2835,117 @@
 		log_tree_root->node = read_tree_block(tree_root, bytenr,
 						      blocksize,
 						      generation + 1);
+		if (!log_tree_root->node ||
+		    !extent_buffer_uptodate(log_tree_root->node)) {
+			printk(KERN_ERR "btrfs: failed to read log tree\n");
+			free_extent_buffer(log_tree_root->node);
+			kfree(log_tree_root);
+			goto fail_trans_kthread;
+		}
+		/* returns with log_tree_root freed on success */
 		ret = btrfs_recover_log_trees(log_tree_root);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_error(tree_root->fs_info, ret,
+				    "Failed to recover log tree");
+			free_extent_buffer(log_tree_root->node);
+			kfree(log_tree_root);
+			goto fail_trans_kthread;
+		}
 
 		if (sb->s_flags & MS_RDONLY) {
-			ret =  btrfs_commit_super(tree_root);
-			BUG_ON(ret);
+			ret = btrfs_commit_super(tree_root);
+			if (ret)
+				goto fail_trans_kthread;
 		}
 	}
 
 	ret = btrfs_find_orphan_roots(tree_root);
-	BUG_ON(ret);
+	if (ret)
+		goto fail_trans_kthread;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_cleanup_fs_roots(fs_info);
-		BUG_ON(ret);
+		if (ret)
+			goto fail_trans_kthread;
 
 		ret = btrfs_recover_relocation(tree_root);
 		if (ret < 0) {
 			printk(KERN_WARNING
 			       "btrfs: failed to recover relocation\n");
 			err = -EINVAL;
-			goto fail_trans_kthread;
+			goto fail_qgroup;
 		}
 	}
 
 	location.objectid = BTRFS_FS_TREE_OBJECTID;
 	location.type = BTRFS_ROOT_ITEM_KEY;
-	location.offset = (u64)-1;
+	location.offset = 0;
 
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
-	if (!fs_info->fs_root)
-		goto fail_trans_kthread;
 	if (IS_ERR(fs_info->fs_root)) {
 		err = PTR_ERR(fs_info->fs_root);
-		goto fail_trans_kthread;
+		goto fail_qgroup;
 	}
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		down_read(&fs_info->cleanup_work_sem);
-		err = btrfs_orphan_cleanup(fs_info->fs_root);
-		if (!err)
-			err = btrfs_orphan_cleanup(fs_info->tree_root);
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	down_read(&fs_info->cleanup_work_sem);
+	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
+	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
 		up_read(&fs_info->cleanup_work_sem);
-		if (err) {
+		close_ctree(tree_root);
+		return ret;
+	}
+	up_read(&fs_info->cleanup_work_sem);
+
+	ret = btrfs_resume_balance_async(fs_info);
+	if (ret) {
+		printk(KERN_WARNING "btrfs: failed to resume balance\n");
+		close_ctree(tree_root);
+		return ret;
+	}
+
+	ret = btrfs_resume_dev_replace_async(fs_info);
+	if (ret) {
+		pr_warn("btrfs: failed to resume dev_replace\n");
+		close_ctree(tree_root);
+		return ret;
+	}
+
+	btrfs_qgroup_rescan_resume(fs_info);
+
+	if (create_uuid_tree) {
+		pr_info("btrfs: creating UUID tree\n");
+		ret = btrfs_create_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("btrfs: failed to create the UUID tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	} else if (check_uuid_tree ||
+		   btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
+		pr_info("btrfs: checking UUID tree\n");
+		ret = btrfs_check_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("btrfs: failed to check the UUID tree %d\n",
+				ret);
 			close_ctree(tree_root);
-			return ERR_PTR(err);
+			return ret;
 		}
+	} else {
+		fs_info->update_uuid_tree_gen = 1;
 	}
 
-	return tree_root;
+	return 0;
 
+fail_qgroup:
+	btrfs_free_qgroup_config(fs_info);
 fail_trans_kthread:
 	kthread_stop(fs_info->transaction_kthread);
+	btrfs_cleanup_transaction(fs_info->tree_root);
+	del_fs_roots(fs_info);
 fail_cleaner:
 	kthread_stop(fs_info->cleaner_kthread);
 
@@ -2445,42 +2954,34 @@
 	 * kthreads
 	 */
 	filemap_write_and_wait(fs_info->btree_inode->i_mapping);
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_block_groups:
+	btrfs_put_block_group_cache(fs_info);
 	btrfs_free_block_groups(fs_info);
 
 fail_tree_roots:
 	free_root_pointers(fs_info, 1);
+	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 
 fail_sb_buffer:
-	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->readahead_workers);
-	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->delayed_workers);
-	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_stop_all_workers(fs_info);
 fail_alloc:
 fail_iput:
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
-	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
+fail_delalloc_bytes:
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 fail_bdi:
 	bdi_destroy(&fs_info->bdi);
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
+	btrfs_free_stripe_hash_table(fs_info);
 	btrfs_close_devices(fs_info->fs_devices);
-	free_fs_info(fs_info);
-	return ERR_PTR(err);
+	return err;
 
 recovery_tree_root:
 	if (!btrfs_test_opt(tree_root, RECOVERY))
@@ -2503,18 +3004,20 @@
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (uptodate) {
 		set_buffer_uptodate(bh);
 	} else {
-		printk_ratelimited(KERN_WARNING "lost page write due to "
-					"I/O error on %s\n",
-				       bdevname(bh->b_bdev, b));
+		struct btrfs_device *device = (struct btrfs_device *)
+			bh->b_private;
+
+		printk_ratelimited_in_rcu(KERN_WARNING "lost page write due to "
+					  "I/O error on %s\n",
+					  rcu_str_deref(device->name));
 		/* note, we dont' set_buffer_write_io_error because we have
 		 * our own ways of dealing with the IO errors
 		 */
 		clear_buffer_uptodate(bh);
+		btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_WRITE_ERRS);
 	}
 	unlock_buffer(bh);
 	put_bh(bh);
@@ -2536,16 +3039,17 @@
 	 */
 	for (i = 0; i < 1; i++) {
 		bytenr = btrfs_sb_offset(i);
-		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+					i_size_read(bdev->bd_inode))
 			break;
-		bh = __bread(bdev, bytenr / 4096, 4096);
+		bh = __bread(bdev, bytenr / 4096,
+					BTRFS_SUPER_INFO_SIZE);
 		if (!bh)
 			continue;
 
 		super = (struct btrfs_super_block *)bh->b_data;
 		if (btrfs_super_bytenr(super) != bytenr ||
-		    strncmp((char *)(&super->magic), BTRFS_MAGIC,
-			    sizeof(super->magic))) {
+		    btrfs_super_magic(super) != BTRFS_MAGIC) {
 			brelse(bh);
 			continue;
 		}
@@ -2594,7 +3098,10 @@
 		if (wait) {
 			bh = __find_get_block(device->bdev, bytenr / 4096,
 					      BTRFS_SUPER_INFO_SIZE);
-			BUG_ON(!bh);
+			if (!bh) {
+				errors++;
+				continue;
+			}
 			wait_on_buffer(bh);
 			if (!buffer_uptodate(bh))
 				errors++;
@@ -2609,7 +3116,7 @@
 			btrfs_set_super_bytenr(sb, bytenr);
 
 			crc = ~(u32)0;
-			crc = btrfs_csum_data(NULL, (char *)sb +
+			crc = btrfs_csum_data((char *)sb +
 					      BTRFS_CSUM_SIZE, crc,
 					      BTRFS_SUPER_INFO_SIZE -
 					      BTRFS_CSUM_SIZE);
@@ -2621,6 +3128,13 @@
 			 */
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
+			if (!bh) {
+				printk(KERN_ERR "btrfs: couldn't get super "
+				       "buffer head for bytenr %Lu\n", bytenr);
+				errors++;
+				continue;
+			}
+
 			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 
 			/* one reference for submit_bh */
@@ -2629,13 +3143,14 @@
 			set_buffer_uptodate(bh);
 			lock_buffer(bh);
 			bh->b_end_io = btrfs_end_buffer_write_sync;
+			bh->b_private = device;
 		}
 
 		/*
 		 * we fua the first super.  The others we allow
 		 * to go down lazy.
 		 */
-		ret = submit_bh(WRITE_FUA, bh);
+		ret = btrfsic_submit_bh(WRITE_FUA, bh);
 		if (ret)
 			errors++;
 	}
@@ -2681,12 +3196,13 @@
 		wait_for_completion(&device->flush_wait);
 
 		if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
-			printk("btrfs: disabling barriers on dev %s\n",
-			       device->name);
+			printk_in_rcu("btrfs: disabling barriers on dev %s\n",
+				      rcu_str_deref(device->name));
 			device->nobarriers = 1;
-		}
-		if (!bio_flagged(bio, BIO_UPTODATE)) {
+		} else if (!bio_flagged(bio, BIO_UPTODATE)) {
 			ret = -EIO;
+			btrfs_dev_stat_inc_and_print(device,
+				BTRFS_DEV_STAT_FLUSH_ERRS);
 		}
 
 		/* drop the reference from the wait == 0 run */
@@ -2700,8 +3216,8 @@
 	 * one reference for us, and we leave it for the
 	 * caller
 	 */
-	device->flush_bio = NULL;;
-	bio = bio_alloc(GFP_NOFS, 0);
+	device->flush_bio = NULL;
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
 	if (!bio)
 		return -ENOMEM;
 
@@ -2712,7 +3228,7 @@
 	device->flush_bio = bio;
 
 	bio_get(bio);
-	submit_bio(WRITE_FLUSH, bio);
+	btrfsic_submit_bio(WRITE_FLUSH, bio);
 
 	return 0;
 }
@@ -2725,14 +3241,15 @@
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
-	int errors = 0;
+	int errors_send = 0;
+	int errors_wait = 0;
 	int ret;
 
 	/* send down all the barriers */
 	head = &info->fs_devices->devices;
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_send++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2740,13 +3257,13 @@
 
 		ret = write_dev_flush(dev, 0);
 		if (ret)
-			errors++;
+			errors_send++;
 	}
 
 	/* wait for all the barriers */
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
-			errors++;
+			errors_wait++;
 			continue;
 		}
 		if (!dev->in_fs_metadata || !dev->writeable)
@@ -2754,14 +3271,93 @@
 
 		ret = write_dev_flush(dev, 1);
 		if (ret)
-			errors++;
+			errors_wait++;
 	}
-	if (errors)
+	if (errors_send > info->num_tolerated_disk_barrier_failures ||
+	    errors_wait > info->num_tolerated_disk_barrier_failures)
 		return -EIO;
 	return 0;
 }
 
-int write_all_supers(struct btrfs_root *root, int max_mirrors)
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_ioctl_space_info space;
+	struct btrfs_space_info *sinfo;
+	u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+		       BTRFS_BLOCK_GROUP_SYSTEM,
+		       BTRFS_BLOCK_GROUP_METADATA,
+		       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+	int num_types = 4;
+	int i;
+	int c;
+	int num_tolerated_disk_barrier_failures =
+		(int)fs_info->fs_devices->num_devices;
+
+	for (i = 0; i < num_types; i++) {
+		struct btrfs_space_info *tmp;
+
+		sinfo = NULL;
+		rcu_read_lock();
+		list_for_each_entry_rcu(tmp, &fs_info->space_info, list) {
+			if (tmp->flags == types[i]) {
+				sinfo = tmp;
+				break;
+			}
+		}
+		rcu_read_unlock();
+
+		if (!sinfo)
+			continue;
+
+		down_read(&sinfo->groups_sem);
+		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+			if (!list_empty(&sinfo->block_groups[c])) {
+				u64 flags;
+
+				btrfs_get_block_group_info(
+					&sinfo->block_groups[c], &space);
+				if (space.total_bytes == 0 ||
+				    space.used_bytes == 0)
+					continue;
+				flags = space.flags;
+				/*
+				 * return
+				 * 0: if dup, single or RAID0 is configured for
+				 *    any of metadata, system or data, else
+				 * 1: if RAID5 is configured, or if RAID1 or
+				 *    RAID10 is configured and only two mirrors
+				 *    are used, else
+				 * 2: if RAID6 is configured, else
+				 * num_mirrors - 1: if RAID1 or RAID10 is
+				 *                  configured and more than
+				 *                  2 mirrors are used.
+				 */
+				if (num_tolerated_disk_barrier_failures > 0 &&
+				    ((flags & (BTRFS_BLOCK_GROUP_DUP |
+					       BTRFS_BLOCK_GROUP_RAID0)) ||
+				     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
+				      == 0)))
+					num_tolerated_disk_barrier_failures = 0;
+				else if (num_tolerated_disk_barrier_failures > 1) {
+					if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+					    BTRFS_BLOCK_GROUP_RAID5 |
+					    BTRFS_BLOCK_GROUP_RAID10)) {
+						num_tolerated_disk_barrier_failures = 1;
+					} else if (flags &
+						   BTRFS_BLOCK_GROUP_RAID6) {
+						num_tolerated_disk_barrier_failures = 2;
+					}
+				}
+			}
+		}
+		up_read(&sinfo->groups_sem);
+	}
+
+	return num_tolerated_disk_barrier_failures;
+}
+
+static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
@@ -2773,7 +3369,6 @@
 	int total_errors = 0;
 	u64 flags;
 
-	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 	backup_super_roots(root->fs_info);
 
@@ -2782,9 +3377,18 @@
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 
-	if (do_barriers)
-		barrier_all_devices(root->fs_info);
+	if (do_barriers) {
+		ret = barrier_all_devices(root->fs_info);
+		if (ret) {
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
+			btrfs_error(root->fs_info, ret,
+				    "errors while submitting device barriers.");
+			return ret;
+		}
+	}
 
 	list_for_each_entry_rcu(dev, head, dev_list) {
 		if (!dev->bdev) {
@@ -2815,7 +3419,12 @@
 	if (total_errors > max_errors) {
 		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
 		       total_errors);
-		BUG();
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+		/* FUA is masked off if unsupported and can't be the reason */
+		btrfs_error(root->fs_info, -EIO,
+			    "%d errors while writing supers", total_errors);
+		return -EIO;
 	}
 
 	total_errors = 0;
@@ -2831,9 +3440,9 @@
 	}
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
-		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
-		       total_errors);
-		BUG();
+		btrfs_error(root->fs_info, -EIO,
+			    "%d errors while writing supers", total_errors);
+		return -EIO;
 	}
 	return 0;
 }
@@ -2847,7 +3456,9 @@
 	return ret;
 }
 
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+/* Drop a fs root from the radix tree and free it. */
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				  struct btrfs_root *root)
 {
 	spin_lock(&fs_info->fs_roots_radix_lock);
 	radix_tree_delete(&fs_info->fs_roots_radix,
@@ -2857,16 +3468,22 @@
 	if (btrfs_root_refs(&root->root_item) == 0)
 		synchronize_srcu(&fs_info->subvol_srcu);
 
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		btrfs_free_log(NULL, root);
+		btrfs_free_log_root_tree(NULL, fs_info);
+	}
+
 	__btrfs_remove_free_space_cache(root->free_ino_pinned);
 	__btrfs_remove_free_space_cache(root->free_ino_ctl);
 	free_fs_root(root);
-	return 0;
 }
 
 static void free_fs_root(struct btrfs_root *root)
 {
 	iput(root->cache_inode);
 	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
 	if (root->anon_dev)
 		free_anon_bdev(root->anon_dev);
 	free_extent_buffer(root->node);
@@ -2874,39 +3491,12 @@
 	kfree(root->free_ino_ctl);
 	kfree(root->free_ino_pinned);
 	kfree(root->name);
-	kfree(root);
+	btrfs_put_fs_root(root);
 }
 
-static int del_fs_roots(struct btrfs_fs_info *fs_info)
+void btrfs_free_fs_root(struct btrfs_root *root)
 {
-	int ret;
-	struct btrfs_root *gang[8];
-	int i;
-
-	while (!list_empty(&fs_info->dead_roots)) {
-		gang[0] = list_entry(fs_info->dead_roots.next,
-				     struct btrfs_root, root_list);
-		list_del(&gang[0]->root_list);
-
-		if (gang[0]->in_radix) {
-			btrfs_free_fs_root(fs_info, gang[0]);
-		} else {
-			free_extent_buffer(gang[0]->node);
-			free_extent_buffer(gang[0]->commit_root);
-			kfree(gang[0]);
-		}
-	}
-
-	while (1) {
-		ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
-					     (void **)gang, 0,
-					     ARRAY_SIZE(gang));
-		if (!ret)
-			break;
-		for (i = 0; i < ret; i++)
-			btrfs_free_fs_root(fs_info, gang[i]);
-	}
-	return 0;
+	free_fs_root(root);
 }
 
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2944,8 +3534,8 @@
 
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
-	btrfs_clean_old_snapshots(root);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
+	wake_up_process(root->fs_info->cleaner_kthread);
 
 	/* wait until ongoing cleanup work done */
 	down_write(&root->fs_info->cleanup_work_sem);
@@ -2955,14 +3545,21 @@
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 	ret = btrfs_commit_transaction(trans, root);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 	/* run commit again to drop the original snapshot */
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
-	btrfs_commit_transaction(trans, root);
+	ret = btrfs_commit_transaction(trans, root);
+	if (ret)
+		return ret;
 	ret = btrfs_write_and_wait_transaction(NULL, root);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_error(root->fs_info, ret,
+			    "Failed to sync btree inode to disk.");
+		return ret;
+	}
 
 	ret = write_ctree_super(NULL, root, 0);
 	return ret;
@@ -2976,258 +3573,171 @@
 	fs_info->closing = 1;
 	smp_mb();
 
-	btrfs_scrub_cancel(root);
+	/* wait for the uuid_scan task to finish */
+	down(&fs_info->uuid_tree_rescan_sem);
+	/* avoid complains from lockdep et al., set sem back to initial state */
+	up(&fs_info->uuid_tree_rescan_sem);
+
+	/* pause restriper - we want to resume on mount */
+	btrfs_pause_balance(fs_info);
+
+	btrfs_dev_replace_suspend_for_unmount(fs_info);
+
+	btrfs_scrub_cancel(fs_info);
 
 	/* wait for any defraggers to finish */
 	wait_event(fs_info->transaction_wait,
 		   (atomic_read(&fs_info->defrag_running) == 0));
 
 	/* clear out the rbtree of defraggable inodes */
-	btrfs_run_defrag_inodes(root->fs_info);
+	btrfs_cleanup_defrag_inodes(fs_info);
 
-	/*
-	 * Here come 2 situations when btrfs is broken to flip readonly:
-	 *
-	 * 1. when btrfs flips readonly somewhere else before
-	 * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
-	 * and btrfs will skip to write sb directly to keep
-	 * ERROR state on disk.
-	 *
-	 * 2. when btrfs flips readonly just in btrfs_commit_super,
-	 * and in such case, btrfs cannot write sb via btrfs_commit_super,
-	 * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
-	 * btrfs will cleanup all FS resources first and write sb then.
-	 */
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_commit_super(root);
 		if (ret)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
-		ret = btrfs_error_commit_super(root);
-		if (ret)
-			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
-	}
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
+		btrfs_error_commit_super(root);
 
 	btrfs_put_block_group_cache(fs_info);
 
-	kthread_stop(root->fs_info->transaction_kthread);
-	kthread_stop(root->fs_info->cleaner_kthread);
+	kthread_stop(fs_info->transaction_kthread);
+	kthread_stop(fs_info->cleaner_kthread);
 
 	fs_info->closing = 2;
 	smp_mb();
 
-	if (fs_info->delalloc_bytes) {
-		printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
-		       (unsigned long long)fs_info->delalloc_bytes);
-	}
-	if (fs_info->total_ref_cache_size) {
-		printk(KERN_INFO "btrfs: at umount reference cache size %llu\n",
-		       (unsigned long long)fs_info->total_ref_cache_size);
-	}
-
-	free_extent_buffer(fs_info->extent_root->node);
-	free_extent_buffer(fs_info->extent_root->commit_root);
-	free_extent_buffer(fs_info->tree_root->node);
-	free_extent_buffer(fs_info->tree_root->commit_root);
-	free_extent_buffer(root->fs_info->chunk_root->node);
-	free_extent_buffer(root->fs_info->chunk_root->commit_root);
-	free_extent_buffer(root->fs_info->dev_root->node);
-	free_extent_buffer(root->fs_info->dev_root->commit_root);
-	free_extent_buffer(root->fs_info->csum_root->node);
-	free_extent_buffer(root->fs_info->csum_root->commit_root);
+	btrfs_free_qgroup_config(root->fs_info);
 
-	btrfs_free_block_groups(root->fs_info);
+	if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
+		printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
+		       percpu_counter_sum(&fs_info->delalloc_bytes));
+	}
+
+	btrfs_free_block_groups(fs_info);
+
+	btrfs_stop_all_workers(fs_info);
 
 	del_fs_roots(fs_info);
 
+	free_root_pointers(fs_info, 1);
+
 	iput(fs_info->btree_inode);
 
-	btrfs_stop_workers(&fs_info->generic_worker);
-	btrfs_stop_workers(&fs_info->fixup_workers);
-	btrfs_stop_workers(&fs_info->delalloc_workers);
-	btrfs_stop_workers(&fs_info->workers);
-	btrfs_stop_workers(&fs_info->endio_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_workers);
-	btrfs_stop_workers(&fs_info->endio_meta_write_workers);
-	btrfs_stop_workers(&fs_info->endio_write_workers);
-	btrfs_stop_workers(&fs_info->endio_freespace_worker);
-	btrfs_stop_workers(&fs_info->submit_workers);
-	btrfs_stop_workers(&fs_info->delayed_workers);
-	btrfs_stop_workers(&fs_info->caching_workers);
-	btrfs_stop_workers(&fs_info->readahead_workers);
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		btrfsic_unmount(root, fs_info->fs_devices);
+#endif
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
 
+	percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+	percpu_counter_destroy(&fs_info->delalloc_bytes);
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-	free_fs_info(fs_info);
+	btrfs_free_stripe_hash_table(fs_info);
+
+	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
 
 	return 0;
 }
 
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+			  int atomic)
 {
 	int ret;
-	struct inode *btree_inode = buf->first_page->mapping->host;
+	struct inode *btree_inode = buf->pages[0]->mapping->host;
 
-	ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
-				     NULL);
+	ret = extent_buffer_uptodate(buf);
 	if (!ret)
 		return ret;
 
 	ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf,
-				    parent_transid);
+				    parent_transid, atomic);
+	if (ret == -EAGAIN)
+		return ret;
 	return !ret;
 }
 
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
 {
-	struct inode *btree_inode = buf->first_page->mapping->host;
-	return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree,
-					  buf);
+	return set_extent_buffer_uptodate(buf);
 }
 
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
 	u64 transid = btrfs_header_generation(buf);
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	int was_dirty;
 
 	btrfs_assert_tree_locked(buf);
-	if (transid != root->fs_info->generation) {
-		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
+	if (transid != root->fs_info->generation)
+		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
-			(unsigned long long)buf->start,
-			(unsigned long long)transid,
-			(unsigned long long)root->fs_info->generation);
-		WARN_ON(1);
-	}
-	was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
-					    buf);
-	if (!was_dirty) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->dirty_metadata_bytes += buf->len;
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
+			buf->start, transid, root->fs_info->generation);
+	was_dirty = set_extent_buffer_dirty(buf);
+	if (!was_dirty)
+		__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
+				     buf->len,
+				     root->fs_info->dirty_metadata_batch);
 }
 
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
+					int flush_delayed)
 {
 	/*
 	 * looks as though older kernels can get into trouble with
 	 * this code, they end up stuck in balance_dirty_pages forever
 	 */
-	u64 num_dirty;
-	unsigned long thresh = 32 * 1024 * 1024;
+	int ret;
 
 	if (current->flags & PF_MEMALLOC)
 		return;
 
-	btrfs_balance_delayed_items(root);
+	if (flush_delayed)
+		btrfs_balance_delayed_items(root);
 
-	num_dirty = root->fs_info->dirty_metadata_bytes;
-
-	if (num_dirty > thresh) {
-		balance_dirty_pages_ratelimited_nr(
-				   root->fs_info->btree_inode->i_mapping, 1);
+	ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+				     BTRFS_DIRTY_METADATA_THRESH);
+	if (ret > 0) {
+		balance_dirty_pages_ratelimited(
+				   root->fs_info->btree_inode->i_mapping);
 	}
 	return;
 }
 
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+void btrfs_btree_balance_dirty(struct btrfs_root *root)
 {
-	/*
-	 * looks as though older kernels can get into trouble with
-	 * this code, they end up stuck in balance_dirty_pages forever
-	 */
-	u64 num_dirty;
-	unsigned long thresh = 32 * 1024 * 1024;
-
-	if (current->flags & PF_MEMALLOC)
-		return;
-
-	num_dirty = root->fs_info->dirty_metadata_bytes;
-
-	if (num_dirty > thresh) {
-		balance_dirty_pages_ratelimited_nr(
-				   root->fs_info->btree_inode->i_mapping, 1);
-	}
-	return;
+	__btrfs_btree_balance_dirty(root, 1);
 }
 
-int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root)
 {
-	struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
-	int ret;
-	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
-	if (ret == 0)
-		set_bit(EXTENT_BUFFER_UPTODATE, &buf->bflags);
-	return ret;
+	__btrfs_btree_balance_dirty(root, 0);
 }
 
-static int btree_lock_page_hook(struct page *page, void *data,
-				void (*flush_fn)(void *))
+int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 {
-	struct inode *inode = page->mapping->host;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct extent_buffer *eb;
-	unsigned long len;
-	u64 bytenr = page_offset(page);
-
-	if (page->private == EXTENT_PAGE_PRIVATE)
-		goto out;
-
-	len = page->private >> 2;
-	eb = find_extent_buffer(io_tree, bytenr, len);
-	if (!eb)
-		goto out;
-
-	if (!btrfs_try_tree_write_lock(eb)) {
-		flush_fn(data);
-		btrfs_tree_lock(eb);
-	}
-	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-
-	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		spin_lock(&root->fs_info->delalloc_lock);
-		if (root->fs_info->dirty_metadata_bytes >= eb->len)
-			root->fs_info->dirty_metadata_bytes -= eb->len;
-		else
-			WARN_ON(1);
-		spin_unlock(&root->fs_info->delalloc_lock);
-	}
-
-	btrfs_tree_unlock(eb);
-	free_extent_buffer(eb);
-out:
-	if (!trylock_page(page)) {
-		flush_fn(data);
-		lock_page(page);
-	}
-	return 0;
+	struct btrfs_root *root = BTRFS_I(buf->pages[0]->mapping->host)->root;
+	return btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
 }
 
-static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
-	if (read_only)
-		return;
-
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
-		printk(KERN_WARNING "warning: mount fs with errors, "
-		       "running btrfsck is recommended\n");
+	/*
+	 * Placeholder for checks
+	 */
+	return 0;
 }
 
-int btrfs_error_commit_super(struct btrfs_root *root)
+static void btrfs_error_commit_super(struct btrfs_root *root)
 {
-	int ret;
-
 	mutex_lock(&root->fs_info->cleaner_mutex);
 	btrfs_run_delayed_iputs(root);
 	mutex_unlock(&root->fs_info->cleaner_mutex);
@@ -3237,13 +3747,10 @@
 
 	/* cleanup FS via transaction */
 	btrfs_cleanup_transaction(root);
-
-	ret = write_ctree_super(NULL, root, 0);
-
-	return ret;
 }
 
-static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+					     struct btrfs_root *root)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct list_head splice;
@@ -3251,58 +3758,59 @@
 	INIT_LIST_HEAD(&splice);
 
 	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_root_lock);
 
-	list_splice_init(&root->fs_info->ordered_operations, &splice);
+	list_splice_init(&t->ordered_operations, &splice);
 	while (!list_empty(&splice)) {
 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 					 ordered_operations);
 
 		list_del_init(&btrfs_inode->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 
 		btrfs_invalidate_inodes(btrfs_inode->root);
+
+		spin_lock(&root->fs_info->ordered_root_lock);
 	}
 
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_root_lock);
 	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+}
 
-	return 0;
+static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	spin_lock(&root->ordered_extent_lock);
+	/*
+	 * This will just short circuit the ordered completion stuff which will
+	 * make sure the ordered extent gets properly cleaned up.
+	 */
+	list_for_each_entry(ordered, &root->ordered_extents,
+			    root_extent_list)
+		set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
+	spin_unlock(&root->ordered_extent_lock);
 }
 
-static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 {
+	struct btrfs_root *root;
 	struct list_head splice;
-	struct btrfs_ordered_extent *ordered;
-	struct inode *inode;
 
 	INIT_LIST_HEAD(&splice);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
-
-	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
 	while (!list_empty(&splice)) {
-		ordered = list_entry(splice.next, struct btrfs_ordered_extent,
-				     root_extent_list);
-
-		list_del_init(&ordered->root_extent_list);
-		atomic_inc(&ordered->refs);
-
-		/* the inode may be getting freed (in sys_unlink path). */
-		inode = igrab(ordered->inode);
-
-		spin_unlock(&root->fs_info->ordered_extent_lock);
-		if (inode)
-			iput(inode);
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		list_del_init(&root->ordered_root);
 
-		atomic_set(&ordered->refs, 1);
-		btrfs_put_ordered_extent(ordered);
+		btrfs_destroy_ordered_extents(root);
 
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		cond_resched_lock(&fs_info->ordered_root_lock);
 	}
-
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-
-	return 0;
+	spin_unlock(&fs_info->ordered_root_lock);
 }
 
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
@@ -3322,30 +3830,47 @@
 		return ret;
 	}
 
-	node = rb_first(&delayed_refs->root);
-	while (node) {
-		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		node = rb_next(node);
-
-		ref->in_tree = 0;
-		rb_erase(&ref->rb_node, &delayed_refs->root);
-		delayed_refs->num_entries--;
+	while ((node = rb_first(&delayed_refs->root)) != NULL) {
+		struct btrfs_delayed_ref_head *head = NULL;
+		bool pin_bytes = false;
 
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 		atomic_set(&ref->refs, 1);
 		if (btrfs_delayed_ref_is_head(ref)) {
-			struct btrfs_delayed_ref_head *head;
 
 			head = btrfs_delayed_node_to_head(ref);
-			mutex_lock(&head->mutex);
-			kfree(head->extent_op);
+			if (!mutex_trylock(&head->mutex)) {
+				atomic_inc(&ref->refs);
+				spin_unlock(&delayed_refs->lock);
+
+				/* Need to wait for the delayed ref to run */
+				mutex_lock(&head->mutex);
+				mutex_unlock(&head->mutex);
+				btrfs_put_delayed_ref(ref);
+
+				spin_lock(&delayed_refs->lock);
+				continue;
+			}
+
+			if (head->must_insert_reserved)
+				pin_bytes = true;
+			btrfs_free_delayed_extent_op(head->extent_op);
 			delayed_refs->num_heads--;
 			if (list_empty(&head->cluster))
 				delayed_refs->num_heads_ready--;
 			list_del_init(&head->cluster);
-			mutex_unlock(&head->mutex);
 		}
 
+		ref->in_tree = 0;
+		rb_erase(&ref->rb_node, &delayed_refs->root);
+		delayed_refs->num_entries--;
 		spin_unlock(&delayed_refs->lock);
+		if (head) {
+			if (pin_bytes)
+				btrfs_pin_extent(root, ref->bytenr,
+						 ref->num_bytes, 1);
+			mutex_unlock(&head->mutex);
+		}
 		btrfs_put_delayed_ref(ref);
 
 		cond_resched();
@@ -3357,7 +3882,7 @@
 	return ret;
 }
 
-static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+static void btrfs_evict_pending_snapshots(struct btrfs_transaction *t)
 {
 	struct btrfs_pending_snapshot *snapshot;
 	struct list_head splice;
@@ -3370,37 +3895,61 @@
 		snapshot = list_entry(splice.next,
 				      struct btrfs_pending_snapshot,
 				      list);
-
+		snapshot->error = -ECANCELED;
 		list_del_init(&snapshot->list);
-
-		kfree(snapshot);
 	}
-
-	return 0;
 }
 
-static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct list_head splice;
 
 	INIT_LIST_HEAD(&splice);
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
 
 	while (!list_empty(&splice)) {
-		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
-				    delalloc_inodes);
+		btrfs_inode = list_first_entry(&splice, struct btrfs_inode,
+					       delalloc_inodes);
 
 		list_del_init(&btrfs_inode->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &btrfs_inode->runtime_flags);
+		spin_unlock(&root->delalloc_lock);
 
 		btrfs_invalidate_inodes(btrfs_inode->root);
+
+		spin_lock(&root->delalloc_lock);
 	}
 
-	spin_unlock(&root->fs_info->delalloc_lock);
+	spin_unlock(&root->delalloc_lock);
+}
 
-	return 0;
+static void btrfs_destroy_all_delalloc_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					 delalloc_root);
+		list_del_init(&root->delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		spin_unlock(&fs_info->delalloc_root_lock);
+
+		btrfs_destroy_delalloc_inodes(root);
+		btrfs_put_fs_root(root);
+
+		spin_lock(&fs_info->delalloc_root_lock);
+	}
+	spin_unlock(&fs_info->delalloc_root_lock);
 }
 
 static int btrfs_destroy_marked_extents(struct btrfs_root *root,
@@ -3408,54 +3957,29 @@
 					int mark)
 {
 	int ret;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
 	struct extent_buffer *eb;
 	u64 start = 0;
 	u64 end;
-	u64 offset;
-	unsigned long index;
 
 	while (1) {
 		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
+					    mark, NULL);
 		if (ret)
 			break;
 
 		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
 		while (start <= end) {
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
+			eb = btrfs_find_tree_block(root, start,
+						   root->leafsize);
+			start += root->leafsize;
+			if (!eb)
 				continue;
-			offset = page_offset(page);
-
-			spin_lock(&dirty_pages->buffer_lock);
-			eb = radix_tree_lookup(
-			     &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
-					       offset >> PAGE_CACHE_SHIFT);
-			spin_unlock(&dirty_pages->buffer_lock);
-			if (eb) {
-				ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
-							 &eb->bflags);
-				atomic_set(&eb->refs, 1);
-			}
-			if (PageWriteback(page))
-				end_page_writeback(page);
-
-			lock_page(page);
-			if (PageDirty(page)) {
-				clear_page_dirty_for_io(page);
-				spin_lock_irq(&page->mapping->tree_lock);
-				radix_tree_tag_clear(&page->mapping->page_tree,
-							page_index(page),
-							PAGECACHE_TAG_DIRTY);
-				spin_unlock_irq(&page->mapping->tree_lock);
-			}
+			wait_on_extent_buffer_writeback(eb);
 
-			page->mapping->a_ops->invalidatepage(page, 0);
-			unlock_page(page);
+			if (test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+					       &eb->bflags))
+				clear_extent_buffer_dirty(eb);
+			free_extent_buffer_stale(eb);
 		}
 	}
 
@@ -3469,11 +3993,13 @@
 	u64 start;
 	u64 end;
 	int ret;
+	bool loop = true;
 
 	unpin = pinned_extents;
+again:
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
@@ -3488,59 +4014,92 @@
 		cond_resched();
 	}
 
+	if (loop) {
+		if (unpin == &root->fs_info->freed_extents[0])
+			unpin = &root->fs_info->freed_extents[1];
+		else
+			unpin = &root->fs_info->freed_extents[0];
+		loop = false;
+		goto again;
+	}
+
 	return 0;
 }
 
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
+				   struct btrfs_root *root)
+{
+	btrfs_destroy_delayed_refs(cur_trans, root);
+	btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+				cur_trans->dirty_pages.dirty_bytes);
+
+	cur_trans->state = TRANS_STATE_COMMIT_START;
+	wake_up(&root->fs_info->transaction_blocked_wait);
+
+	btrfs_evict_pending_snapshots(cur_trans);
+
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
+	wake_up(&root->fs_info->transaction_wait);
+
+	btrfs_destroy_delayed_inodes(root);
+	btrfs_assert_delayed_root_empty(root);
+
+	btrfs_destroy_marked_extents(root, &cur_trans->dirty_pages,
+				     EXTENT_DIRTY);
+	btrfs_destroy_pinned_extent(root,
+				    root->fs_info->pinned_extents);
+
+	cur_trans->state =TRANS_STATE_COMPLETED;
+	wake_up(&cur_trans->commit_wait);
+
+	/*
+	memset(cur_trans, 0, sizeof(*cur_trans));
+	kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+	*/
+}
+
 static int btrfs_cleanup_transaction(struct btrfs_root *root)
 {
 	struct btrfs_transaction *t;
 	LIST_HEAD(list);
 
-	WARN_ON(1);
-
 	mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 	spin_lock(&root->fs_info->trans_lock);
 	list_splice_init(&root->fs_info->trans_list, &list);
-	root->fs_info->trans_no_join = 1;
+	root->fs_info->running_transaction = NULL;
 	spin_unlock(&root->fs_info->trans_lock);
 
 	while (!list_empty(&list)) {
 		t = list_entry(list.next, struct btrfs_transaction, list);
-		if (!t)
-			break;
 
-		btrfs_destroy_ordered_operations(root);
+		btrfs_destroy_ordered_operations(t, root);
 
-		btrfs_destroy_ordered_extents(root);
+		btrfs_destroy_all_ordered_extents(root->fs_info);
 
 		btrfs_destroy_delayed_refs(t, root);
 
-		btrfs_block_rsv_release(root,
-					&root->fs_info->trans_block_rsv,
-					t->dirty_pages.dirty_bytes);
-
-		/* FIXME: cleanup wait for commit */
-		t->in_commit = 1;
-		t->blocked = 1;
+		/*
+		 *  FIXME: cleanup wait for commit
+		 *  We needn't acquire the lock here, because we are during
+		 *  the umount, there is no other task which will change it.
+		 */
+		t->state = TRANS_STATE_COMMIT_START;
+		smp_mb();
 		if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
 			wake_up(&root->fs_info->transaction_blocked_wait);
 
-		t->blocked = 0;
+		btrfs_evict_pending_snapshots(t);
+
+		t->state = TRANS_STATE_UNBLOCKED;
+		smp_mb();
 		if (waitqueue_active(&root->fs_info->transaction_wait))
 			wake_up(&root->fs_info->transaction_wait);
 
-		t->commit_done = 1;
-		if (waitqueue_active(&t->commit_wait))
-			wake_up(&t->commit_wait);
+		btrfs_destroy_delayed_inodes(root);
+		btrfs_assert_delayed_root_empty(root);
 
-		btrfs_destroy_pending_snapshots(t);
-
-		btrfs_destroy_delalloc_inodes(root);
-
-		spin_lock(&root->fs_info->trans_lock);
-		root->fs_info->running_transaction = NULL;
-		spin_unlock(&root->fs_info->trans_lock);
+		btrfs_destroy_all_delalloc_inodes(root->fs_info);
 
 		btrfs_destroy_marked_extents(root, &t->dirty_pages,
 					     EXTENT_DIRTY);
@@ -3548,22 +4107,23 @@
 		btrfs_destroy_pinned_extent(root,
 					    root->fs_info->pinned_extents);
 
+		t->state = TRANS_STATE_COMPLETED;
+		smp_mb();
+		if (waitqueue_active(&t->commit_wait))
+			wake_up(&t->commit_wait);
+
 		atomic_set(&t->use_count, 0);
 		list_del_init(&t->list);
 		memset(t, 0, sizeof(*t));
 		kmem_cache_free(btrfs_transaction_cachep, t);
 	}
 
-	spin_lock(&root->fs_info->trans_lock);
-	root->fs_info->trans_no_join = 0;
-	spin_unlock(&root->fs_info->trans_lock);
 	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 
 	return 0;
 }
 
 static struct extent_io_ops btree_extent_io_ops = {
-	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
 	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
diff -ur a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
--- a/fs/btrfs/disk-io.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/disk-io.h	2014-02-17 11:56:58.000000000 +0100
@@ -25,6 +25,13 @@
 #define BTRFS_SUPER_MIRROR_MAX	 3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
 
+enum {
+	BTRFS_WQ_ENDIO_DATA = 0,
+	BTRFS_WQ_ENDIO_METADATA = 1,
+	BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+	BTRFS_WQ_ENDIO_RAID56 = 3,
+};
+
 static inline u64 btrfs_sb_offset(int mirror)
 {
 	u64 start = 16 * 1024;
@@ -44,32 +51,67 @@
 			 int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
-int clean_tree_block(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct extent_buffer *buf);
-struct btrfs_root *open_ctree(struct super_block *sb,
-			      struct btrfs_fs_devices *fs_devices,
-			      char *options);
+void clean_tree_block(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *root, struct extent_buffer *buf);
+int open_ctree(struct super_block *sb,
+	       struct btrfs_fs_devices *fs_devices,
+	       char *options);
 int close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
-int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
-					       struct btrfs_key *location);
-struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
-					      struct btrfs_key *location);
+struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root,
+				      struct btrfs_key *location);
+int btrfs_init_fs_root(struct btrfs_root *root);
+int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
+			 struct btrfs_root *root);
+
+struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
+				     struct btrfs_key *key,
+				     bool check_ref);
+static inline struct btrfs_root *
+btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
+			   struct btrfs_key *location)
+{
+	return btrfs_get_fs_root(fs_info, location, true);
+}
+
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
-int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
+void btrfs_btree_balance_dirty(struct btrfs_root *root);
+void btrfs_btree_balance_dirty_nodelay(struct btrfs_root *root);
+void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
+				 struct btrfs_root *root);
+void btrfs_free_fs_root(struct btrfs_root *root);
+
+/*
+ * This function is used to grab the root, and avoid it is freed when we
+ * access it. But it doesn't ensure that the tree is not dropped.
+ *
+ * If you want to ensure the whole tree is safe, you should use
+ * 	fs_info->subvol_srcu
+ */
+static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root)
+{
+	if (atomic_inc_not_zero(&root->refs))
+		return root;
+	return NULL;
+}
+
+static inline void btrfs_put_fs_root(struct btrfs_root *root)
+{
+	if (atomic_dec_and_test(&root->refs))
+		kfree(root);
+}
+
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
+int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
+			  int atomic);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
-u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
+u32 btrfs_csum_data(char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
 			int metadata);
@@ -85,6 +127,15 @@
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
+void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
+				  struct btrfs_root *root);
+struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
+				     struct btrfs_fs_info *fs_info,
+				     u64 objectid);
+int btree_lock_page_hook(struct page *page, void *data,
+				void (*flush_fn)(void *));
+int btrfs_calc_num_tolerated_disk_barrier_failures(
+	struct btrfs_fs_info *fs_info);
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff -ur a/fs/btrfs/export.c b/fs/btrfs/export.c
--- a/fs/btrfs/export.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/export.c	2014-02-17 11:56:58.000000000 +0100
@@ -67,7 +67,7 @@
 				       u64 root_objectid, u32 generation,
 				       int check_generation)
 {
-	struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_root *root;
 	struct inode *inode;
 	struct btrfs_key key;
@@ -89,11 +89,6 @@
 		goto fail;
 	}
 
-	if (btrfs_root_refs(&root->root_item) == 0) {
-		err = -ENOENT;
-		goto fail;
-	}
-
 	key.objectid = objectid;
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
@@ -193,7 +188,7 @@
 	if (ret < 0)
 		goto fail;
 
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Key with offset of -1 found */
 	if (path->slots[0] == 0) {
 		ret = -ENOENT;
 		goto fail;
diff -ur a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
--- a/fs/btrfs/extent_io.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/extent_io.c	2014-02-17 11:56:58.000000000 +0100
@@ -4,7 +4,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
@@ -18,16 +17,88 @@
 #include "ctree.h"
 #include "btrfs_inode.h"
 #include "volumes.h"
+#include "check-integrity.h"
+#include "locking.h"
+#include "rcu-string.h"
+
+#ifdef MY_ABC_HERE
+#include <linux/ratelimit.h>
+#endif
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
+static struct bio_set *btrfs_bioset;
 
+#ifdef CONFIG_BTRFS_DEBUG
 static LIST_HEAD(buffers);
 static LIST_HEAD(states);
 
-#define LEAK_DEBUG 0
-#if LEAK_DEBUG
 static DEFINE_SPINLOCK(leak_lock);
+
+static inline
+void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_add(new, head);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_del(struct list_head *entry)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&leak_lock, flags);
+	list_del(entry);
+	spin_unlock_irqrestore(&leak_lock, flags);
+}
+
+static inline
+void btrfs_leak_debug_check(void)
+{
+	struct extent_state *state;
+	struct extent_buffer *eb;
+
+	while (!list_empty(&states)) {
+		state = list_entry(states.next, struct extent_state, leak_list);
+		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
+		       "state %lu in tree %p refs %d\n",
+		       state->start, state->end, state->state, state->tree,
+		       atomic_read(&state->refs));
+		list_del(&state->leak_list);
+		kmem_cache_free(extent_state_cache, state);
+	}
+
+	while (!list_empty(&buffers)) {
+		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
+		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
+		       "refs %d\n",
+		       eb->start, eb->len, atomic_read(&eb->refs));
+		list_del(&eb->leak_list);
+		kmem_cache_free(extent_buffer_cache, eb);
+	}
+}
+
+#define btrfs_debug_check_extent_io_range(inode, start, end)		\
+	__btrfs_debug_check_extent_io_range(__func__, (inode), (start), (end))
+static inline void __btrfs_debug_check_extent_io_range(const char *caller,
+		struct inode *inode, u64 start, u64 end)
+{
+	u64 isize = i_size_read(inode);
+
+	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
+		printk_ratelimited(KERN_DEBUG
+		    "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
+				caller, btrfs_ino(inode), isize, start, end);
+	}
+}
+#else
+#define btrfs_leak_debug_add(new, head)	do {} while (0)
+#define btrfs_leak_debug_del(entry)	do {} while (0)
+#define btrfs_leak_debug_check()	do {} while (0)
+#define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
 #endif
 
 #define BUFFER_LRU_MAX 64
@@ -42,6 +113,7 @@
 	struct bio *bio;
 	struct extent_io_tree *tree;
 	get_extent_t *get_extent;
+	unsigned long bio_flags;
 
 	/* tells writepage not to lock the state bits for this range
 	 * it still does the unlocking
@@ -52,55 +124,66 @@
 	unsigned int sync_io:1;
 };
 
+static noinline void flush_write_bio(void *data);
+static inline struct btrfs_fs_info *
+tree_fs_info(struct extent_io_tree *tree)
+{
+	return btrfs_sb(tree->mapping->host->i_sb);
+}
+
 int __init extent_io_init(void)
 {
-	extent_state_cache = kmem_cache_create("extent_state",
+	extent_state_cache = kmem_cache_create("btrfs_extent_state",
 			sizeof(struct extent_state), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_state_cache)
 		return -ENOMEM;
 
-	extent_buffer_cache = kmem_cache_create("extent_buffers",
+	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
 			sizeof(struct extent_buffer), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_buffer_cache)
 		goto free_state_cache;
+
+	btrfs_bioset = bioset_create(BIO_POOL_SIZE,
+				     offsetof(struct btrfs_io_bio, bio));
+	if (!btrfs_bioset)
+		goto free_buffer_cache;
+
+	if (bioset_integrity_create(btrfs_bioset, BIO_POOL_SIZE))
+		goto free_bioset;
+
 	return 0;
 
+free_bioset:
+	bioset_free(btrfs_bioset);
+	btrfs_bioset = NULL;
+
+free_buffer_cache:
+	kmem_cache_destroy(extent_buffer_cache);
+	extent_buffer_cache = NULL;
+
 free_state_cache:
 	kmem_cache_destroy(extent_state_cache);
+	extent_state_cache = NULL;
 	return -ENOMEM;
 }
 
 void extent_io_exit(void)
 {
-	struct extent_state *state;
-	struct extent_buffer *eb;
-
-	while (!list_empty(&states)) {
-		state = list_entry(states.next, struct extent_state, leak_list);
-		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
-		       "state %lu in tree %p refs %d\n",
-		       (unsigned long long)state->start,
-		       (unsigned long long)state->end,
-		       state->state, state->tree, atomic_read(&state->refs));
-		list_del(&state->leak_list);
-		kmem_cache_free(extent_state_cache, state);
-
-	}
+	btrfs_leak_debug_check();
 
-	while (!list_empty(&buffers)) {
-		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
-		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
-		       "refs %d\n", (unsigned long long)eb->start,
-		       eb->len, atomic_read(&eb->refs));
-		list_del(&eb->leak_list);
-		kmem_cache_free(extent_buffer_cache, eb);
-	}
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
 	if (extent_state_cache)
 		kmem_cache_destroy(extent_state_cache);
 	if (extent_buffer_cache)
 		kmem_cache_destroy(extent_buffer_cache);
+	if (btrfs_bioset)
+		bioset_free(btrfs_bioset);
 }
 
 void extent_io_tree_init(struct extent_io_tree *tree,
@@ -118,9 +201,6 @@
 static struct extent_state *alloc_extent_state(gfp_t mask)
 {
 	struct extent_state *state;
-#if LEAK_DEBUG
-	unsigned long flags;
-#endif
 
 	state = kmem_cache_alloc(extent_state_cache, mask);
 	if (!state)
@@ -128,13 +208,10 @@
 	state->state = 0;
 	state->private = 0;
 	state->tree = NULL;
-#if LEAK_DEBUG
-	spin_lock_irqsave(&leak_lock, flags);
-	list_add(&state->leak_list, &states);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+	btrfs_leak_debug_add(&state->leak_list, &states);
 	atomic_set(&state->refs, 1);
 	init_waitqueue_head(&state->wq);
+	trace_alloc_extent_state(state, mask, _RET_IP_);
 	return state;
 }
 
@@ -143,15 +220,9 @@
 	if (!state)
 		return;
 	if (atomic_dec_and_test(&state->refs)) {
-#if LEAK_DEBUG
-		unsigned long flags;
-#endif
 		WARN_ON(state->tree);
-#if LEAK_DEBUG
-		spin_lock_irqsave(&leak_lock, flags);
-		list_del(&state->leak_list);
-		spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+		btrfs_leak_debug_del(&state->leak_list);
+		trace_free_extent_state(state, _RET_IP_);
 		kmem_cache_free(extent_state_cache, state);
 	}
 }
@@ -175,7 +246,6 @@
 			return parent;
 	}
 
-	entry = rb_entry(node, struct tree_entry, rb_node);
 	rb_link_node(node, parent, p);
 	rb_insert_color(node, root);
 	return NULL;
@@ -291,21 +361,21 @@
 }
 
 static void set_state_cb(struct extent_io_tree *tree,
-			 struct extent_state *state, int *bits)
+			 struct extent_state *state, unsigned long *bits)
 {
 	if (tree->ops && tree->ops->set_bit_hook)
 		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void clear_state_cb(struct extent_io_tree *tree,
-			   struct extent_state *state, int *bits)
+			   struct extent_state *state, unsigned long *bits)
 {
 	if (tree->ops && tree->ops->clear_bit_hook)
 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
-			   struct extent_state *state, int *bits);
+			   struct extent_state *state, unsigned long *bits);
 
 /*
  * insert an extent_state struct into the tree.  'bits' are set on the
@@ -319,16 +389,13 @@
  */
 static int insert_state(struct extent_io_tree *tree,
 			struct extent_state *state, u64 start, u64 end,
-			int *bits)
+			unsigned long *bits)
 {
 	struct rb_node *node;
 
-	if (end < start) {
-		printk(KERN_ERR "btrfs end < start %llu %llu\n",
-		       (unsigned long long)end,
-		       (unsigned long long)start);
-		WARN_ON(1);
-	}
+	if (end < start)
+		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
+		       end, start);
 	state->start = start;
 	state->end = end;
 
@@ -339,9 +406,8 @@
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
-		       "%llu %llu\n", (unsigned long long)found->start,
-		       (unsigned long long)found->end,
-		       (unsigned long long)start, (unsigned long long)end);
+		       "%llu %llu\n",
+		       found->start, found->end, start, end);
 		return -EEXIST;
 	}
 	state->tree = tree;
@@ -391,20 +457,28 @@
 	return 0;
 }
 
+static struct extent_state *next_state(struct extent_state *state)
+{
+	struct rb_node *next = rb_next(&state->rb_node);
+	if (next)
+		return rb_entry(next, struct extent_state, rb_node);
+	else
+		return NULL;
+}
+
 /*
  * utility function to clear some bits in an extent state struct.
- * it will optionally wake up any one waiting on this state (wake == 1), or
- * forcibly remove the state from the tree (delete == 1).
+ * it will optionally wake up any one waiting on this state (wake == 1).
  *
  * If no bits are set on the state struct after clearing things, the
  * struct is freed and removed from the tree
  */
-static int clear_state_bit(struct extent_io_tree *tree,
-			    struct extent_state *state,
-			    int *bits, int wake)
+static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
+					    struct extent_state *state,
+					    unsigned long *bits, int wake)
 {
-	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
-	int ret = state->state & bits_to_clear;
+	struct extent_state *next;
+	unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
 
 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
 		u64 range = state->end - state->start + 1;
@@ -416,6 +490,7 @@
 	if (wake)
 		wake_up(&state->wq);
 	if (state->state == 0) {
+		next = next_state(state);
 		if (state->tree) {
 			rb_erase(&state->rb_node, &tree->state);
 			state->tree = NULL;
@@ -425,8 +500,9 @@
 		}
 	} else {
 		merge_state(tree, state);
+		next = next_state(state);
 	}
-	return ret;
+	return next;
 }
 
 static struct extent_state *
@@ -438,6 +514,13 @@
 	return prealloc;
 }
 
+static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+{
+	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
+		    "Extent tree was modified by another "
+		    "thread while locked.");
+}
+
 /*
  * clear some bits on a range in the tree.  This may require splitting
  * or inserting elements in the tree, so the gfp mask is used to
@@ -448,24 +531,26 @@
  *
  * the range [start, end] is inclusive.
  *
- * This takes the tree lock, and returns < 0 on error, > 0 if any of the
- * bits were already set, or zero if none of the bits were already set.
+ * This takes the tree lock, and returns 0 on success and < 0 on error.
  */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete,
+		     unsigned long bits, int wake, int delete,
 		     struct extent_state **cached_state,
 		     gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *cached;
 	struct extent_state *prealloc = NULL;
-	struct rb_node *next_node;
 	struct rb_node *node;
 	u64 last_end;
 	int err;
-	int set = 0;
 	int clear = 0;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
+	if (bits & EXTENT_DELALLOC)
+		bits |= EXTENT_NORESERVE;
+
 	if (delete)
 		bits |= ~EXTENT_CTLBITS;
 	bits |= EXTENT_FIRST_DELALLOC;
@@ -512,6 +597,12 @@
 	WARN_ON(state->end < start);
 	last_end = state->end;
 
+	/* the state doesn't have the wanted bits, go ahead */
+	if (!(state->state & bits)) {
+		state = next_state(state);
+		goto next;
+	}
+
 	/*
 	 *     | ---- desired range ---- |
 	 *  | state | or
@@ -532,15 +623,15 @@
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		prealloc = NULL;
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			set |= clear_state_bit(tree, state, &bits, wake);
-			if (last_end == (u64)-1)
-				goto out;
-			start = last_end + 1;
+			state = clear_state_bit(tree, state, &bits, wake);
+			goto next;
 		}
 		goto search_again;
 	}
@@ -554,31 +645,25 @@
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		if (wake)
 			wake_up(&state->wq);
 
-		set |= clear_state_bit(tree, prealloc, &bits, wake);
+		clear_state_bit(tree, prealloc, &bits, wake);
 
 		prealloc = NULL;
 		goto out;
 	}
 
-	if (state->end < end && prealloc && !need_resched())
-		next_node = rb_next(&state->rb_node);
-	else
-		next_node = NULL;
-
-	set |= clear_state_bit(tree, state, &bits, wake);
+	state = clear_state_bit(tree, state, &bits, wake);
+next:
 	if (last_end == (u64)-1)
 		goto out;
 	start = last_end + 1;
-	if (start <= end && next_node) {
-		state = rb_entry(next_node, struct extent_state,
-				 rb_node);
-		if (state->start == start)
-			goto hit_next;
-	}
+	if (start <= end && state && !need_resched())
+		goto hit_next;
 	goto search_again;
 
 out:
@@ -586,7 +671,7 @@
 	if (prealloc)
 		free_extent_state(prealloc);
 
-	return set;
+	return 0;
 
 search_again:
 	if (start > end)
@@ -597,8 +682,8 @@
 	goto again;
 }
 
-static int wait_on_state(struct extent_io_tree *tree,
-			 struct extent_state *state)
+static void wait_on_state(struct extent_io_tree *tree,
+			  struct extent_state *state)
 		__releases(tree->lock)
 		__acquires(tree->lock)
 {
@@ -608,7 +693,6 @@
 	schedule();
 	spin_lock(&tree->lock);
 	finish_wait(&state->wq, &wait);
-	return 0;
 }
 
 /*
@@ -616,11 +700,14 @@
  * The range [start, end] is inclusive.
  * The tree lock is taken by this function
  */
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
+static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+			    unsigned long bits)
 {
 	struct extent_state *state;
 	struct rb_node *node;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
 	spin_lock(&tree->lock);
 again:
 	while (1) {
@@ -653,14 +740,13 @@
 	}
 out:
 	spin_unlock(&tree->lock);
-	return 0;
 }
 
 static void set_state_bits(struct extent_io_tree *tree,
 			   struct extent_state *state,
-			   int *bits)
+			   unsigned long *bits)
 {
-	int bits_to_set = *bits & ~EXTENT_CTLBITS;
+	unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
 
 	set_state_cb(tree, state, bits);
 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -681,15 +767,6 @@
 	}
 }
 
-static void uncache_state(struct extent_state **cached_ptr)
-{
-	if (cached_ptr && (*cached_ptr)) {
-		struct extent_state *state = *cached_ptr;
-		*cached_ptr = NULL;
-		free_extent_state(state);
-	}
-}
-
 /*
  * set some bits on a range in the tree.  This may require allocations or
  * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -701,9 +778,11 @@
  * [start, end] is inclusive This takes the tree lock.
  */
 
-int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int exclusive_bits, u64 *failed_start,
-		   struct extent_state **cached_state, gfp_t mask)
+static int __must_check
+__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		 unsigned long bits, unsigned long exclusive_bits,
+		 u64 *failed_start, struct extent_state **cached_state,
+		 gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -712,6 +791,8 @@
 	u64 last_start;
 	u64 last_end;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
 	bits |= EXTENT_FIRST_DELALLOC;
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
@@ -737,8 +818,10 @@
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = insert_state(tree, prealloc, start, end, &bits);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		prealloc = NULL;
-		BUG_ON(err == -EEXIST);
 		goto out;
 	}
 	state = rb_entry(node, struct extent_state, rb_node);
@@ -753,7 +836,6 @@
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
-		struct rb_node *next_node;
 		if (state->state & exclusive_bits) {
 			*failed_start = state->start;
 			err = -EEXIST;
@@ -761,20 +843,15 @@
 		}
 
 		set_state_bits(tree, state, &bits);
-
 		cache_state(state, cached_state);
 		merge_state(tree, state);
 		if (last_end == (u64)-1)
 			goto out;
-
 		start = last_end + 1;
-		next_node = rb_next(&state->rb_node);
-		if (next_node && start < end && prealloc && !need_resched()) {
-			state = rb_entry(next_node, struct extent_state,
-					 rb_node);
-			if (state->start == start)
-				goto hit_next;
-		}
+		state = next_state(state);
+		if (start < end && state && state->start == start &&
+		    !need_resched())
+			goto hit_next;
 		goto search_again;
 	}
 
@@ -804,7 +881,9 @@
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		prealloc = NULL;
 		if (err)
 			goto out;
@@ -815,6 +894,10 @@
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
+			state = next_state(state);
+			if (start < end && state && state->start == start &&
+			    !need_resched())
+				goto hit_next;
 		}
 		goto search_again;
 	}
@@ -841,12 +924,9 @@
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
 				   &bits);
-		BUG_ON(err == -EEXIST);
-		if (err) {
-			free_extent_state(prealloc);
-			prealloc = NULL;
-			goto out;
-		}
+		if (err)
+			extent_io_tree_panic(tree, err);
+
 		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		start = this_end + 1;
@@ -868,7 +948,8 @@
 		prealloc = alloc_extent_state_atomic(prealloc);
 		BUG_ON(!prealloc);
 		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
 
 		set_state_bits(tree, prealloc, &bits);
 		cache_state(prealloc, cached_state);
@@ -895,13 +976,24 @@
 	goto again;
 }
 
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		   unsigned long bits, u64 * failed_start,
+		   struct extent_state **cached_state, gfp_t mask)
+{
+	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
+				cached_state, mask);
+}
+
+
 /**
- * convert_extent - convert all bits in a given range from one bit to another
+ * convert_extent_bit - convert all bits in a given range from one bit to
+ * 			another
  * @tree:	the io tree to search
  * @start:	the start offset in bytes
  * @end:	the end offset in bytes (inclusive)
  * @bits:	the bits to set in this range
  * @clear_bits:	the bits to clear in this range
+ * @cached_state:	state that we're going to cache
  * @mask:	the allocation mask
  *
  * This will go through and set bits for the given range.  If any states exist
@@ -911,7 +1003,8 @@
  * boundary bits like LOCK.
  */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask)
+		       unsigned long bits, unsigned long clear_bits,
+		       struct extent_state **cached_state, gfp_t mask)
 {
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
@@ -920,6 +1013,8 @@
 	u64 last_start;
 	u64 last_end;
 
+	btrfs_debug_check_extent_io_range(tree->mapping->host, start, end);
+
 again:
 	if (!prealloc && (mask & __GFP_WAIT)) {
 		prealloc = alloc_extent_state(mask);
@@ -928,6 +1023,15 @@
 	}
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->start <= start && state->end > start &&
+		    state->tree) {
+			node = &state->rb_node;
+			goto hit_next;
+		}
+	}
+
 	/*
 	 * this search will find all the extents that end after
 	 * our range starts.
@@ -941,7 +1045,8 @@
 		}
 		err = insert_state(tree, prealloc, start, end, &bits);
 		prealloc = NULL;
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
 		goto out;
 	}
 	state = rb_entry(node, struct extent_state, rb_node);
@@ -956,23 +1061,15 @@
 	 * Just lock what we found and keep going
 	 */
 	if (state->start == start && state->end <= end) {
-		struct rb_node *next_node;
-
 		set_state_bits(tree, state, &bits);
-		clear_state_bit(tree, state, &clear_bits, 0);
-
-		merge_state(tree, state);
+		cache_state(state, cached_state);
+		state = clear_state_bit(tree, state, &clear_bits, 0);
 		if (last_end == (u64)-1)
 			goto out;
-
 		start = last_end + 1;
-		next_node = rb_next(&state->rb_node);
-		if (next_node && start < end && prealloc && !need_resched()) {
-			state = rb_entry(next_node, struct extent_state,
-					 rb_node);
-			if (state->start == start)
-				goto hit_next;
-		}
+		if (start < end && state && state->start == start &&
+		    !need_resched())
+			goto hit_next;
 		goto search_again;
 	}
 
@@ -999,17 +1096,21 @@
 			goto out;
 		}
 		err = split_state(tree, state, prealloc, start);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
 		prealloc = NULL;
 		if (err)
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, &bits);
-			clear_state_bit(tree, state, &clear_bits, 0);
-			merge_state(tree, state);
+			cache_state(state, cached_state);
+			state = clear_state_bit(tree, state, &clear_bits, 0);
 			if (last_end == (u64)-1)
 				goto out;
 			start = last_end + 1;
+			if (start < end && state && state->start == start &&
+			    !need_resched())
+				goto hit_next;
 		}
 		goto search_again;
 	}
@@ -1039,12 +1140,9 @@
 		 */
 		err = insert_state(tree, prealloc, start, this_end,
 				   &bits);
-		BUG_ON(err == -EEXIST);
-		if (err) {
-			free_extent_state(prealloc);
-			prealloc = NULL;
-			goto out;
-		}
+		if (err)
+			extent_io_tree_panic(tree, err);
+		cache_state(prealloc, cached_state);
 		prealloc = NULL;
 		start = this_end + 1;
 		goto search_again;
@@ -1063,12 +1161,12 @@
 		}
 
 		err = split_state(tree, state, prealloc, end + 1);
-		BUG_ON(err == -EEXIST);
+		if (err)
+			extent_io_tree_panic(tree, err);
 
 		set_state_bits(tree, prealloc, &bits);
+		cache_state(prealloc, cached_state);
 		clear_state_bit(tree, prealloc, &clear_bits, 0);
-
-		merge_state(tree, prealloc);
 		prealloc = NULL;
 		goto out;
 	}
@@ -1095,19 +1193,19 @@
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
 			      NULL, mask);
 }
 
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask)
+		    unsigned long bits, gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, bits, 0, NULL,
+	return set_extent_bit(tree, start, end, bits, NULL,
 			      NULL, mask);
 }
 
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask)
+		      unsigned long bits, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
@@ -1117,7 +1215,15 @@
 {
 	return set_extent_bit(tree, start, end,
 			      EXTENT_DELALLOC | EXTENT_UPTODATE,
-			      0, NULL, cached_state, mask);
+			      NULL, cached_state, mask);
+}
+
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+			      NULL, cached_state, mask);
 }
 
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1131,20 +1237,19 @@
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
+	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
 			      NULL, mask);
 }
 
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
-	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
-			      NULL, cached_state, mask);
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+			      cached_state, mask);
 }
 
-static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
-				 u64 end, struct extent_state **cached_state,
-				 gfp_t mask)
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  struct extent_state **cached_state, gfp_t mask)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
 				cached_state, mask);
@@ -1155,42 +1260,40 @@
  * us if waiting is desired.
  */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, struct extent_state **cached_state, gfp_t mask)
+		     unsigned long bits, struct extent_state **cached_state)
 {
 	int err;
 	u64 failed_start;
 	while (1) {
-		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
-				     EXTENT_LOCKED, &failed_start,
-				     cached_state, mask);
-		if (err == -EEXIST && (mask & __GFP_WAIT)) {
+		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+				       EXTENT_LOCKED, &failed_start,
+				       cached_state, GFP_NOFS);
+		if (err == -EEXIST) {
 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
 			start = failed_start;
-		} else {
+		} else
 			break;
-		}
 		WARN_ON(start > end);
 	}
 	return err;
 }
 
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
-	return lock_extent_bits(tree, start, end, 0, NULL, mask);
+	return lock_extent_bits(tree, start, end, 0, NULL);
 }
 
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-		    gfp_t mask)
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	int err;
 	u64 failed_start;
 
-	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
-			     &failed_start, NULL, mask);
+	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
+			       &failed_start, NULL, GFP_NOFS);
 	if (err == -EEXIST) {
 		if (failed_start > start)
 			clear_extent_bit(tree, start, failed_start - 1,
-					 EXTENT_LOCKED, 1, 0, NULL, mask);
+					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
 		return 0;
 	}
 	return 1;
@@ -1203,10 +1306,43 @@
 				mask);
 }
 
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
-				mask);
+				GFP_NOFS);
+}
+
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		clear_page_dirty_for_io(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
+}
+
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+{
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
+	struct page *page;
+
+	while (index <= end_index) {
+		page = find_get_page(inode->i_mapping, index);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
+		account_page_redirty(page);
+		__set_page_dirty_nobuffers(page);
+		page_cache_release(page);
+		index++;
+	}
+	return 0;
 }
 
 /*
@@ -1220,7 +1356,7 @@
 
 	while (index <= end_index) {
 		page = find_get_page(tree->mapping, index);
-		BUG_ON(!page);
+		BUG_ON(!page); /* Pages should be in the extent_io_tree */
 		set_page_writeback(page);
 		page_cache_release(page);
 		index++;
@@ -1232,8 +1368,9 @@
  * return it.  tree->lock must be held.  NULL will returned if
  * nothing was found after 'start'
  */
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
-						 u64 start, int bits)
+static struct extent_state *
+find_first_extent_bit_state(struct extent_io_tree *tree,
+			    u64 start, unsigned long bits)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1264,21 +1401,45 @@
  * returned if we find something, and *start_ret and *end_ret are
  * set to reflect the state struct that was found.
  *
- * If nothing was found, 1 is returned, < 0 on error
+ * If nothing was found, 1 is returned. If found something, return 0.
  */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits)
+			  u64 *start_ret, u64 *end_ret, unsigned long bits,
+			  struct extent_state **cached_state)
 {
 	struct extent_state *state;
+	struct rb_node *n;
 	int ret = 1;
 
 	spin_lock(&tree->lock);
+	if (cached_state && *cached_state) {
+		state = *cached_state;
+		if (state->end == start - 1 && state->tree) {
+			n = rb_next(&state->rb_node);
+			while (n) {
+				state = rb_entry(n, struct extent_state,
+						 rb_node);
+				if (state->state & bits)
+					goto got_it;
+				n = rb_next(n);
+			}
+			free_extent_state(*cached_state);
+			*cached_state = NULL;
+			goto out;
+		}
+		free_extent_state(*cached_state);
+		*cached_state = NULL;
+	}
+
 	state = find_first_extent_bit_state(tree, start, bits);
+got_it:
 	if (state) {
+		cache_state(state, cached_state);
 		*start_ret = state->start;
 		*end_ret = state->end;
 		ret = 0;
 	}
+out:
 	spin_unlock(&tree->lock);
 	return ret;
 }
@@ -1332,20 +1493,20 @@
 		*end = state->end;
 		cur_start = state->end + 1;
 		node = rb_next(node);
-		if (!node)
-			break;
 		total_bytes += state->end - state->start + 1;
 		if (total_bytes >= max_bytes)
 			break;
+		if (!node)
+			break;
 	}
 out:
 	spin_unlock(&tree->lock);
 	return found;
 }
 
-static noinline int __unlock_for_delalloc(struct inode *inode,
-					  struct page *locked_page,
-					  u64 start, u64 end)
+static noinline void __unlock_for_delalloc(struct inode *inode,
+					   struct page *locked_page,
+					   u64 start, u64 end)
 {
 	int ret;
 	struct page *pages[16];
@@ -1355,7 +1516,7 @@
 	int i;
 
 	if (index == locked_page->index && end_index == index)
-		return 0;
+		return;
 
 	while (nr_pages > 0) {
 		ret = find_get_pages_contig(inode->i_mapping, index,
@@ -1370,7 +1531,6 @@
 		index += ret;
 		cond_resched();
 	}
-	return 0;
 }
 
 static noinline int lock_delalloc_pages(struct inode *inode,
@@ -1464,7 +1624,7 @@
 		*start = delalloc_start;
 		*end = delalloc_end;
 		free_extent_state(cached_state);
-		return found;
+		return 0;
 	}
 
 	/*
@@ -1477,10 +1637,9 @@
 
 	/*
 	 * make sure to limit the number of pages we try to lock down
-	 * if we're looping.
 	 */
-	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
-		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
+	if (delalloc_end + 1 - delalloc_start > max_bytes)
+		delalloc_end = delalloc_start + max_bytes - 1;
 
 	/* step two, lock all the pages after the page that has start */
 	ret = lock_delalloc_pages(inode, locked_page,
@@ -1491,8 +1650,7 @@
 		 */
 		free_extent_state(cached_state);
 		if (!loops) {
-			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
-			max_bytes = PAGE_CACHE_SIZE - offset;
+			max_bytes = PAGE_CACHE_SIZE;
 			loops = 1;
 			goto again;
 		} else {
@@ -1500,11 +1658,10 @@
 			goto out_failed;
 		}
 	}
-	BUG_ON(ret);
+	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
 
 	/* step three, lock the state bits for the whole range */
-	lock_extent_bits(tree, delalloc_start, delalloc_end,
-			 0, &cached_state, GFP_NOFS);
+	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
 
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1524,31 +1681,21 @@
 	return found;
 }
 
-int extent_clear_unlock_delalloc(struct inode *inode,
-				struct extent_io_tree *tree,
-				u64 start, u64 end, struct page *locked_page,
-				unsigned long op)
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned long clear_bits,
+				 unsigned long page_ops)
 {
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	int ret;
 	struct page *pages[16];
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
-	int clear_bits = 0;
-
-	if (op & EXTENT_CLEAR_UNLOCK)
-		clear_bits |= EXTENT_LOCKED;
-	if (op & EXTENT_CLEAR_DIRTY)
-		clear_bits |= EXTENT_DIRTY;
-
-	if (op & EXTENT_CLEAR_DELALLOC)
-		clear_bits |= EXTENT_DELALLOC;
 
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
-	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
-		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
-		    EXTENT_SET_PRIVATE2)))
+	if (page_ops == 0)
 		return 0;
 
 	while (nr_pages > 0) {
@@ -1557,20 +1704,20 @@
 				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 
-			if (op & EXTENT_SET_PRIVATE2)
+			if (page_ops & PAGE_SET_PRIVATE2)
 				SetPagePrivate2(pages[i]);
 
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
 				continue;
 			}
-			if (op & EXTENT_CLEAR_DIRTY)
+			if (page_ops & PAGE_CLEAR_DIRTY)
 				clear_page_dirty_for_io(pages[i]);
-			if (op & EXTENT_SET_WRITEBACK)
+			if (page_ops & PAGE_SET_WRITEBACK)
 				set_page_writeback(pages[i]);
-			if (op & EXTENT_END_WRITEBACK)
+			if (page_ops & PAGE_END_WRITEBACK)
 				end_page_writeback(pages[i]);
-			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+			if (page_ops & PAGE_UNLOCK)
 				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
@@ -1647,7 +1794,7 @@
  * set the private field for a given byte offset in the tree.  If there isn't
  * an extent_state there already, this does nothing.
  */
-int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1708,7 +1855,7 @@
  * range is found set.
  */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled, struct extent_state *cached)
+		   unsigned long bits, int filled, struct extent_state *cached)
 {
 	struct extent_state *state = NULL;
 	struct rb_node *node;
@@ -1761,39 +1908,12 @@
  * helper function to set a given page up to date if all the
  * extents in the tree for that page are up to date
  */
-static int check_page_uptodate(struct extent_io_tree *tree,
-			       struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
 		SetPageUptodate(page);
-	return 0;
-}
-
-/*
- * helper function to unlock a page if all the extents in the tree
- * for that page are unlocked
- */
-static int check_page_locked(struct extent_io_tree *tree,
-			     struct page *page)
-{
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
-		unlock_page(page);
-	return 0;
-}
-
-/*
- * helper function to end page writeback if all the extents
- * in the tree for that page are done with writeback
- */
-static int check_page_writeback(struct extent_io_tree *tree,
-			     struct page *page)
-{
-	end_page_writeback(page);
-	return 0;
 }
 
 /*
@@ -1829,13 +1949,11 @@
 	if (ret)
 		err = ret;
 
-	if (did_repair) {
-		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
-					rec->start + rec->len - 1,
-					EXTENT_DAMAGED, GFP_NOFS);
-		if (ret && !err)
-			err = ret;
-	}
+	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_DAMAGED, GFP_NOFS);
+	if (ret && !err)
+		err = ret;
 
 	kfree(rec);
 	return err;
@@ -1851,12 +1969,12 @@
  * the standard behavior is to write all copies in a raid setup. here we only
  * want to write the one bad copy. so we do the mapping for ourselves and issue
  * submit_bio directly.
- * to avoid any synchonization issues, wait for the data after writing, which
+ * to avoid any synchronization issues, wait for the data after writing, which
  * actually prevents the read that triggered the error from finishing.
  * currently, there can be no more than two copies of every data bit. thus,
  * exactly one rewrite is required.
  */
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 			u64 length, u64 logical, struct page *page,
 			int mirror_num)
 {
@@ -1866,11 +1984,16 @@
 	u64 map_length = 0;
 	u64 sector;
 	struct btrfs_bio *bbio = NULL;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	int ret;
 
 	BUG_ON(!mirror_num);
 
-	bio = bio_alloc(GFP_NOFS, 1);
+	/* we can't repair anything in raid56 yet */
+	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+		return 0;
+
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio)
 		return -EIO;
 	bio->bi_private = &compl;
@@ -1878,7 +2001,7 @@
 	bio->bi_size = 0;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, WRITE, logical,
+	ret = btrfs_map_block(fs_info, WRITE, logical,
 			      &map_length, &bbio, mirror_num);
 	if (ret) {
 		bio_put(bio);
@@ -1894,24 +2017,44 @@
 		return -EIO;
 	}
 	bio->bi_bdev = dev->bdev;
-	bio_add_page(bio, page, length, start-page_offset(page));
-	submit_bio(WRITE_SYNC, bio);
+	bio_add_page(bio, page, length, start - page_offset(page));
+	btrfsic_submit_bio(WRITE_SYNC, bio);
 	wait_for_completion(&compl);
 
 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 		/* try to remap that extent elsewhere? */
 		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		return -EIO;
 	}
 
-	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
-			"sector %llu)\n", page->mapping->host->i_ino, start,
-			dev->name, sector);
+	printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
+		      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
+		      start, rcu_str_deref(dev->name), sector);
 
 	bio_put(bio);
 	return 0;
 }
 
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+			 int mirror_num)
+{
+	u64 start = eb->start;
+	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
+	int ret = 0;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
+					start, p, mirror_num);
+		if (ret)
+			break;
+		start += PAGE_CACHE_SIZE;
+	}
+
+	return ret;
+}
+
 /*
  * each time an IO finishes, we do a fast check in the IO failure tree
  * to see if we need to process or clean up an io_failure_record
@@ -1921,7 +2064,7 @@
 	u64 private;
 	u64 private_failure;
 	struct io_failure_record *failrec;
-	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_fs_info *fs_info;
 	struct extent_state *state;
 	int num_copies;
 	int did_repair = 0;
@@ -1956,16 +2099,18 @@
 					    EXTENT_LOCKED);
 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
 
-	if (state && state->start == failrec->start) {
-		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
-		num_copies = btrfs_num_copies(map_tree, failrec->logical,
-						failrec->len);
+	if (state && state->start <= failrec->start &&
+	    state->end >= failrec->start + failrec->len - 1) {
+		fs_info = BTRFS_I(inode)->root->fs_info;
+		num_copies = btrfs_num_copies(fs_info, failrec->logical,
+					      failrec->len);
 		if (num_copies > 1)  {
-			ret = repair_io_failure(map_tree, start, failrec->len,
+			ret = repair_io_failure(fs_info, start, failrec->len,
 						failrec->logical, page,
 						failrec->failed_mirror);
 			did_repair = !ret;
 		}
+		ret = 0;
 	}
 
 out:
@@ -1983,9 +2128,9 @@
  * needed
  */
 
-static int bio_readpage_error(struct bio *failed_bio, struct page *page,
-				u64 start, u64 end, int failed_mirror,
-				struct extent_state *state)
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+			      struct page *page, u64 start, u64 end,
+			      int failed_mirror)
 {
 	struct io_failure_record *failrec = NULL;
 	u64 private;
@@ -1995,6 +2140,8 @@
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct bio *bio;
+	struct btrfs_io_bio *btrfs_failed_bio;
+	struct btrfs_io_bio *btrfs_bio;
 	int num_copies;
 	int ret;
 	int read_mode;
@@ -2027,7 +2174,7 @@
 		}
 		read_unlock(&em_tree->lock);
 
-		if (!em || IS_ERR(em)) {
+		if (!em) {
 			kfree(failrec);
 			return -EIO;
 		}
@@ -2070,32 +2217,20 @@
 		 * clean_io_failure() clean all those errors at once.
 		 */
 	}
-	num_copies = btrfs_num_copies(
-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
-			      failrec->logical, failrec->len);
+	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
+				      failrec->logical, failrec->len);
 	if (num_copies == 1) {
 		/*
 		 * we only have a single copy of the data, so don't bother with
 		 * all the retry and error correction code that follows. no
 		 * matter what the error is, it is very likely to persist.
 		 */
-		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
-			 "state=%p, num_copies=%d, next_mirror %d, "
-			 "failed_mirror %d\n", state, num_copies,
-			 failrec->this_mirror, failed_mirror);
+		pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
 		free_io_failure(inode, failrec, 0);
 		return -EIO;
 	}
 
-	if (!state) {
-		spin_lock(&tree->lock);
-		state = find_first_extent_bit_state(tree, failrec->start,
-						    EXTENT_LOCKED);
-		if (state && state->start != failrec->start)
-			state = NULL;
-		spin_unlock(&tree->lock);
-	}
-
 	/*
 	 * there are two premises:
 	 *	a) deliver good data to the caller
@@ -2132,34 +2267,72 @@
 		read_mode = READ_SYNC;
 	}
 
-	if (!state || failrec->this_mirror > num_copies) {
-		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
-			 "next_mirror %d, failed_mirror %d\n", state,
+	if (failrec->this_mirror > num_copies) {
+		pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
 			 num_copies, failrec->this_mirror, failed_mirror);
 		free_io_failure(inode, failrec, 0);
 		return -EIO;
 	}
 
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_private = state;
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio) {
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
 	bio->bi_end_io = failed_bio->bi_end_io;
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 	bio->bi_size = 0;
 
+	btrfs_failed_bio = btrfs_io_bio(failed_bio);
+	if (btrfs_failed_bio->csum) {
+		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = btrfs_bio->csum_inline;
+		phy_offset >>= inode->i_sb->s_blocksize_bits;
+		phy_offset *= csum_size;
+		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
+		       csum_size);
+	}
+
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
 
 	pr_debug("bio_readpage_error: submitting new read[%#x] to "
 		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
 		 failrec->this_mirror, num_copies, failrec->in_validation);
 
-	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
-					failrec->bio_flags, 0);
-	return 0;
+	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
+					 failrec->this_mirror,
+					 failrec->bio_flags, 0);
+	return ret;
 }
 
 /* lots and lots of room for performance fixes in the end_bio funcs */
 
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+{
+	int uptodate = (err == 0);
+	struct extent_io_tree *tree;
+	int ret;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+
+	if (tree->ops && tree->ops->writepage_end_io_hook) {
+		ret = tree->ops->writepage_end_io_hook(page, start,
+					       end, NULL, uptodate);
+		if (ret)
+			uptodate = 0;
+	}
+
+	if (!uptodate) {
+		ClearPageUptodate(page);
+		SetPageError(page);
+	}
+	return 0;
+}
+
 /*
  * after a writepage IO is done, we need to:
  * clear the uptodate bits on error
@@ -2171,61 +2344,53 @@
  */
 static void end_bio_extent_writepage(struct bio *bio, int err)
 {
-	int uptodate = err == 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct extent_io_tree *tree;
 	u64 start;
 	u64 end;
-	int whole_page;
-	int ret;
 
 	do {
 		struct page *page = bvec->bv_page;
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			 bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+			printk("%s page write in btrfs with offset %u and length %u\n",
+			       bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+			       ? KERN_ERR "partial" : KERN_INFO "incomplete",
+			       bvec->bv_offset, bvec->bv_len);
 
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
-		if (tree->ops && tree->ops->writepage_end_io_hook) {
-			ret = tree->ops->writepage_end_io_hook(page, start,
-						       end, NULL, uptodate);
-			if (ret)
-				uptodate = 0;
-		}
-
-		if (!uptodate && tree->ops &&
-		    tree->ops->writepage_io_failed_hook) {
-			ret = tree->ops->writepage_io_failed_hook(bio, page,
-							 start, end, NULL);
-			if (ret == 0) {
-				uptodate = (err == 0);
-				continue;
-			}
-		}
 
-		if (!uptodate) {
-			clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
-			ClearPageUptodate(page);
-			SetPageError(page);
-		}
+		if (end_extent_writepage(page, err, start, end))
+			continue;
 
-		if (whole_page)
-			end_page_writeback(page);
-		else
-			check_page_writeback(tree, page);
+		end_page_writeback(page);
 	} while (bvec >= bio->bi_io_vec);
 
 	bio_put(bio);
 }
 
+static void
+endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
+			      int uptodate)
+{
+	struct extent_state *cached = NULL;
+	u64 end = start + len - 1;
+
+	if (uptodate && tree->track_uptodate)
+		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
+	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+}
+
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
@@ -2242,10 +2407,15 @@
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct extent_io_tree *tree;
+	u64 offset = 0;
 	u64 start;
 	u64 end;
-	int whole_page;
+	u64 len;
+	u64 extent_start = 0;
+	u64 extent_len = 0;
+	int mirror;
 	int ret;
 
 	if (err)
@@ -2253,48 +2423,52 @@
 
 	do {
 		struct page *page = bvec->bv_page;
-		struct extent_state *cached = NULL;
-		struct extent_state *state;
-
-		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
-			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
-			 (long int)bio->bi_bdev);
-		tree = &BTRFS_I(page->mapping->host)->io_tree;
-
-		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-			bvec->bv_offset;
-		end = start + bvec->bv_len - 1;
+		struct inode *inode = page->mapping->host;
 
-		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
-			whole_page = 1;
-		else
-			whole_page = 0;
+		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
+			 "mirror=%lu\n", (u64)bio->bi_sector, err,
+			 io_bio->mirror_num);
+		tree = &BTRFS_I(inode)->io_tree;
+
+		/* We always issue full-page reads, but if some block
+		 * in a page fails to read, blk_update_request() will
+		 * advance bv_offset and adjust bv_len to compensate.
+		 * Print a warning for nonzero offsets, and an error
+		 * if they don't add up to a full page.  */
+		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE)
+			printk("%s page read in btrfs with offset %u and length %u\n",
+			       bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE
+			       ? KERN_ERR "partial" : KERN_INFO "incomplete",
+			       bvec->bv_offset, bvec->bv_len);
+
+		start = page_offset(page);
+		end = start + bvec->bv_offset + bvec->bv_len - 1;
+		len = bvec->bv_len;
 
 		if (++bvec <= bvec_end)
 			prefetchw(&bvec->bv_page->flags);
 
-		spin_lock(&tree->lock);
-		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
-		if (state && state->start == start) {
-			/*
-			 * take a reference on the state, unlock will drop
-			 * the ref
-			 */
-			cache_state(state, &cached);
-		}
-		spin_unlock(&tree->lock);
-
-		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
-			ret = tree->ops->readpage_end_io_hook(page, start, end,
-							      state);
+		mirror = io_bio->mirror_num;
+		if (likely(uptodate && tree->ops &&
+			   tree->ops->readpage_end_io_hook)) {
+			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
+							      page, start, end,
+							      mirror);
 			if (ret)
 				uptodate = 0;
 			else
 				clean_io_failure(start, page);
 		}
-		if (!uptodate) {
-			int failed_mirror;
-			failed_mirror = (int)(unsigned long)bio->bi_bdev;
+
+		if (likely(uptodate))
+			goto readpage_ok;
+
+		if (tree->ops && tree->ops->readpage_io_failed_hook) {
+			ret = tree->ops->readpage_io_failed_hook(page, mirror);
+			if (!ret && !err &&
+			    test_bit(BIO_UPTODATE, &bio->bi_flags))
+				uptodate = 1;
+		} else {
 			/*
 			 * The generic bio_readpage_error handles errors the
 			 * following way: If possible, new read requests are
@@ -2305,77 +2479,157 @@
 			 * can't handle the error it will return -EIO and we
 			 * remain responsible for that page.
 			 */
-			ret = bio_readpage_error(bio, page, start, end,
-							failed_mirror, NULL);
+			ret = bio_readpage_error(bio, offset, page, start, end,
+						 mirror);
 			if (ret == 0) {
-error_handled:
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				if (err)
 					uptodate = 0;
-				uncache_state(&cached);
 				continue;
 			}
-			if (tree->ops && tree->ops->readpage_io_failed_hook) {
-				ret = tree->ops->readpage_io_failed_hook(
-							bio, page, start, end,
-							failed_mirror, state);
-				if (ret == 0)
-					goto error_handled;
-			}
 		}
+readpage_ok:
+		if (likely(uptodate)) {
+			loff_t i_size = i_size_read(inode);
+			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+			unsigned offset;
+
+			/* Zero out the end if this page straddles i_size */
+			offset = i_size & (PAGE_CACHE_SIZE-1);
+			if (page->index == end_index && offset)
+				zero_user_segment(page, offset, PAGE_CACHE_SIZE);
+			SetPageUptodate(page);
+		} else {
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+		unlock_page(page);
+		offset += len;
 
-		if (uptodate) {
-			set_extent_uptodate(tree, start, end, &cached,
-					    GFP_ATOMIC);
-		}
-		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
-
-		if (whole_page) {
-			if (uptodate) {
-				SetPageUptodate(page);
-			} else {
-				ClearPageUptodate(page);
-				SetPageError(page);
+		if (unlikely(!uptodate)) {
+			if (extent_len) {
+				endio_readpage_release_extent(tree,
+							      extent_start,
+							      extent_len, 1);
+				extent_start = 0;
+				extent_len = 0;
 			}
-			unlock_page(page);
+			endio_readpage_release_extent(tree, start,
+						      end - start + 1, 0);
+		} else if (!extent_len) {
+			extent_start = start;
+			extent_len = end + 1 - start;
+		} else if (extent_start + extent_len == start) {
+			extent_len += end + 1 - start;
 		} else {
-			if (uptodate) {
-				check_page_uptodate(tree, page);
-			} else {
-				ClearPageUptodate(page);
-				SetPageError(page);
-			}
-			check_page_locked(tree, page);
+			endio_readpage_release_extent(tree, extent_start,
+						      extent_len, uptodate);
+			extent_start = start;
+			extent_len = end + 1 - start;
 		}
 	} while (bvec <= bvec_end);
 
+	if (extent_len)
+		endio_readpage_release_extent(tree, extent_start, extent_len,
+					      uptodate);
+	if (io_bio->end_io)
+		io_bio->end_io(io_bio, err);
 	bio_put(bio);
 }
 
+#ifdef MY_ABC_HERE
+static void btrfs_bio_destructor(struct bio *bio)
+{
+	bio_free(bio, btrfs_bioset);
+}
+#endif
+
+/*
+ * this allocates from the btrfs_bioset.  We're returning a bio right now
+ * but you can call btrfs_io_bio for the appropriate container_of magic
+ */
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags)
 {
+	struct btrfs_io_bio *btrfs_bio;
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, nr_vecs);
+	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
 
 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-		while (!bio && (nr_vecs /= 2))
-			bio = bio_alloc(gfp_flags, nr_vecs);
+		while (!bio && (nr_vecs /= 2)) {
+			bio = bio_alloc_bioset(gfp_flags,
+					       nr_vecs, btrfs_bioset);
+		}
 	}
 
 	if (bio) {
+#ifdef MY_ABC_HERE
+		bio->bi_destructor = btrfs_bio_destructor;
+#endif
 		bio->bi_size = 0;
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
 	}
 	return bio;
 }
 
-static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
-			  unsigned long bio_flags)
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
+{
+#ifdef MY_ABC_HERE
+	struct bio *b;
+
+	b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs, btrfs_bioset);
+	if (!b)
+		return NULL;
+
+	b->bi_destructor = btrfs_bio_destructor;
+	__bio_clone(b, bio);
+
+	if (bio_integrity(bio)) {
+		int ret;
+
+		ret = bio_integrity_clone(b, bio, gfp_mask, btrfs_bioset);
+		if (ret < 0) {
+			bio_put(b);
+			return NULL;
+		}
+	}
+	return b;
+#else
+	return bio_clone_bioset(bio, gfp_mask, btrfs_bioset);
+#endif
+}
+
+
+/* this also allocates from the btrfs_bioset */
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
+{
+	struct btrfs_io_bio *btrfs_bio;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+	if (bio) {
+#ifdef MY_ABC_HERE
+		bio->bi_destructor = btrfs_bio_destructor;
+#endif
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
+	}
+	return bio;
+}
+
+
+static int __must_check submit_one_bio(int rw, struct bio *bio,
+				       int mirror_num, unsigned long bio_flags)
 {
 	int ret = 0;
 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
@@ -2383,7 +2637,7 @@
 	struct extent_io_tree *tree = bio->bi_private;
 	u64 start;
 
-	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+	start = page_offset(page) + bvec->bv_offset;
 
 	bio->bi_private = NULL;
 
@@ -2393,7 +2647,7 @@
 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
 					   mirror_num, bio_flags, start);
 	else
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
 
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
@@ -2401,6 +2655,19 @@
 	return ret;
 }
 
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
+		     unsigned long offset, size_t size, struct bio *bio,
+		     unsigned long bio_flags)
+{
+	int ret = 0;
+	if (tree->ops && tree->ops->merge_bio_hook)
+		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
+						bio_flags);
+	BUG_ON(ret < 0);
+	return ret;
+
+}
+
 static int submit_extent_page(int rw, struct extent_io_tree *tree,
 			      struct page *page, sector_t sector,
 			      size_t size, unsigned long offset,
@@ -2429,12 +2696,12 @@
 				sector;
 
 		if (prev_bio_flags != bio_flags || !contig ||
-		    (tree->ops && tree->ops->merge_bio_hook &&
-		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
-					       bio_flags)) ||
+		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
 		    bio_add_page(bio, page, page_size, offset) < page_size) {
 			ret = submit_one_bio(rw, bio, mirror_num,
 					     prev_bio_flags);
+			if (ret < 0)
+				return ret;
 			bio = NULL;
 		} else {
 			return 0;
@@ -2461,6 +2728,18 @@
 	return ret;
 }
 
+static void attach_extent_buffer_page(struct extent_buffer *eb,
+				      struct page *page)
+{
+	if (!PagePrivate(page)) {
+		SetPagePrivate(page);
+		page_cache_get(page);
+		set_page_private(page, (unsigned long)eb);
+	} else {
+		WARN_ON(page->private != (unsigned long)eb);
+	}
+}
+
 void set_page_extent_mapped(struct page *page)
 {
 	if (!PagePrivate(page)) {
@@ -2470,25 +2749,48 @@
 	}
 }
 
-static void set_page_extent_head(struct page *page, unsigned long len)
+static struct extent_map *
+__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+		 u64 start, u64 len, get_extent_t *get_extent,
+		 struct extent_map **em_cached)
 {
-	WARN_ON(!PagePrivate(page));
-	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
-}
+	struct extent_map *em;
+
+	if (em_cached && *em_cached) {
+		em = *em_cached;
+		if (em->in_tree && start >= em->start &&
+		    start < extent_map_end(em)) {
+			atomic_inc(&em->refs);
+			return em;
+		}
 
+		free_extent_map(em);
+		*em_cached = NULL;
+	}
+
+	em = get_extent(inode, page, pg_offset, start, len, 0);
+	if (em_cached && !IS_ERR_OR_NULL(em)) {
+		BUG_ON(*em_cached);
+		atomic_inc(&em->refs);
+		*em_cached = em;
+	}
+	return em;
+}
 /*
  * basic readpage implementation.  Locked extent state structs are inserted
  * into the tree that are removed when the IO is done (by the end_io
  * handlers)
+ * XXX JDM: This needs looking at to ensure proper page locking
  */
-static int __extent_read_full_page(struct extent_io_tree *tree,
-				   struct page *page,
-				   get_extent_t *get_extent,
-				   struct bio **bio, int mirror_num,
-				   unsigned long *bio_flags)
+static int __do_readpage(struct extent_io_tree *tree,
+			 struct page *page,
+			 get_extent_t *get_extent,
+			 struct extent_map **em_cached,
+			 struct bio **bio, int mirror_num,
+			 unsigned long *bio_flags, int rw)
 {
 	struct inode *inode = page->mapping->host;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
 	u64 cur = start;
@@ -2499,68 +2801,64 @@
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
-	struct btrfs_ordered_extent *ordered;
 	int ret;
 	int nr = 0;
+	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
 	size_t pg_offset = 0;
 	size_t iosize;
 	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	unsigned long this_bio_flag = 0;
+	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
 
 	set_page_extent_mapped(page);
 
+	end = page_end;
 	if (!PageUptodate(page)) {
 		if (cleancache_get_page(page) == 0) {
 			BUG_ON(blocksize != PAGE_SIZE);
+			unlock_extent(tree, start, end);
 			goto out;
 		}
 	}
 
-	end = page_end;
-	while (1) {
-		lock_extent(tree, start, end, GFP_NOFS);
-		ordered = btrfs_lookup_ordered_extent(inode, start);
-		if (!ordered)
-			break;
-		unlock_extent(tree, start, end, GFP_NOFS);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-	}
-
 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
 		char *userpage;
 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
 
 		if (zero_offset) {
 			iosize = PAGE_CACHE_SIZE - zero_offset;
-			userpage = kmap_atomic(page, KM_USER0);
+			userpage = kmap_atomic(page);
 			memset(userpage + zero_offset, 0, iosize);
 			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
+			kunmap_atomic(userpage);
 		}
 	}
 	while (cur <= end) {
+		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
+
 		if (cur >= last_byte) {
 			char *userpage;
 			struct extent_state *cached = NULL;
 
 			iosize = PAGE_CACHE_SIZE - pg_offset;
-			userpage = kmap_atomic(page, KM_USER0);
+			userpage = kmap_atomic(page);
 			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
+			kunmap_atomic(userpage);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
-			unlock_extent_cached(tree, cur, cur + iosize - 1,
-					     &cached, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent_cached(tree, cur,
+						     cur + iosize - 1,
+						     &cached, GFP_NOFS);
 			break;
 		}
-		em = get_extent(inode, page, pg_offset, cur,
-				end - cur + 1, 0);
+		em = __get_extent_map(inode, page, pg_offset, cur,
+				      end - cur + 1, get_extent, em_cached);
 		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
-			unlock_extent(tree, cur, end, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent(tree, cur, end);
 			break;
 		}
 		extent_offset = cur - em->start;
@@ -2568,14 +2866,14 @@
 		BUG_ON(end < cur);
 
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-			this_bio_flag = EXTENT_BIO_COMPRESSED;
+			this_bio_flag |= EXTENT_BIO_COMPRESSED;
 			extent_set_compress_type(&this_bio_flag,
 						 em->compress_type);
 		}
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
 		cur_end = min(extent_map_end(em) - 1, end);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		iosize = ALIGN(iosize, blocksize);
 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
 			disk_io_size = em->block_len;
 			sector = em->block_start >> 9;
@@ -2595,10 +2893,10 @@
 			char *userpage;
 			struct extent_state *cached = NULL;
 
-			userpage = kmap_atomic(page, KM_USER0);
+			userpage = kmap_atomic(page);
 			memset(userpage + pg_offset, 0, iosize);
 			flush_dcache_page(page);
-			kunmap_atomic(userpage, KM_USER0);
+			kunmap_atomic(userpage);
 
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
@@ -2612,7 +2910,8 @@
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
 			check_page_uptodate(tree, page);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
@@ -2622,31 +2921,28 @@
 		 */
 		if (block_start == EXTENT_MAP_INLINE) {
 			SetPageError(page);
-			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
 		}
 
-		ret = 0;
-		if (tree->ops && tree->ops->readpage_io_hook) {
-			ret = tree->ops->readpage_io_hook(page, cur,
-							  cur + iosize - 1);
-		}
-		if (!ret) {
-			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-			pnr -= page->index;
-			ret = submit_extent_page(READ, tree, page,
+		pnr -= page->index;
+		ret = submit_extent_page(rw, tree, page,
 					 sector, disk_io_size, pg_offset,
 					 bdev, bio, pnr,
 					 end_bio_extent_readpage, mirror_num,
 					 *bio_flags,
 					 this_bio_flag);
+		if (!ret) {
 			nr++;
 			*bio_flags = this_bio_flag;
-		}
-		if (ret)
+		} else {
 			SetPageError(page);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
+		}
 		cur = cur + iosize;
 		pg_offset += iosize;
 	}
@@ -2659,6 +2955,104 @@
 	return 0;
 }
 
+static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
+					     struct page *pages[], int nr_pages,
+					     u64 start, u64 end,
+					     get_extent_t *get_extent,
+					     struct extent_map **em_cached,
+					     struct bio **bio, int mirror_num,
+					     unsigned long *bio_flags, int rw)
+{
+	struct inode *inode;
+	struct btrfs_ordered_extent *ordered;
+	int index;
+
+	inode = pages[0]->mapping->host;
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_range(inode, start,
+						     end - start + 1);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	for (index = 0; index < nr_pages; index++) {
+		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
+			      mirror_num, bio_flags, rw);
+		page_cache_release(pages[index]);
+	}
+}
+
+static void __extent_readpages(struct extent_io_tree *tree,
+			       struct page *pages[],
+			       int nr_pages, get_extent_t *get_extent,
+			       struct extent_map **em_cached,
+			       struct bio **bio, int mirror_num,
+			       unsigned long *bio_flags, int rw)
+{
+	u64 start = 0;
+	u64 end = 0;
+	u64 page_start;
+	int index;
+	int first_index = 0;
+
+	for (index = 0; index < nr_pages; index++) {
+		page_start = page_offset(pages[index]);
+		if (!end) {
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		} else if (end + 1 == page_start) {
+			end += PAGE_CACHE_SIZE;
+		} else {
+			__do_contiguous_readpages(tree, &pages[first_index],
+						  index - first_index, start,
+						  end, get_extent, em_cached,
+						  bio, mirror_num, bio_flags,
+						  rw);
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		}
+	}
+
+	if (end)
+		__do_contiguous_readpages(tree, &pages[first_index],
+					  index - first_index, start,
+					  end, get_extent, em_cached, bio,
+					  mirror_num, bio_flags, rw);
+}
+
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags, int rw)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_ordered_extent *ordered;
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret;
+
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_extent(inode, start);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
+			    bio_flags, rw);
+	return ret;
+}
+
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			    get_extent_t *get_extent, int mirror_num)
 {
@@ -2667,7 +3061,21 @@
 	int ret;
 
 	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
-				      &bio_flags);
+				      &bio_flags, READ);
+	if (bio)
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+	return ret;
+}
+
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num)
+{
+	struct bio *bio = NULL;
+	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
+	int ret;
+
+	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
+				      &bio_flags, READ);
 	if (bio)
 		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
 	return ret;
@@ -2695,7 +3103,7 @@
 	struct inode *inode = page->mapping->host;
 	struct extent_page_data *epd = data;
 	struct extent_io_tree *tree = epd->tree;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 delalloc_start;
 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
 	u64 end;
@@ -2744,10 +3152,10 @@
 	if (page->index == end_index) {
 		char *userpage;
 
-		userpage = kmap_atomic(page, KM_USER0);
+		userpage = kmap_atomic(page);
 		memset(userpage + pg_offset, 0,
 		       PAGE_CACHE_SIZE - pg_offset);
-		kunmap_atomic(userpage, KM_USER0);
+		kunmap_atomic(userpage);
 		flush_dcache_page(page);
 	}
 	pg_offset = 0;
@@ -2778,9 +3186,16 @@
 				delalloc_start = delalloc_end + 1;
 				continue;
 			}
-			tree->ops->fill_delalloc(inode, page, delalloc_start,
-						 delalloc_end, &page_started,
-						 &nr_written);
+			ret = tree->ops->fill_delalloc(inode, page,
+						       delalloc_start,
+						       delalloc_end,
+						       &page_started,
+						       &nr_written);
+			/* File system has been set read-only */
+			if (ret) {
+				SetPageError(page);
+				goto done;
+			}
 			/*
 			 * delalloc_end is already one less than the total
 			 * length, so we don't subtract one from
@@ -2817,8 +3232,12 @@
 	if (tree->ops && tree->ops->writepage_start_hook) {
 		ret = tree->ops->writepage_start_hook(page, start,
 						      page_end);
-		if (ret == -EAGAIN) {
-			redirty_page_for_writepage(wbc, page);
+		if (ret) {
+			/* Fixup worker will requeue */
+			if (ret == -EBUSY)
+				wbc->pages_skipped++;
+			else
+				redirty_page_for_writepage(wbc, page);
 			update_nr_written(page, wbc, nr_written);
 			unlock_page(page);
 			ret = 0;
@@ -2860,7 +3279,7 @@
 		BUG_ON(extent_map_end(em) <= cur);
 		BUG_ON(end < cur);
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
-		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+		iosize = ALIGN(iosize, blocksize);
 		sector = (em->block_start + extent_offset) >> 9;
 		bdev = em->bdev;
 		block_start = em->block_start;
@@ -2918,8 +3337,7 @@
 			if (!PageWriteback(page)) {
 				printk(KERN_ERR "btrfs warning page %lu not "
 				       "writeback, cur %llu end %llu\n",
-				       page->index, (unsigned long long)cur,
-				       (unsigned long long)end);
+				       page->index, cur, end);
 			}
 
 			ret = submit_extent_page(write_flags, tree, page,
@@ -2949,6 +3367,302 @@
 	return 0;
 }
 
+static int eb_wait(void *word)
+{
+	io_schedule();
+	return 0;
+}
+
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
+		    TASK_UNINTERRUPTIBLE);
+}
+
+static int lock_extent_buffer_for_io(struct extent_buffer *eb,
+				     struct btrfs_fs_info *fs_info,
+				     struct extent_page_data *epd)
+{
+	unsigned long i, num_pages;
+	int flush = 0;
+	int ret = 0;
+
+	if (!btrfs_try_tree_write_lock(eb)) {
+		flush = 1;
+		flush_write_bio(epd);
+		btrfs_tree_lock(eb);
+	}
+
+	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
+		btrfs_tree_unlock(eb);
+		if (!epd->sync_io)
+			return 0;
+		if (!flush) {
+			flush_write_bio(epd);
+			flush = 1;
+		}
+		while (1) {
+			wait_on_extent_buffer_writeback(eb);
+			btrfs_tree_lock(eb);
+			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
+				break;
+			btrfs_tree_unlock(eb);
+		}
+	}
+
+	/*
+	 * We need to do this to prevent races in people who check if the eb is
+	 * under IO since we can end up having no IO bits set for a short period
+	 * of time.
+	 */
+	spin_lock(&eb->refs_lock);
+	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+		spin_unlock(&eb->refs_lock);
+		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
+				     -eb->len,
+				     fs_info->dirty_metadata_batch);
+		ret = 1;
+	} else {
+		spin_unlock(&eb->refs_lock);
+	}
+
+	btrfs_tree_unlock(eb);
+
+	if (!ret)
+		return ret;
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+
+		if (!trylock_page(p)) {
+			if (!flush) {
+				flush_write_bio(epd);
+				flush = 1;
+			}
+			lock_page(p);
+		}
+	}
+
+	return ret;
+}
+
+static void end_extent_buffer_writeback(struct extent_buffer *eb)
+{
+	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
+}
+
+static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
+{
+	int uptodate = err == 0;
+	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct extent_buffer *eb;
+	int done;
+
+	do {
+		struct page *page = bvec->bv_page;
+
+		bvec--;
+		eb = (struct extent_buffer *)page->private;
+		BUG_ON(!eb);
+		done = atomic_dec_and_test(&eb->io_pages);
+
+		if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
+			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+			ClearPageUptodate(page);
+			SetPageError(page);
+		}
+
+		end_page_writeback(page);
+
+		if (!done)
+			continue;
+
+		end_extent_buffer_writeback(eb);
+	} while (bvec >= bio->bi_io_vec);
+
+	bio_put(bio);
+
+}
+
+static int write_one_eb(struct extent_buffer *eb,
+			struct btrfs_fs_info *fs_info,
+			struct writeback_control *wbc,
+			struct extent_page_data *epd)
+{
+	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
+	u64 offset = eb->start;
+	unsigned long i, num_pages;
+	unsigned long bio_flags = 0;
+	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
+	int ret = 0;
+
+	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+	num_pages = num_extent_pages(eb->start, eb->len);
+	atomic_set(&eb->io_pages, num_pages);
+	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
+		bio_flags = EXTENT_BIO_TREE_LOG;
+
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+
+		clear_page_dirty_for_io(p);
+		set_page_writeback(p);
+		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
+					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
+					 -1, end_bio_extent_buffer_writepage,
+					 0, epd->bio_flags, bio_flags);
+		epd->bio_flags = bio_flags;
+		if (ret) {
+			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+			SetPageError(p);
+			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
+				end_extent_buffer_writeback(eb);
+			ret = -EIO;
+			break;
+		}
+		offset += PAGE_CACHE_SIZE;
+		update_nr_written(p, wbc, 1);
+		unlock_page(p);
+	}
+
+	if (unlikely(ret)) {
+		for (; i < num_pages; i++) {
+			struct page *p = extent_buffer_page(eb, i);
+			unlock_page(p);
+		}
+	}
+
+	return ret;
+}
+
+int btree_write_cache_pages(struct address_space *mapping,
+				   struct writeback_control *wbc)
+{
+	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
+	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
+	struct extent_buffer *eb, *prev_eb = NULL;
+	struct extent_page_data epd = {
+		.bio = NULL,
+		.tree = tree,
+		.extent_locked = 0,
+		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
+	};
+	int ret = 0;
+	int done = 0;
+	int nr_to_write_done = 0;
+	struct pagevec pvec;
+	int nr_pages;
+	pgoff_t index;
+	pgoff_t end;		/* Inclusive */
+	int scanned = 0;
+	int tag;
+
+	pagevec_init(&pvec, 0);
+	if (wbc->range_cyclic) {
+		index = mapping->writeback_index; /* Start from prev offset */
+		end = -1;
+	} else {
+		index = wbc->range_start >> PAGE_CACHE_SHIFT;
+		end = wbc->range_end >> PAGE_CACHE_SHIFT;
+		scanned = 1;
+	}
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag = PAGECACHE_TAG_TOWRITE;
+	else
+		tag = PAGECACHE_TAG_DIRTY;
+retry:
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		tag_pages_for_writeback(mapping, index, end);
+	while (!done && !nr_to_write_done && (index <= end) &&
+	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
+		unsigned i;
+
+		scanned = 1;
+		for (i = 0; i < nr_pages; i++) {
+			struct page *page = pvec.pages[i];
+
+			if (!PagePrivate(page))
+				continue;
+
+			if (!wbc->range_cyclic && page->index > end) {
+				done = 1;
+				break;
+			}
+
+			spin_lock(&mapping->private_lock);
+			if (!PagePrivate(page)) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			eb = (struct extent_buffer *)page->private;
+
+			/*
+			 * Shouldn't happen and normally this would be a BUG_ON
+			 * but no sense in crashing the users box for something
+			 * we can survive anyway.
+			 */
+			if (!eb) {
+				spin_unlock(&mapping->private_lock);
+				WARN_ON(1);
+				continue;
+			}
+
+			if (eb == prev_eb) {
+				spin_unlock(&mapping->private_lock);
+				continue;
+			}
+
+			ret = atomic_inc_not_zero(&eb->refs);
+			spin_unlock(&mapping->private_lock);
+			if (!ret)
+				continue;
+
+			prev_eb = eb;
+			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
+			if (!ret) {
+				free_extent_buffer(eb);
+				continue;
+			}
+
+			ret = write_one_eb(eb, fs_info, wbc, &epd);
+			if (ret) {
+				done = 1;
+				free_extent_buffer(eb);
+				break;
+			}
+			free_extent_buffer(eb);
+
+			/*
+			 * the filesystem may choose to bump up nr_to_write.
+			 * We have to make sure to honor the new nr_to_write
+			 * at any time
+			 */
+			nr_to_write_done = wbc->nr_to_write <= 0;
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+	}
+	if (!scanned && !done) {
+		/*
+		 * We hit the last page and there is more work to be done: wrap
+		 * back to the start of the file
+		 */
+		scanned = 1;
+		index = 0;
+		goto retry;
+	}
+	flush_write_bio(&epd);
+	return ret;
+}
+
 /**
  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
  * @mapping: address space structure to write
@@ -2970,6 +3684,7 @@
 			     writepage_t writepage, void *data,
 			     void (*flush_fn)(void *))
 {
+	struct inode *inode = mapping->host;
 	int ret = 0;
 	int done = 0;
 	int nr_to_write_done = 0;
@@ -2980,6 +3695,18 @@
 	int scanned = 0;
 	int tag;
 
+	/*
+	 * We have to hold onto the inode so that ordered extents can do their
+	 * work when the IO finishes.  The alternative to this is failing to add
+	 * an ordered extent if the igrab() fails there and that is a huge pain
+	 * to deal with, so instead just hold onto the inode throughout the
+	 * writepages operation.  If it fails here we are freeing up the inode
+	 * anyway and we'd rather not waste our time writing out stuff that is
+	 * going to be truncated anyway.
+	 */
+	if (!igrab(inode))
+		return 0;
+
 	pagevec_init(&pvec, 0);
 	if (wbc->range_cyclic) {
 		index = mapping->writeback_index; /* Start from prev offset */
@@ -3012,15 +3739,9 @@
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			if (tree->ops &&
-			    tree->ops->write_cache_pages_lock_hook) {
-				tree->ops->write_cache_pages_lock_hook(page,
-							       data, flush_fn);
-			} else {
-				if (!trylock_page(page)) {
-					flush_fn(data);
-					lock_page(page);
-				}
+			if (!trylock_page(page)) {
+				flush_fn(data);
+				lock_page(page);
 			}
 
 			if (unlikely(page->mapping != mapping)) {
@@ -3074,16 +3795,21 @@
 		index = 0;
 		goto retry;
 	}
+	btrfs_add_delayed_iput(inode);
 	return ret;
 }
 
 static void flush_epd_write_bio(struct extent_page_data *epd)
 {
 	if (epd->bio) {
+		int rw = WRITE;
+		int ret;
+
 		if (epd->sync_io)
-			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
-		else
-			submit_one_bio(WRITE, epd->bio, 0, 0);
+			rw = WRITE_SYNC;
+
+		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
+		BUG_ON(ret < 0); /* -ENOMEM */
 		epd->bio = NULL;
 	}
 }
@@ -3105,6 +3831,7 @@
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = __extent_writepage(page, wbc, &epd);
@@ -3129,6 +3856,7 @@
 		.get_extent = get_extent,
 		.extent_locked = 1,
 		.sync_io = mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
@@ -3168,6 +3896,7 @@
 		.get_extent = get_extent,
 		.extent_locked = 0,
 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
+		.bio_flags = 0,
 	};
 
 	ret = extent_write_cache_pages(tree, mapping, wbc,
@@ -3185,22 +3914,39 @@
 	struct bio *bio = NULL;
 	unsigned page_idx;
 	unsigned long bio_flags = 0;
+	struct page *pagepool[16];
+	struct page *page;
+	struct extent_map *em_cached = NULL;
+	int nr = 0;
 
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
-		struct page *page = list_entry(pages->prev, struct page, lru);
+		page = list_entry(pages->prev, struct page, lru);
 
 		prefetchw(&page->flags);
 		list_del(&page->lru);
-		if (!add_to_page_cache_lru(page, mapping,
+		if (add_to_page_cache_lru(page, mapping,
 					page->index, GFP_NOFS)) {
-			__extent_read_full_page(tree, page, get_extent,
-						&bio, 0, &bio_flags);
+			page_cache_release(page);
+			continue;
 		}
-		page_cache_release(page);
-	}
+
+		pagepool[nr++] = page;
+		if (nr < ARRAY_SIZE(pagepool))
+			continue;
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
+		nr = 0;
+	}
+	if (nr)
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
+
+	if (em_cached)
+		free_extent_map(em_cached);
+
 	BUG_ON(!list_empty(pages));
 	if (bio)
-		submit_one_bio(READ, bio, 0, bio_flags);
+		return submit_one_bio(READ, bio, 0, bio_flags);
 	return 0;
 }
 
@@ -3213,15 +3959,15 @@
 			  struct page *page, unsigned long offset)
 {
 	struct extent_state *cached_state = NULL;
-	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
 
-	start += (offset + blocksize - 1) & ~(blocksize - 1);
+	start += ALIGN(offset, blocksize);
 	if (start > end)
 		return 0;
 
-	lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
+	lock_extent_bits(tree, start, end, 0, &cached_state);
 	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -3235,11 +3981,11 @@
  * are locked or under IO and drops the related state bits if it is safe
  * to drop the page.
  */
-int try_release_extent_state(struct extent_map_tree *map,
-			     struct extent_io_tree *tree, struct page *page,
-			     gfp_t mask)
+static int try_release_extent_state(struct extent_map_tree *map,
+				    struct extent_io_tree *tree,
+				    struct page *page, gfp_t mask)
 {
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 	int ret = 1;
 
@@ -3278,7 +4024,7 @@
 			       gfp_t mask)
 {
 	struct extent_map *em;
-	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 
 	if ((mask & __GFP_WAIT) &&
@@ -3288,7 +4034,7 @@
 			len = end - start + 1;
 			write_lock(&map->lock);
 			em = lookup_extent_mapping(map, start, len);
-			if (IS_ERR_OR_NULL(em)) {
+			if (!em) {
 				write_unlock(&map->lock);
 				break;
 			}
@@ -3336,7 +4082,7 @@
 		len = last - offset;
 		if (len == 0)
 			break;
-		len = (len + sectorsize - 1) & ~(sectorsize - 1);
+		len = ALIGN(len, sectorsize);
 		em = get_extent(inode, NULL, 0, offset, len, 0);
 		if (IS_ERR_OR_NULL(em))
 			return em;
@@ -3434,8 +4180,8 @@
 		last_for_get_extent = isize;
 	}
 
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
-			 &cached_state, GFP_NOFS);
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+			 &cached_state);
 
 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
 				   get_extent);
@@ -3447,7 +4193,7 @@
 	}
 
 	while (!end) {
-		u64 offset_in_extent;
+		u64 offset_in_extent = 0;
 
 		/* break if the extent we found is outside the range */
 		if (em->start >= max || extent_map_end(em) < off)
@@ -3463,9 +4209,12 @@
 
 		/*
 		 * record the offset from the start of the extent
-		 * for adjusting the disk offset below
+		 * for adjusting the disk offset below.  Only do this if the
+		 * extent isn't compressed since our in ram offset may be past
+		 * what we have actually allocated on disk.
 		 */
-		offset_in_extent = em_start - em->start;
+		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+			offset_in_extent = em_start - em->start;
 		em_end = extent_map_end(em);
 		em_len = em_end - em_start;
 		emflags = em->flags;
@@ -3521,40 +4270,85 @@
 out_free:
 	free_extent_map(em);
 out:
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			     &cached_state, GFP_NOFS);
 	return ret;
 }
 
-inline struct page *extent_buffer_page(struct extent_buffer *eb,
-					      unsigned long i)
+static void __free_extent_buffer(struct extent_buffer *eb)
 {
-	struct page *p;
-	struct address_space *mapping;
+	btrfs_leak_debug_del(&eb->leak_list);
+	kmem_cache_free(extent_buffer_cache, eb);
+}
 
-	if (i == 0)
-		return eb->first_page;
-	i += eb->start >> PAGE_CACHE_SHIFT;
-	mapping = eb->first_page->mapping;
-	if (!mapping)
-		return NULL;
+static int extent_buffer_under_io(struct extent_buffer *eb)
+{
+	return (atomic_read(&eb->io_pages) ||
+		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+}
 
-	/*
-	 * extent_buffer_page is only called after pinning the page
-	 * by increasing the reference count.  So we know the page must
-	 * be in the radix tree.
-	 */
-	rcu_read_lock();
-	p = radix_tree_lookup(&mapping->page_tree, i);
-	rcu_read_unlock();
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+						unsigned long start_idx)
+{
+	unsigned long index;
+	unsigned long num_pages;
+	struct page *page;
+	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+	BUG_ON(extent_buffer_under_io(eb));
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	index = start_idx + num_pages;
+	if (start_idx >= index)
+		return;
 
-	return p;
+	do {
+		index--;
+		page = extent_buffer_page(eb, index);
+		if (page && mapped) {
+			spin_lock(&page->mapping->private_lock);
+			/*
+			 * We do this since we'll remove the pages after we've
+			 * removed the eb from the radix tree, so we could race
+			 * and have this page now attached to the new eb.  So
+			 * only clear page_private if it's still connected to
+			 * this eb.
+			 */
+			if (PagePrivate(page) &&
+			    page->private == (unsigned long)eb) {
+				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+				BUG_ON(PageDirty(page));
+				BUG_ON(PageWriteback(page));
+				/*
+				 * We need to make sure we haven't be attached
+				 * to a new eb.
+				 */
+				ClearPagePrivate(page);
+				set_page_private(page, 0);
+				/* One for the page private */
+				page_cache_release(page);
+			}
+			spin_unlock(&page->mapping->private_lock);
+
+		}
+		if (page) {
+			/* One for when we alloced the page */
+			page_cache_release(page);
+		}
+	} while (index != start_idx);
 }
 
-inline unsigned long num_extent_pages(u64 start, u64 len)
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 {
-	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
-		(start >> PAGE_CACHE_SHIFT);
+	btrfs_release_extent_buffer_page(eb, 0);
+	__free_extent_buffer(eb);
 }
 
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
@@ -3563,15 +4357,14 @@
 						   gfp_t mask)
 {
 	struct extent_buffer *eb = NULL;
-#if LEAK_DEBUG
-	unsigned long flags;
-#endif
 
 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
 	if (eb == NULL)
 		return NULL;
 	eb->start = start;
 	eb->len = len;
+	eb->tree = tree;
+	eb->bflags = 0;
 	rwlock_init(&eb->lock);
 	atomic_set(&eb->write_locks, 0);
 	atomic_set(&eb->read_locks, 0);
@@ -3579,66 +4372,131 @@
 	atomic_set(&eb->blocking_writers, 0);
 	atomic_set(&eb->spinning_readers, 0);
 	atomic_set(&eb->spinning_writers, 0);
+	eb->lock_nested = 0;
 	init_waitqueue_head(&eb->write_lock_wq);
 	init_waitqueue_head(&eb->read_lock_wq);
 
-#if LEAK_DEBUG
-	spin_lock_irqsave(&leak_lock, flags);
-	list_add(&eb->leak_list, &buffers);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
+	btrfs_leak_debug_add(&eb->leak_list, &buffers);
+
+	spin_lock_init(&eb->refs_lock);
 	atomic_set(&eb->refs, 1);
+	atomic_set(&eb->io_pages, 0);
+
+	/*
+	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
+	 */
+	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
+		> MAX_INLINE_EXTENT_BUFFER_SIZE);
+	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
 
 	return eb;
 }
 
-static void __free_extent_buffer(struct extent_buffer *eb)
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 {
-#if LEAK_DEBUG
-	unsigned long flags;
-	spin_lock_irqsave(&leak_lock, flags);
-	list_del(&eb->leak_list);
-	spin_unlock_irqrestore(&leak_lock, flags);
-#endif
-	kmem_cache_free(extent_buffer_cache, eb);
+	unsigned long i;
+	struct page *p;
+	struct extent_buffer *new;
+	unsigned long num_pages = num_extent_pages(src->start, src->len);
+
+	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+	if (new == NULL)
+		return NULL;
+
+	for (i = 0; i < num_pages; i++) {
+		p = alloc_page(GFP_NOFS);
+		if (!p) {
+			btrfs_release_extent_buffer(new);
+			return NULL;
+		}
+		attach_extent_buffer_page(new, p);
+		WARN_ON(PageDirty(p));
+		SetPageUptodate(p);
+		new->pages[i] = p;
+	}
+
+	copy_extent_buffer(new, src, 0, 0, src->len);
+	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
+	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
+
+	return new;
 }
 
-/*
- * Helper for releasing extent buffer page.
- */
-static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
-						unsigned long start_idx)
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
 {
-	unsigned long index;
-	struct page *page;
+	struct extent_buffer *eb;
+	unsigned long num_pages = num_extent_pages(0, len);
+	unsigned long i;
 
-	if (!eb->first_page)
-		return;
+	eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+	if (!eb)
+		return NULL;
 
-	index = num_extent_pages(eb->start, eb->len);
-	if (start_idx >= index)
+	for (i = 0; i < num_pages; i++) {
+		eb->pages[i] = alloc_page(GFP_NOFS);
+		if (!eb->pages[i])
+			goto err;
+	}
+	set_extent_buffer_uptodate(eb);
+	btrfs_set_header_nritems(eb, 0);
+	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+	return eb;
+err:
+	for (; i > 0; i--)
+		__free_page(eb->pages[i - 1]);
+	__free_extent_buffer(eb);
+	return NULL;
+}
+
+static void check_buffer_tree_ref(struct extent_buffer *eb)
+{
+	int refs;
+	/* the ref bit is tricky.  We have to make sure it is set
+	 * if we have the buffer dirty.   Otherwise the
+	 * code to free a buffer can end up dropping a dirty
+	 * page
+	 *
+	 * Once the ref bit is set, it won't go away while the
+	 * buffer is dirty or in writeback, and it also won't
+	 * go away while we have the reference count on the
+	 * eb bumped.
+	 *
+	 * We can't just set the ref bit without bumping the
+	 * ref on the eb because free_extent_buffer might
+	 * see the ref bit and try to clear it.  If this happens
+	 * free_extent_buffer might end up dropping our original
+	 * ref by mistake and freeing the page before we are able
+	 * to add one more ref.
+	 *
+	 * So bump the ref count first, then set the bit.  If someone
+	 * beat us to it, drop the ref we added.
+	 */
+	refs = atomic_read(&eb->refs);
+	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
 		return;
 
-	do {
-		index--;
-		page = extent_buffer_page(eb, index);
-		if (page)
-			page_cache_release(page);
-	} while (index != start_idx);
+	spin_lock(&eb->refs_lock);
+	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_inc(&eb->refs);
+	spin_unlock(&eb->refs_lock);
 }
 
-/*
- * Helper for releasing the extent buffer.
- */
-static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
 {
-	btrfs_release_extent_buffer_page(eb, 0);
-	__free_extent_buffer(eb);
+	unsigned long num_pages, i;
+
+	check_buffer_tree_ref(eb);
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	for (i = 0; i < num_pages; i++) {
+		struct page *p = extent_buffer_page(eb, i);
+		mark_page_accessed(p);
+	}
 }
 
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0)
+					  u64 start, unsigned long len)
 {
 	unsigned long num_pages = num_extent_pages(start, len);
 	unsigned long i;
@@ -3654,7 +4512,7 @@
 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
-		mark_page_accessed(eb->first_page);
+		mark_extent_buffer_accessed(eb);
 		return eb;
 	}
 	rcu_read_unlock();
@@ -3663,32 +4521,42 @@
 	if (!eb)
 		return NULL;
 
-	if (page0) {
-		eb->first_page = page0;
-		i = 1;
-		index++;
-		page_cache_get(page0);
-		mark_page_accessed(page0);
-		set_page_extent_mapped(page0);
-		set_page_extent_head(page0, len);
-		uptodate = PageUptodate(page0);
-	} else {
-		i = 0;
-	}
-	for (; i < num_pages; i++, index++) {
+	for (i = 0; i < num_pages; i++, index++) {
 		p = find_or_create_page(mapping, index, GFP_NOFS);
-		if (!p) {
-			WARN_ON(1);
+		if (!p)
 			goto free_eb;
-		}
-		set_page_extent_mapped(p);
+
+		spin_lock(&mapping->private_lock);
+		if (PagePrivate(p)) {
+			/*
+			 * We could have already allocated an eb for this page
+			 * and attached one so lets see if we can get a ref on
+			 * the existing eb, and if we can we know it's good and
+			 * we can just return that one, else we know we can just
+			 * overwrite page->private.
+			 */
+			exists = (struct extent_buffer *)p->private;
+			if (atomic_inc_not_zero(&exists->refs)) {
+				spin_unlock(&mapping->private_lock);
+				unlock_page(p);
+				page_cache_release(p);
+				mark_extent_buffer_accessed(exists);
+				goto free_eb;
+			}
+
+			/*
+			 * Do this so attach doesn't complain and we need to
+			 * drop the ref the old guy had.
+			 */
+			ClearPagePrivate(p);
+			WARN_ON(PageDirty(p));
+			page_cache_release(p);
+		}
+		attach_extent_buffer_page(eb, p);
+		spin_unlock(&mapping->private_lock);
+		WARN_ON(PageDirty(p));
 		mark_page_accessed(p);
-		if (i == 0) {
-			eb->first_page = p;
-			set_page_extent_head(p, len);
-		} else {
-			set_page_private(p, EXTENT_PAGE_PRIVATE);
-		}
+		eb->pages[i] = p;
 		if (!PageUptodate(p))
 			uptodate = 0;
 
@@ -3696,12 +4564,10 @@
 		 * see below about how we avoid a nasty race with release page
 		 * and why we unlock later
 		 */
-		if (i != 0)
-			unlock_page(p);
 	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
+again:
 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
 	if (ret)
 		goto free_eb;
@@ -3711,14 +4577,19 @@
 	if (ret == -EEXIST) {
 		exists = radix_tree_lookup(&tree->buffer,
 						start >> PAGE_CACHE_SHIFT);
-		/* add one reference for the caller */
-		atomic_inc(&exists->refs);
+		if (!atomic_inc_not_zero(&exists->refs)) {
+			spin_unlock(&tree->buffer_lock);
+			radix_tree_preload_end();
+			exists = NULL;
+			goto again;
+		}
 		spin_unlock(&tree->buffer_lock);
 		radix_tree_preload_end();
+		mark_extent_buffer_accessed(exists);
 		goto free_eb;
 	}
 	/* add one reference for the tree */
-	atomic_inc(&eb->refs);
+	check_buffer_tree_ref(eb);
 	spin_unlock(&tree->buffer_lock);
 	radix_tree_preload_end();
 
@@ -3731,18 +4602,22 @@
 	 * after the extent buffer is in the radix tree so
 	 * it doesn't get lost
 	 */
-	set_page_extent_mapped(eb->first_page);
-	set_page_extent_head(eb->first_page, eb->len);
-	if (!page0)
-		unlock_page(eb->first_page);
+	SetPageChecked(eb->pages[0]);
+	for (i = 1; i < num_pages; i++) {
+		p = extent_buffer_page(eb, i);
+		ClearPageChecked(p);
+		unlock_page(p);
+	}
+	unlock_page(eb->pages[0]);
 	return eb;
 
 free_eb:
-	if (eb->first_page && !page0)
-		unlock_page(eb->first_page);
+	for (i = 0; i < num_pages; i++) {
+		if (eb->pages[i])
+			unlock_page(eb->pages[i]);
+	}
 
-	if (!atomic_dec_and_test(&eb->refs))
-		return exists;
+	WARN_ON(!atomic_dec_and_test(&eb->refs));
 	btrfs_release_extent_buffer(eb);
 	return exists;
 }
@@ -3756,7 +4631,7 @@
 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
 	if (eb && atomic_inc_not_zero(&eb->refs)) {
 		rcu_read_unlock();
-		mark_page_accessed(eb->first_page);
+		mark_extent_buffer_accessed(eb);
 		return eb;
 	}
 	rcu_read_unlock();
@@ -3764,19 +4639,91 @@
 	return NULL;
 }
 
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+	struct extent_buffer *eb =
+			container_of(head, struct extent_buffer, rcu_head);
+
+	__free_extent_buffer(eb);
+}
+
+/* Expects to have eb->eb_lock already held */
+static int release_extent_buffer(struct extent_buffer *eb)
+{
+	WARN_ON(atomic_read(&eb->refs) == 0);
+	if (atomic_dec_and_test(&eb->refs)) {
+		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
+			spin_unlock(&eb->refs_lock);
+		} else {
+			struct extent_io_tree *tree = eb->tree;
+
+			spin_unlock(&eb->refs_lock);
+
+			spin_lock(&tree->buffer_lock);
+			radix_tree_delete(&tree->buffer,
+					  eb->start >> PAGE_CACHE_SHIFT);
+			spin_unlock(&tree->buffer_lock);
+		}
+
+		/* Should be safe to release our pages at this point */
+		btrfs_release_extent_buffer_page(eb, 0);
+		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
+		return 1;
+	}
+	spin_unlock(&eb->refs_lock);
+
+	return 0;
+}
+
 void free_extent_buffer(struct extent_buffer *eb)
 {
+	int refs;
+	int old;
 	if (!eb)
 		return;
 
-	if (!atomic_dec_and_test(&eb->refs))
+	while (1) {
+		refs = atomic_read(&eb->refs);
+		if (refs <= 3)
+			break;
+		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+		if (old == refs)
+			return;
+	}
+
+	spin_lock(&eb->refs_lock);
+	if (atomic_read(&eb->refs) == 2 &&
+	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
+		atomic_dec(&eb->refs);
+
+	if (atomic_read(&eb->refs) == 2 &&
+	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
+	    !extent_buffer_under_io(eb) &&
+	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_dec(&eb->refs);
+
+	/*
+	 * I know this is terrible, but it's temporary until we stop tracking
+	 * the uptodate bits and such for the extent buffers.
+	 */
+	release_extent_buffer(eb);
+}
+
+void free_extent_buffer_stale(struct extent_buffer *eb)
+{
+	if (!eb)
 		return;
 
-	WARN_ON(1);
+	spin_lock(&eb->refs_lock);
+	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
+
+	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
+	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+		atomic_dec(&eb->refs);
+	release_extent_buffer(eb);
 }
 
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
-			      struct extent_buffer *eb)
+void clear_extent_buffer_dirty(struct extent_buffer *eb)
 {
 	unsigned long i;
 	unsigned long num_pages;
@@ -3792,10 +4739,6 @@
 		lock_page(page);
 		WARN_ON(!PagePrivate(page));
 
-		set_page_extent_mapped(page);
-		if (i == 0)
-			set_page_extent_head(page, eb->len);
-
 		clear_page_dirty_for_io(page);
 		spin_lock_irq(&page->mapping->tree_lock);
 		if (!PageDirty(page)) {
@@ -3807,54 +4750,36 @@
 		ClearPageError(page);
 		unlock_page(page);
 	}
-	return 0;
+	WARN_ON(atomic_read(&eb->refs) == 0);
 }
 
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
-			     struct extent_buffer *eb)
+int set_extent_buffer_dirty(struct extent_buffer *eb)
 {
 	unsigned long i;
 	unsigned long num_pages;
 	int was_dirty = 0;
 
+	check_buffer_tree_ref(eb);
+
 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+
 	num_pages = num_extent_pages(eb->start, eb->len);
+	WARN_ON(atomic_read(&eb->refs) == 0);
+	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+
 	for (i = 0; i < num_pages; i++)
-		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
+		set_page_dirty(extent_buffer_page(eb, i));
 	return was_dirty;
 }
 
-static int __eb_straddles_pages(u64 start, u64 len)
-{
-	if (len < PAGE_CACHE_SIZE)
-		return 1;
-	if (start & (PAGE_CACHE_SIZE - 1))
-		return 1;
-	if ((start + len) & (PAGE_CACHE_SIZE - 1))
-		return 1;
-	return 0;
-}
-
-static int eb_straddles_pages(struct extent_buffer *eb)
-{
-	return __eb_straddles_pages(eb->start, eb->len);
-}
-
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb,
-				struct extent_state **cached_state)
+int clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	unsigned long i;
 	struct page *page;
 	unsigned long num_pages;
 
-	num_pages = num_extent_pages(eb->start, eb->len);
 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
-
-	if (eb_straddles_pages(eb)) {
-		clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-				      cached_state, GFP_NOFS);
-	}
+	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
 		if (page)
@@ -3863,90 +4788,24 @@
 	return 0;
 }
 
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb)
+int set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	unsigned long i;
 	struct page *page;
 	unsigned long num_pages;
 
+	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	num_pages = num_extent_pages(eb->start, eb->len);
-
-	if (eb_straddles_pages(eb)) {
-		set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-				    NULL, GFP_NOFS);
-	}
 	for (i = 0; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-		    ((i == num_pages - 1) &&
-		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-			check_page_uptodate(tree, page);
-			continue;
-		}
 		SetPageUptodate(page);
 	}
 	return 0;
 }
 
-int extent_range_uptodate(struct extent_io_tree *tree,
-			  u64 start, u64 end)
+int extent_buffer_uptodate(struct extent_buffer *eb)
 {
-	struct page *page;
-	int ret;
-	int pg_uptodate = 1;
-	int uptodate;
-	unsigned long index;
-
-	if (__eb_straddles_pages(start, end - start + 1)) {
-		ret = test_range_bit(tree, start, end,
-				     EXTENT_UPTODATE, 1, NULL);
-		if (ret)
-			return 1;
-	}
-	while (start <= end) {
-		index = start >> PAGE_CACHE_SHIFT;
-		page = find_get_page(tree->mapping, index);
-		uptodate = PageUptodate(page);
-		page_cache_release(page);
-		if (!uptodate) {
-			pg_uptodate = 0;
-			break;
-		}
-		start += PAGE_CACHE_SIZE;
-	}
-	return pg_uptodate;
-}
-
-int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb,
-			   struct extent_state *cached_state)
-{
-	int ret = 0;
-	unsigned long num_pages;
-	unsigned long i;
-	struct page *page;
-	int pg_uptodate = 1;
-
-	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
-		return 1;
-
-	if (eb_straddles_pages(eb)) {
-		ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-				   EXTENT_UPTODATE, 1, cached_state);
-		if (ret)
-			return ret;
-	}
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	for (i = 0; i < num_pages; i++) {
-		page = extent_buffer_page(eb, i);
-		if (!PageUptodate(page)) {
-			pg_uptodate = 0;
-			break;
-		}
-	}
-	return pg_uptodate;
+	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
@@ -3960,21 +4819,14 @@
 	int ret = 0;
 	int locked_pages = 0;
 	int all_uptodate = 1;
-	int inc_all_pages = 0;
 	unsigned long num_pages;
+	unsigned long num_reads = 0;
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 
 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
 		return 0;
 
-	if (eb_straddles_pages(eb)) {
-		if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
-				   EXTENT_UPTODATE, 1, NULL)) {
-			return 0;
-		}
-	}
-
 	if (start) {
 		WARN_ON(start < eb->start);
 		start_i = (start >> PAGE_CACHE_SHIFT) -
@@ -3993,8 +4845,10 @@
 			lock_page(page);
 		}
 		locked_pages++;
-		if (!PageUptodate(page))
+		if (!PageUptodate(page)) {
+			num_reads++;
 			all_uptodate = 0;
+		}
 	}
 	if (all_uptodate) {
 		if (start_i == 0)
@@ -4002,24 +4856,17 @@
 		goto unlock_exit;
 	}
 
+	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
+	eb->read_mirror = 0;
+	atomic_set(&eb->io_pages, num_reads);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-
-		WARN_ON(!PagePrivate(page));
-
-		set_page_extent_mapped(page);
-		if (i == 0)
-			set_page_extent_head(page, eb->len);
-
-		if (inc_all_pages)
-			page_cache_get(page);
 		if (!PageUptodate(page)) {
-			if (start_i == 0)
-				inc_all_pages = 1;
 			ClearPageError(page);
 			err = __extent_read_full_page(tree, page,
 						      get_extent, &bio,
-						      mirror_num, &bio_flags);
+						      mirror_num, &bio_flags,
+						      READ | REQ_META);
 			if (err)
 				ret = err;
 		} else {
@@ -4027,8 +4874,12 @@
 		}
 	}
 
-	if (bio)
-		submit_one_bio(READ, bio, mirror_num, bio_flags);
+	if (bio) {
+		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
+				     bio_flags);
+		if (err)
+			return err;
+	}
 
 	if (ret || wait != WAIT_COMPLETE)
 		return ret;
@@ -4040,8 +4891,6 @@
 			ret = -EIO;
 	}
 
-	if (!ret)
-		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
 	return ret;
 
 unlock_exit:
@@ -4070,7 +4919,7 @@
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -4111,10 +4960,9 @@
 	}
 
 	if (start + min_len > eb->len) {
-		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
-		       "wanted %lu %lu\n", (unsigned long long)eb->start,
-		       eb->len, start, min_len);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
+		       "wanted %lu %lu\n",
+		       eb->start, eb->len, start, min_len);
 		return -EINVAL;
 	}
 
@@ -4125,7 +4973,9 @@
 	return 0;
 }
 
-int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+#ifdef MY_ABC_HERE
+/* We don't want to check in while loop, so copy from memcmp_extent_buffer(). */
+int memcmp_caseless_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
 			  unsigned long len)
 {
@@ -4149,6 +4999,43 @@
 		cur = min(len, (PAGE_CACHE_SIZE - offset));
 
 		kaddr = page_address(page);
+		ret = SYNOUnicodeUTF8Strcmp(ptr, kaddr + offset, cur, cur, 0);
+		if (ret)
+			break;
+
+		ptr += cur;
+		len -= cur;
+		offset = 0;
+		i++;
+	}
+	return ret;
+}
+#endif
+
+int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len)
+{
+	size_t cur;
+	size_t offset;
+	struct page *page;
+	char *kaddr;
+	char *ptr = (char *)ptrv;
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
+	int ret = 0;
+
+	WARN_ON(start > eb->len);
+	WARN_ON(start + len > eb->start + eb->len);
+
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
+
+	while (len > 0) {
+		page = extent_buffer_page(eb, i);
+
+		cur = min(len, (PAGE_CACHE_SIZE - offset));
+
+		kaddr = page_address(page);
 		ret = memcmp(ptr, kaddr + offset, cur);
 		if (ret)
 			break;
@@ -4175,7 +5062,7 @@
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -4205,7 +5092,7 @@
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -4236,7 +5123,7 @@
 	WARN_ON(src->len != dst_len);
 
 	offset = (start_offset + dst_offset) &
-		((unsigned long)PAGE_CACHE_SIZE - 1);
+		(PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(dst, i);
@@ -4283,15 +5170,20 @@
 {
 	char *dst_kaddr = page_address(dst_page);
 	char *src_kaddr;
+	int must_memmove = 0;
 
 	if (dst_page != src_page) {
 		src_kaddr = page_address(src_page);
 	} else {
 		src_kaddr = dst_kaddr;
-		BUG_ON(areas_overlap(src_off, dst_off, len));
+		if (areas_overlap(src_off, dst_off, len))
+			must_memmove = 1;
 	}
 
-	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	if (must_memmove)
+		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
+	else
+		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
 }
 
 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
@@ -4317,9 +5209,9 @@
 
 	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 
 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
@@ -4361,7 +5253,7 @@
 		       "len %lu len %lu\n", dst_offset, len, dst->len);
 		BUG_ON(1);
 	}
-	if (!areas_overlap(src_offset, dst_offset, len)) {
+	if (dst_offset < src_offset) {
 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
 		return;
 	}
@@ -4370,9 +5262,9 @@
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 
 		dst_off_in_page = (start_offset + dst_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
@@ -4387,47 +5279,44 @@
 	}
 }
 
-static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
-{
-	struct extent_buffer *eb =
-			container_of(head, struct extent_buffer, rcu_head);
-
-	btrfs_release_extent_buffer(eb);
-}
-
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
+int try_release_extent_buffer(struct page *page)
 {
-	u64 start = page_offset(page);
 	struct extent_buffer *eb;
-	int ret = 1;
 
-	spin_lock(&tree->buffer_lock);
-	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-	if (!eb) {
-		spin_unlock(&tree->buffer_lock);
-		return ret;
+	/*
+	 * We need to make sure noboody is attaching this page to an eb right
+	 * now.
+	 */
+	spin_lock(&page->mapping->private_lock);
+	if (!PagePrivate(page)) {
+		spin_unlock(&page->mapping->private_lock);
+		return 1;
 	}
 
-	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
-		ret = 0;
-		goto out;
-	}
+	eb = (struct extent_buffer *)page->private;
+	BUG_ON(!eb);
 
 	/*
-	 * set @eb->refs to 0 if it is already 1, and then release the @eb.
-	 * Or go back.
+	 * This is a little awful but should be ok, we need to make sure that
+	 * the eb doesn't disappear out from under us while we're looking at
+	 * this page.
 	 */
-	if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
-		ret = 0;
-		goto out;
+	spin_lock(&eb->refs_lock);
+	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
+		spin_unlock(&eb->refs_lock);
+		spin_unlock(&page->mapping->private_lock);
+		return 0;
 	}
+	spin_unlock(&page->mapping->private_lock);
 
-	radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-out:
-	spin_unlock(&tree->buffer_lock);
+	/*
+	 * If tree ref isn't set then we know the ref on this eb is a real ref,
+	 * so just return, this page will likely be freed soon anyway.
+	 */
+	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
+		spin_unlock(&eb->refs_lock);
+		return 0;
+	}
 
-	/* at this point we can safely release the extent buffer */
-	if (atomic_read(&eb->refs) == 0)
-		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
-	return ret;
+	return release_extent_buffer(eb);
 }
diff -ur a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
--- a/fs/btrfs/extent_io.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/extent_io.h	2014-02-17 11:56:58.000000000 +0100
@@ -19,6 +19,7 @@
 #define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_NEED_WAIT (1 << 13)
 #define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_NORESERVE (1 << 15)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
@@ -27,6 +28,8 @@
  * type for this bio
  */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_TREE_LOG 2
+#define EXTENT_BIO_PARENT_LOCKED 4
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
@@ -35,16 +38,18 @@
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
 #define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
+#define EXTENT_BUFFER_TREE_REF 5
+#define EXTENT_BUFFER_STALE 6
+#define EXTENT_BUFFER_WRITEBACK 7
+#define EXTENT_BUFFER_IOERR 8
+#define EXTENT_BUFFER_DUMMY 9
 
 /* these are flags for extent_clear_unlock_delalloc */
-#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-#define EXTENT_CLEAR_UNLOCK	 0x2
-#define EXTENT_CLEAR_DELALLOC	 0x4
-#define EXTENT_CLEAR_DIRTY	 0x8
-#define EXTENT_SET_WRITEBACK	 0x10
-#define EXTENT_END_WRITEBACK	 0x20
-#define EXTENT_SET_PRIVATE2	 0x40
-#define EXTENT_CLEAR_ACCOUNTING  0x80
+#define PAGE_UNLOCK		(1 << 0)
+#define PAGE_CLEAR_DIRTY	(1 << 1)
+#define PAGE_SET_WRITEBACK	(1 << 2)
+#define PAGE_END_WRITEBACK	(1 << 3)
+#define PAGE_SET_PRIVATE2	(1 << 4)
 
 /*
  * page->private values.  Every page that is controlled by the extent
@@ -54,6 +59,8 @@
 #define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3
 
 struct extent_state;
+struct btrfs_root;
+struct btrfs_io_bio;
 
 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
 				       struct bio *bio, int mirror_num,
@@ -65,31 +72,24 @@
 	int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
 	extent_submit_bio_hook_t *submit_bio_hook;
-	int (*merge_bio_hook)(struct page *page, unsigned long offset,
+	int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
 			      size_t size, struct bio *bio,
 			      unsigned long bio_flags);
-	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-				       u64 start, u64 end, int failed_mirror,
-				       struct extent_state *state);
-	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
-					u64 start, u64 end,
-				       struct extent_state *state);
-	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
-				    struct extent_state *state);
+	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
+	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
+				    struct page *page, u64 start, u64 end,
+				    int mirror);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
 	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-			     int *bits);
+			     unsigned long *bits);
 	void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-			       int *bits);
+			       unsigned long *bits);
 	void (*merge_extent_hook)(struct inode *inode,
 				  struct extent_state *new,
 				  struct extent_state *other);
 	void (*split_extent_hook)(struct inode *inode,
 				  struct extent_state *orig, u64 split);
-	int (*write_cache_pages_lock_hook)(struct page *page, void *data,
-					   void (*flush_fn)(void *));
 };
 
 struct extent_io_tree {
@@ -97,6 +97,7 @@
 	struct radix_tree_root buffer;
 	struct address_space *mapping;
 	u64 dirty_bytes;
+	int track_uptodate;
 	spinlock_t lock;
 	spinlock_t buffer_lock;
 	struct extent_io_ops *ops;
@@ -116,19 +117,26 @@
 	/* for use by the FS */
 	u64 private;
 
+#ifdef CONFIG_BTRFS_DEBUG
 	struct list_head leak_list;
+#endif
 };
 
+#define INLINE_EXTENT_BUFFER_PAGES 16
+#define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_CACHE_SIZE)
 struct extent_buffer {
 	u64 start;
 	unsigned long len;
 	unsigned long map_start;
 	unsigned long map_len;
-	struct page *first_page;
 	unsigned long bflags;
-	struct list_head leak_list;
-	struct rcu_head rcu_head;
+	struct extent_io_tree *tree;
+	spinlock_t refs_lock;
 	atomic_t refs;
+	atomic_t io_pages;
+	int read_mirror;
+	struct rcu_head rcu_head;
+	pid_t lock_owner;
 
 	/* count of read lock holders on the extent buffer */
 	atomic_t write_locks;
@@ -137,6 +145,7 @@
 	atomic_t blocking_readers;
 	atomic_t spinning_readers;
 	atomic_t spinning_writers;
+	int lock_nested;
 
 	/* protects write locks */
 	rwlock_t lock;
@@ -150,6 +159,11 @@
 	 * to unlock
 	 */
 	wait_queue_head_t read_lock_wq;
+	wait_queue_head_t lock_wq;
+	struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+#ifdef CONFIG_BTRFS_DEBUG
+	struct list_head leak_list;
+#endif
 };
 
 static inline void extent_set_compress_type(unsigned long *bio_flags,
@@ -176,20 +190,18 @@
 int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
-int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page);
-int try_release_extent_state(struct extent_map_tree *map,
-			     struct extent_io_tree *tree, struct page *page,
-			     gfp_t mask);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+int try_release_extent_buffer(struct page *page);
+int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, struct extent_state **cached, gfp_t mask);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
+		     unsigned long bits, struct extent_state **cached);
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 			 struct extent_state **cached, gfp_t mask);
-int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
-		    gfp_t mask);
+int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent, int mirror_num);
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
@@ -199,19 +211,22 @@
 
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int filled, struct extent_state *cached_state);
+		   unsigned long bits, int filled,
+		   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      int bits, gfp_t mask);
+		      unsigned long bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		     int bits, int wake, int delete, struct extent_state **cached,
-		     gfp_t mask);
+		     unsigned long bits, int wake, int delete,
+		     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    int bits, gfp_t mask);
+		    unsigned long bits, gfp_t mask);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		   int bits, int exclusive_bits, u64 *failed_start,
+		   unsigned long bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
+int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
+			  struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -219,13 +234,15 @@
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-		       int bits, int clear_bits, gfp_t mask);
+		       unsigned long bits, unsigned long clear_bits,
+		       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
+int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
+		      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-			  u64 *start_ret, u64 *end_ret, int bits);
-struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
-						 u64 start, int bits);
+			  u64 *start_ret, u64 *end_ret, unsigned long bits,
+			  struct extent_state **cached_state);
 int extent_invalidatepage(struct extent_io_tree *tree,
 			  struct page *page, unsigned long offset);
 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
@@ -238,30 +255,44 @@
 		      struct address_space *mapping,
 		      get_extent_t *get_extent,
 		      struct writeback_control *wbc);
+int btree_write_cache_pages(struct address_space *mapping,
+			    struct writeback_control *wbc);
 int extent_readpages(struct extent_io_tree *tree,
 		     struct address_space *mapping,
 		     struct list_head *pages, unsigned nr_pages,
 		     get_extent_t get_extent);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent);
-int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
-					  u64 start, unsigned long len,
-					  struct page *page0);
+					  u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					 u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+void free_extent_buffer_stale(struct extent_buffer *eb);
 #define WAIT_NONE	0
 #define WAIT_COMPLETE	1
 #define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
-unsigned long num_extent_pages(u64 start, u64 len);
-struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
+void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
+
+static inline unsigned long num_extent_pages(u64 start, u64 len)
+{
+	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
+		(start >> PAGE_CACHE_SHIFT);
+}
+
+static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+					      unsigned long i)
+{
+	return eb->pages[i];
+}
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
@@ -271,6 +302,11 @@
 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 			  unsigned long start,
 			  unsigned long len);
+#ifdef MY_ABC_HERE
+int memcmp_caseless_extent_buffer(struct extent_buffer *eb, const void *ptrv,
+			  unsigned long start,
+			  unsigned long len);
+#endif
 void read_extent_buffer(struct extent_buffer *eb, void *dst,
 			unsigned long start,
 			unsigned long len);
@@ -285,36 +321,33 @@
 			   unsigned long src_offset, unsigned long len);
 void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len);
-int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
-int clear_extent_buffer_dirty(struct extent_io_tree *tree,
-			      struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_io_tree *tree,
-			     struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_io_tree *tree,
-			       struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
-				struct extent_buffer *eb,
-				struct extent_state **cached_state);
-int extent_buffer_uptodate(struct extent_io_tree *tree,
-			   struct extent_buffer *eb,
-			   struct extent_state *cached_state);
+void clear_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_dirty(struct extent_buffer *eb);
+int set_extent_buffer_uptodate(struct extent_buffer *eb);
+int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+int extent_buffer_uptodate(struct extent_buffer *eb);
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long min_len, char **map,
 		      unsigned long *map_start,
 		      unsigned long *map_len);
-int extent_range_uptodate(struct extent_io_tree *tree,
-			  u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode,
-				struct extent_io_tree *tree,
-				u64 start, u64 end, struct page *locked_page,
-				unsigned long op);
+int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned long bits_to_clear,
+				 unsigned long page_ops);
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
+struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs);
+struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask);
 
-struct btrfs_mapping_tree;
+struct btrfs_fs_info;
 
-int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
 			u64 length, u64 logical, struct page *page,
 			int mirror_num);
+int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
+			 int mirror_num);
 #endif
diff -ur a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
--- a/fs/btrfs/extent_map.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/extent_map.c	2014-02-17 11:56:58.000000000 +0100
@@ -1,6 +1,5 @@
 #include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
 #include "ctree.h"
@@ -11,7 +10,7 @@
 
 int __init extent_map_init(void)
 {
-	extent_map_cache = kmem_cache_create("extent_map",
+	extent_map_cache = kmem_cache_create("btrfs_extent_map",
 			sizeof(struct extent_map), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!extent_map_cache)
@@ -35,6 +34,7 @@
 void extent_map_tree_init(struct extent_map_tree *tree)
 {
 	tree->map = RB_ROOT;
+	INIT_LIST_HEAD(&tree->modified_extents);
 	rwlock_init(&tree->lock);
 }
 
@@ -48,13 +48,15 @@
 struct extent_map *alloc_extent_map(void)
 {
 	struct extent_map *em;
-	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+	em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS);
 	if (!em)
 		return NULL;
 	em->in_tree = 0;
 	em->flags = 0;
 	em->compress_type = BTRFS_COMPRESS_NONE;
+	em->generation = 0;
 	atomic_set(&em->refs, 1);
+	INIT_LIST_HEAD(&em->list);
 	return em;
 }
 
@@ -72,6 +74,7 @@
 	WARN_ON(atomic_read(&em->refs) == 0);
 	if (atomic_dec_and_test(&em->refs)) {
 		WARN_ON(em->in_tree);
+		WARN_ON(!list_empty(&em->list));
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
@@ -167,6 +170,18 @@
 	if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
 		return 0;
 
+	if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+	    test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+		return 0;
+
+	/*
+	 * We don't want to merge stuff that hasn't been written to the log yet
+	 * since it may not reflect exactly what is on disk, and that would be
+	 * bad.
+	 */
+	if (!list_empty(&prev->list) || !list_empty(&next->list))
+		return 0;
+
 	if (extent_map_end(prev) == next->start &&
 	    prev->flags == next->flags &&
 	    prev->bdev == next->bdev &&
@@ -194,10 +209,15 @@
 			merge = rb_entry(rb, struct extent_map, rb_node);
 		if (rb && mergable_maps(merge, em)) {
 			em->start = merge->start;
+			em->orig_start = merge->orig_start;
 			em->len += merge->len;
 			em->block_len += merge->block_len;
 			em->block_start = merge->block_start;
 			merge->in_tree = 0;
+			em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
+			em->mod_start = merge->mod_start;
+			em->generation = max(em->generation, merge->generation);
+
 			rb_erase(&merge->rb_node, &tree->map);
 			free_extent_map(merge);
 		}
@@ -211,14 +231,29 @@
 		em->block_len += merge->len;
 		rb_erase(&merge->rb_node, &tree->map);
 		merge->in_tree = 0;
+		em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
+		em->generation = max(em->generation, merge->generation);
 		free_extent_map(merge);
 	}
 }
 
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len)
+/**
+ * unpin_extent_cache - unpin an extent from the cache
+ * @tree:	tree to unpin the extent in
+ * @start:	logical offset in the file
+ * @len:	length of the extent
+ * @gen:	generation that this extent has been modified in
+ *
+ * Called after an extent has been written to disk properly.  Set the generation
+ * to the generation that actually added the file item to the inode so we know
+ * we need to sync this extent when we call fsync().
+ */
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
+		       u64 gen)
 {
 	int ret = 0;
 	struct extent_map *em;
+	bool prealloc = false;
 
 	write_lock(&tree->lock);
 	em = lookup_extent_mapping(tree, start, len);
@@ -228,10 +263,25 @@
 	if (!em)
 		goto out;
 
+	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+		list_move(&em->list, &tree->modified_extents);
+	em->generation = gen;
 	clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+		prealloc = true;
+		clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+	}
 
 	try_merge_map(tree, em);
 
+	if (prealloc) {
+		em->mod_start = em->start;
+		em->mod_len = em->len;
+	}
+
 	free_extent_map(em);
 out:
 	write_unlock(&tree->lock);
@@ -239,6 +289,13 @@
 
 }
 
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+	clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+	if (em->in_tree)
+		try_merge_map(tree, em);
+}
+
 /**
  * add_extent_mapping - add new extent map to the extent tree
  * @tree:	tree to insert new map in
@@ -250,7 +307,7 @@
  * reference dropped if the merge attempt was successful.
  */
 int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em)
+		       struct extent_map *em, int modified)
 {
 	int ret = 0;
 	struct rb_node *rb;
@@ -269,7 +326,13 @@
 	}
 	atomic_inc(&em->refs);
 
-	try_merge_map(tree, em);
+	em->mod_start = em->start;
+	em->mod_len = em->len;
+
+	if (modified)
+		list_move(&em->list, &tree->modified_extents);
+	else
+		try_merge_map(tree, em);
 out:
 	return ret;
 }
@@ -282,8 +345,9 @@
 	return start + len;
 }
 
-struct extent_map *__lookup_extent_mapping(struct extent_map_tree *tree,
-					   u64 start, u64 len, int strict)
+static struct extent_map *
+__lookup_extent_mapping(struct extent_map_tree *tree,
+			u64 start, u64 len, int strict)
 {
 	struct extent_map *em;
 	struct rb_node *rb_node;
@@ -358,6 +422,8 @@
 
 	WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
 	rb_erase(&em->rb_node, &tree->map);
+	if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+		list_del_init(&em->list);
 	em->in_tree = 0;
 	return ret;
 }
diff -ur a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
--- a/fs/btrfs/extent_map.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/extent_map.h	2014-02-17 11:56:58.000000000 +0100
@@ -13,6 +13,8 @@
 #define EXTENT_FLAG_COMPRESSED 1
 #define EXTENT_FLAG_VACANCY 2 /* no file extent item found */
 #define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */
+#define EXTENT_FLAG_LOGGING 4 /* Logging this extent */
+#define EXTENT_FLAG_FILLING 5 /* Filling in a preallocated extent */
 
 struct extent_map {
 	struct rb_node rb_node;
@@ -20,18 +22,25 @@
 	/* all of these are in bytes */
 	u64 start;
 	u64 len;
+	u64 mod_start;
+	u64 mod_len;
 	u64 orig_start;
+	u64 orig_block_len;
+	u64 ram_bytes;
 	u64 block_start;
 	u64 block_len;
+	u64 generation;
 	unsigned long flags;
 	struct block_device *bdev;
 	atomic_t refs;
-	unsigned int in_tree:1;
-	unsigned int compress_type:4;
+	unsigned int in_tree;
+	unsigned int compress_type;
+	struct list_head list;
 };
 
 struct extent_map_tree {
 	struct rb_root map;
+	struct list_head modified_extents;
 	rwlock_t lock;
 };
 
@@ -53,14 +62,15 @@
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
-		       struct extent_map *em);
+		       struct extent_map *em, int modified);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
 
 struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len);
+int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
 					 u64 start, u64 len);
 #endif
diff -ur a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
--- a/fs/btrfs/extent-tree.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/extent-tree.c	2014-02-17 11:56:58.000000000 +0100
@@ -24,6 +24,7 @@
 #include <linux/kthread.h>
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
+#include <linux/percpu_counter.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -31,26 +32,31 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "math.h"
 
-/* control flags for do_chunk_alloc's force field
+#undef SCRAMBLE_DELAYED_REFS
+
+/*
+ * control flags for do_chunk_alloc's force field
  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
  * if we really need one.
  *
- * CHUNK_ALLOC_FORCE means it must try to allocate one
- *
  * CHUNK_ALLOC_LIMITED means to only try and allocate one
  * if we have very few chunks already allocated.  This is
  * used as part of the clustering code to help make sure
  * we have a good pool of storage to cluster in, without
  * filling the FS with empty chunks
  *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
  */
 enum {
 	CHUNK_ALLOC_NO_FORCE = 0,
-	CHUNK_ALLOC_FORCE = 1,
-	CHUNK_ALLOC_LIMITED = 2,
+	CHUNK_ALLOC_LIMITED = 1,
+	CHUNK_ALLOC_FORCE = 2,
 };
 
 /*
@@ -68,8 +74,7 @@
 	RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
@@ -91,20 +96,25 @@
 				     u64 flags, struct btrfs_disk_key *key,
 				     int level, struct btrfs_key *ins);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force);
+			  struct btrfs_root *extent_root, u64 flags,
+			  int force);
 static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
 				       u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+			       u64 num_bytes);
+int btrfs_pin_extent(struct btrfs_root *root,
+		     u64 bytenr, u64 num_bytes, int reserved);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	smp_mb();
-	return cache->cached == BTRFS_CACHE_FINISHED;
+	return cache->cached == BTRFS_CACHE_FINISHED ||
+		cache->cached == BTRFS_CACHE_ERROR;
 }
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
@@ -158,6 +168,10 @@
 	rb_link_node(&block_group->cache_node, parent, p);
 	rb_insert_color(&block_group->cache_node,
 			&info->block_group_cache_tree);
+
+	if (info->first_logical_byte > block_group->key.objectid)
+		info->first_logical_byte = block_group->key.objectid;
+
 	spin_unlock(&info->block_group_cache_lock);
 
 	return 0;
@@ -199,8 +213,11 @@
 			break;
 		}
 	}
-	if (ret)
+	if (ret) {
 		btrfs_get_block_group(ret);
+		if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+			info->first_logical_byte = ret->key.objectid;
+	}
 	spin_unlock(&info->block_group_cache_lock);
 
 	return ret;
@@ -244,7 +261,8 @@
 		cache->bytes_super += stripe_len;
 		ret = add_excluded_extent(root, cache->key.objectid,
 					  stripe_len);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
@@ -252,13 +270,35 @@
 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
 				       cache->key.objectid, bytenr,
 				       0, &logical, &nr, &stripe_len);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 
 		while (nr--) {
-			cache->bytes_super += stripe_len;
-			ret = add_excluded_extent(root, logical[nr],
-						  stripe_len);
-			BUG_ON(ret);
+			u64 start, len;
+
+			if (logical[nr] > cache->key.objectid +
+			    cache->key.offset)
+				continue;
+
+			if (logical[nr] + stripe_len <= cache->key.objectid)
+				continue;
+
+			start = logical[nr];
+			if (start < cache->key.objectid) {
+				start = cache->key.objectid;
+				len = (logical[nr] + stripe_len) - start;
+			} else {
+				len = min_t(u64, stripe_len,
+					    cache->key.objectid +
+					    cache->key.offset - start);
+			}
+
+			cache->bytes_super += len;
+			ret = add_excluded_extent(root, start, len);
+			if (ret) {
+				kfree(logical);
+				return ret;
+			}
 		}
 
 		kfree(logical);
@@ -309,7 +349,8 @@
 	while (start < end) {
 		ret = find_first_extent_bit(info->pinned_extents, start,
 					    &extent_start, &extent_end,
-					    EXTENT_DIRTY | EXTENT_UPTODATE);
+					    EXTENT_DIRTY | EXTENT_UPTODATE,
+					    NULL);
 		if (ret)
 			break;
 
@@ -320,7 +361,7 @@
 			total_added += size;
 			ret = btrfs_add_free_space(block_group, start,
 						   size);
-			BUG_ON(ret);
+			BUG_ON(ret); /* -ENOMEM or logic error */
 			start = extent_end + 1;
 		} else {
 			break;
@@ -331,7 +372,7 @@
 		size = end - start;
 		total_added += size;
 		ret = btrfs_add_free_space(block_group, start, size);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM or logic error */
 	}
 
 	return total_added;
@@ -349,7 +390,7 @@
 	u64 total_found = 0;
 	u64 last = 0;
 	u32 nritems;
-	int ret = 0;
+	int ret = -ENOMEM;
 
 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 	block_group = caching_ctl->block_group;
@@ -380,6 +421,7 @@
 	/* need to make sure the commit_root doesn't disappear */
 	down_read(&fs_info->extent_commit_sem);
 
+next:
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -400,8 +442,7 @@
 			if (ret)
 				break;
 
-			if (need_resched() ||
-			    btrfs_next_leaf(extent_root, path)) {
+			if (need_resched()) {
 				caching_ctl->progress = last;
 				btrfs_release_path(path);
 				up_read(&fs_info->extent_commit_sem);
@@ -409,11 +450,27 @@
 				cond_resched();
 				goto again;
 			}
+
+			ret = btrfs_next_leaf(extent_root, path);
+			if (ret < 0)
+				goto err;
+			if (ret)
+				break;
 			leaf = path->nodes[0];
 			nritems = btrfs_header_nritems(leaf);
 			continue;
 		}
 
+		if (key.objectid < last) {
+			key.objectid = last;
+			key.offset = 0;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+
+			caching_ctl->progress = last;
+			btrfs_release_path(path);
+			goto next;
+		}
+
 		if (key.objectid < block_group->key.objectid) {
 			path->slots[0]++;
 			continue;
@@ -423,11 +480,16 @@
 		    block_group->key.offset)
 			break;
 
-		if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    key.type == BTRFS_METADATA_ITEM_KEY) {
 			total_found += add_new_free_space(block_group,
 							  fs_info, last,
 							  key.objectid);
-			last = key.objectid + key.offset;
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				last = key.objectid +
+					fs_info->tree_root->leafsize;
+			else
+				last = key.objectid + key.offset;
 
 			if (total_found > (1024 * 1024 * 2)) {
 				total_found = 0;
@@ -456,6 +518,12 @@
 
 	mutex_unlock(&caching_ctl->mutex);
 out:
+	if (ret) {
+		spin_lock(&block_group->lock);
+		block_group->caching_ctl = NULL;
+		block_group->cached = BTRFS_CACHE_ERROR;
+		spin_unlock(&block_group->lock);
+	}
 	wake_up(&caching_ctl->wait);
 
 	put_caching_control(caching_ctl);
@@ -463,8 +531,6 @@
 }
 
 static int cache_block_group(struct btrfs_block_group_cache *cache,
-			     struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
 			     int load_cache_only)
 {
 	DEFINE_WAIT(wait);
@@ -473,7 +539,8 @@
 	int ret = 0;
 
 	caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
-	BUG_ON(!caching_ctl);
+	if (!caching_ctl)
+		return -ENOMEM;
 
 	INIT_LIST_HEAD(&caching_ctl->list);
 	mutex_init(&caching_ctl->mutex);
@@ -521,15 +588,7 @@
 	cache->cached = BTRFS_CACHE_FAST;
 	spin_unlock(&cache->lock);
 
-	/*
-	 * We can't do the read from on-disk cache during a commit since we need
-	 * to have the normal tree locking.  Also if we are currently trying to
-	 * allocate blocks for the tree root we can't do the fast caching since
-	 * we likely hold important locks.
-	 */
-	if (trans && (!trans->transaction->in_commit) &&
-	    (root && root != root->fs_info->tree_root) &&
-	    btrfs_test_opt(root, SPACE_CACHE)) {
+	if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
 		ret = load_free_space_cache(fs_info, cache);
 
 		spin_lock(&cache->lock);
@@ -618,8 +677,7 @@
 	struct list_head *head = &info->space_info;
 	struct btrfs_space_info *found;
 
-	flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
-		 BTRFS_BLOCK_GROUP_METADATA;
+	flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
@@ -647,73 +705,6 @@
 	rcu_read_unlock();
 }
 
-static u64 div_factor(u64 num, int factor)
-{
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
-}
-
-static u64 div_factor_fine(u64 num, int factor)
-{
-	if (factor == 100)
-		return num;
-	num *= factor;
-	do_div(num, 100);
-	return num;
-}
-
-u64 btrfs_find_block_group(struct btrfs_root *root,
-			   u64 search_start, u64 search_hint, int owner)
-{
-	struct btrfs_block_group_cache *cache;
-	u64 used;
-	u64 last = max(search_hint, search_start);
-	u64 group_start = 0;
-	int full_search = 0;
-	int factor = 9;
-	int wrapped = 0;
-again:
-	while (1) {
-		cache = btrfs_lookup_first_block_group(root->fs_info, last);
-		if (!cache)
-			break;
-
-		spin_lock(&cache->lock);
-		last = cache->key.objectid + cache->key.offset;
-		used = btrfs_block_group_used(&cache->item);
-
-		if ((full_search || !cache->ro) &&
-		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
-			if (used + cache->pinned + cache->reserved <
-			    div_factor(cache->key.offset, factor)) {
-				group_start = cache->key.objectid;
-				spin_unlock(&cache->lock);
-				btrfs_put_block_group(cache);
-				goto found;
-			}
-		}
-		spin_unlock(&cache->lock);
-		btrfs_put_block_group(cache);
-		cond_resched();
-	}
-	if (!wrapped) {
-		last = search_start;
-		wrapped = 1;
-		goto again;
-	}
-	if (!full_search && factor < 10) {
-		last = search_start;
-		full_search = 1;
-		factor = 10;
-		goto again;
-	}
-found:
-	return group_start;
-}
-
 /* simple helper to search for an existing extent at a given offset */
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 {
@@ -727,15 +718,21 @@
 
 	key.objectid = start;
 	key.offset = len;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+	key.type = BTRFS_EXTENT_ITEM_KEY;
 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
 				0, 0);
+	if (ret > 0) {
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (key.objectid == start &&
+		    key.type == BTRFS_METADATA_ITEM_KEY)
+			ret = 0;
+	}
 	btrfs_free_path(path);
 	return ret;
 }
 
 /*
- * helper function to lookup reference count and flags of extent.
+ * helper function to lookup reference count and flags of a tree block.
  *
  * the head node for delayed ref is used to store the sum of all the
  * reference count modifications queued up in the rbtree. the head
@@ -745,7 +742,7 @@
  */
 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 bytenr,
-			     u64 num_bytes, u64 *refs, u64 *flags)
+			     u64 offset, int metadata, u64 *refs, u64 *flags)
 {
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
@@ -758,13 +755,29 @@
 	u64 extent_flags;
 	int ret;
 
+	/*
+	 * If we don't have skinny metadata, don't bother doing anything
+	 * different
+	 */
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
+		offset = root->leafsize;
+		metadata = 0;
+	}
+
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = num_bytes;
+	if (metadata) {
+		key.objectid = bytenr;
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = offset;
+	} else {
+		key.objectid = bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = offset;
+	}
+
 	if (!trans) {
 		path->skip_locking = 1;
 		path->search_commit_root = 1;
@@ -775,6 +788,26 @@
 	if (ret < 0)
 		goto out_free;
 
+	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
+		metadata = 0;
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == root->leafsize)
+				ret = 0;
+		}
+		if (ret) {
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = root->leafsize;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
 	if (ret == 0) {
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -982,7 +1015,7 @@
 				ret = btrfs_next_leaf(root, path);
 				if (ret < 0)
 					return ret;
-				BUG_ON(ret > 0);
+				BUG_ON(ret > 0); /* Corruption */
 				leaf = path->nodes[0];
 			}
 			btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1008,9 +1041,9 @@
 				new_size + extra_size, 1);
 	if (ret < 0)
 		return ret;
-	BUG_ON(ret);
+	BUG_ON(ret); /* Corruption */
 
-	ret = btrfs_extend_item(trans, root, path, new_size);
+	btrfs_extend_item(root, path, new_size);
 
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1462,6 +1495,8 @@
 	int want;
 	int ret;
 	int err = 0;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -1473,12 +1508,54 @@
 		path->keep_locks = 1;
 	} else
 		extra_size = -1;
+
+	/*
+	 * Owner is our parent level, so we can just add one to get the level
+	 * for the block we are interested in.
+	 */
+	if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = owner;
+	}
+
+again:
 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 	if (ret < 0) {
 		err = ret;
 		goto out;
 	}
-	BUG_ON(ret);
+
+	/*
+	 * We may be a newly converted file system which still has the old fat
+	 * extent entries for metadata, so try and see if we have one of those.
+	 */
+	if (ret > 0 && skinny_metadata) {
+		skinny_metadata = false;
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes)
+				ret = 0;
+		}
+		if (ret) {
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
+	if (ret && !insert) {
+		err = -ENOENT;
+		goto out;
+	} else if (ret) {
+		err = -EIO;
+		WARN_ON(1);
+		goto out;
+	}
 
 	leaf = path->nodes[0];
 	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
@@ -1506,11 +1583,9 @@
 	ptr = (unsigned long)(ei + 1);
 	end = (unsigned long)ei + item_size;
 
-	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
 		ptr += sizeof(struct btrfs_tree_block_info);
 		BUG_ON(ptr > end);
-	} else {
-		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
 	}
 
 	err = -ENOENT;
@@ -1592,13 +1667,12 @@
  * helper to add new inline back ref
  */
 static noinline_for_stack
-int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct btrfs_path *path,
-				struct btrfs_extent_inline_ref *iref,
-				u64 parent, u64 root_objectid,
-				u64 owner, u64 offset, int refs_to_add,
-				struct btrfs_delayed_extent_op *extent_op)
+void setup_inline_extent_backref(struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int refs_to_add,
+				 struct btrfs_delayed_extent_op *extent_op)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
@@ -1608,7 +1682,6 @@
 	u64 refs;
 	int size;
 	int type;
-	int ret;
 
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -1617,7 +1690,7 @@
 	type = extent_ref_type(parent, owner);
 	size = btrfs_extent_inline_ref_size(type);
 
-	ret = btrfs_extend_item(trans, root, path, size);
+	btrfs_extend_item(root, path, size);
 
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	refs = btrfs_extent_refs(leaf, ei);
@@ -1652,7 +1725,6 @@
 		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
 	}
 	btrfs_mark_buffer_dirty(leaf);
-	return 0;
 }
 
 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
@@ -1687,12 +1759,11 @@
  * helper to update/remove inline back ref
  */
 static noinline_for_stack
-int update_inline_extent_backref(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct btrfs_path *path,
-				 struct btrfs_extent_inline_ref *iref,
-				 int refs_to_mod,
-				 struct btrfs_delayed_extent_op *extent_op)
+void update_inline_extent_backref(struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  struct btrfs_extent_inline_ref *iref,
+				  int refs_to_mod,
+				  struct btrfs_delayed_extent_op *extent_op)
 {
 	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
@@ -1703,7 +1774,6 @@
 	u32 item_size;
 	int size;
 	int type;
-	int ret;
 	u64 refs;
 
 	leaf = path->nodes[0];
@@ -1745,10 +1815,9 @@
 			memmove_extent_buffer(leaf, ptr, ptr + size,
 					      end - ptr - size);
 		item_size -= size;
-		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+		btrfs_truncate_item(root, path, item_size, 1);
 	}
 	btrfs_mark_buffer_dirty(leaf);
-	return 0;
 }
 
 static noinline_for_stack
@@ -1768,13 +1837,13 @@
 					   root_objectid, owner, offset, 1);
 	if (ret == 0) {
 		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
-		ret = update_inline_extent_backref(trans, root, path, iref,
-						   refs_to_add, extent_op);
+		update_inline_extent_backref(root, path, iref,
+					     refs_to_add, extent_op);
 	} else if (ret == -ENOENT) {
-		ret = setup_inline_extent_backref(trans, root, path, iref,
-						  parent, root_objectid,
-						  owner, offset, refs_to_add,
-						  extent_op);
+		setup_inline_extent_backref(root, path, iref, parent,
+					    root_objectid, owner, offset,
+					    refs_to_add, extent_op);
+		ret = 0;
 	}
 	return ret;
 }
@@ -1804,12 +1873,12 @@
 				 struct btrfs_extent_inline_ref *iref,
 				 int refs_to_drop, int is_data)
 {
-	int ret;
+	int ret = 0;
 
 	BUG_ON(!is_data && refs_to_drop != 1);
 	if (iref) {
-		ret = update_inline_extent_backref(trans, root, path, iref,
-						   -refs_to_drop, NULL);
+		update_inline_extent_backref(root, path, iref,
+					     -refs_to_drop, NULL);
 	} else if (is_data) {
 		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
 	} else {
@@ -1833,8 +1902,9 @@
 
 
 	/* Tell the block device(s) that the sectors can be discarded */
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
+	ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
 			      bytenr, &num_bytes, &bbio, 0);
+	/* Error condition is -ENOMEM */
 	if (!ret) {
 		struct btrfs_bio_stripe *stripe = bbio->stripes;
 		int i;
@@ -1850,7 +1920,7 @@
 			if (!ret)
 				discarded_bytes += stripe->length;
 			else if (ret != -EOPNOTSUPP)
-				break;
+				break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
 
 			/*
 			 * Just in case we get back EOPNOTSUPP for some reason,
@@ -1866,26 +1936,33 @@
 		*actual_bytes = discarded_bytes;
 
 
+	if (ret == -EOPNOTSUPP)
+		ret = 0;
 	return ret;
 }
 
+/* Can return -ENOMEM */
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 owner, u64 offset)
+			 u64 root_objectid, u64 owner, u64 offset, int for_cow)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
 	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
 	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
 	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, owner, offset,
-					BTRFS_ADD_DELAYED_REF, NULL);
+					BTRFS_ADD_DELAYED_REF, NULL, for_cow);
 	}
 	return ret;
 }
@@ -1940,7 +2017,8 @@
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
 				    path, bytenr, parent, root_objectid,
 				    owner, offset, refs_to_add);
-	BUG_ON(ret);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
 out:
 	btrfs_free_path(path);
 	return err;
@@ -1964,16 +2042,16 @@
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 
 	ref = btrfs_delayed_node_to_data_ref(node);
+	trace_run_delayed_data_ref(node, ref, node->action);
+
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 		parent = ref->parent;
 	else
 		ref_root = ref->root;
 
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		if (extent_op) {
-			BUG_ON(extent_op->update_key);
+		if (extent_op)
 			flags |= extent_op->flags_to_set;
-		}
 		ret = alloc_reserved_file_extent(trans, root,
 						 parent, ref_root, flags,
 						 ref->objectid, ref->offset,
@@ -2026,15 +2104,29 @@
 	u32 item_size;
 	int ret;
 	int err = 0;
+	int metadata = !extent_op->is_data;
+
+	if (trans->aborted)
+		return 0;
+
+	if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		metadata = 0;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	key.objectid = node->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = node->num_bytes;
 
+	if (metadata) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = extent_op->level;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = node->num_bytes;
+	}
+
+again:
 	path->reada = 1;
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
@@ -2044,6 +2136,14 @@
 		goto out;
 	}
 	if (ret > 0) {
+		if (metadata) {
+			btrfs_release_path(path);
+			metadata = 0;
+
+			key.offset = node->num_bytes;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			goto again;
+		}
 		err = -EIO;
 		goto out;
 	}
@@ -2083,21 +2183,29 @@
 	struct btrfs_key ins;
 	u64 parent = 0;
 	u64 ref_root = 0;
-
-	ins.objectid = node->bytenr;
-	ins.offset = node->num_bytes;
-	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	ref = btrfs_delayed_node_to_tree_ref(node);
+	trace_run_delayed_tree_ref(node, ref, node->action);
+
 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 		parent = ref->parent;
 	else
 		ref_root = ref->root;
 
+	ins.objectid = node->bytenr;
+	if (skinny_metadata) {
+		ins.offset = ref->level;
+		ins.type = BTRFS_METADATA_ITEM_KEY;
+	} else {
+		ins.offset = node->num_bytes;
+		ins.type = BTRFS_EXTENT_ITEM_KEY;
+	}
+
 	BUG_ON(node->ref_mod != 1);
 	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
-		BUG_ON(!extent_op || !extent_op->update_flags ||
-		       !extent_op->update_key);
+		BUG_ON(!extent_op || !extent_op->update_flags);
 		ret = alloc_reserved_tree_block(trans, root,
 						parent, ref_root,
 						extent_op->flags_to_set,
@@ -2124,7 +2232,11 @@
 			       struct btrfs_delayed_extent_op *extent_op,
 			       int insert_reserved)
 {
-	int ret;
+	int ret = 0;
+
+	if (trans->aborted)
+		return 0;
+
 	if (btrfs_delayed_ref_is_head(node)) {
 		struct btrfs_delayed_ref_head *head;
 		/*
@@ -2135,6 +2247,8 @@
 		 */
 		BUG_ON(extent_op);
 		head = btrfs_delayed_node_to_head(node);
+		trace_run_delayed_ref_head(node, head, node->action);
+
 		if (insert_reserved) {
 			btrfs_pin_extent(root, node->bytenr,
 					 node->num_bytes, 1);
@@ -2142,11 +2256,9 @@
 				ret = btrfs_del_csums(trans, root,
 						      node->bytenr,
 						      node->num_bytes);
-				BUG_ON(ret);
 			}
 		}
-		mutex_unlock(&head->mutex);
-		return 0;
+		return ret;
 	}
 
 	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
@@ -2193,6 +2305,10 @@
 	return NULL;
 }
 
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 				       struct btrfs_root *root,
 				       struct list_head *cluster)
@@ -2201,6 +2317,7 @@
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
 	struct btrfs_delayed_extent_op *extent_op;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret;
 	int count = 0;
 	int must_insert_reserved = 0;
@@ -2233,6 +2350,38 @@
 		}
 
 		/*
+		 * We need to try and merge add/drops of the same ref since we
+		 * can run into issues with relocate dropping the implicit ref
+		 * and then it being added back again before the drop can
+		 * finish.  If we merged anything we need to re-loop so we can
+		 * get a good ref.
+		 */
+		btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
+					 locked_ref);
+
+		/*
+		 * locked_ref is the head node, so we have to go one
+		 * node back for any delayed ref updates
+		 */
+		ref = select_delayed_ref(locked_ref);
+
+		if (ref && ref->seq &&
+		    btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
+			/*
+			 * there are still refs with lower seq numbers in the
+			 * process of being added. Don't run this ref yet.
+			 */
+			list_del_init(&locked_ref->cluster);
+			btrfs_delayed_ref_unlock(locked_ref);
+			locked_ref = NULL;
+			delayed_refs->num_heads_ready++;
+			spin_unlock(&delayed_refs->lock);
+			cond_resched();
+			spin_lock(&delayed_refs->lock);
+			continue;
+		}
+
+		/*
 		 * record the must insert reserved flag before we
 		 * drop the spin lock.
 		 */
@@ -2242,11 +2391,6 @@
 		extent_op = locked_ref->extent_op;
 		locked_ref->extent_op = NULL;
 
-		/*
-		 * locked_ref is the head node, so we have to go one
-		 * node back for any delayed ref updates
-		 */
-		ref = select_delayed_ref(locked_ref);
 		if (!ref) {
 			/* All delayed refs have been processed, Go ahead
 			 * and send the head node to run_one_delayed_ref,
@@ -2255,7 +2399,7 @@
 			ref = &locked_ref->node;
 
 			if (extent_op && must_insert_reserved) {
-				kfree(extent_op);
+				btrfs_free_delayed_extent_op(extent_op);
 				extent_op = NULL;
 			}
 
@@ -2264,44 +2408,217 @@
 
 				ret = run_delayed_extent_op(trans, root,
 							    ref, extent_op);
-				BUG_ON(ret);
-				kfree(extent_op);
+				btrfs_free_delayed_extent_op(extent_op);
 
-				cond_resched();
-				spin_lock(&delayed_refs->lock);
-				continue;
-			}
+				if (ret) {
+					btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
+					spin_lock(&delayed_refs->lock);
+					btrfs_delayed_ref_unlock(locked_ref);
+					return ret;
+				}
 
-			list_del_init(&locked_ref->cluster);
-			locked_ref = NULL;
+				goto next;
+			}
 		}
 
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
-
+		if (!btrfs_delayed_ref_is_head(ref)) {
+			/*
+			 * when we play the delayed ref, also correct the
+			 * ref_mod on head
+			 */
+			switch (ref->action) {
+			case BTRFS_ADD_DELAYED_REF:
+			case BTRFS_ADD_DELAYED_EXTENT:
+				locked_ref->node.ref_mod -= ref->ref_mod;
+				break;
+			case BTRFS_DROP_DELAYED_REF:
+				locked_ref->node.ref_mod += ref->ref_mod;
+				break;
+			default:
+				WARN_ON(1);
+			}
+		} else {
+			list_del_init(&locked_ref->cluster);
+		}
 		spin_unlock(&delayed_refs->lock);
 
 		ret = run_one_delayed_ref(trans, root, ref, extent_op,
 					  must_insert_reserved);
-		BUG_ON(ret);
 
+		btrfs_free_delayed_extent_op(extent_op);
+		if (ret) {
+			btrfs_delayed_ref_unlock(locked_ref);
+			btrfs_put_delayed_ref(ref);
+			btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
+			spin_lock(&delayed_refs->lock);
+			return ret;
+		}
+
+		/*
+		 * If this node is a head, that means all the refs in this head
+		 * have been dealt with, and we will pick the next head to deal
+		 * with, so we must unlock the head and drop it from the cluster
+		 * list before we release it.
+		 */
+		if (btrfs_delayed_ref_is_head(ref)) {
+			btrfs_delayed_ref_unlock(locked_ref);
+			locked_ref = NULL;
+		}
 		btrfs_put_delayed_ref(ref);
-		kfree(extent_op);
 		count++;
-
+next:
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
 	return count;
 }
 
+#ifdef SCRAMBLE_DELAYED_REFS
+/*
+ * Normally delayed refs get processed in ascending bytenr order. This
+ * correlates in most cases to the order added. To expose dependencies on this
+ * order, we start to process the tree in the middle instead of the beginning
+ */
+static u64 find_middle(struct rb_root *root)
+{
+	struct rb_node *n = root->rb_node;
+	struct btrfs_delayed_ref_node *entry;
+	int alt = 1;
+	u64 middle;
+	u64 first = 0, last = 0;
+
+	n = rb_first(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		first = entry->bytenr;
+	}
+	n = rb_last(root);
+	if (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		last = entry->bytenr;
+	}
+	n = root->rb_node;
+
+	while (n) {
+		entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+		WARN_ON(!entry->in_tree);
+
+		middle = entry->bytenr;
+
+		if (alt)
+			n = n->rb_left;
+		else
+			n = n->rb_right;
+
+		alt = 1 - alt;
+	}
+	return middle;
+}
+#endif
+
+int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
+					 struct btrfs_fs_info *fs_info)
+{
+	struct qgroup_update *qgroup_update;
+	int ret = 0;
+
+	if (list_empty(&trans->qgroup_ref_list) !=
+	    !trans->delayed_ref_elem.seq) {
+		/* list without seq or seq without list */
+		btrfs_err(fs_info,
+			"qgroup accounting update error, list is%s empty, seq is %#x.%x",
+			list_empty(&trans->qgroup_ref_list) ? "" : " not",
+			(u32)(trans->delayed_ref_elem.seq >> 32),
+			(u32)trans->delayed_ref_elem.seq);
+		BUG();
+	}
+
+	if (!trans->delayed_ref_elem.seq)
+		return 0;
+
+	while (!list_empty(&trans->qgroup_ref_list)) {
+		qgroup_update = list_first_entry(&trans->qgroup_ref_list,
+						 struct qgroup_update, list);
+		list_del(&qgroup_update->list);
+		if (!ret)
+			ret = btrfs_qgroup_account_ref(
+					trans, fs_info, qgroup_update->node,
+					qgroup_update->extent_op);
+		kfree(qgroup_update);
+	}
+
+	btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
+
+	return ret;
+}
+
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+		      int count)
+{
+	int val = atomic_read(&delayed_refs->ref_seq);
+
+	if (val < seq || val >= seq + count)
+		return 1;
+	return 0;
+}
+
+static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads)
+{
+	u64 num_bytes;
+
+	num_bytes = heads * (sizeof(struct btrfs_extent_item) +
+			     sizeof(struct btrfs_extent_inline_ref));
+	if (!btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
+		num_bytes += heads * sizeof(struct btrfs_tree_block_info);
+
+	/*
+	 * We don't ever fill up leaves all the way so multiply by 2 just to be
+	 * closer to what we're really going to want to ouse.
+	 */
+	return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root));
+}
+
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_rsv *global_rsv;
+	u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
+	u64 num_bytes;
+	int ret = 0;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	num_heads = heads_to_leaves(root, num_heads);
+	if (num_heads > 1)
+		num_bytes += (num_heads - 1) * root->leafsize;
+	num_bytes <<= 1;
+	global_rsv = &root->fs_info->global_block_rsv;
+
+	/*
+	 * If we can't allocate any more chunks lets make sure we have _lots_ of
+	 * wiggle room since running delayed refs can create more delayed refs.
+	 */
+	if (global_rsv->space_info->full)
+		num_bytes <<= 1;
+
+	spin_lock(&global_rsv->lock);
+	if (global_rsv->reserved <= num_bytes)
+		ret = 1;
+	spin_unlock(&global_rsv->lock);
+	return ret;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
  * 0, which means to process everything in the tree at the start
  * of the run (but not newly added entries), or it can be some target
  * number you'd like to process.
+ *
+ * Returns 0 on success or if called with an aborted transaction
+ * Returns <0 on error and aborts the transaction
  */
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count)
@@ -2311,23 +2628,72 @@
 	struct btrfs_delayed_ref_node *ref;
 	struct list_head cluster;
 	int ret;
+	u64 delayed_start;
 	int run_all = count == (unsigned long)-1;
 	int run_most = 0;
+	int loops;
+
+	/* We'll clean this up in btrfs_cleanup_transaction */
+	if (trans->aborted)
+		return 0;
 
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
+	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	INIT_LIST_HEAD(&cluster);
-again:
-	spin_lock(&delayed_refs->lock);
 	if (count == 0) {
 		count = delayed_refs->num_entries * 2;
 		run_most = 1;
 	}
+
+	if (!run_all && !run_most) {
+		int old;
+		int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+		old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+		if (old) {
+			DEFINE_WAIT(__wait);
+			if (delayed_refs->flushing ||
+			    !btrfs_should_throttle_delayed_refs(trans, root))
+				return 0;
+
+			prepare_to_wait(&delayed_refs->wait, &__wait,
+					TASK_UNINTERRUPTIBLE);
+
+			old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+			if (old) {
+				schedule();
+				finish_wait(&delayed_refs->wait, &__wait);
+
+				if (!refs_newer(delayed_refs, seq, 256))
+					goto progress;
+				else
+					return 0;
+			} else {
+				finish_wait(&delayed_refs->wait, &__wait);
+				goto again;
+			}
+		}
+
+	} else {
+		atomic_inc(&delayed_refs->procs_running_refs);
+	}
+
+again:
+	loops = 0;
+	spin_lock(&delayed_refs->lock);
+
+#ifdef SCRAMBLE_DELAYED_REFS
+	delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
+#endif
+
 	while (1) {
 		if (!(run_all || run_most) &&
-		    delayed_refs->num_heads_ready < 64)
+		    !btrfs_should_throttle_delayed_refs(trans, root))
 			break;
 
 		/*
@@ -2336,21 +2702,59 @@
 		 * of refs to process starting at the first one we are able to
 		 * lock
 		 */
+		delayed_start = delayed_refs->run_delayed_start;
 		ret = btrfs_find_ref_cluster(trans, &cluster,
 					     delayed_refs->run_delayed_start);
 		if (ret)
 			break;
 
 		ret = run_clustered_refs(trans, root, &cluster);
-		BUG_ON(ret < 0);
+		if (ret < 0) {
+			btrfs_release_ref_cluster(&cluster);
+			spin_unlock(&delayed_refs->lock);
+			btrfs_abort_transaction(trans, root, ret);
+			atomic_dec(&delayed_refs->procs_running_refs);
+			wake_up(&delayed_refs->wait);
+			return ret;
+		}
+
+		atomic_add(ret, &delayed_refs->ref_seq);
 
 		count -= min_t(unsigned long, ret, count);
 
 		if (count == 0)
 			break;
+
+		if (delayed_start >= delayed_refs->run_delayed_start) {
+			if (loops == 0) {
+				/*
+				 * btrfs_find_ref_cluster looped. let's do one
+				 * more cycle. if we don't run any delayed ref
+				 * during that cycle (because we can't because
+				 * all of them are blocked), bail out.
+				 */
+				loops = 1;
+			} else {
+				/*
+				 * no runnable refs left, stop trying
+				 */
+				BUG_ON(run_all);
+				break;
+			}
+		}
+		if (ret) {
+			/* refs were run, let's reset staleness detection */
+			loops = 0;
+		}
 	}
 
 	if (run_all) {
+		if (!list_empty(&trans->new_bgs)) {
+			spin_unlock(&delayed_refs->lock);
+			btrfs_create_pending_block_groups(trans, root);
+			spin_lock(&delayed_refs->lock);
+		}
+
 		node = rb_first(&delayed_refs->root);
 		if (!node)
 			goto out;
@@ -2384,19 +2788,25 @@
 		goto again;
 	}
 out:
+	atomic_dec(&delayed_refs->procs_running_refs);
+	smp_mb();
+	if (waitqueue_active(&delayed_refs->wait))
+		wake_up(&delayed_refs->wait);
+
 	spin_unlock(&delayed_refs->lock);
+	assert_qgroups_uptodate(trans);
 	return 0;
 }
 
 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 flags,
-				int is_data)
+				int level, int is_data)
 {
 	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
 
-	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+	extent_op = btrfs_alloc_delayed_extent_op();
 	if (!extent_op)
 		return -ENOMEM;
 
@@ -2404,10 +2814,12 @@
 	extent_op->update_flags = 1;
 	extent_op->update_key = 0;
 	extent_op->is_data = is_data ? 1 : 0;
+	extent_op->level = level;
 
-	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
+					  num_bytes, extent_op);
 	if (ret)
-		kfree(extent_op);
+		btrfs_free_delayed_extent_op(extent_op);
 	return ret;
 }
 
@@ -2463,8 +2875,10 @@
 
 	node = rb_prev(node);
 	if (node) {
+		int seq = ref->seq;
+
 		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-		if (ref->bytenr == bytenr)
+		if (ref->bytenr == bytenr && ref->seq == seq)
 			goto out_unlock;
 	}
 
@@ -2501,7 +2915,7 @@
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Corruption */
 
 	ret = -ENOENT;
 	if (path->slots[0] == 0)
@@ -2590,7 +3004,7 @@
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   struct extent_buffer *buf,
-			   int full_backref, int inc)
+			   int full_backref, int inc, int for_cow)
 {
 	u64 bytenr;
 	u64 num_bytes;
@@ -2603,7 +3017,7 @@
 	int level;
 	int ret = 0;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64, int);
 
 	ref_root = btrfs_header_owner(buf);
 	nritems = btrfs_header_nritems(buf);
@@ -2640,34 +3054,34 @@
 			key.offset -= btrfs_file_extent_offset(buf, fi);
 			ret = process_func(trans, root, bytenr, num_bytes,
 					   parent, ref_root, key.objectid,
-					   key.offset);
+					   key.offset, for_cow);
 			if (ret)
 				goto fail;
 		} else {
 			bytenr = btrfs_node_blockptr(buf, i);
 			num_bytes = btrfs_level_size(root, level - 1);
 			ret = process_func(trans, root, bytenr, num_bytes,
-					   parent, ref_root, level - 1, 0);
+					   parent, ref_root, level - 1, 0,
+					   for_cow);
 			if (ret)
 				goto fail;
 		}
 	}
 	return 0;
 fail:
-	BUG();
 	return ret;
 }
 
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
 }
 
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *buf, int full_backref)
+		  struct extent_buffer *buf, int full_backref, int for_cow)
 {
-	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2683,7 +3097,7 @@
 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
 	if (ret < 0)
 		goto fail;
-	BUG_ON(ret);
+	BUG_ON(ret); /* Corruption */
 
 	leaf = path->nodes[0];
 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -2691,8 +3105,10 @@
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 fail:
-	if (ret)
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
 		return ret;
+	}
 	return 0;
 
 }
@@ -2776,6 +3192,11 @@
 	WARN_ON(ret);
 
 	if (i_size_read(inode) > 0) {
+		ret = btrfs_check_trunc_cache_free_space(root,
+					&root->fs_info->global_block_rsv);
+		if (ret)
+			goto out_put;
+
 		ret = btrfs_truncate_free_space_cache(root, trans, path,
 						      inode);
 		if (ret)
@@ -2783,25 +3204,29 @@
 	}
 
 	spin_lock(&block_group->lock);
-	if (block_group->cached != BTRFS_CACHE_FINISHED) {
-		/* We're not cached, don't bother trying to write stuff out */
+	if (block_group->cached != BTRFS_CACHE_FINISHED ||
+	    !btrfs_test_opt(root, SPACE_CACHE)) {
+		/*
+		 * don't bother trying to write stuff out _if_
+		 * a) we're not cached,
+		 * b) we're with nospace_cache mount option.
+		 */
 		dcs = BTRFS_DC_WRITTEN;
 		spin_unlock(&block_group->lock);
 		goto out_put;
 	}
 	spin_unlock(&block_group->lock);
 
-	num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+	/*
+	 * Try to preallocate enough space based on how big the block group is.
+	 * Keep in mind this has to include any pinned space which could end up
+	 * taking up quite a bit since it's not folded into the other space
+	 * cache.
+	 */
+	num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
 	if (!num_pages)
 		num_pages = 1;
 
-	/*
-	 * Just to make absolutely sure we have enough space, we're going to
-	 * preallocate 12 pages worth of space for each block group.  In
-	 * practice we ought to use at most 8, but we need extra space so we can
-	 * add our header and have a terminator between the extents and the
-	 * bitmaps.
-	 */
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
@@ -2865,7 +3290,8 @@
 		if (last == 0) {
 			err = btrfs_run_delayed_refs(trans, root,
 						     (unsigned long)-1);
-			BUG_ON(err);
+			if (err) /* File system offline */
+				goto out;
 		}
 
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2892,7 +3318,9 @@
 		last = cache->key.objectid + cache->key.offset;
 
 		err = write_one_cache_group(trans, root, path, cache);
-		BUG_ON(err);
+		if (err) /* File system offline */
+			goto out;
+
 		btrfs_put_block_group(cache);
 	}
 
@@ -2905,7 +3333,8 @@
 		if (last == 0) {
 			err = btrfs_run_delayed_refs(trans, root,
 						     (unsigned long)-1);
-			BUG_ON(err);
+			if (err) /* File system offline */
+				goto out;
 		}
 
 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
@@ -2930,20 +3359,21 @@
 			continue;
 		}
 
-		btrfs_write_out_cache(root, trans, cache, path);
+		err = btrfs_write_out_cache(root, trans, cache, path);
 
 		/*
 		 * If we didn't have an error then the cache state is still
 		 * NEED_WRITE, so we can set it to WRITTEN.
 		 */
-		if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+		if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
 			cache->disk_cache_state = BTRFS_DC_WRITTEN;
 		last = cache->key.objectid + cache->key.offset;
 		btrfs_put_block_group(cache);
 	}
+out:
 
 	btrfs_free_path(path);
-	return 0;
+	return err;
 }
 
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2966,6 +3396,7 @@
 	struct btrfs_space_info *found;
 	int i;
 	int factor;
+	int ret;
 
 	if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
 		     BTRFS_BLOCK_GROUP_RAID10))
@@ -2989,13 +3420,17 @@
 	if (!found)
 		return -ENOMEM;
 
+	ret = percpu_counter_init(&found->total_bytes_pinned, 0);
+	if (ret) {
+		kfree(found);
+		return ret;
+	}
+
 	for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 		INIT_LIST_HEAD(&found->block_groups[i]);
 	init_rwsem(&found->groups_sem);
 	spin_lock_init(&found->lock);
-	found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
-				BTRFS_BLOCK_GROUP_SYSTEM |
-				BTRFS_BLOCK_GROUP_METADATA);
+	found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
 	found->total_bytes = total_bytes;
 	found->disk_total = total_bytes * factor;
 	found->bytes_used = bytes_used;
@@ -3011,26 +3446,62 @@
 	init_waitqueue_head(&found->wait);
 	*space_info = found;
 	list_add_rcu(&found->list, &info->space_info);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		info->data_sinfo = found;
 	return 0;
 }
 
 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 {
-	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
-				   BTRFS_BLOCK_GROUP_RAID1 |
-				   BTRFS_BLOCK_GROUP_RAID10 |
-				   BTRFS_BLOCK_GROUP_DUP);
-	if (extra_flags) {
-		if (flags & BTRFS_BLOCK_GROUP_DATA)
-			fs_info->avail_data_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_METADATA)
-			fs_info->avail_metadata_alloc_bits |= extra_flags;
-		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-			fs_info->avail_system_alloc_bits |= extra_flags;
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits |= extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits |= extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
+/*
+ * returns target flags in extended format or 0 if restripe for this
+ * chunk_type is not in progress
+ *
+ * should be called with either volume_mutex or balance_lock held
+ */
+static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	u64 target = 0;
+
+	if (!bctl)
+		return 0;
+
+	if (flags & BTRFS_BLOCK_GROUP_DATA &&
+	    bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
+		   bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
+	} else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
+		   bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
 	}
+
+	return target;
 }
 
-u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
+/*
+ * @flags: available profiles in extended format (see ctree.h)
+ *
+ * Returns reduced profile in chunk format.  If profile changing is in
+ * progress (either running or paused) picks the target profile (if it's
+ * already available), otherwise falls back to plain reducing.
+ */
+static u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
 	/*
 	 * we add in the count of missing devices because we want
@@ -3039,48 +3510,74 @@
 	 */
 	u64 num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
+	u64 target;
+	u64 tmp;
+
+	/*
+	 * see if restripe for this chunk_type is in progress, if so
+	 * try to reduce to the target profile
+	 */
+	spin_lock(&root->fs_info->balance_lock);
+	target = get_restripe_target(root->fs_info, flags);
+	if (target) {
+		/* pick target profile only if it's already available */
+		if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
+			spin_unlock(&root->fs_info->balance_lock);
+			return extended_to_chunk(target);
+		}
+	}
+	spin_unlock(&root->fs_info->balance_lock);
 
+	/* First, mask out the RAID levels which aren't possible */
 	if (num_devices == 1)
-		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+			   BTRFS_BLOCK_GROUP_RAID5);
+	if (num_devices < 3)
+		flags &= ~BTRFS_BLOCK_GROUP_RAID6;
 	if (num_devices < 4)
 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
 
-	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-		      BTRFS_BLOCK_GROUP_RAID10))) {
-		flags &= ~BTRFS_BLOCK_GROUP_DUP;
-	}
-
-	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
-		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
-	}
-
-	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
-	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
-	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
-	     (flags & BTRFS_BLOCK_GROUP_DUP)))
-		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
-	return flags;
+	tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+	flags &= ~tmp;
+
+	if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+		tmp = BTRFS_BLOCK_GROUP_RAID6;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+		tmp = BTRFS_BLOCK_GROUP_RAID5;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+		tmp = BTRFS_BLOCK_GROUP_RAID10;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+		tmp = BTRFS_BLOCK_GROUP_RAID1;
+	else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+		tmp = BTRFS_BLOCK_GROUP_RAID0;
+
+	return extended_to_chunk(flags | tmp);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-	if (flags & BTRFS_BLOCK_GROUP_DATA)
-		flags |= root->fs_info->avail_data_alloc_bits &
-			 root->fs_info->data_alloc_profile;
-	else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-		flags |= root->fs_info->avail_system_alloc_bits &
-			 root->fs_info->system_alloc_profile;
-	else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-		flags |= root->fs_info->avail_metadata_alloc_bits &
-			 root->fs_info->metadata_alloc_profile;
+	unsigned seq;
+
+	do {
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+		if (flags & BTRFS_BLOCK_GROUP_DATA)
+			flags |= root->fs_info->avail_data_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+			flags |= root->fs_info->avail_system_alloc_bits;
+		else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+			flags |= root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
 	return btrfs_reduce_alloc_profile(root, flags);
 }
 
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
 	u64 flags;
+	u64 ret;
 
 	if (data)
 		flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3089,13 +3586,8 @@
 	else
 		flags = BTRFS_BLOCK_GROUP_METADATA;
 
-	return get_alloc_profile(root, flags);
-}
-
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-						       BTRFS_BLOCK_GROUP_DATA);
+	ret = get_alloc_profile(root, flags);
+	return ret;
 }
 
 /*
@@ -3106,11 +3598,12 @@
 {
 	struct btrfs_space_info *data_sinfo;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 used;
 	int ret = 0, committed = 0, alloc_chunk = 1;
 
 	/* make sure bytes are sectorsize aligned */
-	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	bytes = ALIGN(bytes, root->sectorsize);
 
 	if (root == root->fs_info->tree_root ||
 	    BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3118,7 +3611,7 @@
 		committed = 1;
 	}
 
-	data_sinfo = BTRFS_I(inode)->space_info;
+	data_sinfo = fs_info->data_sinfo;
 	if (!data_sinfo)
 		goto alloc;
 
@@ -3148,7 +3641,6 @@
 				return PTR_ERR(trans);
 
 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-					     bytes + 2 * 1024 * 1024,
 					     alloc_target,
 					     CHUNK_ALLOC_NO_FORCE);
 			btrfs_end_transaction(trans, root);
@@ -3159,18 +3651,18 @@
 					goto commit_trans;
 			}
 
-			if (!data_sinfo) {
-				btrfs_set_inode_space_info(root, inode);
-				data_sinfo = BTRFS_I(inode)->space_info;
-			}
+			if (!data_sinfo)
+				data_sinfo = fs_info->data_sinfo;
+
 			goto again;
 		}
 
 		/*
-		 * If we have less pinned bytes than we want to allocate then
-		 * don't bother committing the transaction, it won't help us.
+		 * If we don't have enough pinned space to deal with this
+		 * allocation don't bother committing the transaction.
 		 */
-		if (data_sinfo->bytes_pinned < bytes)
+		if (percpu_counter_compare(&data_sinfo->total_bytes_pinned,
+					   bytes) < 0)
 			committed = 1;
 		spin_unlock(&data_sinfo->lock);
 
@@ -3179,6 +3671,7 @@
 		if (!committed &&
 		    !atomic_read(&root->fs_info->open_ioctl_trans)) {
 			committed = 1;
+
 			trans = btrfs_join_transaction(root);
 			if (IS_ERR(trans))
 				return PTR_ERR(trans);
@@ -3191,6 +3684,8 @@
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, bytes, 1);
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3205,11 +3700,14 @@
 	struct btrfs_space_info *data_sinfo;
 
 	/* make sure bytes are sectorsize aligned */
-	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	bytes = ALIGN(bytes, root->sectorsize);
 
-	data_sinfo = BTRFS_I(inode)->space_info;
+	data_sinfo = root->fs_info->data_sinfo;
 	spin_lock(&data_sinfo->lock);
+	WARN_ON(data_sinfo->bytes_may_use < bytes);
 	data_sinfo->bytes_may_use -= bytes;
+	trace_btrfs_space_reservation(root->fs_info, "space_info",
+				      data_sinfo->flags, bytes, 0);
 	spin_unlock(&data_sinfo->lock);
 }
 
@@ -3226,9 +3724,13 @@
 	rcu_read_unlock();
 }
 
+static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
+{
+	return (global->size << 1);
+}
+
 static int should_alloc_chunk(struct btrfs_root *root,
-			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
-			      int force)
+			      struct btrfs_space_info *sinfo, int force)
 {
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
@@ -3243,7 +3745,8 @@
 	 * and purposes it's used space.  Don't worry about locking the
 	 * global_rsv, it doesn't change except when the transaction commits.
 	 */
-	num_allocated += global_rsv->size;
+	if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
+		num_allocated += calc_global_rsv_need_space(global_rsv);
 
 	/*
 	 * in limited mode, we want to have some free space up to
@@ -3258,59 +3761,91 @@
 			return 1;
 	}
 
-	/*
-	 * we have two similar checks here, one based on percentage
-	 * and once based on a hard number of 256MB.  The idea
-	 * is that if we have a good amount of free
-	 * room, don't allocate a chunk.  A good mount is
-	 * less than 80% utilized of the chunks we have allocated,
-	 * or more than 256MB free
-	 */
-	if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
 		return 0;
+	return 1;
+}
 
-	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-		return 0;
+static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
+{
+	u64 num_dev;
+
+	if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+		    BTRFS_BLOCK_GROUP_RAID0 |
+		    BTRFS_BLOCK_GROUP_RAID5 |
+		    BTRFS_BLOCK_GROUP_RAID6))
+		num_dev = root->fs_info->fs_devices->rw_devices;
+	else if (type & BTRFS_BLOCK_GROUP_RAID1)
+		num_dev = 2;
+	else
+		num_dev = 1;	/* DUP or single */
 
-	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
+	/* metadata for updaing devices and chunk tree */
+	return btrfs_calc_trans_metadata_size(root, num_dev + 1);
+}
 
-	/* 256MB or 5% of the FS */
-	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+static void check_system_chunk(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, u64 type)
+{
+	struct btrfs_space_info *info;
+	u64 left;
+	u64 thresh;
 
-	if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
-		return 0;
-	return 1;
+	info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+	spin_lock(&info->lock);
+	left = info->total_bytes - info->bytes_used - info->bytes_pinned -
+		info->bytes_reserved - info->bytes_readonly;
+	spin_unlock(&info->lock);
+
+	thresh = get_system_chunk_thresh(root, type);
+	if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
+			left, thresh, type);
+		dump_space_info(info, 0, 0);
+	}
+
+	if (left < thresh) {
+		u64 flags;
+
+		flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
+		btrfs_alloc_chunk(trans, root, flags);
+	}
 }
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *extent_root, u64 alloc_bytes,
-			  u64 flags, int force)
+			  struct btrfs_root *extent_root, u64 flags, int force)
 {
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_info *fs_info = extent_root->fs_info;
 	int wait_for_alloc = 0;
 	int ret = 0;
 
-	flags = btrfs_reduce_alloc_profile(extent_root, flags);
+	/* Don't re-enter if we're already allocating a chunk */
+	if (trans->allocating_chunk)
+		return -ENOSPC;
 
 	space_info = __find_space_info(extent_root->fs_info, flags);
 	if (!space_info) {
 		ret = update_space_info(extent_root->fs_info, flags,
 					0, 0, &space_info);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM */
 	}
-	BUG_ON(!space_info);
+	BUG_ON(!space_info); /* Logic error */
 
 again:
 	spin_lock(&space_info->lock);
-	if (space_info->force_alloc)
+	if (force < space_info->force_alloc)
 		force = space_info->force_alloc;
 	if (space_info->full) {
+		if (should_alloc_chunk(extent_root, space_info, force))
+			ret = -ENOSPC;
+		else
+			ret = 0;
 		spin_unlock(&space_info->lock);
-		return 0;
+		return ret;
 	}
 
-	if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
+	if (!should_alloc_chunk(extent_root, space_info, force)) {
 		spin_unlock(&space_info->lock);
 		return 0;
 	} else if (space_info->chunk_alloc) {
@@ -3335,6 +3870,8 @@
 		goto again;
 	}
 
+	trans->allocating_chunk = true;
+
 	/*
 	 * If we have mixed data/metadata chunks we want to make sure we keep
 	 * allocating mixed chunks instead of individual chunks.
@@ -3354,110 +3891,174 @@
 			force_metadata_allocation(fs_info);
 	}
 
+	/*
+	 * Check if we have enough space in SYSTEM chunk because we may need
+	 * to update devices.
+	 */
+	check_system_chunk(trans, extent_root, flags);
+
 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
-	if (ret < 0 && ret != -ENOSPC)
-		goto out;
+	trans->allocating_chunk = false;
 
 	spin_lock(&space_info->lock);
+	if (ret < 0 && ret != -ENOSPC)
+		goto out;
 	if (ret)
 		space_info->full = 1;
 	else
 		ret = 1;
 
 	space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
 	space_info->chunk_alloc = 0;
 	spin_unlock(&space_info->lock);
-out:
-	mutex_unlock(&extent_root->fs_info->chunk_mutex);
+	mutex_unlock(&fs_info->chunk_mutex);
 	return ret;
 }
 
+static int can_overcommit(struct btrfs_root *root,
+			  struct btrfs_space_info *space_info, u64 bytes,
+			  enum btrfs_reserve_flush_enum flush)
+{
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+	u64 profile = btrfs_get_alloc_profile(root, 0);
+	u64 space_size;
+	u64 avail;
+	u64 used;
+
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly;
+
+	/*
+	 * We only want to allow over committing if we have lots of actual space
+	 * free, but if we don't have enough space to handle the global reserve
+	 * space then we could end up having a real enospc problem when trying
+	 * to allocate a chunk or some other such important allocation.
+	 */
+	spin_lock(&global_rsv->lock);
+	space_size = calc_global_rsv_need_space(global_rsv);
+	spin_unlock(&global_rsv->lock);
+	if (used + space_size >= space_info->total_bytes)
+		return 0;
+
+	used += space_info->bytes_may_use;
+
+	spin_lock(&root->fs_info->free_chunk_lock);
+	avail = root->fs_info->free_chunk_space;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
+	/*
+	 * If we have dup, raid1 or raid10 then only half of the free
+	 * space is actually useable.  For raid56, the space info used
+	 * doesn't include the parity drive, so we don't have to
+	 * change the math
+	 */
+	if (profile & (BTRFS_BLOCK_GROUP_DUP |
+		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID10))
+		avail >>= 1;
+
+	/*
+	 * If we aren't flushing all things, let us overcommit up to
+	 * 1/2th of the space. If we can flush, don't let us overcommit
+	 * too much, let it overcommit up to 1/8 of the space.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_ALL)
+		avail >>= 3;
+	else
+		avail >>= 1;
+
+	if (used + bytes < space_info->total_bytes + avail)
+		return 1;
+	return 0;
+}
+
+static void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+					 unsigned long nr_pages)
+{
+	struct super_block *sb = root->fs_info->sb;
+
+	if (down_read_trylock(&sb->s_umount)) {
+		writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE);
+		up_read(&sb->s_umount);
+	} else {
+		/*
+		 * We needn't worry the filesystem going from r/w to r/o though
+		 * we don't acquire ->s_umount mutex, because the filesystem
+		 * should guarantee the delalloc inodes list be empty after
+		 * the filesystem is readonly(all dirty pages are written to
+		 * the disk).
+		 */
+		btrfs_start_all_delalloc_inodes(root->fs_info, 0);
+		if (!current->journal_info)
+			btrfs_wait_all_ordered_extents(root->fs_info);
+	}
+}
+
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
-			   bool wait_ordered)
+static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
+			    bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
 	struct btrfs_trans_handle *trans;
-	u64 reserved;
+	u64 delalloc_bytes;
 	u64 max_reclaim;
-	u64 reclaimed = 0;
 	long time_left;
 	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
-	unsigned long progress;
+	enum btrfs_reserve_flush_enum flush;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
 	smp_mb();
-	reserved = space_info->bytes_may_use;
-	progress = space_info->reservation_progress;
-
-	if (reserved == 0)
-		return 0;
-
-	smp_mb();
-	if (root->fs_info->delalloc_bytes == 0) {
+	delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
+	if (delalloc_bytes == 0) {
 		if (trans)
-			return 0;
-		btrfs_wait_ordered_extents(root, 0, 0);
-		return 0;
+			return;
+		btrfs_wait_all_ordered_extents(root->fs_info);
+		return;
 	}
 
-	max_reclaim = min(reserved, to_reclaim);
-	nr_pages = max_t(unsigned long, nr_pages,
-			 max_reclaim >> PAGE_CACHE_SHIFT);
-	while (loops < 1024) {
-		/* have the flusher threads jump in and do some IO */
-		smp_mb();
-		nr_pages = min_t(unsigned long, nr_pages,
-		       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages,
-						WB_REASON_FS_FREE_SPACE);
+	while (delalloc_bytes && loops < 3) {
+		max_reclaim = min(delalloc_bytes, to_reclaim);
+		nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
+		btrfs_writeback_inodes_sb_nr(root, nr_pages);
+		/*
+		 * We need to wait for the async pages to actually start before
+		 * we do anything.
+		 */
+		wait_event(root->fs_info->async_submit_wait,
+			   !atomic_read(&root->fs_info->async_delalloc_pages));
 
+		if (!trans)
+			flush = BTRFS_RESERVE_FLUSH_ALL;
+		else
+			flush = BTRFS_RESERVE_NO_FLUSH;
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_may_use)
-			reclaimed += reserved - space_info->bytes_may_use;
-		reserved = space_info->bytes_may_use;
+		if (can_overcommit(root, space_info, orig, flush)) {
+			spin_unlock(&space_info->lock);
+			break;
+		}
 		spin_unlock(&space_info->lock);
 
 		loops++;
-
-		if (reserved == 0 || reclaimed >= max_reclaim)
-			break;
-
-		if (trans && trans->transaction->blocked)
-			return -EAGAIN;
-
 		if (wait_ordered && !trans) {
-			btrfs_wait_ordered_extents(root, 0, 0);
+			btrfs_wait_all_ordered_extents(root->fs_info);
 		} else {
-			time_left = schedule_timeout_interruptible(1);
-
-			/* We were interrupted, exit */
+			time_left = schedule_timeout_killable(1);
 			if (time_left)
 				break;
 		}
-
-		/* we've kicked the IO a few times, if anything has been freed,
-		 * exit.  There is no sense in looping here for a long time
-		 * when we really need to commit the transaction, or there are
-		 * just too many writers without enough free space
-		 */
-
-		if (loops > 3) {
-			smp_mb();
-			if (progress != space_info->reservation_progress)
-				break;
-		}
-
+		smp_mb();
+		delalloc_bytes = percpu_counter_sum_positive(
+						&root->fs_info->delalloc_bytes);
 	}
-
-	return reclaimed >= to_reclaim;
 }
 
 /**
@@ -3486,7 +4087,8 @@
 
 	/* See if there is enough pinned space to make this reservation */
 	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned >= bytes) {
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes) >= 0) {
 		spin_unlock(&space_info->lock);
 		goto commit;
 	}
@@ -3499,12 +4101,16 @@
 	if (space_info != delayed_rsv->space_info)
 		return -ENOSPC;
 
+	spin_lock(&space_info->lock);
 	spin_lock(&delayed_rsv->lock);
-	if (delayed_rsv->size < bytes) {
+	if (percpu_counter_compare(&space_info->total_bytes_pinned,
+				   bytes - delayed_rsv->size) >= 0) {
 		spin_unlock(&delayed_rsv->lock);
+		spin_unlock(&space_info->lock);
 		return -ENOSPC;
 	}
 	spin_unlock(&delayed_rsv->lock);
+	spin_unlock(&space_info->lock);
 
 commit:
 	trans = btrfs_join_transaction(root);
@@ -3514,6 +4120,72 @@
 	return btrfs_commit_transaction(trans, root);
 }
 
+enum flush_state {
+	FLUSH_DELAYED_ITEMS_NR	=	1,
+	FLUSH_DELAYED_ITEMS	=	2,
+	FLUSH_DELALLOC		=	3,
+	FLUSH_DELALLOC_WAIT	=	4,
+	ALLOC_CHUNK		=	5,
+	COMMIT_TRANS		=	6,
+};
+
+static int flush_space(struct btrfs_root *root,
+		       struct btrfs_space_info *space_info, u64 num_bytes,
+		       u64 orig_bytes, int state)
+{
+	struct btrfs_trans_handle *trans;
+	int nr;
+	int ret = 0;
+
+	switch (state) {
+	case FLUSH_DELAYED_ITEMS_NR:
+	case FLUSH_DELAYED_ITEMS:
+		if (state == FLUSH_DELAYED_ITEMS_NR) {
+			u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+			nr = (int)div64_u64(num_bytes, bytes);
+			if (!nr)
+				nr = 1;
+			nr *= 2;
+		} else {
+			nr = -1;
+		}
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = btrfs_run_delayed_items_nr(trans, root, nr);
+		btrfs_end_transaction(trans, root);
+		break;
+	case FLUSH_DELALLOC:
+	case FLUSH_DELALLOC_WAIT:
+		shrink_delalloc(root, num_bytes, orig_bytes,
+				state == FLUSH_DELALLOC_WAIT);
+		break;
+	case ALLOC_CHUNK:
+		trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			break;
+		}
+		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+				     btrfs_get_alloc_profile(root, 0),
+				     CHUNK_ALLOC_NO_FORCE);
+		btrfs_end_transaction(trans, root);
+		if (ret == -ENOSPC)
+			ret = 0;
+		break;
+	case COMMIT_TRANS:
+		ret = may_commit_transaction(root, space_info, orig_bytes, 0);
+		break;
+	default:
+		ret = -ENOSPC;
+		break;
+	}
+
+	return ret;
+}
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
  * @root - the root we're allocating for
@@ -3530,25 +4202,25 @@
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush)
+				  u64 orig_bytes,
+				  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	u64 used;
 	u64 num_bytes = orig_bytes;
-	int retries = 0;
+	int flush_state = FLUSH_DELAYED_ITEMS_NR;
 	int ret = 0;
-	bool committed = false;
 	bool flushing = false;
-	bool wait_ordered = false;
 
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
 	/*
-	 * We only want to wait if somebody other than us is flushing and we are
-	 * actually alloed to flush.
+	 * We only want to wait if somebody other than us is flushing and we
+	 * are actually allowed to flush all things.
 	 */
-	while (flush && !flushing && space_info->flush) {
+	while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
+	       space_info->flush) {
 		spin_unlock(&space_info->lock);
 		/*
 		 * If we have a trans handle we can't wait because the flusher
@@ -3558,9 +4230,8 @@
 		 */
 		if (current->journal_info)
 			return -EAGAIN;
-		ret = wait_event_interruptible(space_info->wait,
-					       !space_info->flush);
-		/* Must have been interrupted, return */
+		ret = wait_event_killable(space_info->wait, !space_info->flush);
+		/* Must have been killed, return */
 		if (ret)
 			return -EINTR;
 
@@ -3582,6 +4253,8 @@
 	if (used <= space_info->total_bytes) {
 		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
+			trace_btrfs_space_reservation(root->fs_info,
+				"space_info", space_info->flags, orig_bytes, 1);
 			ret = 0;
 		} else {
 			/*
@@ -3597,111 +4270,68 @@
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
-		wait_ordered = true;
 		num_bytes = used - space_info->total_bytes +
-			(orig_bytes * (retries + 1));
+			(orig_bytes * 2);
 	}
 
-	if (ret) {
-		u64 profile = btrfs_get_alloc_profile(root, 0);
-		u64 avail;
-
-		/*
-		 * If we have a lot of space that's pinned, don't bother doing
-		 * the overcommit dance yet and just commit the transaction.
-		 */
-		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
-		do_div(avail, 10);
-		if (space_info->bytes_pinned >= avail && flush && !committed) {
-			space_info->flush = 1;
-			flushing = true;
-			spin_unlock(&space_info->lock);
-			ret = may_commit_transaction(root, space_info,
-						     orig_bytes, 1);
-			if (ret)
-				goto out;
-			committed = true;
-			goto again;
-		}
-
-		spin_lock(&root->fs_info->free_chunk_lock);
-		avail = root->fs_info->free_chunk_space;
-
-		/*
-		 * If we have dup, raid1 or raid10 then only half of the free
-		 * space is actually useable.
-		 */
-		if (profile & (BTRFS_BLOCK_GROUP_DUP |
-			       BTRFS_BLOCK_GROUP_RAID1 |
-			       BTRFS_BLOCK_GROUP_RAID10))
-			avail >>= 1;
-
-		/*
-		 * If we aren't flushing don't let us overcommit too much, say
-		 * 1/8th of the space.  If we can flush, let it overcommit up to
-		 * 1/2 of the space.
-		 */
-		if (flush)
-			avail >>= 3;
-		else
-			avail >>= 1;
-		 spin_unlock(&root->fs_info->free_chunk_lock);
-
-		if (used + num_bytes < space_info->total_bytes + avail) {
-			space_info->bytes_may_use += orig_bytes;
-			ret = 0;
-		} else {
-			wait_ordered = true;
-		}
+	if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
+		space_info->bytes_may_use += orig_bytes;
+		trace_btrfs_space_reservation(root->fs_info, "space_info",
+					      space_info->flags, orig_bytes,
+					      1);
+		ret = 0;
 	}
 
 	/*
 	 * Couldn't make our reservation, save our place so while we're trying
 	 * to reclaim space we can actually use it instead of somebody else
 	 * stealing it from us.
+	 *
+	 * We make the other tasks wait for the flush only when we can flush
+	 * all things.
 	 */
-	if (ret && flush) {
+	if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
 		flushing = true;
 		space_info->flush = 1;
 	}
 
 	spin_unlock(&space_info->lock);
 
-	if (!ret || !flush)
-		goto out;
-
-	/*
-	 * We do synchronous shrinking since we don't actually unreserve
-	 * metadata until after the IO is completed.
-	 */
-	ret = shrink_delalloc(root, num_bytes, wait_ordered);
-	if (ret < 0)
+	if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
 		goto out;
 
-	ret = 0;
+	ret = flush_space(root, space_info, num_bytes, orig_bytes,
+			  flush_state);
+	flush_state++;
 
 	/*
-	 * So if we were overcommitted it's possible that somebody else flushed
-	 * out enough space and we simply didn't have enough space to reclaim,
-	 * so go back around and try again.
-	 */
-	if (retries < 2) {
-		wait_ordered = true;
-		retries++;
-		goto again;
-	}
-
-	ret = -ENOSPC;
-	if (committed)
-		goto out;
+	 * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
+	 * would happen. So skip delalloc flush.
+	 */
+	if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+	    (flush_state == FLUSH_DELALLOC ||
+	     flush_state == FLUSH_DELALLOC_WAIT))
+		flush_state = ALLOC_CHUNK;
 
-	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
-	if (!ret) {
-		committed = true;
+	if (!ret)
+		goto again;
+	else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
+		 flush_state < COMMIT_TRANS)
+		goto again;
+	else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+		 flush_state <= COMMIT_TRANS)
 		goto again;
-	}
 
 out:
+	if (ret == -ENOSPC &&
+	    unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+		struct btrfs_block_rsv *global_rsv =
+			&root->fs_info->global_block_rsv;
+
+		if (block_rsv != global_rsv &&
+		    !block_rsv_use_bytes(global_rsv, orig_bytes))
+			ret = 0;
+	}
 	if (flushing) {
 		spin_lock(&space_info->lock);
 		space_info->flush = 0;
@@ -3711,12 +4341,19 @@
 	return ret;
 }
 
-static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root)
+static struct btrfs_block_rsv *get_block_rsv(
+					const struct btrfs_trans_handle *trans,
+					const struct btrfs_root *root)
 {
 	struct btrfs_block_rsv *block_rsv = NULL;
 
-	if (root->ref_cows || root == root->fs_info->csum_root)
+	if (root->ref_cows)
+		block_rsv = trans->block_rsv;
+
+	if (root == root->fs_info->csum_root && trans->adding_csums)
+		block_rsv = trans->block_rsv;
+
+	if (root == root->fs_info->uuid_root)
 		block_rsv = trans->block_rsv;
 
 	if (!block_rsv)
@@ -3755,7 +4392,33 @@
 	spin_unlock(&block_rsv->lock);
 }
 
-static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_rsv *dest, u64 num_bytes,
+			     int min_factor)
+{
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+	u64 min_bytes;
+
+	if (global_rsv->space_info != dest->space_info)
+		return -ENOSPC;
+
+	spin_lock(&global_rsv->lock);
+	min_bytes = div_factor(global_rsv->size, min_factor);
+	if (global_rsv->reserved < min_bytes + num_bytes) {
+		spin_unlock(&global_rsv->lock);
+		return -ENOSPC;
+	}
+	global_rsv->reserved -= num_bytes;
+	if (global_rsv->reserved < global_rsv->size)
+		global_rsv->full = 0;
+	spin_unlock(&global_rsv->lock);
+
+	block_rsv_add_bytes(dest, num_bytes, 1);
+	return 0;
+}
+
+static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_rsv *block_rsv,
 				    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3791,7 +4454,8 @@
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
 			space_info->bytes_may_use -= num_bytes;
-			space_info->reservation_progress++;
+			trace_btrfs_space_reservation(fs_info, "space_info",
+					space_info->flags, num_bytes, 0);
 			spin_unlock(&space_info->lock);
 		}
 	}
@@ -3810,13 +4474,15 @@
 	return 0;
 }
 
-void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
 {
 	memset(rsv, 0, sizeof(*rsv));
 	spin_lock_init(&rsv->lock);
+	rsv->type = type;
 }
 
-struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
+					      unsigned short type)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3825,7 +4491,7 @@
 	if (!block_rsv)
 		return NULL;
 
-	btrfs_init_block_rsv(block_rsv);
+	btrfs_init_block_rsv(block_rsv, type);
 	block_rsv->space_info = __find_space_info(fs_info,
 						  BTRFS_BLOCK_GROUP_METADATA);
 	return block_rsv;
@@ -3834,13 +4500,15 @@
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv)
 {
+	if (!rsv)
+		return;
 	btrfs_block_rsv_release(root, rsv, (u64)-1);
 	kfree(rsv);
 }
 
-static inline int __block_rsv_add(struct btrfs_root *root,
-				  struct btrfs_block_rsv *block_rsv,
-				  u64 num_bytes, int flush)
+int btrfs_block_rsv_add(struct btrfs_root *root,
+			struct btrfs_block_rsv *block_rsv, u64 num_bytes,
+			enum btrfs_reserve_flush_enum flush)
 {
 	int ret;
 
@@ -3856,20 +4524,6 @@
 	return ret;
 }
 
-int btrfs_block_rsv_add(struct btrfs_root *root,
-			struct btrfs_block_rsv *block_rsv,
-			u64 num_bytes)
-{
-	return __block_rsv_add(root, block_rsv, num_bytes, 1);
-}
-
-int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-				struct btrfs_block_rsv *block_rsv,
-				u64 num_bytes)
-{
-	return __block_rsv_add(root, block_rsv, num_bytes, 0);
-}
-
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
@@ -3888,9 +4542,9 @@
 	return ret;
 }
 
-static inline int __btrfs_block_rsv_refill(struct btrfs_root *root,
-					   struct btrfs_block_rsv *block_rsv,
-					   u64 min_reserved, int flush)
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			   struct btrfs_block_rsv *block_rsv, u64 min_reserved,
+			   enum btrfs_reserve_flush_enum flush)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3918,20 +4572,6 @@
 	return ret;
 }
 
-int btrfs_block_rsv_refill(struct btrfs_root *root,
-			   struct btrfs_block_rsv *block_rsv,
-			   u64 min_reserved)
-{
-	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 1);
-}
-
-int btrfs_block_rsv_refill_noflush(struct btrfs_root *root,
-				   struct btrfs_block_rsv *block_rsv,
-				   u64 min_reserved)
-{
-	return __btrfs_block_rsv_refill(root, block_rsv, min_reserved, 0);
-}
-
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes)
@@ -3947,7 +4587,8 @@
 	if (global_rsv->full || global_rsv == block_rsv ||
 	    block_rsv->space_info != global_rsv->space_info)
 		global_rsv = NULL;
-	block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+	block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
+				num_bytes);
 }
 
 /*
@@ -3993,10 +4634,10 @@
 
 	num_bytes = calc_global_metadata_size(fs_info);
 
-	spin_lock(&block_rsv->lock);
 	spin_lock(&sinfo->lock);
+	spin_lock(&block_rsv->lock);
 
-	block_rsv->size = num_bytes;
+	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
 
 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -4006,18 +4647,21 @@
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
 		sinfo->bytes_may_use += num_bytes;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+				      sinfo->flags, num_bytes, 1);
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
 		sinfo->bytes_may_use -= num_bytes;
-		sinfo->reservation_progress++;
+		trace_btrfs_space_reservation(fs_info, "space_info",
+				      sinfo->flags, num_bytes, 0);
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
 	}
 
-	spin_unlock(&sinfo->lock);
 	spin_unlock(&block_rsv->lock);
+	spin_unlock(&sinfo->lock);
 }
 
 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
@@ -4038,6 +4682,8 @@
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+	if (fs_info->quota_root)
+		fs_info->quota_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 
 	update_global_block_rsv(fs_info);
@@ -4045,7 +4691,8 @@
 
 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 {
-	block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+	block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
+				(u64)-1);
 	WARN_ON(fs_info->delalloc_block_rsv.size > 0);
 	WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
 	WARN_ON(fs_info->trans_block_rsv.size > 0);
@@ -4059,13 +4706,19 @@
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
+	if (!trans->block_rsv)
+		return;
+
 	if (!trans->bytes_reserved)
 		return;
 
+	trace_btrfs_space_reservation(root->fs_info, "transaction",
+				      trans->transid, trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
+/* Can only return 0 or -ENOSPC */
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
 				  struct inode *inode)
 {
@@ -4079,6 +4732,8 @@
 	 * when we are truly done with the orphan item.
 	 */
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 1);
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
@@ -4086,22 +4741,71 @@
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+	trace_btrfs_space_reservation(root->fs_info, "orphan",
+				      btrfs_ino(inode), num_bytes, 0);
 	btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
 
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending)
+/*
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+				     struct btrfs_block_rsv *rsv,
+				     int items,
+				     u64 *qgroup_reserved,
+				     bool use_global_rsv)
 {
-	struct btrfs_root *root = pending->root;
-	struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
-	struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
-	/*
-	 * two for root back/forward refs, two for directory entries
-	 * and one for root of the snapshot.
-	 */
-	u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
-	dst_rsv->space_info = src_rsv->space_info;
-	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+	u64 num_bytes;
+	int ret;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+
+	if (root->fs_info->quota_enabled) {
+		/* One for parent inode, two for dir entries */
+		num_bytes = 3 * root->leafsize;
+		ret = btrfs_qgroup_reserve(root, num_bytes);
+		if (ret)
+			return ret;
+	} else {
+		num_bytes = 0;
+	}
+
+	*qgroup_reserved = num_bytes;
+
+	num_bytes = btrfs_calc_trans_metadata_size(root, items);
+	rsv->space_info = __find_space_info(root->fs_info,
+					    BTRFS_BLOCK_GROUP_METADATA);
+	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
+
+	if (ret == -ENOSPC && use_global_rsv)
+		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+
+	if (ret) {
+		if (*qgroup_reserved)
+			btrfs_qgroup_free(root, *qgroup_reserved);
+	}
+
+	return ret;
+}
+
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+				      struct btrfs_block_rsv *rsv,
+				      u64 qgroup_reserved)
+{
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	if (qgroup_reserved)
+		btrfs_qgroup_free(root, qgroup_reserved);
 }
 
 /**
@@ -4122,10 +4826,9 @@
 	BTRFS_I(inode)->outstanding_extents--;
 
 	if (BTRFS_I(inode)->outstanding_extents == 0 &&
-	    BTRFS_I(inode)->delalloc_meta_reserved) {
+	    test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+			       &BTRFS_I(inode)->runtime_flags))
 		drop_inode_space = 1;
-		BTRFS_I(inode)->delalloc_meta_reserved = 0;
-	}
 
 	/*
 	 * If we have more or the same amount of outsanding extents than we have
@@ -4207,18 +4910,29 @@
 	u64 csum_bytes;
 	unsigned nr_extents = 0;
 	int extra_reserve = 0;
-	int flush = 1;
-	int ret;
+	enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
+	int ret = 0;
+	bool delalloc_lock = true;
+	u64 to_free = 0;
+	unsigned dropped;
 
-	/* Need to be holding the i_mutex here if we aren't free space cache */
-	if (btrfs_is_free_space_inode(root, inode))
-		flush = 0;
-	else
-		WARN_ON(!mutex_is_locked(&inode->i_mutex));
+	/* If we are a free space inode we need to not flush since we will be in
+	 * the middle of a transaction commit.  We also don't need the delalloc
+	 * mutex since we won't race with anybody.  We need this mostly to make
+	 * lockdep shut its filthy mouth.
+	 */
+	if (btrfs_is_free_space_inode(inode)) {
+		flush = BTRFS_RESERVE_NO_FLUSH;
+		delalloc_lock = false;
+	}
 
-	if (flush && btrfs_transaction_in_commit(root->fs_info))
+	if (flush != BTRFS_RESERVE_NO_FLUSH &&
+	    btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
+	if (delalloc_lock)
+		mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
+
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
 
 	spin_lock(&BTRFS_I(inode)->lock);
@@ -4233,7 +4947,8 @@
 	 * Add an item to reserve for updating the inode when we complete the
 	 * delalloc io.
 	 */
-	if (!BTRFS_I(inode)->delalloc_meta_reserved) {
+	if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+		      &BTRFS_I(inode)->runtime_flags)) {
 		nr_extents++;
 		extra_reserve = 1;
 	}
@@ -4243,45 +4958,102 @@
 	csum_bytes = BTRFS_I(inode)->csum_bytes;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
-	if (ret) {
-		u64 to_free = 0;
-		unsigned dropped;
-
-		spin_lock(&BTRFS_I(inode)->lock);
-		dropped = drop_outstanding_extent(inode);
-		/*
-		 * If the inodes csum_bytes is the same as the original
-		 * csum_bytes then we know we haven't raced with any free()ers
-		 * so we can just reduce our inodes csum bytes and carry on.
-		 * Otherwise we have to do the normal free thing to account for
-		 * the case that the free side didn't free up its reserve
-		 * because of this outstanding reservation.
-		 */
-		if (BTRFS_I(inode)->csum_bytes == csum_bytes)
-			calc_csum_metadata_size(inode, num_bytes, 0);
-		else
-			to_free = calc_csum_metadata_size(inode, num_bytes, 0);
-		spin_unlock(&BTRFS_I(inode)->lock);
-		if (dropped)
-			to_free += btrfs_calc_trans_metadata_size(root, dropped);
+	if (root->fs_info->quota_enabled) {
+		ret = btrfs_qgroup_reserve(root, num_bytes +
+					   nr_extents * root->leafsize);
+		if (ret)
+			goto out_fail;
+	}
 
-		if (to_free)
-			btrfs_block_rsv_release(root, block_rsv, to_free);
-		return ret;
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	if (unlikely(ret)) {
+		if (root->fs_info->quota_enabled)
+			btrfs_qgroup_free(root, num_bytes +
+						nr_extents * root->leafsize);
+		goto out_fail;
 	}
 
 	spin_lock(&BTRFS_I(inode)->lock);
 	if (extra_reserve) {
-		BTRFS_I(inode)->delalloc_meta_reserved = 1;
+		set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
+			&BTRFS_I(inode)->runtime_flags);
 		nr_extents--;
 	}
 	BTRFS_I(inode)->reserved_extents += nr_extents;
 	spin_unlock(&BTRFS_I(inode)->lock);
 
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+
+	if (to_reserve)
+		trace_btrfs_space_reservation(root->fs_info,"delalloc",
+					      btrfs_ino(inode), to_reserve, 1);
 	block_rsv_add_bytes(block_rsv, to_reserve, 1);
 
 	return 0;
+
+out_fail:
+	spin_lock(&BTRFS_I(inode)->lock);
+	dropped = drop_outstanding_extent(inode);
+	/*
+	 * If the inodes csum_bytes is the same as the original
+	 * csum_bytes then we know we haven't raced with any free()ers
+	 * so we can just reduce our inodes csum bytes and carry on.
+	 */
+	if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
+		calc_csum_metadata_size(inode, num_bytes, 0);
+	} else {
+		u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
+		u64 bytes;
+
+		/*
+		 * This is tricky, but first we need to figure out how much we
+		 * free'd from any free-ers that occured during this
+		 * reservation, so we reset ->csum_bytes to the csum_bytes
+		 * before we dropped our lock, and then call the free for the
+		 * number of bytes that were freed while we were trying our
+		 * reservation.
+		 */
+		bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
+		BTRFS_I(inode)->csum_bytes = csum_bytes;
+		to_free = calc_csum_metadata_size(inode, bytes, 0);
+
+
+		/*
+		 * Now we need to see how much we would have freed had we not
+		 * been making this reservation and our ->csum_bytes were not
+		 * artificially inflated.
+		 */
+		BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
+		bytes = csum_bytes - orig_csum_bytes;
+		bytes = calc_csum_metadata_size(inode, bytes, 0);
+
+		/*
+		 * Now reset ->csum_bytes to what it should be.  If bytes is
+		 * more than to_free then we would have free'd more space had we
+		 * not had an artificially high ->csum_bytes, so we need to free
+		 * the remainder.  If bytes is the same or less then we don't
+		 * need to do anything, the other free-ers did the correct
+		 * thing.
+		 */
+		BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
+		if (bytes > to_free)
+			to_free = bytes - to_free;
+		else
+			to_free = 0;
+	}
+	spin_unlock(&BTRFS_I(inode)->lock);
+	if (dropped)
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+	if (to_free) {
+		btrfs_block_rsv_release(root, block_rsv, to_free);
+		trace_btrfs_space_reservation(root->fs_info, "delalloc",
+					      btrfs_ino(inode), to_free, 0);
+	}
+	if (delalloc_lock)
+		mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+	return ret;
 }
 
 /**
@@ -4303,11 +5075,19 @@
 	spin_lock(&BTRFS_I(inode)->lock);
 	dropped = drop_outstanding_extent(inode);
 
-	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	if (num_bytes)
+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 	spin_unlock(&BTRFS_I(inode)->lock);
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
+	trace_btrfs_space_reservation(root->fs_info, "delalloc",
+				      btrfs_ino(inode), to_free, 0);
+	if (root->fs_info->quota_enabled) {
+		btrfs_qgroup_free(root, num_bytes +
+					dropped * root->leafsize);
+	}
+
 	btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
 				to_free);
 }
@@ -4363,8 +5143,7 @@
 	btrfs_free_reserved_data_space(inode, num_bytes);
 }
 
-static int update_block_group(struct btrfs_trans_handle *trans,
-			      struct btrfs_root *root,
+static int update_block_group(struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc)
 {
 	struct btrfs_block_group_cache *cache = NULL;
@@ -4375,19 +5154,19 @@
 	int factor;
 
 	/* block accounting for super block */
-	spin_lock(&info->delalloc_lock);
+	spin_lock(&info->delalloc_root_lock);
 	old_val = btrfs_super_bytes_used(info->super_copy);
 	if (alloc)
 		old_val += num_bytes;
 	else
 		old_val -= num_bytes;
 	btrfs_set_super_bytes_used(info->super_copy, old_val);
-	spin_unlock(&info->delalloc_lock);
+	spin_unlock(&info->delalloc_root_lock);
 
 	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
-			return -1;
+			return -ENOENT;
 		if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
 				    BTRFS_BLOCK_GROUP_RAID1 |
 				    BTRFS_BLOCK_GROUP_RAID10))
@@ -4401,7 +5180,7 @@
 		 * space back to the block group, otherwise we will leak space.
 		 */
 		if (!alloc && cache->cached == BTRFS_CACHE_NO)
-			cache_block_group(cache, trans, NULL, 1);
+			cache_block_group(cache, 1);
 
 		byte_in_group = bytenr - cache->key.objectid;
 		WARN_ON(byte_in_group > cache->key.offset);
@@ -4451,6 +5230,13 @@
 	struct btrfs_block_group_cache *cache;
 	u64 bytenr;
 
+	spin_lock(&root->fs_info->block_group_cache_lock);
+	bytenr = root->fs_info->first_logical_byte;
+	spin_unlock(&root->fs_info->block_group_cache_lock);
+
+	if (bytenr < (u64)-1)
+		return bytenr;
+
 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
 	if (!cache)
 		return 0;
@@ -4490,7 +5276,7 @@
 	struct btrfs_block_group_cache *cache;
 
 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-	BUG_ON(!cache);
+	BUG_ON(!cache); /* Logic error */
 
 	pin_down_extent(root, cache, bytenr, num_bytes, reserved);
 
@@ -4501,14 +5287,15 @@
 /*
  * this function must be called within transaction
  */
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
 				    u64 bytenr, u64 num_bytes)
 {
 	struct btrfs_block_group_cache *cache;
+	int ret;
 
 	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
-	BUG_ON(!cache);
+	if (!cache)
+		return -EINVAL;
 
 	/*
 	 * pull in the free space cache (if any) so that our pin
@@ -4516,13 +5303,87 @@
 	 * to one because the slow code to read in the free extents does check
 	 * the pinned extents.
 	 */
-	cache_block_group(cache, trans, root, 1);
+	cache_block_group(cache, 1);
 
 	pin_down_extent(root, cache, bytenr, num_bytes, 0);
 
 	/* remove us from the free space cache (if we're there at all) */
-	btrfs_remove_free_space(cache, bytenr, num_bytes);
+	ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
 	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int __exclude_logged_extent(struct btrfs_root *root, u64 start, u64 num_bytes)
+{
+	int ret;
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_caching_control *caching_ctl;
+
+	block_group = btrfs_lookup_block_group(root->fs_info, start);
+	if (!block_group)
+		return -EINVAL;
+
+	cache_block_group(block_group, 0);
+	caching_ctl = get_caching_control(block_group);
+
+	if (!caching_ctl) {
+		/* Logic error */
+		BUG_ON(!block_group_cache_done(block_group));
+		ret = btrfs_remove_free_space(block_group, start, num_bytes);
+	} else {
+		mutex_lock(&caching_ctl->mutex);
+
+		if (start >= caching_ctl->progress) {
+			ret = add_excluded_extent(root, start, num_bytes);
+		} else if (start + num_bytes <= caching_ctl->progress) {
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+		} else {
+			num_bytes = caching_ctl->progress - start;
+			ret = btrfs_remove_free_space(block_group,
+						      start, num_bytes);
+			if (ret)
+				goto out_lock;
+
+			num_bytes = (start + num_bytes) -
+				caching_ctl->progress;
+			start = caching_ctl->progress;
+			ret = add_excluded_extent(root, start, num_bytes);
+		}
+out_lock:
+		mutex_unlock(&caching_ctl->mutex);
+		put_caching_control(caching_ctl);
+	}
+	btrfs_put_block_group(block_group);
+	return ret;
+}
+
+int btrfs_exclude_logged_extents(struct btrfs_root *log,
+				 struct extent_buffer *eb)
+{
+	struct btrfs_file_extent_item *item;
+	struct btrfs_key key;
+	int found_type;
+	int i;
+
+	if (!btrfs_fs_incompat(log->fs_info, MIXED_GROUPS))
+		return 0;
+
+	for (i = 0; i < btrfs_header_nritems(eb); i++) {
+		btrfs_item_key_to_cpu(eb, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
+		found_type = btrfs_file_extent_type(eb, item);
+		if (found_type == BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			continue;
+		key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
+		key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
+		__exclude_logged_extent(log, key.objectid, key.offset);
+	}
+
 	return 0;
 }
 
@@ -4553,6 +5414,7 @@
 {
 	struct btrfs_space_info *space_info = cache->space_info;
 	int ret = 0;
+
 	spin_lock(&space_info->lock);
 	spin_lock(&cache->lock);
 	if (reserve != RESERVE_FREE) {
@@ -4562,7 +5424,9 @@
 			cache->reserved += num_bytes;
 			space_info->bytes_reserved += num_bytes;
 			if (reserve == RESERVE_ALLOC) {
-				BUG_ON(space_info->bytes_may_use < num_bytes);
+				trace_btrfs_space_reservation(cache->fs_info,
+						"space_info", space_info->flags,
+						num_bytes, 0);
 				space_info->bytes_may_use -= num_bytes;
 			}
 		}
@@ -4571,20 +5435,20 @@
 			space_info->bytes_readonly += num_bytes;
 		cache->reserved -= num_bytes;
 		space_info->bytes_reserved -= num_bytes;
-		space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&space_info->lock);
 	return ret;
 }
 
-int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
+void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_caching_control *next;
 	struct btrfs_caching_control *caching_ctl;
 	struct btrfs_block_group_cache *cache;
+	struct btrfs_space_info *space_info;
 
 	down_write(&fs_info->extent_commit_sem);
 
@@ -4607,23 +5471,29 @@
 
 	up_write(&fs_info->extent_commit_sem);
 
+	list_for_each_entry_rcu(space_info, &fs_info->space_info, list)
+		percpu_counter_set(&space_info->total_bytes_pinned, 0);
+
 	update_global_block_rsv(fs_info);
-	return 0;
 }
 
 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_space_info *space_info;
+	struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
 	u64 len;
+	bool readonly;
 
 	while (start <= end) {
+		readonly = false;
 		if (!cache ||
 		    start >= cache->key.objectid + cache->key.offset) {
 			if (cache)
 				btrfs_put_block_group(cache);
 			cache = btrfs_lookup_block_group(fs_info, start);
-			BUG_ON(!cache);
+			BUG_ON(!cache); /* Logic error */
 		}
 
 		len = cache->key.objectid + cache->key.offset - start;
@@ -4635,15 +5505,30 @@
 		}
 
 		start += len;
+		space_info = cache->space_info;
 
-		spin_lock(&cache->space_info->lock);
+		spin_lock(&space_info->lock);
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
-		cache->space_info->bytes_pinned -= len;
-		if (cache->ro)
-			cache->space_info->bytes_readonly += len;
+		space_info->bytes_pinned -= len;
+		if (cache->ro) {
+			space_info->bytes_readonly += len;
+			readonly = true;
+		}
 		spin_unlock(&cache->lock);
-		spin_unlock(&cache->space_info->lock);
+		if (!readonly && global_rsv->space_info == space_info) {
+			spin_lock(&global_rsv->lock);
+			if (!global_rsv->full) {
+				len = min(len, global_rsv->size -
+					  global_rsv->reserved);
+				global_rsv->reserved += len;
+				space_info->bytes_may_use += len;
+				if (global_rsv->reserved >= global_rsv->size)
+					global_rsv->full = 1;
+			}
+			spin_unlock(&global_rsv->lock);
+		}
+		spin_unlock(&space_info->lock);
 	}
 
 	if (cache)
@@ -4660,6 +5545,9 @@
 	u64 end;
 	int ret;
 
+	if (trans->aborted)
+		return 0;
+
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
 		unpin = &fs_info->freed_extents[1];
 	else
@@ -4667,7 +5555,7 @@
 
 	while (1) {
 		ret = find_first_extent_bit(unpin, 0, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 		if (ret)
 			break;
 
@@ -4683,6 +5571,27 @@
 	return 0;
 }
 
+static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes,
+			     u64 owner, u64 root_objectid)
+{
+	struct btrfs_space_info *space_info;
+	u64 flags;
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (root_objectid == BTRFS_CHUNK_TREE_OBJECTID)
+			flags = BTRFS_BLOCK_GROUP_SYSTEM;
+		else
+			flags = BTRFS_BLOCK_GROUP_METADATA;
+	} else {
+		flags = BTRFS_BLOCK_GROUP_DATA;
+	}
+
+	space_info = __find_space_info(fs_info, flags);
+	BUG_ON(!space_info); /* Logic bug */
+	percpu_counter_add(&space_info->total_bytes_pinned, num_bytes);
+}
+
+
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				u64 bytenr, u64 num_bytes, u64 parent,
@@ -4704,6 +5613,8 @@
 	int num_to_del = 1;
 	u32 item_size;
 	u64 refs;
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -4715,6 +5626,9 @@
 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
 	BUG_ON(!is_data && refs_to_drop != 1);
 
+	if (is_data)
+		skinny_metadata = 0;
+
 	ret = lookup_extent_backref(trans, extent_root, path, &iref,
 				    bytenr, num_bytes, parent,
 				    root_objectid, owner_objectid,
@@ -4731,6 +5645,11 @@
 				found_extent = 1;
 				break;
 			}
+			if (key.type == BTRFS_METADATA_ITEM_KEY &&
+			    key.offset == owner_objectid) {
+				found_extent = 1;
+				break;
+			}
 			if (path->slots[0] - extent_slot > 5)
 				break;
 			extent_slot--;
@@ -4745,7 +5664,10 @@
 			ret = remove_extent_backref(trans, extent_root, path,
 						    NULL, refs_to_drop,
 						    is_data);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 			btrfs_release_path(path);
 			path->leave_spinning = 1;
 
@@ -4753,29 +5675,59 @@
 			key.type = BTRFS_EXTENT_ITEM_KEY;
 			key.offset = num_bytes;
 
+			if (!is_data && skinny_metadata) {
+				key.type = BTRFS_METADATA_ITEM_KEY;
+				key.offset = owner_objectid;
+			}
+
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
+			if (ret > 0 && skinny_metadata && path->slots[0]) {
+				/*
+				 * Couldn't find our skinny metadata item,
+				 * see if we have ye olde extent item.
+				 */
+				path->slots[0]--;
+				btrfs_item_key_to_cpu(path->nodes[0], &key,
+						      path->slots[0]);
+				if (key.objectid == bytenr &&
+				    key.type == BTRFS_EXTENT_ITEM_KEY &&
+				    key.offset == num_bytes)
+					ret = 0;
+			}
+
+			if (ret > 0 && skinny_metadata) {
+				skinny_metadata = false;
+				key.type = BTRFS_EXTENT_ITEM_KEY;
+				key.offset = num_bytes;
+				btrfs_release_path(path);
+				ret = btrfs_search_slot(trans, extent_root,
+							&key, path, -1, 1);
+			}
+
 			if (ret) {
-				printk(KERN_ERR "umm, got %d back from search"
-				       ", was looking for %llu\n", ret,
-				       (unsigned long long)bytenr);
+				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+					ret, bytenr);
 				if (ret > 0)
 					btrfs_print_leaf(extent_root,
 							 path->nodes[0]);
 			}
-			BUG_ON(ret);
+			if (ret < 0) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 			extent_slot = path->slots[0];
 		}
-	} else {
+	} else if (ret == -ENOENT) {
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
-		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "parent %llu root %llu  owner %llu offset %llu\n",
-		       (unsigned long long)bytenr,
-		       (unsigned long long)parent,
-		       (unsigned long long)root_objectid,
-		       (unsigned long long)owner_objectid,
-		       (unsigned long long)owner_offset);
+		btrfs_err(info,
+			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
+			bytenr, parent, root_objectid, owner_objectid,
+			owner_offset);
+	} else {
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
 	}
 
 	leaf = path->nodes[0];
@@ -4785,7 +5737,10 @@
 		BUG_ON(found_extent || extent_slot != path->slots[0]);
 		ret = convert_extent_item_v0(trans, extent_root, path,
 					     owner_objectid, 0);
-		BUG_ON(ret < 0);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 
 		btrfs_release_path(path);
 		path->leave_spinning = 1;
@@ -4797,12 +5752,15 @@
 		ret = btrfs_search_slot(trans, extent_root, &key, path,
 					-1, 1);
 		if (ret) {
-			printk(KERN_ERR "umm, got %d back from search"
-			       ", was looking for %llu\n", ret,
-			       (unsigned long long)bytenr);
+			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
+				ret, bytenr);
 			btrfs_print_leaf(extent_root, path->nodes[0]);
 		}
-		BUG_ON(ret);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+
 		extent_slot = path->slots[0];
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, extent_slot);
@@ -4811,7 +5769,8 @@
 	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
+	    key.type == BTRFS_EXTENT_ITEM_KEY) {
 		struct btrfs_tree_block_info *bi;
 		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
 		bi = (struct btrfs_tree_block_info *)(ei + 1);
@@ -4819,7 +5778,13 @@
 	}
 
 	refs = btrfs_extent_refs(leaf, ei);
-	BUG_ON(refs < refs_to_drop);
+	if (refs < refs_to_drop) {
+		btrfs_err(info, "trying to drop %d refs but we only have %Lu "
+			  "for bytenr %Lu\n", refs_to_drop, refs, bytenr);
+		ret = -EINVAL;
+		btrfs_abort_transaction(trans, extent_root, ret);
+		goto out;
+	}
 	refs -= refs_to_drop;
 
 	if (refs > 0) {
@@ -4839,8 +5804,13 @@
 			ret = remove_extent_backref(trans, extent_root, path,
 						    iref, refs_to_drop,
 						    is_data);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
+		add_pinned_bytes(root->fs_info, -num_bytes, owner_objectid,
+				 root_objectid);
 	} else {
 		if (found_extent) {
 			BUG_ON(is_data && refs_to_drop !=
@@ -4856,21 +5826,27 @@
 
 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
 				      num_to_del);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 		btrfs_release_path(path);
 
 		if (is_data) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
-			BUG_ON(ret);
-		} else {
-			invalidate_mapping_pages(info->btree_inode->i_mapping,
-			     bytenr >> PAGE_CACHE_SHIFT,
-			     (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
+			if (ret) {
+				btrfs_abort_transaction(trans, extent_root, ret);
+				goto out;
+			}
 		}
 
-		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-		BUG_ON(ret);
+		ret = update_block_group(root, bytenr, num_bytes, 0);
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
 	}
+out:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -4909,7 +5885,7 @@
 	if (head->extent_op) {
 		if (!head->must_insert_reserved)
 			goto out;
-		kfree(head->extent_op);
+		btrfs_free_delayed_extent_op(head->extent_op);
 		head->extent_op = NULL;
 	}
 
@@ -4958,14 +5934,16 @@
 			   u64 parent, int last_ref)
 {
 	struct btrfs_block_group_cache *cache = NULL;
+	int pin = 1;
 	int ret;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
-						parent, root->root_key.objectid,
-						btrfs_header_level(buf),
-						BTRFS_DROP_DELAYED_REF, NULL);
-		BUG_ON(ret);
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					buf->start, buf->len,
+					parent, root->root_key.objectid,
+					btrfs_header_level(buf),
+					BTRFS_DROP_DELAYED_REF, NULL, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 
 	if (!last_ref)
@@ -4989,8 +5967,14 @@
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
+		pin = 0;
 	}
 out:
+	if (pin)
+		add_pinned_bytes(root->fs_info, buf->len,
+				 btrfs_header_level(buf),
+				 root->root_key.objectid);
+
 	/*
 	 * Deleting the buffer, clear the corrupt flag since it doesn't matter
 	 * anymore.
@@ -4999,12 +5983,15 @@
 	btrfs_put_block_group(cache);
 }
 
-int btrfs_free_extent(struct btrfs_trans_handle *trans,
-		      struct btrfs_root *root,
-		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 owner, u64 offset)
+/* Can return -ENOMEM */
+int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
+		      u64 owner, u64 offset, int for_cow)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid);
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
@@ -5016,23 +6003,25 @@
 		btrfs_pin_extent(root, bytenr, num_bytes, 1);
 		ret = 0;
 	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
-		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+		ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
+					num_bytes,
 					parent, root_objectid, (int)owner,
-					BTRFS_DROP_DELAYED_REF, NULL);
-		BUG_ON(ret);
+					BTRFS_DROP_DELAYED_REF, NULL, for_cow);
 	} else {
-		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
-					parent, root_objectid, owner,
-					offset, BTRFS_DROP_DELAYED_REF, NULL);
-		BUG_ON(ret);
+		ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
+						num_bytes,
+						parent, root_objectid, owner,
+						offset, BTRFS_DROP_DELAYED_REF,
+						NULL, for_cow);
 	}
 	return ret;
 }
 
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+			struct btrfs_block_group_cache *cache,
+			u64 val, u64 num_bytes)
 {
-	u64 mask = ((u64)root->stripesize - 1);
-	u64 ret = (val + mask) & ~mask;
+	u64 ret = ALIGN(val, root->stripesize);
 	return ret;
 }
 
@@ -5046,109 +6035,118 @@
  * for our min num_bytes.  Another option is to have it go ahead
  * and look in the rbtree for a free extent of a given size, but this
  * is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
  */
-static noinline int
+static noinline void
 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 				u64 num_bytes)
 {
 	struct btrfs_caching_control *caching_ctl;
-	DEFINE_WAIT(wait);
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
-		return 0;
+		return;
 
 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
 		   (cache->free_space_ctl->free_space >= num_bytes));
 
 	put_caching_control(caching_ctl);
-	return 0;
 }
 
 static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_caching_control *caching_ctl;
-	DEFINE_WAIT(wait);
+	int ret = 0;
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
-		return 0;
+		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 
 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
-
+	if (cache->cached == BTRFS_CACHE_ERROR)
+		ret = -EIO;
 	put_caching_control(caching_ctl);
-	return 0;
+	return ret;
+}
+
+int __get_raid_index(u64 flags)
+{
+	if (flags & BTRFS_BLOCK_GROUP_RAID10)
+		return BTRFS_RAID_RAID10;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID1)
+		return BTRFS_RAID_RAID1;
+	else if (flags & BTRFS_BLOCK_GROUP_DUP)
+		return BTRFS_RAID_DUP;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID0)
+		return BTRFS_RAID_RAID0;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+		return BTRFS_RAID_RAID5;
+	else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+		return BTRFS_RAID_RAID6;
+
+	return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
 {
-	int index;
-	if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
-		index = 0;
-	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
-		index = 1;
-	else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
-		index = 2;
-	else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
-		index = 3;
-	else
-		index = 4;
-	return index;
+	return __get_raid_index(cache->flags);
 }
 
 enum btrfs_loop_type {
-	LOOP_FIND_IDEAL = 0,
-	LOOP_CACHING_NOWAIT = 1,
-	LOOP_CACHING_WAIT = 2,
-	LOOP_ALLOC_CHUNK = 3,
-	LOOP_NO_EMPTY_SIZE = 4,
+	LOOP_CACHING_NOWAIT = 0,
+	LOOP_CACHING_WAIT = 1,
+	LOOP_ALLOC_CHUNK = 2,
+	LOOP_NO_EMPTY_SIZE = 3,
 };
 
 /*
  * walks the btree of allocated extents and find a hole of a given size.
  * The key ins is changed to record the hole:
- * ins->objectid == block start
+ * ins->objectid == start position
  * ins->flags = BTRFS_EXTENT_ITEM_KEY
- * ins->offset == number of blocks
+ * ins->offset == the size of the hole.
  * Any available blocks before search_start are skipped.
+ *
+ * If there is no suitable free space, we will record the max size of
+ * the free space extent currently.
  */
-static noinline int find_free_extent(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *orig_root,
+static noinline int find_free_extent(struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
-				     u64 search_start, u64 search_end,
 				     u64 hint_byte, struct btrfs_key *ins,
-				     u64 data)
+				     u64 flags)
 {
 	int ret = 0;
 	struct btrfs_root *root = orig_root->fs_info->extent_root;
 	struct btrfs_free_cluster *last_ptr = NULL;
 	struct btrfs_block_group_cache *block_group = NULL;
 	struct btrfs_block_group_cache *used_block_group;
+	u64 search_start = 0;
+	u64 max_extent_size = 0;
 	int empty_cluster = 2 * 1024 * 1024;
-	int allowed_chunk_alloc = 0;
-	int done_chunk_alloc = 0;
 	struct btrfs_space_info *space_info;
 	int loop = 0;
-	int index = 0;
-	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+	int index = __get_raid_index(flags);
+	int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ?
 		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
 	bool have_caching_bg = false;
-	u64 ideal_cache_percent = 0;
-	u64 ideal_cache_offset = 0;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 	ins->objectid = 0;
 	ins->offset = 0;
 
-	space_info = __find_space_info(root->fs_info, data);
+	trace_find_free_extent(orig_root, num_bytes, empty_size, flags);
+
+	space_info = __find_space_info(root->fs_info, flags);
 	if (!space_info) {
-		printk(KERN_ERR "No space info for %llu\n", data);
+		btrfs_err(root->fs_info, "No space info for %llu", flags);
 		return -ENOSPC;
 	}
 
@@ -5159,16 +6157,13 @@
 	if (btrfs_mixed_space_info(space_info))
 		use_cluster = false;
 
-	if (orig_root->ref_cows || empty_size)
-		allowed_chunk_alloc = 1;
-
-	if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
+	if (flags & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
 		last_ptr = &root->fs_info->meta_alloc_cluster;
 		if (!btrfs_test_opt(root, SSD))
 			empty_cluster = 64 * 1024;
 	}
 
-	if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+	if ((flags & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
 	    btrfs_test_opt(root, SSD)) {
 		last_ptr = &root->fs_info->data_alloc_cluster;
 	}
@@ -5187,7 +6182,6 @@
 		empty_cluster = 0;
 
 	if (search_start == hint_byte) {
-ideal_cache:
 		block_group = btrfs_lookup_block_group(root->fs_info,
 						       search_start);
 		used_block_group = block_group;
@@ -5198,9 +6192,8 @@
 		 * However if we are re-searching with an ideal block group
 		 * picked out then we don't care that the block group is cached.
 		 */
-		if (block_group && block_group_bits(block_group, data) &&
-		    (block_group->cached != BTRFS_CACHE_NO ||
-		     search_start == ideal_cache_offset)) {
+		if (block_group && block_group_bits(block_group, flags) &&
+		    block_group->cached != BTRFS_CACHE_NO) {
 			down_read(&space_info->groups_sem);
 			if (list_empty(&block_group->list) ||
 			    block_group->ro) {
@@ -5237,9 +6230,11 @@
 		 * raid types, but we want to make sure we only allocate
 		 * for the proper type.
 		 */
-		if (!block_group_bits(block_group, data)) {
+		if (!block_group_bits(block_group, flags)) {
 		    u64 extra = BTRFS_BLOCK_GROUP_DUP |
 				BTRFS_BLOCK_GROUP_RAID1 |
+				BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6 |
 				BTRFS_BLOCK_GROUP_RAID10;
 
 			/*
@@ -5247,68 +6242,30 @@
 			 * doesn't provide them, bail.  This does allow us to
 			 * fill raid0 from raid1.
 			 */
-			if ((data & extra) && !(block_group->flags & extra))
+			if ((flags & extra) && !(block_group->flags & extra))
 				goto loop;
 		}
 
 have_block_group:
 		cached = block_group_cache_done(block_group);
 		if (unlikely(!cached)) {
-			u64 free_percent;
-
 			found_uncached_bg = true;
-			ret = cache_block_group(block_group, trans,
-						orig_root, 1);
-			if (block_group->cached == BTRFS_CACHE_FINISHED)
-				goto alloc;
-
-			free_percent = btrfs_block_group_used(&block_group->item);
-			free_percent *= 100;
-			free_percent = div64_u64(free_percent,
-						 block_group->key.offset);
-			free_percent = 100 - free_percent;
-			if (free_percent > ideal_cache_percent &&
-			    likely(!block_group->ro)) {
-				ideal_cache_offset = block_group->key.objectid;
-				ideal_cache_percent = free_percent;
-			}
-
-			/*
-			 * The caching workers are limited to 2 threads, so we
-			 * can queue as much work as we care to.
-			 */
-			if (loop > LOOP_FIND_IDEAL) {
-				ret = cache_block_group(block_group, trans,
-							orig_root, 0);
-				BUG_ON(ret);
-			}
-
-			/*
-			 * If loop is set for cached only, try the next block
-			 * group.
-			 */
-			if (loop == LOOP_FIND_IDEAL)
-				goto loop;
+			ret = cache_block_group(block_group, 0);
+			BUG_ON(ret < 0);
+			ret = 0;
 		}
 
-alloc:
-		if (unlikely(block_group->ro))
+		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
 			goto loop;
-
-		spin_lock(&block_group->free_space_ctl->tree_lock);
-		if (cached &&
-		    block_group->free_space_ctl->free_space <
-		    num_bytes + empty_cluster + empty_size) {
-			spin_unlock(&block_group->free_space_ctl->tree_lock);
+		if (unlikely(block_group->ro))
 			goto loop;
-		}
-		spin_unlock(&block_group->free_space_ctl->tree_lock);
 
 		/*
 		 * Ok we want to try and use the cluster allocator, so
 		 * lets look there
 		 */
 		if (last_ptr) {
+			unsigned long aligned_cluster;
 			/*
 			 * the refill lock keeps out other
 			 * people trying to start a new cluster
@@ -5318,7 +6275,7 @@
 			if (used_block_group != block_group &&
 			    (!used_block_group ||
 			     used_block_group->ro ||
-			     !block_group_bits(used_block_group, data))) {
+			     !block_group_bits(used_block_group, flags))) {
 				used_block_group = block_group;
 				goto refill_cluster;
 			}
@@ -5327,10 +6284,15 @@
 				btrfs_get_block_group(used_block_group);
 
 			offset = btrfs_alloc_from_cluster(used_block_group,
-			  last_ptr, num_bytes, used_block_group->key.objectid);
+						last_ptr,
+						num_bytes,
+						used_block_group->key.objectid,
+						&max_extent_size);
 			if (offset) {
 				/* we have a block, we're done */
 				spin_unlock(&last_ptr->refill_lock);
+				trace_btrfs_reserve_extent_cluster(root,
+					block_group, search_start, num_bytes);
 				goto checks;
 			}
 
@@ -5349,8 +6311,15 @@
 			 * plenty of times and not have found
 			 * anything, so we are likely way too
 			 * fragmented for the clustering stuff to find
-			 * anything.  */
-			if (loop >= LOOP_NO_EMPTY_SIZE) {
+			 * anything.
+			 *
+			 * However, if the cluster is taken from the
+			 * current block group, release the cluster
+			 * first, so that we stand a better chance of
+			 * succeeding in the unclustered
+			 * allocation.  */
+			if (loop >= LOOP_NO_EMPTY_SIZE &&
+			    last_ptr->block_group != block_group) {
 				spin_unlock(&last_ptr->refill_lock);
 				goto unclustered_alloc;
 			}
@@ -5361,22 +6330,36 @@
 			 */
 			btrfs_return_cluster_to_free_space(NULL, last_ptr);
 
+			if (loop >= LOOP_NO_EMPTY_SIZE) {
+				spin_unlock(&last_ptr->refill_lock);
+				goto unclustered_alloc;
+			}
+
+			aligned_cluster = max_t(unsigned long,
+						empty_cluster + empty_size,
+					      block_group->full_stripe_len);
+
 			/* allocate a cluster in this block group */
-			ret = btrfs_find_space_cluster(trans, root,
-					       block_group, last_ptr,
-					       search_start, num_bytes,
-					       empty_cluster + empty_size);
+			ret = btrfs_find_space_cluster(root, block_group,
+						       last_ptr, search_start,
+						       num_bytes,
+						       aligned_cluster);
 			if (ret == 0) {
 				/*
 				 * now pull our allocation out of this
 				 * cluster
 				 */
 				offset = btrfs_alloc_from_cluster(block_group,
-						  last_ptr, num_bytes,
-						  search_start);
+							last_ptr,
+							num_bytes,
+							search_start,
+							&max_extent_size);
 				if (offset) {
 					/* we found one, proceed */
 					spin_unlock(&last_ptr->refill_lock);
+					trace_btrfs_reserve_extent_cluster(root,
+						block_group, search_start,
+						num_bytes);
 					goto checks;
 				}
 			} else if (!cached && loop > LOOP_CACHING_NOWAIT
@@ -5401,8 +6384,22 @@
 		}
 
 unclustered_alloc:
+		spin_lock(&block_group->free_space_ctl->tree_lock);
+		if (cached &&
+		    block_group->free_space_ctl->free_space <
+		    num_bytes + empty_cluster + empty_size) {
+			if (block_group->free_space_ctl->free_space >
+			    max_extent_size)
+				max_extent_size =
+					block_group->free_space_ctl->free_space;
+			spin_unlock(&block_group->free_space_ctl->tree_lock);
+			goto loop;
+		}
+		spin_unlock(&block_group->free_space_ctl->tree_lock);
+
 		offset = btrfs_find_space_for_alloc(block_group, search_start,
-						    num_bytes, empty_size);
+						    num_bytes, empty_size,
+						    &max_extent_size);
 		/*
 		 * If we didn't find a chunk, and we haven't failed on this
 		 * block group before, and this block group is in the middle of
@@ -5424,12 +6421,8 @@
 			goto loop;
 		}
 checks:
-		search_start = stripe_align(root, offset);
-		/* move on to the next group */
-		if (search_start + num_bytes >= search_end) {
-			btrfs_add_free_space(used_block_group, offset, num_bytes);
-			goto loop;
-		}
+		search_start = stripe_align(root, used_block_group,
+					    offset, num_bytes);
 
 		/* move on to the next group */
 		if (search_start + num_bytes >
@@ -5438,9 +6431,6 @@
 			goto loop;
 		}
 
-		ins->objectid = search_start;
-		ins->offset = num_bytes;
-
 		if (offset < search_start)
 			btrfs_add_free_space(used_block_group, offset,
 					     search_start - offset);
@@ -5457,10 +6447,8 @@
 		ins->objectid = search_start;
 		ins->offset = num_bytes;
 
-		if (offset < search_start)
-			btrfs_add_free_space(used_block_group, offset,
-					     search_start - offset);
-		BUG_ON(offset > search_start);
+		trace_btrfs_reserve_extent(orig_root, block_group,
+					   search_start, num_bytes);
 		if (used_block_group != block_group)
 			btrfs_put_block_group(used_block_group);
 		btrfs_put_block_group(block_group);
@@ -5481,9 +6469,7 @@
 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
 		goto search;
 
-	/* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
-	 *			for them to make caching progress.  Also
-	 *			determine the best possible bg to cache
+	/*
 	 * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
 	 *			caching kthreads as we move along
 	 * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
@@ -5493,65 +6479,30 @@
 	 */
 	if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
 		index = 0;
-		if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
-			found_uncached_bg = false;
-			loop++;
-			if (!ideal_cache_percent)
-				goto search;
+		loop++;
+		if (loop == LOOP_ALLOC_CHUNK) {
+			struct btrfs_trans_handle *trans;
 
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+
+			ret = do_chunk_alloc(trans, root, flags,
+					     CHUNK_ALLOC_FORCE);
 			/*
-			 * 1 of the following 2 things have happened so far
-			 *
-			 * 1) We found an ideal block group for caching that
-			 * is mostly full and will cache quickly, so we might
-			 * as well wait for it.
-			 *
-			 * 2) We searched for cached only and we didn't find
-			 * anything, and we didn't start any caching kthreads
-			 * either, so chances are we will loop through and
-			 * start a couple caching kthreads, and then come back
-			 * around and just wait for them.  This will be slower
-			 * because we will have 2 caching kthreads reading at
-			 * the same time when we could have just started one
-			 * and waited for it to get far enough to give us an
-			 * allocation, so go ahead and go to the wait caching
-			 * loop.
-			 */
-			loop = LOOP_CACHING_WAIT;
-			search_start = ideal_cache_offset;
-			ideal_cache_percent = 0;
-			goto ideal_cache;
-		} else if (loop == LOOP_FIND_IDEAL) {
-			/*
-			 * Didn't find a uncached bg, wait on anything we find
-			 * next.
+			 * Do not bail out on ENOSPC since we
+			 * can do more things.
 			 */
-			loop = LOOP_CACHING_WAIT;
-			goto search;
-		}
-
-		loop++;
-
-		if (loop == LOOP_ALLOC_CHUNK) {
-		       if (allowed_chunk_alloc) {
-				ret = do_chunk_alloc(trans, root, num_bytes +
-						     2 * 1024 * 1024, data,
-						     CHUNK_ALLOC_LIMITED);
-				allowed_chunk_alloc = 0;
-				if (ret == 1)
-					done_chunk_alloc = 1;
-			} else if (!done_chunk_alloc &&
-				   space_info->force_alloc ==
-				   CHUNK_ALLOC_NO_FORCE) {
-				space_info->force_alloc = CHUNK_ALLOC_LIMITED;
-			}
-
-		       /*
-			* We didn't allocate a chunk, go ahead and drop the
-			* empty size and loop again.
-			*/
-		       if (!done_chunk_alloc)
-			       loop = LOOP_NO_EMPTY_SIZE;
+			if (ret < 0 && ret != -ENOSPC)
+				btrfs_abort_transaction(trans,
+							root, ret);
+			else
+				ret = 0;
+			btrfs_end_transaction(trans, root);
+			if (ret)
+				goto out;
 		}
 
 		if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -5565,7 +6516,9 @@
 	} else if (ins->objectid) {
 		ret = 0;
 	}
-
+out:
+	if (ret == -ENOSPC)
+		ins->offset = max_extent_size;
 	return ret;
 }
 
@@ -5577,19 +6530,15 @@
 
 	spin_lock(&info->lock);
 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
-	       (unsigned long long)info->flags,
-	       (unsigned long long)(info->total_bytes - info->bytes_used -
-				    info->bytes_pinned - info->bytes_reserved -
-				    info->bytes_readonly),
+	       info->flags,
+	       info->total_bytes - info->bytes_used - info->bytes_pinned -
+	       info->bytes_reserved - info->bytes_readonly,
 	       (info->full) ? "" : "not ");
 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
-	       (unsigned long long)info->total_bytes,
-	       (unsigned long long)info->bytes_used,
-	       (unsigned long long)info->bytes_pinned,
-	       (unsigned long long)info->bytes_reserved,
-	       (unsigned long long)info->bytes_may_use,
-	       (unsigned long long)info->bytes_readonly);
+	       info->total_bytes, info->bytes_used, info->bytes_pinned,
+	       info->bytes_reserved, info->bytes_may_use,
+	       info->bytes_readonly);
 	spin_unlock(&info->lock);
 
 	if (!dump_block_groups)
@@ -5599,13 +6548,10 @@
 again:
 	list_for_each_entry(cache, &info->block_groups[index], list) {
 		spin_lock(&cache->lock);
-		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
-		       "%llu pinned %llu reserved\n",
-		       (unsigned long long)cache->key.objectid,
-		       (unsigned long long)cache->key.offset,
-		       (unsigned long long)btrfs_block_group_used(&cache->item),
-		       (unsigned long long)cache->pinned,
-		       (unsigned long long)cache->reserved);
+		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
+		       cache->key.objectid, cache->key.offset,
+		       btrfs_block_group_used(&cache->item), cache->pinned,
+		       cache->reserved, cache->ro ? "[readonly]" : "");
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
@@ -5614,48 +6560,38 @@
 	up_read(&info->groups_sem);
 }
 
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
+int btrfs_reserve_extent(struct btrfs_root *root,
 			 u64 num_bytes, u64 min_alloc_size,
 			 u64 empty_size, u64 hint_byte,
-			 u64 search_end, struct btrfs_key *ins,
-			 u64 data)
+			 struct btrfs_key *ins, int is_data)
 {
+	bool final_tried = false;
+	u64 flags;
 	int ret;
-	u64 search_start = 0;
 
-	data = btrfs_get_alloc_profile(root, data);
+	flags = btrfs_get_alloc_profile(root, is_data);
 again:
-	/*
-	 * the only place that sets empty_size is btrfs_realloc_node, which
-	 * is not called recursively on allocations
-	 */
-	if (empty_size || root->ref_cows)
-		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-				     num_bytes + 2 * 1024 * 1024, data,
-				     CHUNK_ALLOC_NO_FORCE);
-
 	WARN_ON(num_bytes < root->sectorsize);
-	ret = find_free_extent(trans, root, num_bytes, empty_size,
-			       search_start, search_end, hint_byte,
-			       ins, data);
-
-	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
-		num_bytes = num_bytes >> 1;
-		num_bytes = num_bytes & ~(root->sectorsize - 1);
-		num_bytes = max(num_bytes, min_alloc_size);
-		do_chunk_alloc(trans, root->fs_info->extent_root,
-			       num_bytes, data, CHUNK_ALLOC_FORCE);
-		goto again;
-	}
-	if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
-		struct btrfs_space_info *sinfo;
+	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
+			       flags);
 
-		sinfo = __find_space_info(root->fs_info, data);
-		printk(KERN_ERR "btrfs allocation failed flags %llu, "
-		       "wanted %llu\n", (unsigned long long)data,
-		       (unsigned long long)num_bytes);
-		dump_space_info(sinfo, num_bytes, 1);
+	if (ret == -ENOSPC) {
+		if (!final_tried && ins->offset) {
+			num_bytes = min(num_bytes >> 1, ins->offset);
+			num_bytes = round_down(num_bytes, root->sectorsize);
+			num_bytes = max(num_bytes, min_alloc_size);
+			if (num_bytes == min_alloc_size)
+				final_tried = true;
+			goto again;
+		} else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+			struct btrfs_space_info *sinfo;
+
+			sinfo = __find_space_info(root->fs_info, flags);
+			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
+				flags, num_bytes);
+			if (sinfo)
+				dump_space_info(sinfo, num_bytes, 1);
+		}
 	}
 
 	trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
@@ -5671,8 +6607,8 @@
 
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
-		printk(KERN_ERR "Unable to find block group for %llu\n",
-		       (unsigned long long)start);
+		btrfs_err(root->fs_info, "Unable to find block group for %llu",
+			start);
 		return -ENOSPC;
 	}
 
@@ -5733,7 +6669,10 @@
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 				      ins, size);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_free_path(path);
+		return ret;
+	}
 
 	leaf = path->nodes[0];
 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5762,11 +6701,10 @@
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-	if (ret) {
-		printk(KERN_ERR "btrfs update block group failed for %llu "
-		       "%llu\n", (unsigned long long)ins->objectid,
-		       (unsigned long long)ins->offset);
+	ret = update_block_group(root, ins->objectid, ins->offset, 1);
+	if (ret) { /* -ENOENT, logic error */
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			ins->objectid, ins->offset);
 		BUG();
 	}
 	return ret;
@@ -5785,7 +6723,12 @@
 	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
+	u32 size = sizeof(*extent_item) + sizeof(*iref);
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
+
+	if (!skinny_metadata)
+		size += sizeof(*block_info);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -5794,7 +6737,10 @@
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
 				      ins, size);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_free_path(path);
+		return ret;
+	}
 
 	leaf = path->nodes[0];
 	extent_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -5803,12 +6749,16 @@
 	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
 	btrfs_set_extent_flags(leaf, extent_item,
 			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
-	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
 
-	btrfs_set_tree_block_key(leaf, block_info, key);
-	btrfs_set_tree_block_level(leaf, block_info, level);
+	if (skinny_metadata) {
+		iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	} else {
+		block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+		btrfs_set_tree_block_key(leaf, block_info, key);
+		btrfs_set_tree_block_level(leaf, block_info, level);
+		iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	}
 
-	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
 	if (parent > 0) {
 		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
 		btrfs_set_extent_inline_ref_type(leaf, iref,
@@ -5823,11 +6773,10 @@
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-	if (ret) {
-		printk(KERN_ERR "btrfs update block group failed for %llu "
-		       "%llu\n", (unsigned long long)ins->objectid,
-		       (unsigned long long)ins->offset);
+	ret = update_block_group(root, ins->objectid, root->leafsize, 1);
+	if (ret) { /* -ENOENT, logic error */
+		btrfs_err(fs_info, "update block group failed for %llu %llu",
+			ins->objectid, ins->offset);
 		BUG();
 	}
 	return ret;
@@ -5842,9 +6791,10 @@
 
 	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
 
-	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
-					 0, root_objectid, owner, offset,
-					 BTRFS_ADD_DELAYED_EXTENT, NULL);
+	ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
+					 ins->offset, 0,
+					 root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
 	return ret;
 }
 
@@ -5860,58 +6810,33 @@
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
-	struct btrfs_caching_control *caching_ctl;
-	u64 start = ins->objectid;
-	u64 num_bytes = ins->offset;
-
-	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-	cache_block_group(block_group, trans, NULL, 0);
-	caching_ctl = get_caching_control(block_group);
-
-	if (!caching_ctl) {
-		BUG_ON(!block_group_cache_done(block_group));
-		ret = btrfs_remove_free_space(block_group, start, num_bytes);
-		BUG_ON(ret);
-	} else {
-		mutex_lock(&caching_ctl->mutex);
-
-		if (start >= caching_ctl->progress) {
-			ret = add_excluded_extent(root, start, num_bytes);
-			BUG_ON(ret);
-		} else if (start + num_bytes <= caching_ctl->progress) {
-			ret = btrfs_remove_free_space(block_group,
-						      start, num_bytes);
-			BUG_ON(ret);
-		} else {
-			num_bytes = caching_ctl->progress - start;
-			ret = btrfs_remove_free_space(block_group,
-						      start, num_bytes);
-			BUG_ON(ret);
 
-			start = caching_ctl->progress;
-			num_bytes = ins->objectid + ins->offset -
-				    caching_ctl->progress;
-			ret = add_excluded_extent(root, start, num_bytes);
-			BUG_ON(ret);
-		}
-
-		mutex_unlock(&caching_ctl->mutex);
-		put_caching_control(caching_ctl);
+	/*
+	 * Mixed block groups will exclude before processing the log so we only
+	 * need to do the exlude dance if this fs isn't mixed.
+	 */
+	if (!btrfs_fs_incompat(root->fs_info, MIXED_GROUPS)) {
+		ret = __exclude_logged_extent(root, ins->objectid, ins->offset);
+		if (ret)
+			return ret;
 	}
 
+	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
+	if (!block_group)
+		return -EINVAL;
+
 	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
 					  RESERVE_ALLOC_NO_ACCOUNT);
-	BUG_ON(ret);
-	btrfs_put_block_group(block_group);
+	BUG_ON(ret); /* logic error */
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
 					 0, owner, offset, ins, 1);
+	btrfs_put_block_group(block_group);
 	return ret;
 }
 
-struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
-					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize,
-					    int level)
+static struct extent_buffer *
+btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      u64 bytenr, u32 blocksize, int level)
 {
 	struct extent_buffer *buf;
 
@@ -5922,6 +6847,7 @@
 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
+	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
 	btrfs_set_lock_blocking(buf);
 	btrfs_set_buffer_uptodate(buf);
@@ -5953,54 +6879,58 @@
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	int ret;
+	bool global_updated = false;
 
 	block_rsv = get_block_rsv(trans, root);
 
-	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
-		/*
-		 * If we couldn't reserve metadata bytes try and use some from
-		 * the global reserve.
-		 */
-		if (ret && block_rsv != global_rsv) {
-			ret = block_rsv_use_bytes(global_rsv, blocksize);
-			if (!ret)
-				return global_rsv;
-			return ERR_PTR(ret);
-		} else if (ret) {
-			return ERR_PTR(ret);
-		}
+	if (unlikely(block_rsv->size == 0))
+		goto try_reserve;
+again:
+	ret = block_rsv_use_bytes(block_rsv, blocksize);
+	if (!ret)
 		return block_rsv;
+
+	if (block_rsv->failfast)
+		return ERR_PTR(ret);
+
+	if (block_rsv->type == BTRFS_BLOCK_RSV_GLOBAL && !global_updated) {
+		global_updated = true;
+		update_global_block_rsv(root->fs_info);
+		goto again;
 	}
 
-	ret = block_rsv_use_bytes(block_rsv, blocksize);
+	if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+		static DEFINE_RATELIMIT_STATE(_rs,
+				DEFAULT_RATELIMIT_INTERVAL * 10,
+				/*DEFAULT_RATELIMIT_BURST*/ 1);
+		if (__ratelimit(&_rs))
+			WARN(1, KERN_DEBUG
+				"btrfs: block rsv returned %d\n", ret);
+	}
+try_reserve:
+	ret = reserve_metadata_bytes(root, block_rsv, blocksize,
+				     BTRFS_RESERVE_NO_FLUSH);
 	if (!ret)
 		return block_rsv;
-	if (ret) {
-		static DEFINE_RATELIMIT_STATE(_rs,
-				DEFAULT_RATELIMIT_INTERVAL,
-				/*DEFAULT_RATELIMIT_BURST*/ 2);
-		if (__ratelimit(&_rs)) {
-			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
-			WARN_ON(1);
-		}
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
-		if (!ret) {
-			return block_rsv;
-		} else if (ret && block_rsv != global_rsv) {
-			ret = block_rsv_use_bytes(global_rsv, blocksize);
-			if (!ret)
-				return global_rsv;
-		}
+	/*
+	 * If we couldn't reserve metadata bytes try and use some from
+	 * the global reserve if its space type is the same as the global
+	 * reservation.
+	 */
+	if (block_rsv->type != BTRFS_BLOCK_RSV_GLOBAL &&
+	    block_rsv->space_info == global_rsv->space_info) {
+		ret = block_rsv_use_bytes(global_rsv, blocksize);
+		if (!ret)
+			return global_rsv;
 	}
-
-	return ERR_PTR(-ENOSPC);
+	return ERR_PTR(ret);
 }
 
-static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
+			    struct btrfs_block_rsv *block_rsv, u32 blocksize)
 {
 	block_rsv_add_bytes(block_rsv, blocksize, 0);
-	block_rsv_release_bytes(block_rsv, NULL, 0);
+	block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
 }
 
 /*
@@ -6021,22 +6951,23 @@
 	struct extent_buffer *buf;
 	u64 flags = 0;
 	int ret;
-
+	bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
+						 SKINNY_METADATA);
 
 	block_rsv = use_block_rsv(trans, root, blocksize);
 	if (IS_ERR(block_rsv))
 		return ERR_CAST(block_rsv);
 
-	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
-				   empty_size, hint, (u64)-1, &ins, 0);
+	ret = btrfs_reserve_extent(root, blocksize, blocksize,
+				   empty_size, hint, &ins, 0);
 	if (ret) {
-		unuse_block_rsv(block_rsv, blocksize);
+		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
 		return ERR_PTR(ret);
 	}
 
 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
 				    blocksize, level);
-	BUG_ON(IS_ERR(buf));
+	BUG_ON(IS_ERR(buf)); /* -ENOMEM */
 
 	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
 		if (parent == 0)
@@ -6047,22 +6978,27 @@
 
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
 		struct btrfs_delayed_extent_op *extent_op;
-		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-		BUG_ON(!extent_op);
+		extent_op = btrfs_alloc_delayed_extent_op();
+		BUG_ON(!extent_op); /* -ENOMEM */
 		if (key)
 			memcpy(&extent_op->key, key, sizeof(extent_op->key));
 		else
 			memset(&extent_op->key, 0, sizeof(extent_op->key));
 		extent_op->flags_to_set = flags;
-		extent_op->update_key = 1;
+		if (skinny_metadata)
+			extent_op->update_key = 0;
+		else
+			extent_op->update_key = 1;
 		extent_op->update_flags = 1;
 		extent_op->is_data = 0;
+		extent_op->level = level;
 
-		ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
+					ins.objectid,
 					ins.offset, parent, root_objectid,
 					level, BTRFS_ADD_DELAYED_EXTENT,
-					extent_op);
-		BUG_ON(ret);
+					extent_op, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	return buf;
 }
@@ -6078,6 +7014,7 @@
 	int keep_locks;
 	int reada_slot;
 	int reada_count;
+	int for_reloc;
 };
 
 #define DROP_REFERENCE	1
@@ -6129,9 +7066,12 @@
 			continue;
 
 		/* We don't lock the tree block, it's OK to be racy here */
-		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
-					       &refs, &flags);
-		BUG_ON(ret);
+		ret = btrfs_lookup_extent_info(trans, root, bytenr,
+					       wc->level - 1, 1, &refs,
+					       &flags);
+		/* We don't care about errors in readahead. */
+		if (ret < 0)
+			continue;
 		BUG_ON(refs == 0);
 
 		if (wc->stage == DROP_REFERENCE) {
@@ -6165,7 +7105,7 @@
 }
 
 /*
- * hepler to process tree block while walking down the tree.
+ * helper to process tree block while walking down the tree.
  *
  * when wc->stage == UPDATE_BACKREF, this function updates
  * back refs for pointers in the block.
@@ -6195,10 +7135,12 @@
 	     (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
 		BUG_ON(!path->locks[level]);
 		ret = btrfs_lookup_extent_info(trans, root,
-					       eb->start, eb->len,
+					       eb->start, level, 1,
 					       &wc->refs[level],
 					       &wc->flags[level]);
-		BUG_ON(ret);
+		BUG_ON(ret == -ENOMEM);
+		if (ret)
+			return ret;
 		BUG_ON(wc->refs[level] == 0);
 	}
 
@@ -6216,13 +7158,14 @@
 	/* wc->stage == UPDATE_BACKREF */
 	if (!(wc->flags[level] & flag)) {
 		BUG_ON(!path->locks[level]);
-		ret = btrfs_inc_ref(trans, root, eb, 1);
-		BUG_ON(ret);
-		ret = btrfs_dec_ref(trans, root, eb, 0);
-		BUG_ON(ret);
+		ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
+		BUG_ON(ret); /* -ENOMEM */
+		ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
+		BUG_ON(ret); /* -ENOMEM */
 		ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
-						  eb->len, flag, 0);
-		BUG_ON(ret);
+						  eb->len, flag,
+						  btrfs_header_level(eb), 0);
+		BUG_ON(ret); /* -ENOMEM */
 		wc->flags[level] |= flag;
 	}
 
@@ -6238,7 +7181,7 @@
 }
 
 /*
- * hepler to process tree block pointer.
+ * helper to process tree block pointer.
  *
  * when wc->stage == DROP_REFERENCE, this function checks
  * reference count of the block pointed to. if the block
@@ -6286,16 +7229,25 @@
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
 		if (!next)
 			return -ENOMEM;
+		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
+					       level - 1);
 		reada = 1;
 	}
 	btrfs_tree_lock(next);
 	btrfs_set_lock_blocking(next);
 
-	ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+	ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
 				       &wc->refs[level - 1],
 				       &wc->flags[level - 1]);
-	BUG_ON(ret);
-	BUG_ON(wc->refs[level - 1] == 0);
+	if (ret < 0) {
+		btrfs_tree_unlock(next);
+		return ret;
+	}
+
+	if (unlikely(wc->refs[level - 1] == 0)) {
+		btrfs_err(root->fs_info, "Missing references.");
+		BUG();
+	}
 	*lookup_info = 0;
 
 	if (wc->stage == DROP_REFERENCE) {
@@ -6323,7 +7275,7 @@
 			goto skip;
 	}
 
-	if (!btrfs_buffer_uptodate(next, generation)) {
+	if (!btrfs_buffer_uptodate(next, generation, 0)) {
 		btrfs_tree_unlock(next);
 		free_extent_buffer(next);
 		next = NULL;
@@ -6334,8 +7286,10 @@
 		if (reada && level == 1)
 			reada_walk_down(trans, root, wc, path);
 		next = read_tree_block(root, bytenr, blocksize, generation);
-		if (!next)
+		if (!next || !extent_buffer_uptodate(next)) {
+			free_extent_buffer(next);
 			return -EIO;
+		}
 		btrfs_tree_lock(next);
 		btrfs_set_lock_blocking(next);
 	}
@@ -6362,8 +7316,8 @@
 		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
-					root->root_key.objectid, level - 1, 0);
-		BUG_ON(ret);
+				root->root_key.objectid, level - 1, 0, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	btrfs_tree_unlock(next);
 	free_extent_buffer(next);
@@ -6372,7 +7326,7 @@
 }
 
 /*
- * hepler to process tree block while walking up the tree.
+ * helper to process tree block while walking up the tree.
  *
  * when wc->stage == DROP_REFERENCE, this function drops
  * reference count on the block.
@@ -6418,13 +7372,18 @@
 			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 			ret = btrfs_lookup_extent_info(trans, root,
-						       eb->start, eb->len,
+						       eb->start, level, 1,
 						       &wc->refs[level],
 						       &wc->flags[level]);
-			BUG_ON(ret);
+			if (ret < 0) {
+				btrfs_tree_unlock_rw(eb, path->locks[level]);
+				path->locks[level] = 0;
+				return ret;
+			}
 			BUG_ON(wc->refs[level] == 0);
 			if (wc->refs[level] == 1) {
 				btrfs_tree_unlock_rw(eb, path->locks[level]);
+				path->locks[level] = 0;
 				return 1;
 			}
 		}
@@ -6436,10 +7395,12 @@
 	if (wc->refs[level] == 1) {
 		if (level == 0) {
 			if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
-				ret = btrfs_dec_ref(trans, root, eb, 1);
+				ret = btrfs_dec_ref(trans, root, eb, 1,
+						    wc->for_reloc);
 			else
-				ret = btrfs_dec_ref(trans, root, eb, 0);
-			BUG_ON(ret);
+				ret = btrfs_dec_ref(trans, root, eb, 0,
+						    wc->for_reloc);
+			BUG_ON(ret); /* -ENOMEM */
 		}
 		/* make block locked assertion in clean_tree_block happy */
 		if (!path->locks[level] &&
@@ -6547,9 +7508,12 @@
  * reference count by one. if update_ref is true, this function
  * also make sure backrefs for the shared block and all lower level
  * blocks are properly updated.
+ *
+ * If called with for_reloc == 0, may exit early with -EAGAIN
  */
-void btrfs_drop_snapshot(struct btrfs_root *root,
-			 struct btrfs_block_rsv *block_rsv, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+			 struct btrfs_block_rsv *block_rsv, int update_ref,
+			 int for_reloc)
 {
 	struct btrfs_path *path;
 	struct btrfs_trans_handle *trans;
@@ -6560,6 +7524,7 @@
 	int err = 0;
 	int ret;
 	int level;
+	bool root_dropped = false;
 
 	path = btrfs_alloc_path();
 	if (!path) {
@@ -6575,7 +7540,10 @@
 	}
 
 	trans = btrfs_start_transaction(tree_root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
 
 	if (block_rsv)
 		trans->block_rsv = block_rsv;
@@ -6600,7 +7568,7 @@
 		path->lowest_level = 0;
 		if (ret < 0) {
 			err = ret;
-			goto out_free;
+			goto out_end_trans;
 		}
 		WARN_ON(ret > 0);
 
@@ -6614,19 +7582,23 @@
 		while (1) {
 			btrfs_tree_lock(path->nodes[level]);
 			btrfs_set_lock_blocking(path->nodes[level]);
+			path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
 
 			ret = btrfs_lookup_extent_info(trans, root,
 						path->nodes[level]->start,
-						path->nodes[level]->len,
-						&wc->refs[level],
+						level, 1, &wc->refs[level],
 						&wc->flags[level]);
-			BUG_ON(ret);
+			if (ret < 0) {
+				err = ret;
+				goto out_end_trans;
+			}
 			BUG_ON(wc->refs[level] == 0);
 
 			if (level == root_item->drop_level)
 				break;
 
 			btrfs_tree_unlock(path->nodes[level]);
+			path->locks[level] = 0;
 			WARN_ON(wc->refs[level] != 1);
 			level--;
 		}
@@ -6637,9 +7609,11 @@
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = update_ref;
 	wc->keep_locks = 0;
+	wc->for_reloc = for_reloc;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
+
 		ret = walk_down_tree(trans, root, path, wc);
 		if (ret < 0) {
 			err = ret;
@@ -6666,30 +7640,51 @@
 		}
 
 		BUG_ON(wc->level == 0);
-		if (btrfs_should_end_transaction(trans, tree_root)) {
+		if (btrfs_should_end_transaction(trans, tree_root) ||
+		    (!for_reloc && btrfs_need_cleaner_sleep(root))) {
 			ret = btrfs_update_root(trans, tree_root,
 						&root->root_key,
 						root_item);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, tree_root, ret);
+				err = ret;
+				goto out_end_trans;
+			}
 
 			btrfs_end_transaction_throttle(trans, tree_root);
+			if (!for_reloc && btrfs_need_cleaner_sleep(root)) {
+				pr_debug("btrfs: drop snapshot early exit\n");
+				err = -EAGAIN;
+				goto out_free;
+			}
+
 			trans = btrfs_start_transaction(tree_root, 0);
-			BUG_ON(IS_ERR(trans));
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				goto out_free;
+			}
 			if (block_rsv)
 				trans->block_rsv = block_rsv;
 		}
 	}
 	btrfs_release_path(path);
-	BUG_ON(err);
+	if (err)
+		goto out_end_trans;
 
 	ret = btrfs_del_root(trans, tree_root, &root->root_key);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, tree_root, ret);
+		goto out_end_trans;
+	}
 
 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-		ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
-					   NULL, NULL);
-		BUG_ON(ret < 0);
-		if (ret > 0) {
+		ret = btrfs_find_root(tree_root, &root->root_key, path,
+				      NULL, NULL);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, tree_root, ret);
+			err = ret;
+			goto out_end_trans;
+		} else if (ret > 0) {
 			/* if we fail to delete the orphan item this time
 			 * around, it'll get picked up the next time.
 			 *
@@ -6701,26 +7696,38 @@
 	}
 
 	if (root->in_radix) {
-		btrfs_free_fs_root(tree_root->fs_info, root);
+		btrfs_drop_and_free_fs_root(tree_root->fs_info, root);
 	} else {
 		free_extent_buffer(root->node);
 		free_extent_buffer(root->commit_root);
-		kfree(root);
+		btrfs_put_fs_root(root);
 	}
-out_free:
+	root_dropped = true;
+out_end_trans:
 	btrfs_end_transaction_throttle(trans, tree_root);
+out_free:
 	kfree(wc);
 	btrfs_free_path(path);
 out:
+	/*
+	 * So if we need to stop dropping the snapshot for whatever reason we
+	 * need to make sure to add it back to the dead root list so that we
+	 * keep trying to do the work later.  This also cleans up roots if we
+	 * don't have it in the radix (like when we recover after a power fail
+	 * or unmount) so we don't leak memory.
+	 */
+	if (!for_reloc && root_dropped == false)
+		btrfs_add_dead_root(root);
 	if (err)
 		btrfs_std_error(root->fs_info, err);
-	return;
+	return err;
 }
 
 /*
  * drop subtree rooted at tree block 'node'.
  *
  * NOTE: this function will unlock and release tree block 'node'
+ * only used by relocation code
  */
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
@@ -6765,6 +7772,7 @@
 	wc->stage = DROP_REFERENCE;
 	wc->update_ref = 0;
 	wc->keep_locks = 1;
+	wc->for_reloc = 1;
 	wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
 
 	while (1) {
@@ -6789,8 +7797,15 @@
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
 	u64 num_devices;
-	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
-		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+	u64 stripped;
+
+	/*
+	 * if restripe for this chunk_type is on pick target profile and
+	 * return, otherwise do the usual balance
+	 */
+	stripped = get_restripe_target(root->fs_info, flags);
+	if (stripped)
+		return extended_to_chunk(stripped);
 
 	/*
 	 * we add in the count of missing devices because we want
@@ -6800,6 +7815,10 @@
 	num_devices = root->fs_info->fs_devices->rw_devices +
 		root->fs_info->fs_devices->missing_devices;
 
+	stripped = BTRFS_BLOCK_GROUP_RAID0 |
+		BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
+		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
+
 	if (num_devices == 1) {
 		stripped |= BTRFS_BLOCK_GROUP_DUP;
 		stripped = flags & ~stripped;
@@ -6812,7 +7831,6 @@
 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
 			     BTRFS_BLOCK_GROUP_RAID10))
 			return stripped | BTRFS_BLOCK_GROUP_DUP;
-		return flags;
 	} else {
 		/* they already had raid on here, just return */
 		if (flags & stripped)
@@ -6825,9 +7843,9 @@
 		if (flags & BTRFS_BLOCK_GROUP_DUP)
 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
 
-		/* turn single device chunks into raid0 */
-		return stripped | BTRFS_BLOCK_GROUP_RAID0;
+		/* this is drive concat, leave it alone */
 	}
+
 	return flags;
 }
 
@@ -6886,18 +7904,22 @@
 	BUG_ON(cache->ro);
 
 	trans = btrfs_join_transaction(root);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	alloc_flags = update_block_group_flags(root, cache->flags);
-	if (alloc_flags != cache->flags)
-		do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
-			       CHUNK_ALLOC_FORCE);
+	if (alloc_flags != cache->flags) {
+		ret = do_chunk_alloc(trans, root, alloc_flags,
+				     CHUNK_ALLOC_FORCE);
+		if (ret < 0)
+			goto out;
+	}
 
 	ret = set_block_group_ro(cache, 0);
 	if (!ret)
 		goto out;
 	alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-	ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+	ret = do_chunk_alloc(trans, root, alloc_flags,
 			     CHUNK_ALLOC_FORCE);
 	if (ret < 0)
 		goto out;
@@ -6911,7 +7933,7 @@
 			    struct btrfs_root *root, u64 type)
 {
 	u64 alloc_flags = get_alloc_profile(root, type);
-	return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+	return do_chunk_alloc(trans, root, alloc_flags,
 			      CHUNK_ALLOC_FORCE);
 }
 
@@ -6971,7 +7993,7 @@
 	return free_bytes;
 }
 
-int btrfs_set_block_group_rw(struct btrfs_root *root,
+void btrfs_set_block_group_rw(struct btrfs_root *root,
 			      struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_space_info *sinfo = cache->space_info;
@@ -6987,7 +8009,6 @@
 	cache->ro = 0;
 	spin_unlock(&cache->lock);
 	spin_unlock(&sinfo->lock);
-	return 0;
 }
 
 /*
@@ -7002,9 +8023,11 @@
 	struct btrfs_space_info *space_info;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_device *device;
+	struct btrfs_trans_handle *trans;
 	u64 min_free;
 	u64 dev_min = 1;
 	u64 dev_nr = 0;
+	u64 target;
 	int index;
 	int full = 0;
 	int ret = 0;
@@ -7045,13 +8068,11 @@
 	/*
 	 * ok we don't have enough space, but maybe we have free space on our
 	 * devices to allocate new chunks for relocation, so loop through our
-	 * alloc devices and guess if we have enough space.  However, if we
-	 * were marked as full, then we know there aren't enough chunks, and we
-	 * can just return.
+	 * alloc devices and guess if we have enough space.  if this block
+	 * group is going to be restriped, run checks against the target
+	 * profile instead of the current one.
 	 */
 	ret = -1;
-	if (full)
-		goto out;
 
 	/*
 	 * index:
@@ -7061,21 +8082,41 @@
 	 *      3: raid0
 	 *      4: single
 	 */
-	index = get_block_group_index(block_group);
-	if (index == 0) {
+	target = get_restripe_target(root->fs_info, block_group->flags);
+	if (target) {
+		index = __get_raid_index(extended_to_chunk(target));
+	} else {
+		/*
+		 * this is just a balance, so if we were marked as full
+		 * we know there is no space for a new chunk
+		 */
+		if (full)
+			goto out;
+
+		index = get_block_group_index(block_group);
+	}
+
+	if (index == BTRFS_RAID_RAID10) {
 		dev_min = 4;
 		/* Divide by 2 */
 		min_free >>= 1;
-	} else if (index == 1) {
+	} else if (index == BTRFS_RAID_RAID1) {
 		dev_min = 2;
-	} else if (index == 2) {
+	} else if (index == BTRFS_RAID_DUP) {
 		/* Multiply by 2 */
 		min_free <<= 1;
-	} else if (index == 3) {
+	} else if (index == BTRFS_RAID_RAID0) {
 		dev_min = fs_devices->rw_devices;
 		do_div(min_free, dev_min);
 	}
 
+	/* We need to do this so that we can look at pending chunks */
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
 	mutex_lock(&root->fs_info->chunk_mutex);
 	list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
 		u64 dev_offset;
@@ -7084,8 +8125,9 @@
 		 * check to make sure we can actually find a chunk with enough
 		 * space to fit our block group in.
 		 */
-		if (device->total_bytes > device->bytes_used + min_free) {
-			ret = find_free_dev_extent(NULL, device, min_free,
+		if (device->total_bytes > device->bytes_used + min_free &&
+		    !device->is_tgtdev_for_dev_replace) {
+			ret = find_free_dev_extent(trans, device, min_free,
 						   &dev_offset, NULL);
 			if (!ret)
 				dev_nr++;
@@ -7097,6 +8139,7 @@
 		}
 	}
 	mutex_unlock(&root->fs_info->chunk_mutex);
+	btrfs_end_transaction(trans, root);
 out:
 	btrfs_put_block_group(block_group);
 	return ret;
@@ -7207,7 +8250,8 @@
 		 * We haven't cached this block group, which means we could
 		 * possibly have excluded extents on this block group.
 		 */
-		if (block_group->cached == BTRFS_CACHE_NO)
+		if (block_group->cached == BTRFS_CACHE_NO ||
+		    block_group->cached == BTRFS_CACHE_ERROR)
 			free_excluded_extents(info->extent_root, block_group);
 
 		btrfs_remove_free_space_cache(block_group);
@@ -7231,12 +8275,15 @@
 		space_info = list_entry(info->space_info.next,
 					struct btrfs_space_info,
 					list);
-		if (space_info->bytes_pinned > 0 ||
-		    space_info->bytes_reserved > 0 ||
-		    space_info->bytes_may_use > 0) {
-			WARN_ON(1);
-			dump_space_info(space_info, 0, 0);
+		if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+			if (space_info->bytes_pinned > 0 ||
+			    space_info->bytes_reserved > 0 ||
+			    space_info->bytes_may_use > 0) {
+				WARN_ON(1);
+				dump_space_info(space_info, 0, 0);
+			}
 		}
+		percpu_counter_destroy(&space_info->total_bytes_pinned);
 		list_del(&space_info->list);
 		kfree(space_info);
 	}
@@ -7309,8 +8356,21 @@
 		INIT_LIST_HEAD(&cache->list);
 		INIT_LIST_HEAD(&cache->cluster_list);
 
-		if (need_clear)
+		if (need_clear) {
+			/*
+			 * When we mount with old space cache, we need to
+			 * set BTRFS_DC_CLEAR and set dirty flag.
+			 *
+			 * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
+			 *    truncate the old free space cache inode and
+			 *    setup a new one.
+			 * b) Setting 'dirty flag' makes sure that we flush
+			 *    the new space cache info onto disk.
+			 */
 			cache->disk_cache_state = BTRFS_DC_CLEAR;
+			if (btrfs_test_opt(root, SPACE_CACHE))
+				cache->dirty = 1;
+		}
 
 		read_extent_buffer(leaf, &cache->item,
 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -7321,7 +8381,9 @@
 		btrfs_release_path(path);
 		cache->flags = btrfs_block_group_flags(&cache->item);
 		cache->sectorsize = root->sectorsize;
-
+		cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       found_key.objectid);
 		btrfs_init_free_space_ctl(cache);
 
 		/*
@@ -7329,7 +8391,17 @@
 		 * info has super bytes accounted for, otherwise we'll think
 		 * we have more space than we actually do.
 		 */
-		exclude_super_stripes(root, cache);
+		ret = exclude_super_stripes(root, cache);
+		if (ret) {
+			/*
+			 * We may have excluded something, so call this just in
+			 * case.
+			 */
+			free_excluded_extents(root, cache);
+			kfree(cache->free_space_ctl);
+			kfree(cache);
+			goto error;
+		}
 
 		/*
 		 * check for two cases, either we are full, and therefore
@@ -7352,10 +8424,26 @@
 			free_excluded_extents(root, cache);
 		}
 
+		ret = btrfs_add_block_group_cache(root->fs_info, cache);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
 		ret = update_space_info(info, cache->flags, found_key.offset,
 					btrfs_block_group_used(&cache->item),
 					&space_info);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_remove_free_space_cache(cache);
+			spin_lock(&info->block_group_cache_lock);
+			rb_erase(&cache->cache_node,
+				 &info->block_group_cache_tree);
+			spin_unlock(&info->block_group_cache_lock);
+			btrfs_put_block_group(cache);
+			goto error;
+		}
+
 		cache->space_info = space_info;
 		spin_lock(&cache->space_info->lock);
 		cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7363,9 +8451,6 @@
 
 		__link_block_group(space_info, cache);
 
-		ret = btrfs_add_block_group_cache(root->fs_info, cache);
-		BUG_ON(ret);
-
 		set_avail_alloc_bits(root->fs_info, cache->flags);
 		if (btrfs_chunk_readonly(root, cache->key.objectid))
 			set_block_group_ro(cache, 1);
@@ -7375,15 +8460,21 @@
 		if (!(get_alloc_profile(root, space_info->flags) &
 		      (BTRFS_BLOCK_GROUP_RAID10 |
 		       BTRFS_BLOCK_GROUP_RAID1 |
+		       BTRFS_BLOCK_GROUP_RAID5 |
+		       BTRFS_BLOCK_GROUP_RAID6 |
 		       BTRFS_BLOCK_GROUP_DUP)))
 			continue;
 		/*
 		 * avoid allocating from un-mirrored block group if there are
 		 * mirrored block groups.
 		 */
-		list_for_each_entry(cache, &space_info->block_groups[3], list)
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_RAID0],
+				list)
 			set_block_group_ro(cache, 1);
-		list_for_each_entry(cache, &space_info->block_groups[4], list)
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_SINGLE],
+				list)
 			set_block_group_ro(cache, 1);
 	}
 
@@ -7394,6 +8485,38 @@
 	return ret;
 }
 
+void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root)
+{
+	struct btrfs_block_group_cache *block_group, *tmp;
+	struct btrfs_root *extent_root = root->fs_info->extent_root;
+	struct btrfs_block_group_item item;
+	struct btrfs_key key;
+	int ret = 0;
+
+	list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
+				 new_bg_list) {
+		list_del_init(&block_group->new_bg_list);
+
+		if (ret)
+			continue;
+
+		spin_lock(&block_group->lock);
+		memcpy(&item, &block_group->item, sizeof(item));
+		memcpy(&key, &block_group->key, sizeof(key));
+		spin_unlock(&block_group->lock);
+
+		ret = btrfs_insert_item(trans, extent_root, &key, &item,
+					sizeof(item));
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+		ret = btrfs_finish_chunk_alloc(trans, extent_root,
+					       key.objectid, key.offset);
+		if (ret)
+			btrfs_abort_transaction(trans, extent_root, ret);
+	}
+}
+
 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 bytes_used,
 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
@@ -7422,11 +8545,15 @@
 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
 	cache->sectorsize = root->sectorsize;
 	cache->fs_info = root->fs_info;
+	cache->full_stripe_len = btrfs_full_stripe_len(root,
+					       &root->fs_info->mapping_tree,
+					       chunk_offset);
 
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	INIT_LIST_HEAD(&cache->list);
 	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
 
 	btrfs_init_free_space_ctl(cache);
 
@@ -7437,16 +8564,42 @@
 
 	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
-	exclude_super_stripes(root, cache);
+	ret = exclude_super_stripes(root, cache);
+	if (ret) {
+		/*
+		 * We may have excluded something, so call this just in
+		 * case.
+		 */
+		free_excluded_extents(root, cache);
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+		return ret;
+	}
 
 	add_new_free_space(cache, root->fs_info, chunk_offset,
 			   chunk_offset + size);
 
 	free_excluded_extents(root, cache);
 
+	ret = btrfs_add_block_group_cache(root->fs_info, cache);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+
 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
 				&cache->space_info);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_remove_free_space_cache(cache);
+		spin_lock(&root->fs_info->block_group_cache_lock);
+		rb_erase(&cache->cache_node,
+			 &root->fs_info->block_group_cache_tree);
+		spin_unlock(&root->fs_info->block_group_cache_lock);
+		btrfs_put_block_group(cache);
+		return ret;
+	}
+	update_global_block_rsv(root->fs_info);
 
 	spin_lock(&cache->space_info->lock);
 	cache->space_info->bytes_readonly += cache->bytes_super;
@@ -7454,18 +8607,28 @@
 
 	__link_block_group(cache->space_info, cache);
 
-	ret = btrfs_add_block_group_cache(root->fs_info, cache);
-	BUG_ON(ret);
-
-	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
-				sizeof(cache->item));
-	BUG_ON(ret);
+	list_add_tail(&cache->new_bg_list, &trans->new_bgs);
 
 	set_avail_alloc_bits(extent_root->fs_info, type);
 
 	return 0;
 }
 
+static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
+{
+	u64 extra_flags = chunk_to_extended(flags) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	write_seqlock(&fs_info->profiles_lock);
+	if (flags & BTRFS_BLOCK_GROUP_DATA)
+		fs_info->avail_data_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_METADATA)
+		fs_info->avail_metadata_alloc_bits &= ~extra_flags;
+	if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+		fs_info->avail_system_alloc_bits &= ~extra_flags;
+	write_sequnlock(&fs_info->profiles_lock);
+}
+
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start)
 {
@@ -7476,6 +8639,7 @@
 	struct btrfs_key key;
 	struct inode *inode;
 	int ret;
+	int index;
 	int factor;
 
 	root = root->fs_info->extent_root;
@@ -7491,6 +8655,7 @@
 	free_excluded_extents(root, block_group);
 
 	memcpy(&key, &block_group->key, sizeof(key));
+	index = get_block_group_index(block_group);
 	if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
 				  BTRFS_BLOCK_GROUP_RAID1 |
 				  BTRFS_BLOCK_GROUP_RAID10))
@@ -7522,7 +8687,10 @@
 	inode = lookup_free_space_inode(tree_root, block_group, path);
 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, inode);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_add_delayed_iput(inode);
+			goto out;
+		}
 		clear_nlink(inode);
 		/* One for the block groups ref */
 		spin_lock(&block_group->lock);
@@ -7557,6 +8725,9 @@
 	spin_lock(&root->fs_info->block_group_cache_lock);
 	rb_erase(&block_group->cache_node,
 		 &root->fs_info->block_group_cache_tree);
+
+	if (root->fs_info->first_logical_byte == block_group->key.objectid)
+		root->fs_info->first_logical_byte = (u64)-1;
 	spin_unlock(&root->fs_info->block_group_cache_lock);
 
 	down_write(&block_group->space_info->groups_sem);
@@ -7565,6 +8736,8 @@
 	 * are still on the list after taking the semaphore
 	 */
 	list_del_init(&block_group->list);
+	if (list_empty(&block_group->space_info->block_groups[index]))
+		clear_avail_alloc_bits(root->fs_info, block_group->flags);
 	up_write(&block_group->space_info->groups_sem);
 
 	if (block_group->cached == BTRFS_CACHE_STARTED)
@@ -7654,9 +8827,16 @@
 	u64 start;
 	u64 end;
 	u64 trimmed = 0;
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
 	int ret = 0;
 
-	cache = btrfs_lookup_block_group(fs_info, range->start);
+	/*
+	 * try to trim all FS space, our block group may start from non-zero.
+	 */
+	if (range->len == total_bytes)
+		cache = btrfs_lookup_first_block_group(fs_info, range->start);
+	else
+		cache = btrfs_lookup_block_group(fs_info, range->start);
 
 	while (cache) {
 		if (cache->key.objectid >= (range->start + range->len)) {
@@ -7670,9 +8850,16 @@
 
 		if (end - start >= range->minlen) {
 			if (!block_group_cache_done(cache)) {
-				ret = cache_block_group(cache, NULL, root, 0);
-				if (!ret)
-					wait_block_group_cache_done(cache);
+				ret = cache_block_group(cache, 0);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
+				ret = wait_block_group_cache_done(cache);
+				if (ret) {
+					btrfs_put_block_group(cache);
+					break;
+				}
 			}
 			ret = btrfs_trim_block_group(cache,
 						     &group_trimmed,
diff -ur a/fs/btrfs/file.c b/fs/btrfs/file.c
--- a/fs/btrfs/file.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/file.c	2014-02-17 11:56:59.000000000 +0100
@@ -30,16 +30,18 @@
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/slab.h>
+#include <linux/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
 #include "compat.h"
+#include "volumes.h"
 
+static struct kmem_cache *btrfs_inode_defrag_cachep;
 /*
  * when auto defrag is enabled we
  * queue up these defrag structs to remember which
@@ -65,6 +67,21 @@
 	int cycled;
 };
 
+static int __compare_inode_defrag(struct inode_defrag *defrag1,
+				  struct inode_defrag *defrag2)
+{
+	if (defrag1->root > defrag2->root)
+		return 1;
+	else if (defrag1->root < defrag2->root)
+		return -1;
+	else if (defrag1->ino > defrag2->ino)
+		return 1;
+	else if (defrag1->ino < defrag2->ino)
+		return -1;
+	else
+		return 0;
+}
+
 /* pop a record for an inode into the defrag tree.  The lock
  * must be held already
  *
@@ -74,22 +91,24 @@
  * If an existing record is found the defrag item you
  * pass in is freed
  */
-static void __btrfs_add_inode_defrag(struct inode *inode,
+static int __btrfs_add_inode_defrag(struct inode *inode,
 				    struct inode_defrag *defrag)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct inode_defrag *entry;
 	struct rb_node **p;
 	struct rb_node *parent = NULL;
+	int ret;
 
 	p = &root->fs_info->defrag_inodes.rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct inode_defrag, rb_node);
 
-		if (defrag->ino < entry->ino)
+		ret = __compare_inode_defrag(defrag, entry);
+		if (ret < 0)
 			p = &parent->rb_left;
-		else if (defrag->ino > entry->ino)
+		else if (ret > 0)
 			p = &parent->rb_right;
 		else {
 			/* if we're reinserting an entry for
@@ -100,18 +119,24 @@
 				entry->transid = defrag->transid;
 			if (defrag->last_offset > entry->last_offset)
 				entry->last_offset = defrag->last_offset;
-			goto exists;
+			return -EEXIST;
 		}
 	}
-	BTRFS_I(inode)->in_defrag = 1;
+	set_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 	rb_link_node(&defrag->rb_node, parent, p);
 	rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
-	return;
+	return 0;
+}
 
-exists:
-	kfree(defrag);
-	return;
+static inline int __need_auto_defrag(struct btrfs_root *root)
+{
+	if (!btrfs_test_opt(root, AUTO_DEFRAG))
+		return 0;
+
+	if (btrfs_fs_closing(root->fs_info))
+		return 0;
 
+	return 1;
 }
 
 /*
@@ -124,14 +149,12 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct inode_defrag *defrag;
 	u64 transid;
+	int ret;
 
-	if (!btrfs_test_opt(root, AUTO_DEFRAG))
+	if (!__need_auto_defrag(root))
 		return 0;
 
-	if (btrfs_fs_closing(root->fs_info))
-		return 0;
-
-	if (BTRFS_I(inode)->in_defrag)
+	if (test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags))
 		return 0;
 
 	if (trans)
@@ -139,7 +162,7 @@
 	else
 		transid = BTRFS_I(inode)->root->last_trans;
 
-	defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
 	if (!defrag)
 		return -ENOMEM;
 
@@ -148,77 +171,219 @@
 	defrag->root = root->root_key.objectid;
 
 	spin_lock(&root->fs_info->defrag_inodes_lock);
-	if (!BTRFS_I(inode)->in_defrag)
-		__btrfs_add_inode_defrag(inode, defrag);
-	else
-		kfree(defrag);
+	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags)) {
+		/*
+		 * If we set IN_DEFRAG flag and evict the inode from memory,
+		 * and then re-read this inode, this new inode doesn't have
+		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
+		 */
+		ret = __btrfs_add_inode_defrag(inode, defrag);
+		if (ret)
+			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
 	spin_unlock(&root->fs_info->defrag_inodes_lock);
 	return 0;
 }
 
 /*
- * must be called with the defrag_inodes lock held
+ * Requeue the defrag object. If there is a defrag object that points to
+ * the same inode in the tree, we will merge them together (by
+ * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
+ */
+static void btrfs_requeue_inode_defrag(struct inode *inode,
+				       struct inode_defrag *defrag)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	if (!__need_auto_defrag(root))
+		goto out;
+
+	/*
+	 * Here we don't check the IN_DEFRAG flag, because we need merge
+	 * them together.
+	 */
+	spin_lock(&root->fs_info->defrag_inodes_lock);
+	ret = __btrfs_add_inode_defrag(inode, defrag);
+	spin_unlock(&root->fs_info->defrag_inodes_lock);
+	if (ret)
+		goto out;
+	return;
+out:
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+}
+
+/*
+ * pick the defragable inode that we want, if it doesn't exist, we will get
+ * the next one.
  */
-struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
-					     struct rb_node **next)
+static struct inode_defrag *
+btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
 {
 	struct inode_defrag *entry = NULL;
+	struct inode_defrag tmp;
 	struct rb_node *p;
 	struct rb_node *parent = NULL;
+	int ret;
+
+	tmp.ino = ino;
+	tmp.root = root;
 
-	p = info->defrag_inodes.rb_node;
+	spin_lock(&fs_info->defrag_inodes_lock);
+	p = fs_info->defrag_inodes.rb_node;
 	while (p) {
 		parent = p;
 		entry = rb_entry(parent, struct inode_defrag, rb_node);
 
-		if (ino < entry->ino)
+		ret = __compare_inode_defrag(&tmp, entry);
+		if (ret < 0)
 			p = parent->rb_left;
-		else if (ino > entry->ino)
+		else if (ret > 0)
 			p = parent->rb_right;
 		else
-			return entry;
+			goto out;
 	}
 
-	if (next) {
-		while (parent && ino > entry->ino) {
-			parent = rb_next(parent);
+	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
+		parent = rb_next(parent);
+		if (parent)
 			entry = rb_entry(parent, struct inode_defrag, rb_node);
-		}
-		*next = parent;
+		else
+			entry = NULL;
 	}
-	return NULL;
+out:
+	if (entry)
+		rb_erase(parent, &fs_info->defrag_inodes);
+	spin_unlock(&fs_info->defrag_inodes_lock);
+	return entry;
 }
 
-/*
- * run through the list of inodes in the FS that need
- * defragging
- */
-int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
 {
 	struct inode_defrag *defrag;
+	struct rb_node *node;
+
+	spin_lock(&fs_info->defrag_inodes_lock);
+	node = rb_first(&fs_info->defrag_inodes);
+	while (node) {
+		rb_erase(node, &fs_info->defrag_inodes);
+		defrag = rb_entry(node, struct inode_defrag, rb_node);
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+
+		if (need_resched()) {
+			spin_unlock(&fs_info->defrag_inodes_lock);
+			cond_resched();
+			spin_lock(&fs_info->defrag_inodes_lock);
+		}
+
+		node = rb_first(&fs_info->defrag_inodes);
+	}
+	spin_unlock(&fs_info->defrag_inodes_lock);
+}
+
+#define BTRFS_DEFRAG_BATCH	1024
+
+static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
+				    struct inode_defrag *defrag)
+{
 	struct btrfs_root *inode_root;
 	struct inode *inode;
-	struct rb_node *n;
 	struct btrfs_key key;
 	struct btrfs_ioctl_defrag_range_args range;
-	u64 first_ino = 0;
 	int num_defrag;
-	int defrag_batch = 1024;
+	int index;
+	int ret;
+
+	/* get the inode */
+	key.objectid = defrag->root;
+	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+	key.offset = (u64)-1;
+
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(inode_root)) {
+		ret = PTR_ERR(inode_root);
+		goto cleanup;
+	}
+
+	key.objectid = defrag->ino;
+	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+	if (IS_ERR(inode)) {
+		ret = PTR_ERR(inode);
+		goto cleanup;
+	}
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
 
+	/* do a chunk of defrag */
+	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
 	memset(&range, 0, sizeof(range));
 	range.len = (u64)-1;
+	range.start = defrag->last_offset;
+
+	sb_start_write(fs_info->sb);
+	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+				       BTRFS_DEFRAG_BATCH);
+	sb_end_write(fs_info->sb);
+	/*
+	 * if we filled the whole defrag batch, there
+	 * must be more work to do.  Queue this defrag
+	 * again
+	 */
+	if (num_defrag == BTRFS_DEFRAG_BATCH) {
+		defrag->last_offset = range.start;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else if (defrag->last_offset && !defrag->cycled) {
+		/*
+		 * we didn't fill our defrag batch, but
+		 * we didn't start at zero.  Make sure we loop
+		 * around to the start of the file.
+		 */
+		defrag->last_offset = 0;
+		defrag->cycled = 1;
+		btrfs_requeue_inode_defrag(inode, defrag);
+	} else {
+		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	}
+
+	iput(inode);
+	return 0;
+cleanup:
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+	return ret;
+}
+
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+	struct inode_defrag *defrag;
+	u64 first_ino = 0;
+	u64 root_objectid = 0;
 
 	atomic_inc(&fs_info->defrag_running);
-	spin_lock(&fs_info->defrag_inodes_lock);
 	while(1) {
-		n = NULL;
+		/* Pause the auto defragger. */
+		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+			     &fs_info->fs_state))
+			break;
+
+		if (!__need_auto_defrag(fs_info->tree_root))
+			break;
 
 		/* find an inode to defrag */
-		defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
+						 first_ino);
 		if (!defrag) {
-			if (n)
-				defrag = rb_entry(n, struct inode_defrag, rb_node);
-			else if (first_ino) {
+			if (root_objectid || first_ino) {
+				root_objectid = 0;
 				first_ino = 0;
 				continue;
 			} else {
@@ -226,69 +391,11 @@
 			}
 		}
 
-		/* remove it from the rbtree */
 		first_ino = defrag->ino + 1;
-		rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
-
-		if (btrfs_fs_closing(fs_info))
-			goto next_free;
-
-		spin_unlock(&fs_info->defrag_inodes_lock);
+		root_objectid = defrag->root;
 
-		/* get the inode */
-		key.objectid = defrag->root;
-		btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-		key.offset = (u64)-1;
-		inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
-		if (IS_ERR(inode_root))
-			goto next;
-
-		key.objectid = defrag->ino;
-		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
-		key.offset = 0;
-
-		inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
-		if (IS_ERR(inode))
-			goto next;
-
-		/* do a chunk of defrag */
-		BTRFS_I(inode)->in_defrag = 0;
-		range.start = defrag->last_offset;
-		num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
-					       defrag_batch);
-		/*
-		 * if we filled the whole defrag batch, there
-		 * must be more work to do.  Queue this defrag
-		 * again
-		 */
-		if (num_defrag == defrag_batch) {
-			defrag->last_offset = range.start;
-			__btrfs_add_inode_defrag(inode, defrag);
-			/*
-			 * we don't want to kfree defrag, we added it back to
-			 * the rbtree
-			 */
-			defrag = NULL;
-		} else if (defrag->last_offset && !defrag->cycled) {
-			/*
-			 * we didn't fill our defrag batch, but
-			 * we didn't start at zero.  Make sure we loop
-			 * around to the start of the file.
-			 */
-			defrag->last_offset = 0;
-			defrag->cycled = 1;
-			__btrfs_add_inode_defrag(inode, defrag);
-			defrag = NULL;
-		}
-
-		iput(inode);
-next:
-		spin_lock(&fs_info->defrag_inodes_lock);
-next_free:
-		kfree(defrag);
+		__btrfs_run_defrag_inode(fs_info, defrag);
 	}
-	spin_unlock(&fs_info->defrag_inodes_lock);
-
 	atomic_dec(&fs_info->defrag_running);
 
 	/*
@@ -362,7 +469,7 @@
 /*
  * unlocks pages after btrfs_file_write is done with them
  */
-void btrfs_drop_pages(struct page **pages, size_t num_pages)
+static void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
 	size_t i;
 	for (i = 0; i < num_pages; i++) {
@@ -386,9 +493,9 @@
  * doing real data extents, marking pages dirty and delalloc as required.
  */
 int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
-		      struct page **pages, size_t num_pages,
-		      loff_t pos, size_t write_bytes,
-		      struct extent_state **cached)
+			     struct page **pages, size_t num_pages,
+			     loff_t pos, size_t write_bytes,
+			     struct extent_state **cached)
 {
 	int err = 0;
 	int i;
@@ -399,8 +506,7 @@
 	loff_t isize = i_size_read(inode);
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
-	num_bytes = (write_bytes + pos - start_pos +
-		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+	num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
 
 	end_of_last_block = start_pos + num_bytes - 1;
 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -429,18 +535,20 @@
  * this drops all the extents in the cache that intersect the range
  * [start, end].  Existing extents are split as required.
  */
-int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
-			    int skip_pinned)
+void btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
+			     int skip_pinned)
 {
 	struct extent_map *em;
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	u64 len = end - start + 1;
+	u64 gen;
 	int ret;
 	int testend = 1;
 	unsigned long flags;
 	int compressed = 0;
+	bool modified;
 
 	WARN_ON(end < start);
 	if (end == (u64)-1) {
@@ -448,11 +556,15 @@
 		testend = 0;
 	}
 	while (1) {
+		int no_splits = 0;
+
+		modified = false;
 		if (!split)
 			split = alloc_extent_map();
 		if (!split2)
 			split2 = alloc_extent_map();
-		BUG_ON(!split || !split2);
+		if (!split || !split2)
+			no_splits = 1;
 
 		write_lock(&em_tree->lock);
 		em = lookup_extent_mapping(em_tree, start, len);
@@ -461,6 +573,7 @@
 			break;
 		}
 		flags = em->flags;
+		gen = em->generation;
 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
 			if (testend && em->start + em->len >= start + len) {
 				free_extent_map(em);
@@ -476,31 +589,46 @@
 		}
 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+		clear_bit(EXTENT_FLAG_LOGGING, &flags);
+		modified = !list_empty(&em->list);
 		remove_extent_mapping(em_tree, em);
+		if (no_splits)
+			goto next;
 
-		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
-		    em->start < start) {
+		if (em->start < start) {
 			split->start = em->start;
 			split->len = start - em->start;
-			split->orig_start = em->orig_start;
-			split->block_start = em->block_start;
 
-			if (compressed)
-				split->block_len = em->block_len;
-			else
-				split->block_len = split->len;
+			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+				split->orig_start = em->orig_start;
+				split->block_start = em->block_start;
+
+				if (compressed)
+					split->block_len = em->block_len;
+				else
+					split->block_len = split->len;
+				split->orig_block_len = max(split->block_len,
+						em->orig_block_len);
+				split->ram_bytes = em->ram_bytes;
+			} else {
+				split->orig_start = split->start;
+				split->block_len = 0;
+				split->block_start = em->block_start;
+				split->orig_block_len = 0;
+				split->ram_bytes = split->len;
+			}
 
+			split->generation = gen;
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
-			ret = add_extent_mapping(em_tree, split);
-			BUG_ON(ret);
+			ret = add_extent_mapping(em_tree, split, modified);
+			BUG_ON(ret); /* Logic error */
 			free_extent_map(split);
 			split = split2;
 			split2 = NULL;
 		}
-		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
-		    testend && em->start + em->len > start + len) {
+		if (testend && em->start + em->len > start + len) {
 			u64 diff = start + len - em->start;
 
 			split->start = start + len;
@@ -508,22 +636,37 @@
 			split->bdev = em->bdev;
 			split->flags = flags;
 			split->compress_type = em->compress_type;
+			split->generation = gen;
 
-			if (compressed) {
-				split->block_len = em->block_len;
-				split->block_start = em->block_start;
-				split->orig_start = em->orig_start;
+			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+				split->orig_block_len = max(em->block_len,
+						    em->orig_block_len);
+
+				split->ram_bytes = em->ram_bytes;
+				if (compressed) {
+					split->block_len = em->block_len;
+					split->block_start = em->block_start;
+					split->orig_start = em->orig_start;
+				} else {
+					split->block_len = split->len;
+					split->block_start = em->block_start
+						+ diff;
+					split->orig_start = em->orig_start;
+				}
 			} else {
-				split->block_len = split->len;
-				split->block_start = em->block_start + diff;
+				split->ram_bytes = split->len;
 				split->orig_start = split->start;
+				split->block_len = 0;
+				split->block_start = em->block_start;
+				split->orig_block_len = 0;
 			}
 
-			ret = add_extent_mapping(em_tree, split);
-			BUG_ON(ret);
+			ret = add_extent_mapping(em_tree, split, modified);
+			BUG_ON(ret); /* Logic error */
 			free_extent_map(split);
 			split = NULL;
 		}
+next:
 		write_unlock(&em_tree->lock);
 
 		/* once for us */
@@ -535,7 +678,6 @@
 		free_extent_map(split);
 	if (split2)
 		free_extent_map(split2);
-	return 0;
 }
 
 /*
@@ -547,13 +689,13 @@
  * it is either truncated or split.  Anything entirely inside the range
  * is deleted from the tree.
  */
-int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
-		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
+int __btrfs_drop_extents(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root, struct inode *inode,
+			 struct btrfs_path *path, u64 start, u64 end,
+			 u64 *drop_end, int drop_cache)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *fi;
-	struct btrfs_path *path;
 	struct btrfs_key key;
 	struct btrfs_key new_key;
 	u64 ino = btrfs_ino(inode);
@@ -567,18 +709,20 @@
 	int extent_type;
 	int recow;
 	int ret;
+	int modify_tree = -1;
+	int update_refs = (root->ref_cows || root == root->fs_info->tree_root);
+	int found = 0;
 
 	if (drop_cache)
 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (start >= BTRFS_I(inode)->disk_i_size)
+		modify_tree = 0;
 
 	while (1) {
 		recow = 0;
 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
-					       search_start, -1);
+					       search_start, modify_tree);
 		if (ret < 0)
 			break;
 		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
@@ -633,8 +777,10 @@
 			goto next_slot;
 		}
 
+		found = 1;
 		search_start = max(key.offset, start);
-		if (recow) {
+		if (recow || !modify_tree) {
+			modify_tree = -1;
 			btrfs_release_path(path);
 			continue;
 		}
@@ -673,14 +819,13 @@
 							extent_end - start);
 			btrfs_mark_buffer_dirty(leaf);
 
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						new_key.objectid,
-						start - extent_offset);
-				BUG_ON(ret);
-				*hint_byte = disk_bytenr;
+						start - extent_offset, 0);
+				BUG_ON(ret); /* -ENOMEM */
 			}
 			key.offset = start;
 		}
@@ -693,17 +838,15 @@
 
 			memcpy(&new_key, &key, sizeof(new_key));
 			new_key.offset = end;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 
 			extent_offset += end - key.offset;
 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, end - key.offset);
-				*hint_byte = disk_bytenr;
-			}
 			break;
 		}
 
@@ -719,10 +862,8 @@
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
 			btrfs_mark_buffer_dirty(leaf);
-			if (disk_bytenr > 0) {
+			if (update_refs && disk_bytenr > 0)
 				inode_sub_bytes(inode, extent_end - start);
-				*hint_byte = disk_bytenr;
-			}
 			if (end == extent_end)
 				break;
 
@@ -743,21 +884,21 @@
 				del_nr++;
 			}
 
-			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
+			if (update_refs &&
+			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
 				extent_end = ALIGN(extent_end,
 						   root->sectorsize);
-			} else if (disk_bytenr > 0) {
+			} else if (update_refs && disk_bytenr > 0) {
 				ret = btrfs_free_extent(trans, root,
 						disk_bytenr, num_bytes, 0,
 						root->root_key.objectid,
 						key.objectid, key.offset -
-						extent_offset);
-				BUG_ON(ret);
+						extent_offset, 0);
+				BUG_ON(ret); /* -ENOMEM */
 				inode_sub_bytes(inode,
 						extent_end - key.offset);
-				*hint_byte = disk_bytenr;
 			}
 
 			if (end == extent_end)
@@ -770,7 +911,10 @@
 
 			ret = btrfs_del_items(trans, root, path, del_slot,
 					      del_nr);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				break;
+			}
 
 			del_nr = 0;
 			del_slot = 0;
@@ -782,11 +926,30 @@
 		BUG_ON(1);
 	}
 
-	if (del_nr > 0) {
+	if (!ret && del_nr > 0) {
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-		BUG_ON(ret);
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
 	}
 
+	if (drop_end)
+		*drop_end = found ? min(end, extent_end) : end;
+	btrfs_release_path(path);
+	return ret;
+}
+
+int btrfs_drop_extents(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root, struct inode *inode, u64 start,
+		       u64 end, int drop_cache)
+{
+	struct btrfs_path *path;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	ret = __btrfs_drop_extents(trans, root, inode, path, start, end, NULL,
+				   drop_cache);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -853,8 +1016,6 @@
 	int ret;
 	u64 ino = btrfs_ino(inode);
 
-	btrfs_drop_extent_cache(inode, start, end - 1, 0);
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -893,15 +1054,19 @@
 				     ino, bytenr, orig_offset,
 				     &other_start, &other_end)) {
 			new_key.offset = end;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_end - end);
 			btrfs_set_file_extent_offset(leaf, fi,
 						     end - orig_offset);
 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							end - other_start);
 			btrfs_mark_buffer_dirty(leaf);
@@ -919,12 +1084,16 @@
 					    struct btrfs_file_extent_item);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							start - key.offset);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			path->slots[0]++;
 			new_key.offset = start;
-			btrfs_set_item_key_safe(trans, root, path, &new_key);
+			btrfs_set_item_key_safe(root, path, &new_key);
 
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
+			btrfs_set_file_extent_generation(leaf, fi,
+							 trans->transid);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - start);
 			btrfs_set_file_extent_offset(leaf, fi,
@@ -944,17 +1113,22 @@
 			btrfs_release_path(path);
 			goto again;
 		}
-		BUG_ON(ret < 0);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 
 		leaf = path->nodes[0];
 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 				    struct btrfs_file_extent_item);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						split - key.offset);
 
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_file_extent_item);
 
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - split);
@@ -962,8 +1136,8 @@
 
 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
 					   root->root_key.objectid,
-					   ino, orig_offset);
-		BUG_ON(ret);
+					   ino, orig_offset, 0);
+		BUG_ON(ret); /* -ENOMEM */
 
 		if (split == start) {
 			key.offset = start;
@@ -989,8 +1163,8 @@
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset);
-		BUG_ON(ret);
+					ino, orig_offset, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	other_start = 0;
 	other_end = start;
@@ -1006,26 +1180,31 @@
 		del_nr++;
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					0, root->root_key.objectid,
-					ino, orig_offset);
-		BUG_ON(ret);
+					ino, orig_offset, 0);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 	if (del_nr == 0) {
 		fi = btrfs_item_ptr(leaf, path->slots[0],
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_mark_buffer_dirty(leaf);
 	} else {
 		fi = btrfs_item_ptr(leaf, del_slot - 1,
 			   struct btrfs_file_extent_item);
 		btrfs_set_file_extent_type(leaf, fi,
 					   BTRFS_FILE_EXTENT_REG);
+		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 		btrfs_set_file_extent_num_bytes(leaf, fi,
 						extent_end - key.offset);
 		btrfs_mark_buffer_dirty(leaf);
 
 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
-		BUG_ON(ret);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 	}
 out:
 	btrfs_free_path(path);
@@ -1105,8 +1284,7 @@
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
-				 start_pos, last_pos - 1, 0, &cached_state,
-				 GFP_NOFS);
+				 start_pos, last_pos - 1, 0, &cached_state);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    last_pos - 1);
 		if (ordered &&
@@ -1129,14 +1307,15 @@
 
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-				  GFP_NOFS);
+				  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
+				  0, 0, &cached_state, GFP_NOFS);
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 				     start_pos, last_pos - 1, &cached_state,
 				     GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
-		clear_page_dirty_for_io(pages[i]);
+		if (clear_page_dirty_for_io(pages[i]))
+			account_page_redirty(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
@@ -1151,6 +1330,252 @@
 
 }
 
+static noinline int check_can_nocow(struct inode *inode, loff_t pos,
+				    size_t *write_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_extent *ordered;
+	u64 lockstart, lockend;
+	u64 num_bytes;
+	int ret;
+
+	lockstart = round_down(pos, root->sectorsize);
+	lockend = lockstart + round_up(*write_bytes, root->sectorsize) - 1;
+
+	while (1) {
+		lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered) {
+			break;
+		}
+		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	num_bytes = lockend - lockstart + 1;
+	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
+	if (ret <= 0) {
+		ret = 0;
+	} else {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+				 NULL, GFP_NOFS);
+		*write_bytes = min_t(size_t, *write_bytes, num_bytes);
+	}
+
+	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
+
+	return ret;
+}
+
+#ifdef MY_ABC_HERE
+#include <linux/tcp.h>
+#include <net/tcp.h>
+static noinline ssize_t btrfs_syno_recvfile(struct file *file, struct socket *sock,
+					loff_t *ppos, size_t count, size_t * rbytes, size_t * wbytes)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct msghdr   msg;
+	struct page *pages[MAX_PAGES_PER_RECVFILE + 1];
+	struct kvec iov[MAX_PAGES_PER_RECVFILE + 1];
+	u64 release_bytes = 0;
+	u64 start_pos = 0;
+	unsigned long first_index;
+	size_t num_written = 0;
+	int nrptrs;
+	int ret = 0, recv_meg_ret = 0;
+	int i;
+	loff_t pos = *ppos;
+	long rcvtimeo;
+	bool only_release_metadata = false;
+
+	start_pos = round_down(pos, root->sectorsize);
+	if (start_pos > i_size_read(inode)) {
+		ret = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
+		if (ret)
+			goto out;
+	}
+	nrptrs = min((count + PAGE_CACHE_SIZE - 1) /
+		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+		     (sizeof(struct page *)));
+	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
+	nrptrs = max(nrptrs, MAX_PAGES_PER_RECVFILE);
+
+	first_index = pos >> PAGE_CACHE_SHIFT;
+	while (count > 0) {
+		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+		size_t write_bytes = min(count,
+					 nrptrs * (size_t)PAGE_CACHE_SIZE -
+					 offset);
+		size_t num_pages = (write_bytes + offset +
+				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		size_t reserve_bytes;
+		size_t dirty_pages;
+		size_t copied = 0;
+
+		WARN_ON(num_pages > nrptrs);
+
+		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+		ret = btrfs_check_data_free_space(inode, reserve_bytes);
+		if (ret == -ENOSPC &&
+		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					      BTRFS_INODE_PREALLOC))) {
+			ret = check_can_nocow(inode, pos, &write_bytes);
+			if (ret > 0) {
+				only_release_metadata = true;
+				/*
+				 * our prealloc extent may be smaller than
+				 * write_bytes, so scale down.
+				 */
+				num_pages = (write_bytes + offset +
+					     PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+				ret = 0;
+			} else {
+				ret = -ENOSPC;
+			}
+		}
+
+		if (ret)
+			break;
+
+		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+		if (ret) {
+			if (!only_release_metadata)
+				btrfs_free_reserved_data_space(inode,
+							       reserve_bytes);
+			break;
+		}
+
+		release_bytes = reserve_bytes;
+		/*
+		 * This is going to setup the pages array with the number of
+		 * pages we want, so we don't really need to worry about the
+		 * contents of pages from loop to loop
+		 */
+		ret = prepare_pages(root, file, pages, num_pages,
+				    pos, first_index, write_bytes,
+				    false);
+		if (ret)
+			break;
+		iov[0].iov_base = kmap(pages[0]) + offset;
+		iov[0].iov_len = PAGE_CACHE_SIZE - offset;
+		for (i = 1; i < num_pages; i++) {
+			iov[i].iov_base = kmap(pages[i]);
+			iov[i].iov_len = PAGE_CACHE_SIZE;
+		}
+		if (0 != ((write_bytes - offset) & (PAGE_CACHE_SIZE - 1)) && 1 < num_pages) {
+			iov[num_pages-1].iov_len = (write_bytes - offset) & (PAGE_CACHE_SIZE - 1);
+		}
+		msg.msg_name = NULL;
+		msg.msg_namelen = 0;
+		msg.msg_iov = (struct iovec *) &iov[0];
+		msg.msg_iovlen = num_pages;
+		msg.msg_control = NULL;
+		msg.msg_controllen = 0;
+		msg.msg_flags = MSG_KERNSPACE;
+		rcvtimeo = sock->sk->sk_rcvtimeo;
+		sock->sk->sk_rcvtimeo = 64 * HZ;
+		recv_meg_ret = kernel_recvmsg(
+				sock, &msg, &iov[0], num_pages, write_bytes,
+				MSG_WAITALL | MSG_NOCATCHSIGNAL);
+		sock->sk->sk_rcvtimeo = rcvtimeo;
+		for (i = 0; i < num_pages; i++) {
+			kunmap(pages[i]);
+		}
+		if (0 > recv_meg_ret) {
+			btrfs_drop_pages(pages, num_pages);
+			ret = recv_meg_ret;
+			break;
+		}
+		copied = (size_t) recv_meg_ret;
+		*rbytes += copied;
+		if (write_bytes > recv_meg_ret) {
+			recv_meg_ret = -EPIPE;
+		}
+		dirty_pages = (copied + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		/*
+		 * If we had a short copy we need to release the excess delaloc
+		 * bytes we reserved.  We need to increment outstanding_extents
+		 * because btrfs_delalloc_release_space will decrement it, but
+		 * we still have an outstanding extent for the chunk we actually
+		 * managed to copy.
+		 */
+		if (num_pages > dirty_pages) {
+			release_bytes = (num_pages - dirty_pages) <<
+				PAGE_CACHE_SHIFT;
+			if (copied > 0) {
+				spin_lock(&BTRFS_I(inode)->lock);
+				BTRFS_I(inode)->outstanding_extents++;
+				spin_unlock(&BTRFS_I(inode)->lock);
+			}
+			if (only_release_metadata)
+				btrfs_delalloc_release_metadata(inode,
+								release_bytes);
+			else
+				btrfs_delalloc_release_space(inode,
+							     release_bytes);
+		}
+
+		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
+		if (copied > 0) {
+			ret = btrfs_dirty_pages(root, inode, pages,
+						dirty_pages, pos, copied,
+						NULL);
+			if (ret) {
+				btrfs_drop_pages(pages, num_pages);
+				break;
+			}
+		}
+
+		release_bytes = 0;
+		btrfs_drop_pages(pages, num_pages);
+
+		if (only_release_metadata && copied > 0) {
+			u64 lockstart = round_down(pos, root->sectorsize);
+			u64 lockend = lockstart +
+				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				       lockend, EXTENT_NORESERVE, NULL,
+				       NULL, GFP_NOFS);
+			only_release_metadata = false;
+		}
+		cond_resched();
+
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+			btrfs_btree_balance_dirty(root);
+
+		pos += copied;
+		num_written += copied;
+		count -= copied;
+		if (recv_meg_ret == -EPIPE) {
+			ret = -EPIPE;
+			break;
+		}
+	}
+
+
+	if (release_bytes) {
+		if (only_release_metadata)
+			btrfs_delalloc_release_metadata(inode, release_bytes);
+		else
+			btrfs_delalloc_release_space(inode, release_bytes);
+	}
+	*wbytes = num_written;
+	*ppos = pos;
+out:
+
+	return ret ? ret : num_written;
+}
+#endif
+
 static noinline ssize_t __btrfs_buffered_write(struct file *file,
 					       struct iov_iter *i,
 					       loff_t pos)
@@ -1158,10 +1583,12 @@
 	struct inode *inode = fdentry(file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct page **pages = NULL;
+	u64 release_bytes = 0;
 	unsigned long first_index;
 	size_t num_written = 0;
 	int nrptrs;
 	int ret = 0;
+	bool only_release_metadata = false;
 	bool force_page_uptodate = false;
 
 	nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
@@ -1182,6 +1609,7 @@
 					 offset);
 		size_t num_pages = (write_bytes + offset +
 				    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+		size_t reserve_bytes;
 		size_t dirty_pages;
 		size_t copied;
 
@@ -1196,11 +1624,41 @@
 			break;
 		}
 
-		ret = btrfs_delalloc_reserve_space(inode,
-					num_pages << PAGE_CACHE_SHIFT);
+		reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+		ret = btrfs_check_data_free_space(inode, reserve_bytes);
+		if (ret == -ENOSPC &&
+		    (BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+					      BTRFS_INODE_PREALLOC))) {
+			ret = check_can_nocow(inode, pos, &write_bytes);
+			if (ret > 0) {
+				only_release_metadata = true;
+				/*
+				 * our prealloc extent may be smaller than
+				 * write_bytes, so scale down.
+				 */
+				num_pages = (write_bytes + offset +
+					     PAGE_CACHE_SIZE - 1) >>
+					PAGE_CACHE_SHIFT;
+				reserve_bytes = num_pages << PAGE_CACHE_SHIFT;
+				ret = 0;
+			} else {
+				ret = -ENOSPC;
+			}
+		}
+
 		if (ret)
 			break;
 
+		ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes);
+		if (ret) {
+			if (!only_release_metadata)
+				btrfs_free_reserved_data_space(inode,
+							       reserve_bytes);
+			break;
+		}
+
+		release_bytes = reserve_bytes;
+
 		/*
 		 * This is going to setup the pages array with the number of
 		 * pages we want, so we don't really need to worry about the
@@ -1209,11 +1667,8 @@
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, write_bytes,
 				    force_page_uptodate);
-		if (ret) {
-			btrfs_delalloc_release_space(inode,
-					num_pages << PAGE_CACHE_SHIFT);
+		if (ret)
 			break;
-		}
 
 		copied = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, i);
@@ -1243,37 +1698,51 @@
 		 * managed to copy.
 		 */
 		if (num_pages > dirty_pages) {
+			release_bytes = (num_pages - dirty_pages) <<
+				PAGE_CACHE_SHIFT;
 			if (copied > 0) {
 				spin_lock(&BTRFS_I(inode)->lock);
 				BTRFS_I(inode)->outstanding_extents++;
 				spin_unlock(&BTRFS_I(inode)->lock);
 			}
-			btrfs_delalloc_release_space(inode,
-					(num_pages - dirty_pages) <<
-					PAGE_CACHE_SHIFT);
+			if (only_release_metadata)
+				btrfs_delalloc_release_metadata(inode,
+								release_bytes);
+			else
+				btrfs_delalloc_release_space(inode,
+							     release_bytes);
 		}
 
+		release_bytes = dirty_pages << PAGE_CACHE_SHIFT;
 		if (copied > 0) {
 			ret = btrfs_dirty_pages(root, inode, pages,
 						dirty_pages, pos, copied,
 						NULL);
 			if (ret) {
-				btrfs_delalloc_release_space(inode,
-					dirty_pages << PAGE_CACHE_SHIFT);
 				btrfs_drop_pages(pages, num_pages);
 				break;
 			}
 		}
 
+		release_bytes = 0;
 		btrfs_drop_pages(pages, num_pages);
 
+		if (only_release_metadata && copied > 0) {
+			u64 lockstart = round_down(pos, root->sectorsize);
+			u64 lockend = lockstart +
+				(dirty_pages << PAGE_CACHE_SHIFT) - 1;
+
+			set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				       lockend, EXTENT_NORESERVE, NULL,
+				       NULL, GFP_NOFS);
+			only_release_metadata = false;
+		}
+
 		cond_resched();
 
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-						   dirty_pages);
+		balance_dirty_pages_ratelimited(inode->i_mapping);
 		if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-			btrfs_btree_balance_dirty(root, 1);
-		btrfs_throttle(root);
+			btrfs_btree_balance_dirty(root);
 
 		pos += copied;
 		num_written += copied;
@@ -1281,6 +1750,13 @@
 
 	kfree(pages);
 
+	if (release_bytes) {
+		if (only_release_metadata)
+			btrfs_delalloc_release_metadata(inode, release_bytes);
+		else
+			btrfs_delalloc_release_space(inode, release_bytes);
+	}
+
 	return num_written ? num_written : ret;
 }
 
@@ -1290,7 +1766,6 @@
 				    loff_t *ppos, size_t count, size_t ocount)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = fdentry(file)->d_inode;
 	struct iov_iter i;
 	ssize_t written;
 	ssize_t written_buffered;
@@ -1300,18 +1775,6 @@
 	written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
 					    count, ocount);
 
-	/*
-	 * the generic O_DIRECT will update in-memory i_size after the
-	 * DIOs are done.  But our endio handlers that update the on
-	 * disk i_size never update past the in memory i_size.  So we
-	 * need one more update here to catch any additions to the
-	 * file
-	 */
-	if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
-		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-		mark_inode_dirty(inode);
-	}
-
 	if (written < 0 || written == count)
 		return written;
 
@@ -1335,6 +1798,24 @@
 	return written ? written : err;
 }
 
+static void update_time_for_write(struct inode *inode)
+{
+	struct timespec now;
+
+	if (IS_NOCMTIME(inode))
+		return;
+
+	now = current_fs_time(inode->i_sb);
+	if (!timespec_equal(&inode->i_mtime, &now))
+		inode->i_mtime = now;
+
+	if (!timespec_equal(&inode->i_ctime, &now))
+		inode->i_ctime = now;
+
+	if (IS_I_VERSION(inode))
+		inode_inc_iversion(inode);
+}
+
 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs, loff_t pos)
@@ -1347,8 +1828,9 @@
 	ssize_t num_written = 0;
 	ssize_t err = 0;
 	size_t count, ocount;
+	bool sync = (file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host);
 
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	sb_start_write(inode->i_sb);
 
 	mutex_lock(&inode->i_mutex);
 
@@ -1383,18 +1865,19 @@
 	 * although we have opened a file as writable, we have
 	 * to stop this write operation to ensure FS consistency.
 	 */
-	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
 		mutex_unlock(&inode->i_mutex);
 		err = -EROFS;
 		goto out;
 	}
 
-	err = btrfs_update_time(file);
-	if (err) {
-		mutex_unlock(&inode->i_mutex);
-		goto out;
-	}
-	BTRFS_I(inode)->sequence++;
+	/*
+	 * We reserve space for updating the inode when we reserve space for the
+	 * extent we are going to write, so we will enospc out there.  We don't
+	 * need to start yet another transaction to update the inode as we will
+	 * update the inode when we finish writing whatever data we write.
+	 */
+	update_time_for_write(inode);
 
 	start_pos = round_down(pos, root->sectorsize);
 	if (start_pos > i_size_read(inode)) {
@@ -1405,6 +1888,9 @@
 		}
 	}
 
+	if (sync)
+		atomic_inc(&BTRFS_I(inode)->sync_writers);
+
 	if (unlikely(file->f_flags & O_DIRECT)) {
 		num_written = __btrfs_direct_write(iocb, iov, nr_segs,
 						   pos, ppos, count, ocount);
@@ -1431,14 +1917,23 @@
 	 * this will either be one more than the running transaction
 	 * or the generation used for the next transaction if there isn't
 	 * one running right now.
+	 *
+	 * We also have to set last_sub_trans to the current log transid,
+	 * otherwise subsequent syncs to a file that's been synced in this
+	 * transaction will appear to have already occured.
 	 */
 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+	BTRFS_I(inode)->last_sub_trans = root->log_transid;
 	if (num_written > 0 || num_written == -EIOCBQUEUED) {
 		err = generic_write_sync(file, pos, num_written);
 		if (err < 0 && num_written > 0)
 			num_written = err;
 	}
+
+	if (sync)
+		atomic_dec(&BTRFS_I(inode)->sync_writers);
 out:
+	sb_end_write(inode->i_sb);
 	current->backing_dev_info = NULL;
 	return num_written ? num_written : err;
 }
@@ -1451,9 +1946,22 @@
 	 * flush down new bytes that may have been written if the
 	 * application were using truncate to replace a file in place.
 	 */
-	if (BTRFS_I(inode)->ordered_data_close) {
-		BTRFS_I(inode)->ordered_data_close = 0;
-		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+	if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+			       &BTRFS_I(inode)->runtime_flags)) {
+		struct btrfs_trans_handle *trans;
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+
+		/*
+		 * We need to block on a committing transaction to keep us from
+		 * throwing a ordered operation on to the list and causing
+		 * something like sync to deadlock trying to flush out this
+		 * inode.
+		 */
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+		btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+		btrfs_end_transaction(trans, root);
 		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
 			filemap_flush(inode->i_mapping);
 	}
@@ -1480,18 +1988,37 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 	struct btrfs_trans_handle *trans;
+	bool full_sync = 0;
 
 	trace_btrfs_sync_file(file, datasync);
 
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+	/*
+	 * We write the dirty pages in the range and wait until they complete
+	 * out of the ->i_mutex. If so, we can flush the dirty pages by
+	 * multi-task, and make the performance up.  See
+	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
+	 */
+	atomic_inc(&BTRFS_I(inode)->sync_writers);
+	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			     &BTRFS_I(inode)->runtime_flags))
+		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+	atomic_dec(&BTRFS_I(inode)->sync_writers);
 	if (ret)
 		return ret;
+
 	mutex_lock(&inode->i_mutex);
 
-	/* we wait first, since the writeback may change the inode */
-	root->log_batch++;
-	btrfs_wait_ordered_range(inode, 0, (u64)-1);
-	root->log_batch++;
+	/*
+	 * We flush the dirty pages again to avoid some dirty pages in the
+	 * range being left.
+	 */
+	atomic_inc(&root->log_batch);
+	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			     &BTRFS_I(inode)->runtime_flags);
+	if (full_sync)
+		btrfs_wait_ordered_range(inode, start, end - start + 1);
+	atomic_inc(&root->log_batch);
 
 	/*
 	 * check the transaction that last modified this inode
@@ -1508,9 +2035,18 @@
 	 * syncing
 	 */
 	smp_mb();
-	if (BTRFS_I(inode)->last_trans <=
+	if (btrfs_inode_in_log(inode, root->fs_info->generation) ||
+	    BTRFS_I(inode)->last_trans <=
 	    root->fs_info->last_trans_committed) {
 		BTRFS_I(inode)->last_trans = 0;
+
+		/*
+		 * We'v had everything committed since the last time we were
+		 * modified so clear this flag in case it was set for whatever
+		 * reason, it's no longer relevant.
+		 */
+		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			  &BTRFS_I(inode)->runtime_flags);
 		mutex_unlock(&inode->i_mutex);
 		goto out;
 	}
@@ -1530,8 +2066,8 @@
 
 	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0) {
-		mutex_unlock(&inode->i_mutex);
-		goto out;
+		/* Fallthrough and commit/free transaction. */
+		ret = 1;
 	}
 
 	/* we've logged all the items and now have a consistent
@@ -1548,13 +2084,25 @@
 
 	if (ret != BTRFS_NO_LOG_SYNC) {
 		if (ret > 0) {
+			/*
+			 * If we didn't already wait for ordered extents we need
+			 * to do that now.
+			 */
+			if (!full_sync)
+				btrfs_wait_ordered_range(inode, start,
+							 end - start + 1);
 			ret = btrfs_commit_transaction(trans, root);
 		} else {
 			ret = btrfs_sync_log(trans, root);
-			if (ret == 0)
+			if (ret == 0) {
 				ret = btrfs_end_transaction(trans, root);
-			else
+			} else {
+				if (!full_sync)
+					btrfs_wait_ordered_range(inode, start,
+								 end -
+								 start + 1);
 				ret = btrfs_commit_transaction(trans, root);
+			}
 		}
 	} else {
 		ret = btrfs_end_transaction(trans, root);
@@ -1582,33 +2130,365 @@
 	return 0;
 }
 
+static int hole_mergeable(struct inode *inode, struct extent_buffer *leaf,
+			  int slot, u64 start, u64 end)
+{
+	struct btrfs_file_extent_item *fi;
+	struct btrfs_key key;
+
+	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
+		return 0;
+
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+	if (key.objectid != btrfs_ino(inode) ||
+	    key.type != BTRFS_EXTENT_DATA_KEY)
+		return 0;
+
+	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_disk_bytenr(leaf, fi))
+		return 0;
+
+	if (key.offset == end)
+		return 1;
+	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
+		return 1;
+	return 0;
+}
+
+static int fill_holes(struct btrfs_trans_handle *trans, struct inode *inode,
+		      struct btrfs_path *path, u64 offset, u64 end)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_buffer *leaf;
+	struct btrfs_file_extent_item *fi;
+	struct extent_map *hole_em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = offset;
+
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(!ret);
+
+	leaf = path->nodes[0];
+	if (hole_mergeable(inode, leaf, path->slots[0]-1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]--;
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
+			end - offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+
+	if (hole_mergeable(inode, leaf, path->slots[0]+1, offset, end)) {
+		u64 num_bytes;
+
+		path->slots[0]++;
+		key.offset = offset;
+		btrfs_set_item_key_safe(root, path, &key);
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
+			offset;
+		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
+		btrfs_set_file_extent_offset(leaf, fi, 0);
+		btrfs_mark_buffer_dirty(leaf);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode), offset,
+				       0, 0, end - offset, 0, end - offset,
+				       0, 0, 0);
+	if (ret)
+		return ret;
+
+out:
+	btrfs_release_path(path);
+
+	hole_em = alloc_extent_map();
+	if (!hole_em) {
+		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+	} else {
+		hole_em->start = offset;
+		hole_em->len = end - offset;
+		hole_em->ram_bytes = hole_em->len;
+		hole_em->orig_start = offset;
+
+		hole_em->block_start = EXTENT_MAP_HOLE;
+		hole_em->block_len = 0;
+		hole_em->orig_block_len = 0;
+		hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+		hole_em->compress_type = BTRFS_COMPRESS_NONE;
+		hole_em->generation = trans->transid;
+
+		do {
+			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, hole_em, 1);
+			write_unlock(&em_tree->lock);
+		} while (ret == -EEXIST);
+		free_extent_map(hole_em);
+		if (ret)
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+	}
+
+	return 0;
+}
+
+static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
+	struct btrfs_path *path;
+	struct btrfs_block_rsv *rsv;
+	struct btrfs_trans_handle *trans;
+	u64 lockstart = round_up(offset, BTRFS_I(inode)->root->sectorsize);
+	u64 lockend = round_down(offset + len,
+				 BTRFS_I(inode)->root->sectorsize) - 1;
+	u64 cur_offset = lockstart;
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
+	u64 drop_end;
+	int ret = 0;
+	int err = 0;
+	bool same_page = ((offset >> PAGE_CACHE_SHIFT) ==
+			  ((offset + len - 1) >> PAGE_CACHE_SHIFT));
+
+	btrfs_wait_ordered_range(inode, offset, len);
+
+	mutex_lock(&inode->i_mutex);
+	/*
+	 * We needn't truncate any page which is beyond the end of the file
+	 * because we are sure there is no data there.
+	 */
+	/*
+	 * Only do this if we are in the same page and we aren't doing the
+	 * entire page.
+	 */
+	if (same_page && len < PAGE_CACHE_SIZE) {
+		if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE))
+			ret = btrfs_truncate_page(inode, offset, len, 0);
+		mutex_unlock(&inode->i_mutex);
+		return ret;
+	}
+
+	/* zero back part of the first page */
+	if (offset < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+		ret = btrfs_truncate_page(inode, offset, 0, 0);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	/* zero the front end of the last page */
+	if (offset + len < round_up(inode->i_size, PAGE_CACHE_SIZE)) {
+		ret = btrfs_truncate_page(inode, offset + len, 0, 1);
+		if (ret) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
+		}
+	}
+
+	if (lockend < lockstart) {
+		mutex_unlock(&inode->i_mutex);
+		return 0;
+	}
+
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+
+		truncate_pagecache_range(inode, lockstart, lockend);
+
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, &cached_state);
+		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
+
+		/*
+		 * We need to make sure we have no ordered extents in this range
+		 * and nobody raced in and read a page in this range, if we did
+		 * we need to try again.
+		 */
+		if ((!ordered ||
+		    (ordered->file_offset + ordered->len < lockstart ||
+		     ordered->file_offset > lockend)) &&
+		     !test_range_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, EXTENT_UPTODATE, 0,
+				     cached_state)) {
+			if (ordered)
+				btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockend, &cached_state, GFP_NOFS);
+		btrfs_wait_ordered_range(inode, lockstart,
+					 lockend - lockstart + 1);
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
+	if (!rsv) {
+		ret = -ENOMEM;
+		goto out_free;
+	}
+	rsv->size = btrfs_calc_trunc_metadata_size(root, 1);
+	rsv->failfast = 1;
+
+	/*
+	 * 1 - update the inode
+	 * 1 - removing the extents in the range
+	 * 1 - adding the hole extent
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		err = PTR_ERR(trans);
+		goto out_free;
+	}
+
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
+	BUG_ON(ret);
+	trans->block_rsv = rsv;
+
+	while (cur_offset < lockend) {
+		ret = __btrfs_drop_extents(trans, root, inode, path,
+					   cur_offset, lockend + 1,
+					   &drop_end, 1);
+		if (ret != -ENOSPC)
+			break;
+
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
+
+		ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		cur_offset = drop_end;
+
+		ret = btrfs_update_inode(trans, root, inode);
+		if (ret) {
+			err = ret;
+			break;
+		}
+
+		btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root);
+
+		trans = btrfs_start_transaction(root, 3);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
+	}
+
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = fill_holes(trans, inode, path, cur_offset, drop_end);
+	if (ret) {
+		err = ret;
+		goto out_trans;
+	}
+
+out_trans:
+	if (!trans)
+		goto out_free;
+
+	inode_inc_iversion(inode);
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
+	ret = btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
+out_free:
+	btrfs_free_path(path);
+	btrfs_free_block_rsv(root, rsv);
+out:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			     &cached_state, GFP_NOFS);
+	mutex_unlock(&inode->i_mutex);
+	if (ret && !err)
+		err = ret;
+	return err;
+}
+
 static long btrfs_fallocate(struct file *file, int mode,
 			    loff_t offset, loff_t len)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct extent_state *cached_state = NULL;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 cur_offset;
 	u64 last_byte;
 	u64 alloc_start;
 	u64 alloc_end;
 	u64 alloc_hint = 0;
 	u64 locked_end;
-	u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
 	struct extent_map *em;
+	int blocksize = BTRFS_I(inode)->root->sectorsize;
 	int ret;
 
-	alloc_start = offset & ~mask;
-	alloc_end =  (offset + len + mask) & ~mask;
+	alloc_start = round_down(offset, blocksize);
+	alloc_end = round_up(offset + len, blocksize);
 
-	/* We only support the FALLOC_FL_KEEP_SIZE mode */
-	if (mode & ~FALLOC_FL_KEEP_SIZE)
+	/* Make sure we aren't being give some crap mode */
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
 		return -EOPNOTSUPP;
 
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		return btrfs_punch_hole(inode, offset, len);
+
 	/*
-	 * wait for ordered IO before we have any locks.  We'll loop again
-	 * below with the locks held.
+	 * Make sure we have enough space before we do the
+	 * allocation.
 	 */
-	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+	if (ret)
+		return ret;
+	if (root->fs_info->quota_enabled) {
+		ret = btrfs_qgroup_reserve(root, alloc_end - alloc_start);
+		if (ret)
+			goto out_reserve_fail;
+	}
 
 	mutex_lock(&inode->i_mutex);
 	ret = inode_newsize_ok(inode, alloc_end);
@@ -1620,8 +2500,23 @@
 					alloc_start);
 		if (ret)
 			goto out;
+	} else {
+		/*
+		 * If we are fallocating from the end of the file onward we
+		 * need to zero out the end of the page if i_size lands in the
+		 * middle of a page.
+		 */
+		ret = btrfs_truncate_page(inode, inode->i_size, 0, 0);
+		if (ret)
+			goto out;
 	}
 
+	/*
+	 * wait for ordered IO before we have any locks.  We'll loop again
+	 * below with the locks held.
+	 */
+	btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -1630,7 +2525,7 @@
 		 * transaction
 		 */
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-				 locked_end, 0, &cached_state, GFP_NOFS);
+				 locked_end, 0, &cached_state);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
@@ -1659,35 +2554,26 @@
 
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 				      alloc_end - cur_offset, 0);
-		BUG_ON(IS_ERR_OR_NULL(em));
+		if (IS_ERR_OR_NULL(em)) {
+			if (!em)
+				ret = -ENOMEM;
+			else
+				ret = PTR_ERR(em);
+			break;
+		}
 		last_byte = min(extent_map_end(em), alloc_end);
 		actual_end = min_t(u64, extent_map_end(em), offset + len);
-		last_byte = (last_byte + mask) & ~mask;
+		last_byte = ALIGN(last_byte, blocksize);
 
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-
-			/*
-			 * Make sure we have enough space before we do the
-			 * allocation.
-			 */
-			ret = btrfs_check_data_free_space(inode, last_byte -
-							  cur_offset);
-			if (ret) {
-				free_extent_map(em);
-				break;
-			}
-
 			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 							last_byte - cur_offset,
 							1 << inode->i_blkbits,
 							offset + len,
 							&alloc_hint);
 
-			/* Let go of our reservation. */
-			btrfs_free_reserved_data_space(inode, last_byte -
-						       cur_offset);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
@@ -1715,6 +2601,11 @@
 			     &cached_state, GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (root->fs_info->quota_enabled)
+		btrfs_qgroup_free(root, alloc_end - alloc_start);
+out_reserve_fail:
+	/* Let go of our reservation. */
+	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 	return ret;
 }
 
@@ -1735,6 +2626,7 @@
 	if (lockend <= lockstart)
 		lockend = lockstart + root->sectorsize;
 
+	lockend--;
 	len = lockend - lockstart + 1;
 
 	len = max_t(u64, len, root->sectorsize);
@@ -1742,7 +2634,7 @@
 		return -ENXIO;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
-			 &cached_state, GFP_NOFS);
+			 &cached_state);
 
 	/*
 	 * Delalloc is such a pain.  If we have a hole and we have pending
@@ -1761,7 +2653,7 @@
 						     start - root->sectorsize,
 						     root->sectorsize, 0);
 		if (IS_ERR(em)) {
-			ret = -ENXIO;
+			ret = PTR_ERR(em);
 			goto out;
 		}
 		last_end = em->start + em->len;
@@ -1773,7 +2665,7 @@
 	while (1) {
 		em = btrfs_get_extent_fiemap(inode, NULL, 0, start, len, 0);
 		if (IS_ERR(em)) {
-			ret = -ENXIO;
+			ret = PTR_ERR(em);
 			break;
 		}
 
@@ -1801,9 +2693,12 @@
 					}
 				}
 
-				*offset = start;
-				free_extent_map(em);
-				break;
+				if (!test_bit(EXTENT_FLAG_PREALLOC,
+					      &em->flags)) {
+					*offset = start;
+					free_extent_map(em);
+					break;
+				}
 			}
 		}
 
@@ -1886,7 +2781,28 @@
 	.fsync		= btrfs_sync_file,
 	.fallocate	= btrfs_fallocate,
 	.unlocked_ioctl	= btrfs_ioctl,
+#ifdef MY_ABC_HERE
+	.syno_recvfile  = btrfs_syno_recvfile,
+#endif
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
 #endif
 };
+
+void btrfs_auto_defrag_exit(void)
+{
+	if (btrfs_inode_defrag_cachep)
+		kmem_cache_destroy(btrfs_inode_defrag_cachep);
+}
+
+int btrfs_auto_defrag_init(void)
+{
+	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
+					sizeof(struct inode_defrag), 0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_inode_defrag_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
diff -ur a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
--- a/fs/btrfs/file-item.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/file-item.c	2014-02-17 11:56:58.000000000 +0100
@@ -23,16 +23,19 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "volumes.h"
 #include "print-tree.h"
 
-#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \
+#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
 				   sizeof(struct btrfs_item) * 2) / \
 				  size) - 1))
 
+#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
+				       PAGE_CACHE_SIZE))
+
 #define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \
 				   sizeof(struct btrfs_ordered_sum)) / \
-				   sizeof(struct btrfs_sector_sum) * \
-				   (r)->sectorsize - (r)->sectorsize)
+				   sizeof(u32) * (r)->sectorsize)
 
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
@@ -59,7 +62,7 @@
 				      sizeof(*item));
 	if (ret < 0)
 		goto out;
-	BUG_ON(ret);
+	BUG_ON(ret); /* Can't happen */
 	leaf = path->nodes[0];
 	item = btrfs_item_ptr(leaf, path->slots[0],
 			      struct btrfs_file_extent_item);
@@ -80,10 +83,11 @@
 	return ret;
 }
 
-struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, int cow)
+static struct btrfs_csum_item *
+btrfs_lookup_csum(struct btrfs_trans_handle *trans,
+		  struct btrfs_root *root,
+		  struct btrfs_path *path,
+		  u64 bytenr, int cow)
 {
 	int ret;
 	struct btrfs_key file_key;
@@ -115,9 +119,11 @@
 		csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]);
 		csums_in_item /= csum_size;
 
-		if (csum_offset >= csums_in_item) {
+		if (csum_offset == csums_in_item) {
 			ret = -EFBIG;
 			goto fail;
+		} else if (csum_offset > csums_in_item) {
+			goto fail;
 		}
 	}
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
@@ -130,7 +136,6 @@
 	return ERR_PTR(ret);
 }
 
-
 int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid,
@@ -148,28 +153,54 @@
 	return ret;
 }
 
+static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
+{
+	kfree(bio->csum_allocated);
+}
 
 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 				   struct inode *inode, struct bio *bio,
 				   u64 logical_offset, u32 *dst, int dio)
 {
-	u32 sum;
 	struct bio_vec *bvec = bio->bi_io_vec;
-	int bio_index = 0;
+	struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_path *path;
+	u8 *csum;
 	u64 offset = 0;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
 	u32 diff;
+	int nblocks;
+	int bio_index = 0;
+	int count;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-	int ret;
-	struct btrfs_path *path;
-	struct btrfs_csum_item *item = NULL;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+
+	nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits;
+	if (!dst) {
+		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+			btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
+							    GFP_NOFS);
+			if (!btrfs_bio->csum_allocated) {
+				btrfs_free_path(path);
+				return -ENOMEM;
+			}
+			btrfs_bio->csum = btrfs_bio->csum_allocated;
+			btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
+		} else {
+			btrfs_bio->csum = btrfs_bio->csum_inline;
+		}
+		csum = btrfs_bio->csum;
+	} else {
+		csum = (u8 *)dst;
+	}
+
 	if (bio->bi_size > PAGE_CACHE_SIZE * 8)
 		path->reada = 2;
 
@@ -181,7 +212,7 @@
 	 * read from the commit root and sidestep a nasty deadlock
 	 * between reading the free space cache and updating the csum tree.
 	 */
-	if (btrfs_is_free_space_inode(root, inode)) {
+	if (btrfs_is_free_space_inode(inode)) {
 		path->search_commit_root = 1;
 		path->skip_locking = 1;
 	}
@@ -192,8 +223,9 @@
 	while (bio_index < bio->bi_vcnt) {
 		if (!dio)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
-		if (ret == 0)
+		count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
+					       (u32 *)csum, nblocks);
+		if (count)
 			goto found;
 
 		if (!item || disk_bytenr < item_start_offset ||
@@ -206,10 +238,8 @@
 			item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
 						 path, disk_bytenr, 0);
 			if (IS_ERR(item)) {
-				ret = PTR_ERR(item);
-				if (ret == -ENOENT || ret == -EFBIG)
-					ret = 0;
-				sum = 0;
+				count = 1;
+				memset(csum, 0, csum_size);
 				if (BTRFS_I(inode)->root->root_key.objectid ==
 				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 					set_extent_bits(io_tree, offset,
@@ -218,9 +248,7 @@
 				} else {
 					printk(KERN_INFO "btrfs no csum found "
 					       "for inode %llu start %llu\n",
-					       (unsigned long long)
-					       btrfs_ino(inode),
-					       (unsigned long long)offset);
+					       btrfs_ino(inode), offset);
 				}
 				item = NULL;
 				btrfs_release_path(path);
@@ -245,19 +273,20 @@
 		diff = disk_bytenr - item_start_offset;
 		diff = diff / root->sectorsize;
 		diff = diff * csum_size;
-
-		read_extent_buffer(path->nodes[0], &sum,
+		count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
+					    inode->i_sb->s_blocksize_bits);
+		read_extent_buffer(path->nodes[0], csum,
 				   ((unsigned long)item) + diff,
-				   csum_size);
+				   csum_size * count);
 found:
-		if (dst)
-			*dst++ = sum;
-		else
-			set_state_private(io_tree, offset, sum);
-		disk_bytenr += bvec->bv_len;
-		offset += bvec->bv_len;
-		bio_index++;
-		bvec++;
+		csum += count * csum_size;
+		nblocks -= count;
+		while (count--) {
+			disk_bytenr += bvec->bv_len;
+			offset += bvec->bv_len;
+			bio_index++;
+			bvec++;
+		}
 	}
 	btrfs_free_path(path);
 	return 0;
@@ -270,9 +299,19 @@
 }
 
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 offset, u32 *dst)
+			      struct btrfs_dio_private *dip, struct bio *bio,
+			      u64 offset)
 {
-	return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+	int len = (bio->bi_sector << 9) - dip->disk_bytenr;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	int ret;
+
+	len >>= inode->i_sb->s_blocksize_bits;
+	len *= csum_size;
+
+	ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
+				      (u32 *)(dip->csum + len), 1);
+	return ret;
 }
 
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
@@ -282,8 +321,8 @@
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_csum_item *item;
+	LIST_HEAD(tmplist);
 	unsigned long offset;
 	int ret;
 	size_t size;
@@ -333,11 +372,8 @@
 
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 		if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
-		    key.type != BTRFS_EXTENT_CSUM_KEY)
-			break;
-
-		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-		if (key.offset > end)
+		    key.type != BTRFS_EXTENT_CSUM_KEY ||
+		    key.offset > end)
 			break;
 
 		if (key.offset > start)
@@ -355,37 +391,41 @@
 				      struct btrfs_csum_item);
 		while (start < csum_end) {
 			size = min_t(size_t, csum_end - start,
-					MAX_ORDERED_SUM_BYTES(root));
+				     MAX_ORDERED_SUM_BYTES(root));
 			sums = kzalloc(btrfs_ordered_sum_size(root, size),
-					GFP_NOFS);
-			BUG_ON(!sums);
+				       GFP_NOFS);
+			if (!sums) {
+				ret = -ENOMEM;
+				goto fail;
+			}
 
-			sector_sum = sums->sums;
 			sums->bytenr = start;
-			sums->len = size;
+			sums->len = (int)size;
 
 			offset = (start - key.offset) >>
 				root->fs_info->sb->s_blocksize_bits;
 			offset *= csum_size;
+			size >>= root->fs_info->sb->s_blocksize_bits;
 
-			while (size > 0) {
-				read_extent_buffer(path->nodes[0],
-						&sector_sum->sum,
-						((unsigned long)item) +
-						offset, csum_size);
-				sector_sum->bytenr = start;
-
-				size -= root->sectorsize;
-				start += root->sectorsize;
-				offset += csum_size;
-				sector_sum++;
-			}
-			list_add_tail(&sums->list, list);
+			read_extent_buffer(path->nodes[0],
+					   sums->sums,
+					   ((unsigned long)item) + offset,
+					   csum_size * size);
+
+			start += root->sectorsize * size;
+			list_add_tail(&sums->list, &tmplist);
 		}
 		path->slots[0]++;
 	}
 	ret = 0;
 fail:
+	while (ret < 0 && !list_empty(&tmplist)) {
+		sums = list_entry(&tmplist, struct btrfs_ordered_sum, list);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+	list_splice_tail(&tmplist, list);
+
 	btrfs_free_path(path);
 	return ret;
 }
@@ -394,23 +434,20 @@
 		       struct bio *bio, u64 file_start, int contig)
 {
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_ordered_extent *ordered;
 	char *data;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
+	int index;
 	unsigned long total_bytes = 0;
 	unsigned long this_sum_bytes = 0;
 	u64 offset;
-	u64 disk_bytenr;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
 
-	sector_sum = sums->sums;
-	disk_bytenr = (u64)bio->bi_sector << 9;
 	sums->len = bio->bi_size;
 	INIT_LIST_HEAD(&sums->list);
 
@@ -420,15 +457,16 @@
 		offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
-	BUG_ON(!ordered);
-	sums->bytenr = ordered->start;
+	BUG_ON(!ordered); /* Logic error */
+	sums->bytenr = (u64)bio->bi_sector << 9;
+	index = 0;
 
 	while (bio_index < bio->bi_vcnt) {
 		if (!contig)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
 
-		if (!contig && (offset >= ordered->file_offset + ordered->len ||
-		    offset < ordered->file_offset)) {
+		if (offset >= ordered->file_offset + ordered->len ||
+		    offset < ordered->file_offset) {
 			unsigned long bytes_left;
 			sums->len = this_sum_bytes;
 			this_sum_bytes = 0;
@@ -439,30 +477,28 @@
 
 			sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left),
 				       GFP_NOFS);
-			BUG_ON(!sums);
-			sector_sum = sums->sums;
+			BUG_ON(!sums); /* -ENOMEM */
 			sums->len = bytes_left;
 			ordered = btrfs_lookup_ordered_extent(inode, offset);
-			BUG_ON(!ordered);
-			sums->bytenr = ordered->start;
+			BUG_ON(!ordered); /* Logic error */
+			sums->bytenr = ((u64)bio->bi_sector << 9) +
+				       total_bytes;
+			index = 0;
 		}
 
-		data = kmap_atomic(bvec->bv_page, KM_USER0);
-		sector_sum->sum = ~(u32)0;
-		sector_sum->sum = btrfs_csum_data(root,
-						  data + bvec->bv_offset,
-						  sector_sum->sum,
-						  bvec->bv_len);
-		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(sector_sum->sum,
-				 (char *)&sector_sum->sum);
-		sector_sum->bytenr = disk_bytenr;
+		data = kmap_atomic(bvec->bv_page);
+		sums->sums[index] = ~(u32)0;
+		sums->sums[index] = btrfs_csum_data(data + bvec->bv_offset,
+						    sums->sums[index],
+						    bvec->bv_len);
+		kunmap_atomic(data);
+		btrfs_csum_final(sums->sums[index],
+				 (char *)(sums->sums + index));
 
-		sector_sum++;
 		bio_index++;
+		index++;
 		total_bytes += bvec->bv_len;
 		this_sum_bytes += bvec->bv_len;
-		disk_bytenr += bvec->bv_len;
 		offset += bvec->bv_len;
 		bvec++;
 	}
@@ -483,18 +519,16 @@
  * This calls btrfs_truncate_item with the correct args based on the
  * overlap, and fixes up the key as required.
  */
-static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path,
-				      struct btrfs_key *key,
-				      u64 bytenr, u64 len)
+static noinline void truncate_one_csum(struct btrfs_root *root,
+				       struct btrfs_path *path,
+				       struct btrfs_key *key,
+				       u64 bytenr, u64 len)
 {
 	struct extent_buffer *leaf;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	u64 csum_end;
 	u64 end_byte = bytenr + len;
 	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
-	int ret;
 
 	leaf = path->nodes[0];
 	csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size;
@@ -510,7 +544,7 @@
 		 */
 		u32 new_size = (bytenr - key->offset) >> blocksize_bits;
 		new_size *= csum_size;
-		ret = btrfs_truncate_item(trans, root, path, new_size, 1);
+		btrfs_truncate_item(root, path, new_size, 1);
 	} else if (key->offset >= bytenr && csum_end > end_byte &&
 		   end_byte > key->offset) {
 		/*
@@ -522,15 +556,13 @@
 		u32 new_size = (csum_end - end_byte) >> blocksize_bits;
 		new_size *= csum_size;
 
-		ret = btrfs_truncate_item(trans, root, path, new_size, 0);
+		btrfs_truncate_item(root, path, new_size, 0);
 
 		key->offset = end_byte;
-		ret = btrfs_set_item_key_safe(trans, root, path, key);
-		BUG_ON(ret);
+		btrfs_set_item_key_safe(root, path, key);
 	} else {
 		BUG();
 	}
-	return 0;
 }
 
 /*
@@ -635,13 +667,14 @@
 			 * item changed size or key
 			 */
 			ret = btrfs_split_item(trans, root, path, &key, offset);
-			BUG_ON(ret && ret != -EAGAIN);
+			if (ret && ret != -EAGAIN) {
+				btrfs_abort_transaction(trans, root, ret);
+				goto out;
+			}
 
 			key.offset = end_byte - 1;
 		} else {
-			ret = truncate_one_csum(trans, root, path,
-						&key, bytenr, len);
-			BUG_ON(ret);
+			truncate_one_csum(root, path, &key, bytenr, len);
 			if (key.offset < bytenr)
 				break;
 		}
@@ -657,40 +690,42 @@
 			   struct btrfs_root *root,
 			   struct btrfs_ordered_sum *sums)
 {
-	u64 bytenr;
-	int ret;
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
-	u64 next_offset;
-	u64 total_bytes = 0;
-	int found_next;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
+	u64 next_offset;
+	u64 total_bytes = 0;
 	u64 csum_offset;
-	struct btrfs_sector_sum *sector_sum;
+	u64 bytenr;
 	u32 nritems;
 	u32 ins_size;
+	int index = 0;
+	int found_next;
+	int ret;
 	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-
-	sector_sum = sums->sums;
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
+	bytenr = sums->bytenr + total_bytes;
 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
-	file_key.offset = sector_sum->bytenr;
-	bytenr = sector_sum->bytenr;
+	file_key.offset = bytenr;
 	btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY);
 
-	item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1);
+	item = btrfs_lookup_csum(trans, root, path, bytenr, 1);
 	if (!IS_ERR(item)) {
-		leaf = path->nodes[0];
 		ret = 0;
+		leaf = path->nodes[0];
+		item_end = btrfs_item_ptr(leaf, path->slots[0],
+					  struct btrfs_csum_item);
+		item_end = (struct btrfs_csum_item *)((char *)item_end +
+			   btrfs_item_size_nr(leaf, path->slots[0]));
 		goto found;
 	}
 	ret = PTR_ERR(item);
@@ -757,22 +792,34 @@
 		goto insert;
 	}
 
-	if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+	if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
 	    csum_size) {
-		u32 diff = (csum_offset + 1) * csum_size;
+		int extend_nr;
+		u64 tmp;
+		u32 diff;
+		u32 free_space;
 
-		/*
-		 * is the item big enough already?  we dropped our lock
-		 * before and need to recheck
-		 */
-		if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
-			goto csum;
+		if (btrfs_leaf_free_space(root, leaf) <
+				 sizeof(struct btrfs_item) + csum_size * 2)
+			goto insert;
+
+		free_space = btrfs_leaf_free_space(root, leaf) -
+					 sizeof(struct btrfs_item) - csum_size;
+		tmp = sums->len - total_bytes;
+		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		WARN_ON(tmp < 1);
+
+		extend_nr = max_t(int, 1, (int)tmp);
+		diff = (csum_offset + extend_nr) * csum_size;
+		diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
 
 		diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-		if (diff != csum_size)
-			goto insert;
+		diff = min(free_space, diff);
+		diff /= csum_size;
+		diff *= csum_size;
 
-		ret = btrfs_extend_item(trans, root, path, diff);
+		btrfs_extend_item(root, path, diff);
+		ret = 0;
 		goto csum;
 	}
 
@@ -780,19 +827,13 @@
 	btrfs_release_path(path);
 	csum_offset = 0;
 	if (found_next) {
-		u64 tmp = total_bytes + root->sectorsize;
-		u64 next_sector = sector_sum->bytenr;
-		struct btrfs_sector_sum *next = sector_sum + 1;
+		u64 tmp;
 
-		while (tmp < sums->len) {
-			if (next_sector + root->sectorsize != next->bytenr)
-				break;
-			tmp += root->sectorsize;
-			next_sector = next->bytenr;
-			next++;
-		}
-		tmp = min(tmp, next_offset - file_key.offset);
+		tmp = sums->len - total_bytes;
 		tmp >>= root->fs_info->sb->s_blocksize_bits;
+		tmp = min(tmp, (next_offset - file_key.offset) >>
+					 root->fs_info->sb->s_blocksize_bits);
+
 		tmp = max((u64)1, tmp);
 		tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
 		ins_size = csum_size * tmp;
@@ -809,31 +850,25 @@
 		WARN_ON(1);
 		goto fail_unlock;
 	}
-csum:
 	leaf = path->nodes[0];
+csum:
 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-	ret = 0;
+	item_end = (struct btrfs_csum_item *)((unsigned char *)item +
+				      btrfs_item_size_nr(leaf, path->slots[0]));
 	item = (struct btrfs_csum_item *)((unsigned char *)item +
 					  csum_offset * csum_size);
 found:
-	item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item);
-	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
-				      btrfs_item_size_nr(leaf, path->slots[0]));
-next_sector:
-
-	write_extent_buffer(leaf, &sector_sum->sum, (unsigned long)item, csum_size);
-
-	total_bytes += root->sectorsize;
-	sector_sum++;
-	if (total_bytes < sums->len) {
-		item = (struct btrfs_csum_item *)((char *)item +
-						  csum_size);
-		if (item < item_end && bytenr + PAGE_CACHE_SIZE ==
-		    sector_sum->bytenr) {
-			bytenr = sector_sum->bytenr;
-			goto next_sector;
-		}
-	}
+	ins_size = (u32)(sums->len - total_bytes) >>
+		   root->fs_info->sb->s_blocksize_bits;
+	ins_size *= csum_size;
+	ins_size = min_t(u32, (unsigned long)item_end - (unsigned long)item,
+			      ins_size);
+	write_extent_buffer(leaf, sums->sums + index, (unsigned long)item,
+			    ins_size);
+
+	ins_size /= csum_size;
+	total_bytes += ins_size * root->sectorsize;
+	index += ins_size;
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	if (total_bytes < sums->len) {
diff -ur a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
--- a/fs/btrfs/free-space-cache.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/free-space-cache.c	2014-02-17 11:56:58.000000000 +0100
@@ -33,6 +33,8 @@
 
 static int link_free_space(struct btrfs_free_space_ctl *ctl,
 			   struct btrfs_free_space *info);
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
+			      struct btrfs_free_space *info);
 
 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
 					       struct btrfs_path *path,
@@ -75,7 +77,8 @@
 		return ERR_PTR(-ENOENT);
 	}
 
-	inode->i_mapping->flags &= ~__GFP_FS;
+	mapping_set_gfp_mask(inode->i_mapping,
+			mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
 
 	return inode;
 }
@@ -101,7 +104,8 @@
 
 	spin_lock(&block_group->lock);
 	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
-		printk(KERN_INFO "Old style space inode found, converting.\n");
+		btrfs_info(root->fs_info,
+			"Old style space inode found, converting.");
 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
 			BTRFS_INODE_NODATACOW;
 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
@@ -116,9 +120,10 @@
 	return inode;
 }
 
-int __create_free_space_inode(struct btrfs_root *root,
-			      struct btrfs_trans_handle *trans,
-			      struct btrfs_path *path, u64 ino, u64 offset)
+static int __create_free_space_inode(struct btrfs_root *root,
+				     struct btrfs_trans_handle *trans,
+				     struct btrfs_path *path,
+				     u64 ino, u64 offset)
 {
 	struct btrfs_key key;
 	struct btrfs_disk_key disk_key;
@@ -192,30 +197,32 @@
 					 block_group->key.objectid);
 }
 
-int btrfs_truncate_free_space_cache(struct btrfs_root *root,
-				    struct btrfs_trans_handle *trans,
-				    struct btrfs_path *path,
-				    struct inode *inode)
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv)
 {
-	struct btrfs_block_rsv *rsv;
 	u64 needed_bytes;
-	loff_t oldsize;
-	int ret = 0;
-
-	rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	int ret;
 
 	/* 1 for slack space, 1 for updating the inode */
 	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
 		btrfs_calc_trans_metadata_size(root, 1);
 
-	spin_lock(&trans->block_rsv->lock);
-	if (trans->block_rsv->reserved < needed_bytes) {
-		spin_unlock(&trans->block_rsv->lock);
-		trans->block_rsv = rsv;
-		return -ENOSPC;
-	}
-	spin_unlock(&trans->block_rsv->lock);
+	spin_lock(&rsv->lock);
+	if (rsv->reserved < needed_bytes)
+		ret = -ENOSPC;
+	else
+		ret = 0;
+	spin_unlock(&rsv->lock);
+	return ret;
+}
+
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+				    struct btrfs_trans_handle *trans,
+				    struct btrfs_path *path,
+				    struct inode *inode)
+{
+	loff_t oldsize;
+	int ret = 0;
 
 	oldsize = i_size_read(inode);
 	btrfs_i_size_write(inode, 0);
@@ -227,15 +234,14 @@
 	 */
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
-
 	if (ret) {
-		trans->block_rsv = rsv;
-		WARN_ON(1);
+		btrfs_abort_transaction(trans, root, ret);
 		return ret;
 	}
 
 	ret = btrfs_update_inode(trans, root, inode);
-	trans->block_rsv = rsv;
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
 
 	return ret;
 }
@@ -302,8 +308,7 @@
 
 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 {
-	WARN_ON(io_ctl->cur);
-	BUG_ON(io_ctl->index >= io_ctl->num_pages);
+	ASSERT(io_ctl->index < io_ctl->num_pages);
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
 	io_ctl->cur = kmap(io_ctl->page);
 	io_ctl->orig = io_ctl->cur;
@@ -319,9 +324,11 @@
 	io_ctl_unmap_page(io_ctl);
 
 	for (i = 0; i < io_ctl->num_pages; i++) {
-		ClearPageChecked(io_ctl->pages[i]);
-		unlock_page(io_ctl->pages[i]);
-		page_cache_release(io_ctl->pages[i]);
+		if (io_ctl->pages[i]) {
+			ClearPageChecked(io_ctl->pages[i]);
+			unlock_page(io_ctl->pages[i]);
+			page_cache_release(io_ctl->pages[i]);
+		}
 	}
 }
 
@@ -361,7 +368,7 @@
 
 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 {
-	u64 *val;
+	__le64 *val;
 
 	io_ctl_map_page(io_ctl, 1);
 
@@ -384,7 +391,7 @@
 
 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 {
-	u64 *gen;
+	__le64 *gen;
 
 	/*
 	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
@@ -423,9 +430,9 @@
 	}
 
 	if (index == 0)
-		offset = sizeof(u32) * io_ctl->num_pages;;
+		offset = sizeof(u32) * io_ctl->num_pages;
 
-	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	io_ctl_unmap_page(io_ctl);
@@ -455,7 +462,7 @@
 	kunmap(io_ctl->pages[0]);
 
 	io_ctl_map_page(io_ctl, 0);
-	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
 			      PAGE_CACHE_SIZE - offset);
 	btrfs_csum_final(crc, (char *)&crc);
 	if (val != crc) {
@@ -580,9 +587,47 @@
 	return 0;
 }
 
-int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
-			    struct btrfs_free_space_ctl *ctl,
-			    struct btrfs_path *path, u64 offset)
+/*
+ * Since we attach pinned extents after the fact we can have contiguous sections
+ * of free space that are split up in entries.  This poses a problem with the
+ * tree logging stuff since it could have allocated across what appears to be 2
+ * entries since we would have merged the entries when adding the pinned extents
+ * back to the free space cache.  So run through the space cache that we just
+ * loaded and merge contiguous entries.  This will make the log replay stuff not
+ * blow up and it will make for nicer allocator behavior.
+ */
+static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
+{
+	struct btrfs_free_space *e, *prev = NULL;
+	struct rb_node *n;
+
+again:
+	spin_lock(&ctl->tree_lock);
+	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
+		e = rb_entry(n, struct btrfs_free_space, offset_index);
+		if (!prev)
+			goto next;
+		if (e->bitmap || prev->bitmap)
+			goto next;
+		if (prev->offset + prev->bytes == e->offset) {
+			unlink_free_space(ctl, prev);
+			unlink_free_space(ctl, e);
+			prev->bytes += e->bytes;
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			link_free_space(ctl, prev);
+			prev = NULL;
+			spin_unlock(&ctl->tree_lock);
+			goto again;
+		}
+next:
+		prev = e;
+	}
+	spin_unlock(&ctl->tree_lock);
+}
+
+static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_path *path, u64 offset)
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
@@ -625,17 +670,20 @@
 	btrfs_release_path(path);
 
 	if (BTRFS_I(inode)->generation != generation) {
-		printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
-		       " not match free space cache generation (%llu)\n",
-		       (unsigned long long)BTRFS_I(inode)->generation,
-		       (unsigned long long)generation);
+		btrfs_err(root->fs_info,
+			"free space inode generation (%llu) "
+			"did not match free space cache generation (%llu)",
+			BTRFS_I(inode)->generation, generation);
 		return 0;
 	}
 
 	if (!num_entries)
 		return 0;
 
-	io_ctl_init(&io_ctl, inode, root);
+	ret = io_ctl_init(&io_ctl, inode, root);
+	if (ret)
+		return ret;
+
 	ret = readahead_cache(inode);
 	if (ret)
 		goto out;
@@ -674,13 +722,13 @@
 			ret = link_free_space(ctl, e);
 			spin_unlock(&ctl->tree_lock);
 			if (ret) {
-				printk(KERN_ERR "Duplicate entries in "
-				       "free space cache, dumping\n");
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
 				kmem_cache_free(btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
 		} else {
-			BUG_ON(!num_bitmaps);
+			ASSERT(num_bitmaps);
 			num_bitmaps--;
 			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
 			if (!e->bitmap) {
@@ -694,8 +742,8 @@
 			ctl->op->recalc_thresholds(ctl);
 			spin_unlock(&ctl->tree_lock);
 			if (ret) {
-				printk(KERN_ERR "Duplicate entries in "
-				       "free space cache, dumping\n");
+				btrfs_err(root->fs_info,
+					"Duplicate entries in free space cache, dumping");
 				kmem_cache_free(btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
@@ -719,6 +767,7 @@
 	}
 
 	io_ctl_drop_pages(&io_ctl);
+	merge_space_tree(ctl);
 	ret = 1;
 out:
 	io_ctl_free(&io_ctl);
@@ -741,13 +790,6 @@
 	u64 used = btrfs_block_group_used(&block_group->item);
 
 	/*
-	 * If we're unmounting then just return, since this does a search on the
-	 * normal root and not the commit root and we could deadlock.
-	 */
-	if (btrfs_fs_closing(fs_info))
-		return 0;
-
-	/*
 	 * If this block group has been marked to be cleared for one reason or
 	 * another then we can't trust the on disk cache, so just return.
 	 */
@@ -761,6 +803,8 @@
 	path = btrfs_alloc_path();
 	if (!path)
 		return 0;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
 
 	inode = lookup_free_space_inode(root, block_group, path);
 	if (IS_ERR(inode)) {
@@ -772,6 +816,7 @@
 	spin_lock(&block_group->lock);
 	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
 		spin_unlock(&block_group->lock);
+		btrfs_free_path(path);
 		goto out;
 	}
 	spin_unlock(&block_group->lock);
@@ -789,8 +834,8 @@
 
 	if (!matched) {
 		__btrfs_remove_free_space_cache(ctl);
-		printk(KERN_ERR "block group %llu has an wrong amount of free "
-		       "space\n", block_group->key.objectid);
+		btrfs_err(fs_info, "block group %llu has wrong amount of free space",
+			block_group->key.objectid);
 		ret = -1;
 	}
 out:
@@ -801,8 +846,8 @@
 		spin_unlock(&block_group->lock);
 		ret = 0;
 
-		printk(KERN_ERR "btrfs: failed to load free space cache "
-		       "for block group %llu\n", block_group->key.objectid);
+		btrfs_err(fs_info, "failed to load free space cache for block group %llu",
+			block_group->key.objectid);
 	}
 
 	iput(inode);
@@ -822,11 +867,11 @@
  * on mount.  This will return 0 if it was successfull in writing the cache out,
  * and -1 if it was not.
  */
-int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
-			    struct btrfs_free_space_ctl *ctl,
-			    struct btrfs_block_group_cache *block_group,
-			    struct btrfs_trans_handle *trans,
-			    struct btrfs_path *path, u64 offset)
+static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+				   struct btrfs_free_space_ctl *ctl,
+				   struct btrfs_block_group_cache *block_group,
+				   struct btrfs_trans_handle *trans,
+				   struct btrfs_path *path, u64 offset)
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
@@ -838,7 +883,7 @@
 	struct io_ctl io_ctl;
 	struct list_head bitmap_list;
 	struct btrfs_key key;
-	u64 start, end, len;
+	u64 start, extent_start, extent_end, len;
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
@@ -849,7 +894,9 @@
 	if (!i_size_read(inode))
 		return -1;
 
-	io_ctl_init(&io_ctl, inode, root);
+	ret = io_ctl_init(&io_ctl, inode, root);
+	if (ret)
+		return -1;
 
 	/* Get the cluster for this block_group if it exists */
 	if (block_group && !list_empty(&block_group->cluster_list))
@@ -857,24 +904,11 @@
 				     struct btrfs_free_cluster,
 				     block_group_list);
 
-	/*
-	 * We shouldn't have switched the pinned extents yet so this is the
-	 * right one
-	 */
-	unpin = root->fs_info->pinned_extents;
-
 	/* Lock all pages first so we can lock the extent safely. */
 	io_ctl_prepare_pages(&io_ctl, inode, 0);
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
-			 0, &cached_state, GFP_NOFS);
-
-	/*
-	 * When searching for pinned extents, we need to start at our start
-	 * offset.
-	 */
-	if (block_group)
-		start = block_group->key.objectid;
+			 0, &cached_state);
 
 	node = rb_first(&ctl->free_space_offset);
 	if (!node && cluster) {
@@ -884,10 +918,8 @@
 
 	/* Make sure we can fit our crcs into the first page */
 	if (io_ctl.check_crcs &&
-	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
-		WARN_ON(1);
+	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
 		goto out_nospc;
-	}
 
 	io_ctl_set_generation(&io_ctl, trans->transid);
 
@@ -918,30 +950,42 @@
 	 * We want to add any pinned extents to our free space cache
 	 * so we don't leak the space
 	 */
+
+	/*
+	 * We shouldn't have switched the pinned extents yet so this is the
+	 * right one
+	 */
+	unpin = root->fs_info->pinned_extents;
+
+	if (block_group)
+		start = block_group->key.objectid;
+
 	while (block_group && (start < block_group->key.objectid +
 			       block_group->key.offset)) {
-		ret = find_first_extent_bit(unpin, start, &start, &end,
-					    EXTENT_DIRTY);
+		ret = find_first_extent_bit(unpin, start,
+					    &extent_start, &extent_end,
+					    EXTENT_DIRTY, NULL);
 		if (ret) {
 			ret = 0;
 			break;
 		}
 
 		/* This pinned extent is out of our range */
-		if (start >= block_group->key.objectid +
+		if (extent_start >= block_group->key.objectid +
 		    block_group->key.offset)
 			break;
 
-		len = block_group->key.objectid +
-			block_group->key.offset - start;
-		len = min(len, end + 1 - start);
+		extent_start = max(extent_start, start);
+		extent_end = min(block_group->key.objectid +
+				 block_group->key.offset, extent_end + 1);
+		len = extent_end - extent_start;
 
 		entries++;
-		ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+		ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
 		if (ret)
 			goto out_nospc;
 
-		start = end + 1;
+		start = extent_end;
 	}
 
 	/* Write out the bitmaps */
@@ -968,9 +1012,7 @@
 		goto out;
 
 
-	ret = filemap_write_and_wait(inode->i_mapping);
-	if (ret)
-		goto out;
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
@@ -986,7 +1028,7 @@
 	leaf = path->nodes[0];
 	if (ret > 0) {
 		struct btrfs_key found_key;
-		BUG_ON(!path->slots[0]);
+		ASSERT(path->slots[0]);
 		path->slots[0]--;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
@@ -1061,8 +1103,9 @@
 		spin_unlock(&block_group->lock);
 		ret = 0;
 #ifdef DEBUG
-		printk(KERN_ERR "btrfs: failed to write free space cace "
-		       "for block group %llu\n", block_group->key.objectid);
+		btrfs_err(root->fs_info,
+			"failed to write free space cache for block group %llu",
+			block_group->key.objectid);
 #endif
 	}
 
@@ -1073,7 +1116,7 @@
 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
 					  u64 offset)
 {
-	BUG_ON(offset < bitmap_start);
+	ASSERT(offset >= bitmap_start);
 	offset -= bitmap_start;
 	return (unsigned long)(div_u64(offset, unit));
 }
@@ -1206,18 +1249,13 @@
 			 * if previous extent entry covers the offset,
 			 * we should return it instead of the bitmap entry
 			 */
-			n = &entry->offset_index;
-			while (1) {
-				n = rb_prev(n);
-				if (!n)
-					break;
+			n = rb_prev(&entry->offset_index);
+			if (n) {
 				prev = rb_entry(n, struct btrfs_free_space,
 						offset_index);
-				if (!prev->bitmap) {
-					if (prev->offset + prev->bytes > offset)
-						entry = prev;
-					break;
-				}
+				if (!prev->bitmap &&
+				    prev->offset + prev->bytes > offset)
+					entry = prev;
 			}
 		}
 		return entry;
@@ -1233,7 +1271,7 @@
 		if (n) {
 			entry = rb_entry(n, struct btrfs_free_space,
 					offset_index);
-			BUG_ON(entry->offset > offset);
+			ASSERT(entry->offset <= offset);
 		} else {
 			if (fuzzy)
 				return entry;
@@ -1243,18 +1281,13 @@
 	}
 
 	if (entry->bitmap) {
-		n = &entry->offset_index;
-		while (1) {
-			n = rb_prev(n);
-			if (!n)
-				break;
+		n = rb_prev(&entry->offset_index);
+		if (n) {
 			prev = rb_entry(n, struct btrfs_free_space,
 					offset_index);
-			if (!prev->bitmap) {
-				if (prev->offset + prev->bytes > offset)
-					return prev;
-				break;
-			}
+			if (!prev->bitmap &&
+			    prev->offset + prev->bytes > offset)
+				return prev;
 		}
 		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
 			return entry;
@@ -1302,7 +1335,7 @@
 {
 	int ret = 0;
 
-	BUG_ON(!info->bitmap && !info->bytes);
+	ASSERT(info->bytes || info->bitmap);
 	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
 				 &info->offset_index, (info->bitmap != NULL));
 	if (ret)
@@ -1320,10 +1353,12 @@
 	u64 bitmap_bytes;
 	u64 extent_bytes;
 	u64 size = block_group->key.offset;
-	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
 	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
 
-	BUG_ON(ctl->total_bitmaps > max_bitmaps);
+	max_bitmaps = max(max_bitmaps, 1);
+
+	ASSERT(ctl->total_bitmaps <= max_bitmaps);
 
 	/*
 	 * The goal is to keep the total amount of memory used per 1gb of space
@@ -1367,7 +1402,7 @@
 
 	start = offset_to_bit(info->offset, ctl->unit, offset);
 	count = bytes_to_bits(bytes, ctl->unit);
-	BUG_ON(start + count > BITS_PER_BITMAP);
+	ASSERT(start + count <= BITS_PER_BITMAP);
 
 	bitmap_clear(info->bitmap, start, count);
 
@@ -1390,7 +1425,7 @@
 
 	start = offset_to_bit(info->offset, ctl->unit, offset);
 	count = bytes_to_bits(bytes, ctl->unit);
-	BUG_ON(start + count > BITS_PER_BITMAP);
+	ASSERT(start + count <= BITS_PER_BITMAP);
 
 	bitmap_set(info->bitmap, start, count);
 
@@ -1398,13 +1433,19 @@
 	ctl->free_space += bytes;
 }
 
+/*
+ * If we can not find suitable extent, we will use bytes to record
+ * the size of the max extent.
+ */
 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 			 struct btrfs_free_space *bitmap_info, u64 *offset,
 			 u64 *bytes)
 {
 	unsigned long found_bits = 0;
+	unsigned long max_bits = 0;
 	unsigned long bits, i;
 	unsigned long next_zero;
+	unsigned long extent_bits;
 
 	i = offset_to_bit(bitmap_info->offset, ctl->unit,
 			  max_t(u64, *offset, bitmap_info->offset));
@@ -1415,9 +1456,12 @@
 	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
 					       BITS_PER_BITMAP, i);
-		if ((next_zero - i) >= bits) {
-			found_bits = next_zero - i;
+		extent_bits = next_zero - i;
+		if (extent_bits >= bits) {
+			found_bits = extent_bits;
 			break;
+		} else if (extent_bits > max_bits) {
+			max_bits = extent_bits;
 		}
 		i = next_zero;
 	}
@@ -1428,40 +1472,74 @@
 		return 0;
 	}
 
+	*bytes = (u64)(max_bits) * ctl->unit;
 	return -1;
 }
 
+/* Cache the size of the max extent in bytes */
 static struct btrfs_free_space *
-find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+		unsigned long align, u64 *max_extent_size)
 {
 	struct btrfs_free_space *entry;
 	struct rb_node *node;
+	u64 tmp;
+	u64 align_off;
 	int ret;
 
 	if (!ctl->free_space_offset.rb_node)
-		return NULL;
+		goto out;
 
 	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
 	if (!entry)
-		return NULL;
+		goto out;
 
 	for (node = &entry->offset_index; node; node = rb_next(node)) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
-		if (entry->bytes < *bytes)
+		if (entry->bytes < *bytes) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
 			continue;
+		}
+
+		/* make sure the space returned is big enough
+		 * to match our requested alignment
+		 */
+		if (*bytes >= align) {
+			tmp = entry->offset - ctl->start + align - 1;
+			do_div(tmp, align);
+			tmp = tmp * align + ctl->start;
+			align_off = tmp - entry->offset;
+		} else {
+			align_off = 0;
+			tmp = entry->offset;
+		}
+
+		if (entry->bytes < *bytes + align_off) {
+			if (entry->bytes > *max_extent_size)
+				*max_extent_size = entry->bytes;
+			continue;
+		}
 
 		if (entry->bitmap) {
-			ret = search_bitmap(ctl, entry, offset, bytes);
-			if (!ret)
+			u64 size = *bytes;
+
+			ret = search_bitmap(ctl, entry, &tmp, &size);
+			if (!ret) {
+				*offset = tmp;
+				*bytes = size;
 				return entry;
+			} else if (size > *max_extent_size) {
+				*max_extent_size = size;
+			}
 			continue;
 		}
 
-		*offset = entry->offset;
-		*bytes = entry->bytes;
+		*offset = tmp;
+		*bytes = entry->bytes - align_off;
 		return entry;
 	}
-
+out:
 	return NULL;
 }
 
@@ -1499,29 +1577,27 @@
 	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
 
 	/*
-	 * XXX - this can go away after a few releases.
-	 *
-	 * since the only user of btrfs_remove_free_space is the tree logging
-	 * stuff, and the only way to test that is under crash conditions, we
-	 * want to have this debug stuff here just in case somethings not
-	 * working.  Search the bitmap for the space we are trying to use to
-	 * make sure its actually there.  If its not there then we need to stop
-	 * because something has gone wrong.
+	 * We need to search for bits in this bitmap.  We could only cover some
+	 * of the extent in this bitmap thanks to how we add space, so we need
+	 * to search for as much as it as we can and clear that amount, and then
+	 * go searching for the next bit.
 	 */
 	search_start = *offset;
-	search_bytes = *bytes;
+	search_bytes = ctl->unit;
 	search_bytes = min(search_bytes, end - search_start + 1);
 	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
-	BUG_ON(ret < 0 || search_start != *offset);
+	if (ret < 0 || search_start != *offset)
+		return -EINVAL;
 
-	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
-		bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
-		*bytes -= end - *offset + 1;
-		*offset = end + 1;
-	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
-		bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
-		*bytes = 0;
-	}
+	/* We may have found more bits than what we need */
+	search_bytes = min(search_bytes, *bytes);
+
+	/* Cannot clear past the end of the bitmap */
+	search_bytes = min(search_bytes, end - search_start + 1);
+
+	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
+	*offset += search_bytes;
+	*bytes -= search_bytes;
 
 	if (*bytes) {
 		struct rb_node *next = rb_next(&bitmap_info->offset_index);
@@ -1552,7 +1628,7 @@
 		 * everything over again.
 		 */
 		search_start = *offset;
-		search_bytes = *bytes;
+		search_bytes = ctl->unit;
 		ret = search_bitmap(ctl, bitmap_info, &search_start,
 				    &search_bytes);
 		if (ret < 0 || search_start != *offset)
@@ -1608,11 +1684,14 @@
 	}
 
 	/*
-	 * some block groups are so tiny they can't be enveloped by a bitmap, so
-	 * don't even bother to create a bitmap for this
+	 * The original block groups from mkfs can be really small, like 8
+	 * megabytes, so don't bother with a bitmap for those entries.  However
+	 * some block groups can be smaller than what a bitmap would cover but
+	 * are still large enough that they could overflow the 32k memory limit,
+	 * so allow those block groups to still be allowed to have a bitmap
+	 * entry.
 	 */
-	if (BITS_PER_BITMAP * block_group->sectorsize >
-	    block_group->key.offset)
+	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
 		return false;
 
 	return true;
@@ -1684,7 +1763,7 @@
 	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
 					 1, 0);
 	if (!bitmap_info) {
-		BUG_ON(added);
+		ASSERT(added == 0);
 		goto new_bitmap;
 	}
 
@@ -1824,7 +1903,7 @@
 
 	if (ret) {
 		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
-		BUG_ON(ret == -EEXIST);
+		ASSERT(ret != -EEXIST);
 	}
 
 	return ret;
@@ -1835,12 +1914,16 @@
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *info;
-	struct btrfs_free_space *next_info = NULL;
-	int ret = 0;
+	int ret;
+	bool re_search = false;
 
 	spin_lock(&ctl->tree_lock);
 
 again:
+	ret = 0;
+	if (!bytes)
+		goto out_lock;
+
 	info = tree_search_offset(ctl, offset, 0, 0);
 	if (!info) {
 		/*
@@ -1850,105 +1933,66 @@
 		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
 					  1, 0);
 		if (!info) {
-			/* the tree logging code might be calling us before we
-			 * have fully loaded the free space rbtree for this
-			 * block group.  So it is possible the entry won't
-			 * be in the rbtree yet at all.  The caching code
-			 * will make sure not to put it in the rbtree if
-			 * the logging code has pinned it.
+			/*
+			 * If we found a partial bit of our free space in a
+			 * bitmap but then couldn't find the other part this may
+			 * be a problem, so WARN about it.
 			 */
+			WARN_ON(re_search);
 			goto out_lock;
 		}
 	}
 
-	if (info->bytes < bytes && rb_next(&info->offset_index)) {
-		u64 end;
-		next_info = rb_entry(rb_next(&info->offset_index),
-					     struct btrfs_free_space,
-					     offset_index);
-
-		if (next_info->bitmap)
-			end = next_info->offset +
-			      BITS_PER_BITMAP * ctl->unit - 1;
-		else
-			end = next_info->offset + next_info->bytes;
-
-		if (next_info->bytes < bytes ||
-		    next_info->offset > offset || offset > end) {
-			printk(KERN_CRIT "Found free space at %llu, size %llu,"
-			      " trying to use %llu\n",
-			      (unsigned long long)info->offset,
-			      (unsigned long long)info->bytes,
-			      (unsigned long long)bytes);
-			WARN_ON(1);
-			ret = -EINVAL;
-			goto out_lock;
-		}
-
-		info = next_info;
-	}
-
-	if (info->bytes == bytes) {
+	re_search = false;
+	if (!info->bitmap) {
 		unlink_free_space(ctl, info);
-		if (info->bitmap) {
-			kfree(info->bitmap);
-			ctl->total_bitmaps--;
-		}
-		kmem_cache_free(btrfs_free_space_cachep, info);
-		ret = 0;
-		goto out_lock;
-	}
+		if (offset == info->offset) {
+			u64 to_free = min(bytes, info->bytes);
 
-	if (!info->bitmap && info->offset == offset) {
-		unlink_free_space(ctl, info);
-		info->offset += bytes;
-		info->bytes -= bytes;
-		ret = link_free_space(ctl, info);
-		WARN_ON(ret);
-		goto out_lock;
-	}
+			info->bytes -= to_free;
+			info->offset += to_free;
+			if (info->bytes) {
+				ret = link_free_space(ctl, info);
+				WARN_ON(ret);
+			} else {
+				kmem_cache_free(btrfs_free_space_cachep, info);
+			}
 
-	if (!info->bitmap && info->offset <= offset &&
-	    info->offset + info->bytes >= offset + bytes) {
-		u64 old_start = info->offset;
-		/*
-		 * we're freeing space in the middle of the info,
-		 * this can happen during tree log replay
-		 *
-		 * first unlink the old info and then
-		 * insert it again after the hole we're creating
-		 */
-		unlink_free_space(ctl, info);
-		if (offset + bytes < info->offset + info->bytes) {
-			u64 old_end = info->offset + info->bytes;
+			offset += to_free;
+			bytes -= to_free;
+			goto again;
+		} else {
+			u64 old_end = info->bytes + info->offset;
 
-			info->offset = offset + bytes;
-			info->bytes = old_end - info->offset;
+			info->bytes = offset - info->offset;
 			ret = link_free_space(ctl, info);
 			WARN_ON(ret);
 			if (ret)
 				goto out_lock;
-		} else {
-			/* the hole we're creating ends at the end
-			 * of the info struct, just free the info
-			 */
-			kmem_cache_free(btrfs_free_space_cachep, info);
-		}
-		spin_unlock(&ctl->tree_lock);
 
-		/* step two, insert a new info struct to cover
-		 * anything before the hole
-		 */
-		ret = btrfs_add_free_space(block_group, old_start,
-					   offset - old_start);
-		WARN_ON(ret);
-		goto out;
+			/* Not enough bytes in this entry to satisfy us */
+			if (old_end < offset + bytes) {
+				bytes -= old_end - offset;
+				offset = old_end;
+				goto again;
+			} else if (old_end == offset + bytes) {
+				/* all done */
+				goto out_lock;
+			}
+			spin_unlock(&ctl->tree_lock);
+
+			ret = btrfs_add_free_space(block_group, offset + bytes,
+						   old_end - (offset + bytes));
+			WARN_ON(ret);
+			goto out;
+		}
 	}
 
 	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
-	if (ret == -EAGAIN)
+	if (ret == -EAGAIN) {
+		re_search = true;
 		goto again;
-	BUG_ON(ret);
+	}
 out_lock:
 	spin_unlock(&ctl->tree_lock);
 out:
@@ -1965,11 +2009,10 @@
 
 	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
-		if (info->bytes >= bytes)
+		if (info->bytes >= bytes && !block_group->ro)
 			count++;
 		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
-		       (unsigned long long)info->offset,
-		       (unsigned long long)info->bytes,
+		       info->offset, info->bytes,
 		       (info->bitmap) ? "yes" : "no");
 	}
 	printk(KERN_INFO "block group has cluster?: %s\n",
@@ -2042,7 +2085,8 @@
 	return 0;
 }
 
-void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
+static void __btrfs_remove_free_space_cache_locked(
+				struct btrfs_free_space_ctl *ctl)
 {
 	struct btrfs_free_space *info;
 	struct rb_node *node;
@@ -2096,15 +2140,19 @@
 }
 
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-			       u64 offset, u64 bytes, u64 empty_size)
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry = NULL;
 	u64 bytes_search = bytes + empty_size;
 	u64 ret = 0;
+	u64 align_gap = 0;
+	u64 align_gap_len = 0;
 
 	spin_lock(&ctl->tree_lock);
-	entry = find_free_space(ctl, &offset, &bytes_search);
+	entry = find_free_space(ctl, &offset, &bytes_search,
+				block_group->full_stripe_len, max_extent_size);
 	if (!entry)
 		goto out;
 
@@ -2115,17 +2163,23 @@
 			free_bitmap(ctl, entry);
 	} else {
 		unlink_free_space(ctl, entry);
-		entry->offset += bytes;
-		entry->bytes -= bytes;
+		align_gap_len = offset - entry->offset;
+		align_gap = entry->offset;
+
+		entry->offset = offset + bytes;
+		WARN_ON(entry->bytes < bytes + align_gap_len);
+
+		entry->bytes -= bytes + align_gap_len;
 		if (!entry->bytes)
 			kmem_cache_free(btrfs_free_space_cachep, entry);
 		else
 			link_free_space(ctl, entry);
 	}
-
 out:
 	spin_unlock(&ctl->tree_lock);
 
+	if (align_gap_len)
+		__btrfs_add_free_space(ctl, align_gap, align_gap_len);
 	return ret;
 }
 
@@ -2175,7 +2229,8 @@
 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
 				   struct btrfs_free_cluster *cluster,
 				   struct btrfs_free_space *entry,
-				   u64 bytes, u64 min_start)
+				   u64 bytes, u64 min_start,
+				   u64 *max_extent_size)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	int err;
@@ -2187,8 +2242,11 @@
 	search_bytes = bytes;
 
 	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
-	if (err)
+	if (err) {
+		if (search_bytes > *max_extent_size)
+			*max_extent_size = search_bytes;
 		return 0;
+	}
 
 	ret = search_start;
 	__bitmap_clear_bits(ctl, entry, ret, bytes);
@@ -2203,7 +2261,7 @@
  */
 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster, u64 bytes,
-			     u64 min_start)
+			     u64 min_start, u64 *max_extent_size)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry = NULL;
@@ -2223,6 +2281,9 @@
 
 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
 	while(1) {
+		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
+			*max_extent_size = entry->bytes;
+
 		if (entry->bytes < bytes ||
 		    (!entry->bitmap && entry->offset < min_start)) {
 			node = rb_next(&entry->offset_index);
@@ -2236,7 +2297,8 @@
 		if (entry->bitmap) {
 			ret = btrfs_alloc_from_bitmap(block_group,
 						      cluster, entry, bytes,
-						      min_start);
+						      cluster->window_start,
+						      max_extent_size);
 			if (ret == 0) {
 				node = rb_next(&entry->offset_index);
 				if (!node)
@@ -2245,6 +2307,7 @@
 						 offset_index);
 				continue;
 			}
+			cluster->window_start += bytes;
 		} else {
 			ret = entry->offset;
 
@@ -2283,23 +2346,23 @@
 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
 				struct btrfs_free_space *entry,
 				struct btrfs_free_cluster *cluster,
-				u64 offset, u64 bytes, u64 min_bytes)
+				u64 offset, u64 bytes,
+				u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	unsigned long next_zero;
 	unsigned long i;
-	unsigned long search_bits;
-	unsigned long total_bits;
+	unsigned long want_bits;
+	unsigned long min_bits;
 	unsigned long found_bits;
 	unsigned long start = 0;
 	unsigned long total_found = 0;
 	int ret;
-	bool found = false;
 
-	i = offset_to_bit(entry->offset, block_group->sectorsize,
+	i = offset_to_bit(entry->offset, ctl->unit,
 			  max_t(u64, offset, entry->offset));
-	search_bits = bytes_to_bits(bytes, block_group->sectorsize);
-	total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+	want_bits = bytes_to_bits(bytes, ctl->unit);
+	min_bits = bytes_to_bits(min_bytes, ctl->unit);
 
 again:
 	found_bits = 0;
@@ -2308,7 +2371,7 @@
 	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
 		next_zero = find_next_zero_bit(entry->bitmap,
 					       BITS_PER_BITMAP, i);
-		if (next_zero - i >= search_bits) {
+		if (next_zero - i >= min_bits) {
 			found_bits = next_zero - i;
 			break;
 		}
@@ -2318,56 +2381,52 @@
 	if (!found_bits)
 		return -ENOSPC;
 
-	if (!found) {
+	if (!total_found) {
 		start = i;
 		cluster->max_size = 0;
-		found = true;
 	}
 
 	total_found += found_bits;
 
-	if (cluster->max_size < found_bits * block_group->sectorsize)
-		cluster->max_size = found_bits * block_group->sectorsize;
+	if (cluster->max_size < found_bits * ctl->unit)
+		cluster->max_size = found_bits * ctl->unit;
 
-	if (total_found < total_bits) {
-		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
-		if (i - start > total_bits * 2) {
-			total_found = 0;
-			cluster->max_size = 0;
-			found = false;
-		}
+	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
+		i = next_zero + 1;
 		goto again;
 	}
 
-	cluster->window_start = start * block_group->sectorsize +
-		entry->offset;
+	cluster->window_start = start * ctl->unit + entry->offset;
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
-	BUG_ON(ret);
+	ASSERT(!ret); /* -EEXIST; Logic error */
 
+	trace_btrfs_setup_cluster(block_group, cluster,
+				  total_found * ctl->unit, 1);
 	return 0;
 }
 
 /*
  * This searches the block group for just extents to fill the cluster with.
+ * Try to find a cluster with at least bytes total bytes, at least one
+ * extent of cont1_bytes, and other clusters of at least min_bytes.
  */
 static noinline int
 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 			struct btrfs_free_cluster *cluster,
 			struct list_head *bitmaps, u64 offset, u64 bytes,
-			u64 min_bytes)
+			u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *first = NULL;
 	struct btrfs_free_space *entry = NULL;
-	struct btrfs_free_space *prev = NULL;
 	struct btrfs_free_space *last;
 	struct rb_node *node;
 	u64 window_start;
 	u64 window_free;
 	u64 max_extent;
-	u64 max_gap = 128 * 1024;
+	u64 total_size = 0;
 
 	entry = tree_search_offset(ctl, offset, 0, 1);
 	if (!entry)
@@ -2377,8 +2436,8 @@
 	 * We don't want bitmaps, so just move along until we find a normal
 	 * extent entry.
 	 */
-	while (entry->bitmap) {
-		if (list_empty(&entry->list))
+	while (entry->bitmap || entry->bytes < min_bytes) {
+		if (entry->bitmap && list_empty(&entry->list))
 			list_add_tail(&entry->list, bitmaps);
 		node = rb_next(&entry->offset_index);
 		if (!node)
@@ -2391,12 +2450,9 @@
 	max_extent = entry->bytes;
 	first = entry;
 	last = entry;
-	prev = entry;
 
-	while (window_free <= min_bytes) {
-		node = rb_next(&entry->offset_index);
-		if (!node)
-			return -ENOSPC;
+	for (node = rb_next(&entry->offset_index); node;
+	     node = rb_next(&entry->offset_index)) {
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 
 		if (entry->bitmap) {
@@ -2405,26 +2461,18 @@
 			continue;
 		}
 
-		/*
-		 * we haven't filled the empty size and the window is
-		 * very large.  reset and try again
-		 */
-		if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-		    entry->offset - window_start > (min_bytes * 2)) {
-			first = entry;
-			window_start = entry->offset;
-			window_free = entry->bytes;
-			last = entry;
+		if (entry->bytes < min_bytes)
+			continue;
+
+		last = entry;
+		window_free += entry->bytes;
+		if (entry->bytes > max_extent)
 			max_extent = entry->bytes;
-		} else {
-			last = entry;
-			window_free += entry->bytes;
-			if (entry->bytes > max_extent)
-				max_extent = entry->bytes;
-		}
-		prev = entry;
 	}
 
+	if (window_free < bytes || max_extent < cont1_bytes)
+		return -ENOSPC;
+
 	cluster->window_start = first->offset;
 
 	node = &first->offset_index;
@@ -2438,17 +2486,18 @@
 
 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
 		node = rb_next(&entry->offset_index);
-		if (entry->bitmap)
+		if (entry->bitmap || entry->bytes < min_bytes)
 			continue;
 
 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
-		BUG_ON(ret);
+		total_size += entry->bytes;
+		ASSERT(!ret); /* -EEXIST; Logic error */
 	} while (node && entry != last);
 
 	cluster->max_size = max_extent;
-
+	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
 	return 0;
 }
 
@@ -2460,7 +2509,7 @@
 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
 		     struct btrfs_free_cluster *cluster,
 		     struct list_head *bitmaps, u64 offset, u64 bytes,
-		     u64 min_bytes)
+		     u64 cont1_bytes, u64 min_bytes)
 {
 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
 	struct btrfs_free_space *entry;
@@ -2482,10 +2531,10 @@
 	}
 
 	list_for_each_entry(entry, bitmaps, list) {
-		if (entry->bytes < min_bytes)
+		if (entry->bytes < bytes)
 			continue;
 		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
-					   bytes, min_bytes);
+					   bytes, cont1_bytes, min_bytes);
 		if (!ret)
 			return 0;
 	}
@@ -2499,14 +2548,13 @@
 
 /*
  * here we try to find a cluster of blocks in a block group.  The goal
- * is to find at least bytes free and up to empty_size + bytes free.
+ * is to find at least bytes+empty_size.
  * We might not find them all in one contiguous area.
  *
  * returns zero and sets up cluster if things worked out, otherwise
  * it returns -enospc
  */
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size)
@@ -2515,23 +2563,24 @@
 	struct btrfs_free_space *entry, *tmp;
 	LIST_HEAD(bitmaps);
 	u64 min_bytes;
+	u64 cont1_bytes;
 	int ret;
 
-	/* for metadata, allow allocates with more holes */
+	/*
+	 * Choose the minimum extent size we'll require for this
+	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
+	 * For metadata, allow allocates with smaller extents.  For
+	 * data, keep it dense.
+	 */
 	if (btrfs_test_opt(root, SSD_SPREAD)) {
-		min_bytes = bytes + empty_size;
+		cont1_bytes = min_bytes = bytes + empty_size;
 	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-		/*
-		 * we want to do larger allocations when we are
-		 * flushing out the delayed refs, it helps prevent
-		 * making more work as we go along.
-		 */
-		if (trans->transaction->delayed_refs.flushing)
-			min_bytes = max(bytes, (bytes + empty_size) >> 1);
-		else
-			min_bytes = max(bytes, (bytes + empty_size) >> 4);
-	} else
-		min_bytes = max(bytes, (bytes + empty_size) >> 2);
+		cont1_bytes = bytes;
+		min_bytes = block_group->sectorsize;
+	} else {
+		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
+		min_bytes = block_group->sectorsize;
+	}
 
 	spin_lock(&ctl->tree_lock);
 
@@ -2539,7 +2588,7 @@
 	 * If we know we don't have enough space to make a cluster don't even
 	 * bother doing all the work to try and find one.
 	 */
-	if (ctl->free_space < min_bytes) {
+	if (ctl->free_space < bytes) {
 		spin_unlock(&ctl->tree_lock);
 		return -ENOSPC;
 	}
@@ -2552,11 +2601,17 @@
 		goto out;
 	}
 
+	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
+				 min_bytes);
+
+	INIT_LIST_HEAD(&bitmaps);
 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
-				      bytes, min_bytes);
+				      bytes + empty_size,
+				      cont1_bytes, min_bytes);
 	if (ret)
 		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
-					   offset, bytes, min_bytes);
+					   offset, bytes + empty_size,
+					   cont1_bytes, min_bytes);
 
 	/* Clear our temporary list */
 	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
@@ -2567,6 +2622,8 @@
 		list_add_tail(&cluster->block_group_list,
 			      &block_group->cluster_list);
 		cluster->block_group = block_group;
+	} else {
+		trace_btrfs_failed_cluster_setup(block_group);
 	}
 out:
 	spin_unlock(&cluster->lock);
@@ -2588,17 +2645,57 @@
 	cluster->block_group = NULL;
 }
 
-int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
-			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+static int do_trimming(struct btrfs_block_group_cache *block_group,
+		       u64 *total_trimmed, u64 start, u64 bytes,
+		       u64 reserved_start, u64 reserved_bytes)
 {
-	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-	struct btrfs_free_space *entry = NULL;
+	struct btrfs_space_info *space_info = block_group->space_info;
 	struct btrfs_fs_info *fs_info = block_group->fs_info;
-	u64 bytes = 0;
-	u64 actually_trimmed;
-	int ret = 0;
+	int ret;
+	int update = 0;
+	u64 trimmed = 0;
 
-	*trimmed = 0;
+	spin_lock(&space_info->lock);
+	spin_lock(&block_group->lock);
+	if (!block_group->ro) {
+		block_group->reserved += reserved_bytes;
+		space_info->bytes_reserved += reserved_bytes;
+		update = 1;
+	}
+	spin_unlock(&block_group->lock);
+	spin_unlock(&space_info->lock);
+
+	ret = btrfs_error_discard_extent(fs_info->extent_root,
+					 start, bytes, &trimmed);
+	if (!ret)
+		*total_trimmed += trimmed;
+
+	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
+
+	if (update) {
+		spin_lock(&space_info->lock);
+		spin_lock(&block_group->lock);
+		if (block_group->ro)
+			space_info->bytes_readonly += reserved_bytes;
+		block_group->reserved -= reserved_bytes;
+		space_info->bytes_reserved -= reserved_bytes;
+		spin_unlock(&space_info->lock);
+		spin_unlock(&block_group->lock);
+	}
+
+	return ret;
+}
+
+static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
+			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	struct rb_node *node;
+	int ret = 0;
+	u64 extent_start;
+	u64 extent_bytes;
+	u64 bytes;
 
 	while (start < end) {
 		spin_lock(&ctl->tree_lock);
@@ -2609,81 +2706,118 @@
 		}
 
 		entry = tree_search_offset(ctl, start, 0, 1);
-		if (!entry)
-			entry = tree_search_offset(ctl,
-						   offset_to_bitmap(ctl, start),
-						   1, 1);
-
-		if (!entry || entry->offset >= end) {
+		if (!entry) {
 			spin_unlock(&ctl->tree_lock);
 			break;
 		}
 
-		if (entry->bitmap) {
-			ret = search_bitmap(ctl, entry, &start, &bytes);
-			if (!ret) {
-				if (start >= end) {
-					spin_unlock(&ctl->tree_lock);
-					break;
-				}
-				bytes = min(bytes, end - start);
-				bitmap_clear_bits(ctl, entry, start, bytes);
-				if (entry->bytes == 0)
-					free_bitmap(ctl, entry);
-			} else {
-				start = entry->offset + BITS_PER_BITMAP *
-					block_group->sectorsize;
+		/* skip bitmaps */
+		while (entry->bitmap) {
+			node = rb_next(&entry->offset_index);
+			if (!node) {
 				spin_unlock(&ctl->tree_lock);
-				ret = 0;
-				continue;
+				goto out;
 			}
-		} else {
-			start = entry->offset;
-			bytes = min(entry->bytes, end - start);
-			unlink_free_space(ctl, entry);
-			kmem_cache_free(btrfs_free_space_cachep, entry);
+			entry = rb_entry(node, struct btrfs_free_space,
+					 offset_index);
+		}
+
+		if (entry->offset >= end) {
+			spin_unlock(&ctl->tree_lock);
+			break;
 		}
 
+		extent_start = entry->offset;
+		extent_bytes = entry->bytes;
+		start = max(start, extent_start);
+		bytes = min(extent_start + extent_bytes, end) - start;
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
+		}
+
+		unlink_free_space(ctl, entry);
+		kmem_cache_free(btrfs_free_space_cachep, entry);
+
 		spin_unlock(&ctl->tree_lock);
 
-		if (bytes >= minlen) {
-			struct btrfs_space_info *space_info;
-			int update = 0;
-
-			space_info = block_group->space_info;
-			spin_lock(&space_info->lock);
-			spin_lock(&block_group->lock);
-			if (!block_group->ro) {
-				block_group->reserved += bytes;
-				space_info->bytes_reserved += bytes;
-				update = 1;
-			}
-			spin_unlock(&block_group->lock);
-			spin_unlock(&space_info->lock);
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  extent_start, extent_bytes);
+		if (ret)
+			break;
+next:
+		start += bytes;
 
-			ret = btrfs_error_discard_extent(fs_info->extent_root,
-							 start,
-							 bytes,
-							 &actually_trimmed);
-
-			btrfs_add_free_space(block_group, start, bytes);
-			if (update) {
-				spin_lock(&space_info->lock);
-				spin_lock(&block_group->lock);
-				if (block_group->ro)
-					space_info->bytes_readonly += bytes;
-				block_group->reserved -= bytes;
-				space_info->bytes_reserved -= bytes;
-				spin_unlock(&space_info->lock);
-				spin_unlock(&block_group->lock);
-			}
+		if (fatal_signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
 
-			if (ret)
-				break;
-			*trimmed += actually_trimmed;
+		cond_resched();
+	}
+out:
+	return ret;
+}
+
+static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
+			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
+{
+	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+	struct btrfs_free_space *entry;
+	int ret = 0;
+	int ret2;
+	u64 bytes;
+	u64 offset = offset_to_bitmap(ctl, start);
+
+	while (offset < end) {
+		bool next_bitmap = false;
+
+		spin_lock(&ctl->tree_lock);
+
+		if (ctl->free_space < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			break;
+		}
+
+		entry = tree_search_offset(ctl, offset, 1, 0);
+		if (!entry) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = minlen;
+		ret2 = search_bitmap(ctl, entry, &start, &bytes);
+		if (ret2 || start >= end) {
+			spin_unlock(&ctl->tree_lock);
+			next_bitmap = true;
+			goto next;
+		}
+
+		bytes = min(bytes, end - start);
+		if (bytes < minlen) {
+			spin_unlock(&ctl->tree_lock);
+			goto next;
+		}
+
+		bitmap_clear_bits(ctl, entry, start, bytes);
+		if (entry->bytes == 0)
+			free_bitmap(ctl, entry);
+
+		spin_unlock(&ctl->tree_lock);
+
+		ret = do_trimming(block_group, total_trimmed, start, bytes,
+				  start, bytes);
+		if (ret)
+			break;
+next:
+		if (next_bitmap) {
+			offset += BITS_PER_BITMAP * ctl->unit;
+		} else {
+			start += bytes;
+			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
+				offset += BITS_PER_BITMAP * ctl->unit;
 		}
-		start += bytes;
-		bytes = 0;
 
 		if (fatal_signal_pending(current)) {
 			ret = -ERESTARTSYS;
@@ -2696,6 +2830,22 @@
 	return ret;
 }
 
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+			   u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+	int ret;
+
+	*trimmed = 0;
+
+	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
+	if (ret)
+		return ret;
+
+	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
+
+	return ret;
+}
+
 /*
  * Find the left-most item in the cache tree, and then return the
  * smallest inode number in the item.
@@ -2733,7 +2883,8 @@
 		int ret;
 
 		ret = search_bitmap(ctl, entry, &offset, &count);
-		BUG_ON(ret);
+		/* Logic error; Should be empty if it can't find anything */
+		ASSERT(!ret);
 
 		ino = offset;
 		bitmap_clear_bits(ctl, entry, offset, 1);
@@ -2810,8 +2961,9 @@
 	ret = __load_free_space_cache(root, inode, ctl, path, 0);
 
 	if (ret < 0)
-		printk(KERN_ERR "btrfs: failed to load free ino cache for "
-		       "root %llu\n", root->root_key.objectid);
+		btrfs_err(fs_info,
+			"failed to load free ino cache for root %llu",
+			root->root_key.objectid);
 out_put:
 	iput(inode);
 out:
@@ -2838,11 +2990,162 @@
 	if (ret) {
 		btrfs_delalloc_release_metadata(inode, inode->i_size);
 #ifdef DEBUG
-		printk(KERN_ERR "btrfs: failed to write free ino cache "
-		       "for root %llu\n", root->root_key.objectid);
+		btrfs_err(root->fs_info,
+			"failed to write free ino cache for root %llu",
+			root->root_key.objectid);
 #endif
 	}
 
 	iput(inode);
 	return ret;
 }
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+/*
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
+ */
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap)
+{
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info = NULL, *bitmap_info;
+	void *map = NULL;
+	u64 bytes_added;
+	int ret;
+
+again:
+	if (!info) {
+		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+		if (!info)
+			return -ENOMEM;
+	}
+
+	if (!bitmap) {
+		spin_lock(&ctl->tree_lock);
+		info->offset = offset;
+		info->bytes = bytes;
+		ret = link_free_space(ctl, info);
+		spin_unlock(&ctl->tree_lock);
+		if (ret)
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		return ret;
+	}
+
+	if (!map) {
+		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		if (!map) {
+			kmem_cache_free(btrfs_free_space_cachep, info);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock(&ctl->tree_lock);
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		info->bitmap = map;
+		map = NULL;
+		add_new_bitmap(ctl, info, offset);
+		bitmap_info = info;
+	}
+
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	spin_unlock(&ctl->tree_lock);
+
+	if (bytes)
+		goto again;
+
+	if (map)
+		kfree(map);
+	return 0;
+}
+
+/*
+ * Checks to see if the given range is in the free space cache.  This is really
+ * just used to check the absence of space, so if there is free space in the
+ * range at all we will return 1.
+ */
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes)
+{
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info;
+	int ret = 0;
+
+	spin_lock(&ctl->tree_lock);
+	info = tree_search_offset(ctl, offset, 0, 0);
+	if (!info) {
+		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					  1, 0);
+		if (!info)
+			goto out;
+	}
+
+have_info:
+	if (info->bitmap) {
+		u64 bit_off, bit_bytes;
+		struct rb_node *n;
+		struct btrfs_free_space *tmp;
+
+		bit_off = offset;
+		bit_bytes = ctl->unit;
+		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
+		if (!ret) {
+			if (bit_off == offset) {
+				ret = 1;
+				goto out;
+			} else if (bit_off > offset &&
+				   offset + bytes > bit_off) {
+				ret = 1;
+				goto out;
+			}
+		}
+
+		n = rb_prev(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (tmp->offset + tmp->bytes < offset)
+				break;
+			if (offset + bytes < tmp->offset) {
+				n = rb_prev(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		n = rb_next(&info->offset_index);
+		while (n) {
+			tmp = rb_entry(n, struct btrfs_free_space,
+				       offset_index);
+			if (offset + bytes < tmp->offset)
+				break;
+			if (tmp->offset + tmp->bytes < offset) {
+				n = rb_next(&info->offset_index);
+				continue;
+			}
+			info = tmp;
+			goto have_info;
+		}
+
+		goto out;
+	}
+
+	if (info->offset == offset) {
+		ret = 1;
+		goto out;
+	}
+
+	if (offset > info->offset && offset < info->offset + info->bytes)
+		ret = 1;
+out:
+	spin_unlock(&ctl->tree_lock);
+	return ret;
+}
+#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff -ur a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
--- a/fs/btrfs/free-space-cache.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/free-space-cache.h	2014-02-17 11:56:58.000000000 +0100
@@ -54,6 +54,8 @@
 			    struct btrfs_block_group_cache *block_group,
 			    struct btrfs_path *path);
 
+int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
+				       struct btrfs_block_rsv *rsv);
 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_trans_handle *trans,
 				    struct btrfs_path *path,
@@ -92,22 +94,31 @@
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
 				     *block_group);
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
-			       u64 offset, u64 bytes, u64 empty_size);
+			       u64 offset, u64 bytes, u64 empty_size,
+			       u64 *max_extent_size);
 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size);
 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster);
 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster, u64 bytes,
-			     u64 min_start);
+			     u64 min_start, u64 *max_extent_size);
 int btrfs_return_cluster_to_free_space(
 			       struct btrfs_block_group_cache *block_group,
 			       struct btrfs_free_cluster *cluster);
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 			   u64 *trimmed, u64 start, u64 end, u64 minlen);
+
+/* Support functions for runnint our sanity tests */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap);
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes);
+#endif
+
 #endif
diff -ur a/fs/btrfs/hash.h b/fs/btrfs/hash.h
--- a/fs/btrfs/hash.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/hash.h	2014-02-17 11:56:58.000000000 +0100
@@ -24,4 +24,14 @@
 {
 	return crc32c((u32)~1, name, len);
 }
+
+/*
+ * Figure the key offset of an extended inode ref
+ */
+static inline u64 btrfs_extref_hash(u64 parent_objectid, const char *name,
+				    int len)
+{
+	return (u64) crc32c(parent_objectid, name, len);
+}
+
 #endif
diff -ur a/fs/btrfs/inode.c b/fs/btrfs/inode.c
--- a/fs/btrfs/inode.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/inode.c	2014-02-17 11:56:58.000000000 +0100
@@ -39,12 +39,14 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
+#include <linux/posix_acl_xattr.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "ordered-data.h"
 #include "xattr.h"
@@ -54,6 +56,15 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "backref.h"
+#include "hash.h"
+
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+#include "syno_acl.h"
+#endif
+#ifdef MY_ABC_HERE
+#include <linux/namei.h>
+#endif
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -71,6 +82,7 @@
 static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
+static struct kmem_cache *btrfs_delalloc_work_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
@@ -87,15 +99,143 @@
 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
 };
 
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode);
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
 				   struct page *locked_page,
 				   u64 start, u64 end, int *page_started,
 				   unsigned long *nr_written, int unlock);
-static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct inode *inode);
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type);
+
+static int btrfs_dirty_inode(struct inode *inode);
+
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+static int syno_btrfs_init_attr(struct btrfs_trans_handle *trans, struct inode *inode)
+{
+	int err = -EINVAL;
+#ifdef MY_ABC_HERE
+	struct btrfs_timespec crtime;
+#endif
+#ifdef MY_ABC_HERE
+	__le32 archive_bit;
+#endif
+
+#ifdef MY_ABC_HERE
+	inode->i_CreateTime = inode->i_mtime;
+	crtime.sec = cpu_to_le64(inode->i_CreateTime.tv_sec);
+	crtime.nsec = cpu_to_le32(inode->i_CreateTime.tv_nsec);
+	err = __btrfs_setxattr(trans, inode, XATTR_SYNO_PREFIX XATTR_SYNO_CREATE_TIME, &crtime, sizeof(crtime), XATTR_CREATE);
+	if (err)
+		goto out;
+#endif
+
+#ifdef MY_ABC_HERE
+	inode->i_mode2 = ALL_SYNO_ARCHIVE;
+	archive_bit = cpu_to_le32(inode->i_mode2);
+	err = __btrfs_setxattr(trans, inode, XATTR_SYNO_PREFIX XATTR_SYNO_ARCHIVE_BIT, &archive_bit, sizeof(archive_bit), XATTR_CREATE);
+#endif
+out:
+	return err;
+}
+#endif
+
+#ifdef MY_ABC_HERE
+static int syno_btrfs_set_crtime(struct dentry *dentry, struct timespec *time)
+{
+	int err = -EINVAL;
+	struct btrfs_timespec crtime;
+
+	if (!dentry->d_inode->i_op->setxattr) {
+		// ex.
+		// btrfs subvolume create /volume1/abc
+		// btrfs subvolume create /volume1/abc/123
+		// btrfs subvolume snapshot /volume1/abc /volume1/111
+		// /volume1/111/123 <-- i_op = btrfs_dir_ro_inode_operations
+		printk(KERN_ERR "BTRFS:Can't set create time on dir_ro_inode %s\n", dentry->d_name.name);
+		goto out;
+	}
+	crtime.sec = cpu_to_le64(time->tv_sec);
+	crtime.nsec = cpu_to_le32(time->tv_nsec);
+	err = btrfs_setxattr(dentry, XATTR_SYNO_PREFIX XATTR_SYNO_CREATE_TIME, &crtime, sizeof(crtime), XATTR_REPLACE);
+	if (!err)
+		dentry->d_inode->i_CreateTime = *time;
+out:
+	return err;
+}
+#endif
+
+#ifdef MY_ABC_HERE
+static int syno_btrfs_set_archive_bit(struct dentry *dentry, u32 archive_bit)
+{
+	int err = -EINVAL;
+	__le32 archive_le32;
+
+	if (!dentry->d_inode->i_op->setxattr) {
+		printk(KERN_ERR "BTRFS:Can't set archive bit on dir_ro_inode %s\n", dentry->d_name.name);
+		goto out;
+	}
+	archive_le32 = cpu_to_le32(archive_bit);
+	err = btrfs_setxattr(dentry, XATTR_SYNO_PREFIX XATTR_SYNO_ARCHIVE_BIT, &archive_le32, sizeof(archive_le32), XATTR_REPLACE);
+	if (!err) {
+		dentry->d_inode->i_mode2 = archive_bit;
+	}
+out:
+	return err;
+}
+#endif
+
+#ifdef MY_ABC_HERE
+static int syno_btrfs_set_archive_ver(struct dentry *dentry, u32 version)
+{
+	struct inode *inode = dentry->d_inode;
+	struct syno_xattr_archive_version value;
+	int err;
+
+	if (!dentry->d_inode->i_op->setxattr) {
+		printk(KERN_ERR "BTRFS:Can't set archive ver on dir_ro_inode %s\n", dentry->d_name.name);
+		return -EINVAL;
+	}
+	value.v_magic = cpu_to_le16(0x2552);
+	value.v_struct_version = cpu_to_le16(1);
+	value.v_archive_version = cpu_to_le32(version);
+	err = btrfs_setxattr(dentry, XATTR_SYNO_PREFIX XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value), 0);
+	if (!err) {
+		inode->i_archive_version = version;
+		inode->i_flags |= S_ARCHIVE_VERSION_CACHED;
+	}
+	return err;
+}
+
+static int syno_btrfs_get_archive_ver(struct dentry *dentry, u32 *version)
+{
+	struct inode *inode = dentry->d_inode;
+	struct syno_xattr_archive_version value;
+	int err;
+
+	if (IS_ARCHIVE_VERSION_CACHED(inode)) {
+		*version = inode->i_archive_version;
+		return 0;
+	}
+
+	err = __btrfs_getxattr(inode, XATTR_SYNO_PREFIX XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value));
+	if (0 < err) {
+		inode->i_archive_version = le32_to_cpu(value.v_archive_version);
+	} else if (-ENODATA == err) {
+		inode->i_archive_version = 0;
+	} else {
+		*version = 0;
+		return err;
+	}
+	*version = inode->i_archive_version;
+	inode->i_flags |= S_ARCHIVE_VERSION_CACHED;
+	return 0;
+}
+#endif
 
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 				     struct inode *inode,  struct inode *dir,
@@ -150,7 +290,6 @@
 	inode_add_bytes(inode, size);
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      datasize);
-	BUG_ON(ret);
 	if (ret) {
 		err = ret;
 		goto fail;
@@ -173,9 +312,9 @@
 			cur_size = min_t(unsigned long, compressed_size,
 				       PAGE_CACHE_SIZE);
 
-			kaddr = kmap_atomic(cpage, KM_USER0);
+			kaddr = kmap_atomic(cpage);
 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
-			kunmap_atomic(kaddr, KM_USER0);
+			kunmap_atomic(kaddr);
 
 			i++;
 			ptr += cur_size;
@@ -187,10 +326,10 @@
 		page = find_get_page(inode->i_mapping,
 				     start >> PAGE_CACHE_SHIFT);
 		btrfs_set_file_extent_compression(leaf, ei, 0);
-		kaddr = kmap_atomic(page, KM_USER0);
+		kaddr = kmap_atomic(page);
 		offset = start & (PAGE_CACHE_SIZE - 1);
 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 		page_cache_release(page);
 	}
 	btrfs_mark_buffer_dirty(leaf);
@@ -206,9 +345,9 @@
 	 * could end up racing with unlink.
 	 */
 	BTRFS_I(inode)->disk_i_size = inode->i_size;
-	btrfs_update_inode(trans, root, inode);
+	ret = btrfs_update_inode(trans, root, inode);
 
-	return 0;
+	return ret;
 fail:
 	btrfs_free_path(path);
 	return err;
@@ -220,18 +359,17 @@
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct inode *inode, u64 start, u64 end,
-				 size_t compressed_size, int compress_type,
-				 struct page **compressed_pages)
+static noinline int cow_file_range_inline(struct btrfs_root *root,
+					  struct inode *inode, u64 start,
+					  u64 end, size_t compressed_size,
+					  int compress_type,
+					  struct page **compressed_pages)
 {
+	struct btrfs_trans_handle *trans;
 	u64 isize = i_size_read(inode);
 	u64 actual_end = min(end + 1, isize);
 	u64 inline_len = actual_end - start;
-	u64 aligned_end = (end + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-	u64 hint_byte;
+	u64 aligned_end = ALIGN(end, root->sectorsize);
 	u64 data_len = inline_len;
 	int ret;
 
@@ -248,19 +386,36 @@
 		return 1;
 	}
 
-	ret = btrfs_drop_extents(trans, inode, start, aligned_end,
-				 &hint_byte, 1);
-	BUG_ON(ret);
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
+	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	if (isize > actual_end)
 		inline_len = min_t(u64, isize, actual_end);
 	ret = insert_inline_extent(trans, root, inode, start,
 				   inline_len, compressed_size,
 				   compress_type, compressed_pages);
-	BUG_ON(ret);
+	if (ret && ret != -ENOSPC) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	} else if (ret == -ENOSPC) {
+		ret = 1;
+		goto out;
+	}
+
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
-	return 0;
+out:
+	btrfs_end_transaction(trans, root);
+	return ret;
 }
 
 struct async_extent {
@@ -293,7 +448,7 @@
 	struct async_extent *async_extent;
 
 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
-	BUG_ON(!async_extent);
+	BUG_ON(!async_extent); /* -ENOMEM */
 	async_extent->start = start;
 	async_extent->ram_size = ram_size;
 	async_extent->compressed_size = compressed_size;
@@ -318,7 +473,8 @@
  * If this code finds it can't get good compression, it puts an
  * entry onto the work queue to write the uncompressed bytes.  This
  * makes sure that both compressed inodes and uncompressed inodes
- * are written in the same order that pdflush sent them down.
+ * are written in the same order that the flusher thread sent them
+ * down.
  */
 static noinline int compress_file_range(struct inode *inode,
 					struct page *locked_page,
@@ -327,7 +483,6 @@
 					int *num_added)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	u64 num_bytes;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
@@ -343,9 +498,11 @@
 	int i;
 	int will_compress;
 	int compress_type = root->fs_info->compress_type;
+	int redirty = 0;
 
-	/* if this is a small write inside eof, kick off a defragbot */
-	if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+	/* if this is a small write inside eof, kick off a defrag */
+	if ((end - start + 1) < 16 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 		btrfs_add_inode_defrag(NULL, inode);
 
 	actual_end = min_t(u64, isize, end + 1);
@@ -380,7 +537,7 @@
 	 * a compressed extent to 128k.
 	 */
 	total_compressed = min(total_compressed, max_uncompressed);
-	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
 	total_in = 0;
 	ret = 0;
@@ -404,6 +561,17 @@
 		if (BTRFS_I(inode)->force_compress)
 			compress_type = BTRFS_I(inode)->force_compress;
 
+		/*
+		 * we need to call clear_page_dirty_for_io on each
+		 * page in the range.  Otherwise applications with the file
+		 * mmap'd can wander in and change the page contents while
+		 * we are compressing them.
+		 *
+		 * If the compression fails for any reason, we set the pages
+		 * dirty again later on.
+		 */
+		extent_range_clear_dirty_for_io(inode, start, end);
+		redirty = 1;
 		ret = btrfs_compress_pages(compress_type,
 					   inode->i_mapping, start,
 					   total_compressed, pages,
@@ -422,51 +590,46 @@
 			 * sending it down to disk
 			 */
 			if (offset) {
-				kaddr = kmap_atomic(page, KM_USER0);
+				kaddr = kmap_atomic(page);
 				memset(kaddr + offset, 0,
 				       PAGE_CACHE_SIZE - offset);
-				kunmap_atomic(kaddr, KM_USER0);
+				kunmap_atomic(kaddr);
 			}
 			will_compress = 1;
 		}
 	}
 cont:
 	if (start == 0) {
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
-		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
 		/* lets try to make an inline extent */
 		if (ret || total_in < (actual_end - start)) {
 			/* we didn't compress the entire range, try
 			 * to make an uncompressed inline extent.
 			 */
-			ret = cow_file_range_inline(trans, root, inode,
-						    start, end, 0, 0, NULL);
+			ret = cow_file_range_inline(root, inode, start, end,
+						    0, 0, NULL);
 		} else {
 			/* try making a compressed inline extent */
-			ret = cow_file_range_inline(trans, root, inode,
-						    start, end,
+			ret = cow_file_range_inline(root, inode, start, end,
 						    total_compressed,
 						    compress_type, pages);
 		}
-		if (ret == 0) {
+		if (ret <= 0) {
+			unsigned long clear_flags = EXTENT_DELALLOC |
+				EXTENT_DEFRAG;
+			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+
 			/*
-			 * inline extent creation worked, we don't need
-			 * to create any more async work items.  Unlock
-			 * and free up our temp pages.
+			 * inline extent creation worked or returned error,
+			 * we don't need to create any more async work items.
+			 * Unlock and free up our temp pages.
 			 */
-			extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     start, end, NULL,
-			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
-
-			btrfs_end_transaction(trans, root);
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+						     clear_flags, PAGE_UNLOCK |
+						     PAGE_CLEAR_DIRTY |
+						     PAGE_SET_WRITEBACK |
+						     PAGE_END_WRITEBACK);
 			goto free_pages_out;
 		}
-		btrfs_end_transaction(trans, root);
 	}
 
 	if (will_compress) {
@@ -475,15 +638,13 @@
 		 * up to a block size boundary so the allocator does sane
 		 * things
 		 */
-		total_compressed = (total_compressed + blocksize - 1) &
-			~(blocksize - 1);
+		total_compressed = ALIGN(total_compressed, blocksize);
 
 		/*
 		 * one last check to make sure the compression is really a
 		 * win, compare the page count read with the blocks on disk
 		 */
-		total_in = (total_in + PAGE_CACHE_SIZE - 1) &
-			~(PAGE_CACHE_SIZE - 1);
+		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
 		if (total_compressed >= total_in) {
 			will_compress = 0;
 		} else {
@@ -541,13 +702,15 @@
 			__set_page_dirty_nobuffers(locked_page);
 			/* unlocked later on in the async handlers */
 		}
+		if (redirty)
+			extent_range_redirty_for_io(inode, start, end);
 		add_async_extent(async_cow, start, end - start + 1,
 				 0, NULL, 0, BTRFS_COMPRESS_NONE);
 		*num_added += 1;
 	}
 
 out:
-	return 0;
+	return ret;
 
 free_pages_out:
 	for (i = 0; i < nr_pages_ret; i++) {
@@ -570,7 +733,6 @@
 {
 	struct async_extent *async_extent;
 	u64 alloc_hint = 0;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -581,7 +743,7 @@
 	if (list_empty(&async_cow->extents))
 		return 0;
 
-
+again:
 	while (!list_empty(&async_cow->extents)) {
 		async_extent = list_entry(async_cow->extents.next,
 					  struct async_extent, list);
@@ -597,7 +759,7 @@
 
 			lock_extent(io_tree, async_extent->start,
 					 async_extent->start +
-					 async_extent->ram_size - 1, GFP_NOFS);
+					 async_extent->ram_size - 1);
 
 			/* allocate blocks */
 			ret = cow_file_range(inode, async_cow->locked_page,
@@ -606,6 +768,8 @@
 					     async_extent->ram_size - 1,
 					     &page_started, &nr_written, 0);
 
+			/* JDM XXX */
+
 			/*
 			 * if page_started, cow_file_range inserted an
 			 * inline extent and took care of all the unlocking
@@ -619,27 +783,23 @@
 						  async_extent->ram_size - 1,
 						  btrfs_get_extent,
 						  WB_SYNC_ALL);
+			else if (ret)
+				unlock_page(async_cow->locked_page);
 			kfree(async_extent);
 			cond_resched();
 			continue;
 		}
 
 		lock_extent(io_tree, async_extent->start,
-			    async_extent->start + async_extent->ram_size - 1,
-			    GFP_NOFS);
+			    async_extent->start + async_extent->ram_size - 1);
 
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
-		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-		ret = btrfs_reserve_extent(trans, root,
+		ret = btrfs_reserve_extent(root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
-					   0, alloc_hint,
-					   (u64)-1, &ins, 1);
-		btrfs_end_transaction(trans, root);
-
+					   0, alloc_hint, &ins, 1);
 		if (ret) {
 			int i;
+
 			for (i = 0; i < async_extent->nr_pages; i++) {
 				WARN_ON(async_extent->pages[i]->mapping);
 				page_cache_release(async_extent->pages[i]);
@@ -647,10 +807,14 @@
 			kfree(async_extent->pages);
 			async_extent->nr_pages = 0;
 			async_extent->pages = NULL;
-			unlock_extent(io_tree, async_extent->start,
-				      async_extent->start +
-				      async_extent->ram_size - 1, GFP_NOFS);
-			goto retry;
+
+			if (ret == -ENOSPC) {
+				unlock_extent(io_tree, async_extent->start,
+					      async_extent->start +
+					      async_extent->ram_size - 1);
+				goto retry;
+			}
+			goto out_free;
 		}
 
 		/*
@@ -662,21 +826,29 @@
 					async_extent->ram_size - 1, 0);
 
 		em = alloc_extent_map();
-		BUG_ON(!em);
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_free_reserve;
+		}
 		em->start = async_extent->start;
 		em->len = async_extent->ram_size;
 		em->orig_start = em->start;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = async_extent->ram_size;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		em->compress_type = async_extent->compress_type;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+		em->generation = -1;
 
 		while (1) {
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, em);
+			ret = add_extent_mapping(em_tree, em, 1);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -687,6 +859,9 @@
 						async_extent->ram_size - 1, 0);
 		}
 
+		if (ret)
+			goto out_free_reserve;
+
 		ret = btrfs_add_ordered_extent_compress(inode,
 						async_extent->start,
 						ins.objectid,
@@ -694,35 +869,45 @@
 						ins.offset,
 						BTRFS_ORDERED_COMPRESSED,
 						async_extent->compress_type);
-		BUG_ON(ret);
+		if (ret)
+			goto out_free_reserve;
 
 		/*
 		 * clear dirty, set writeback and unlock the pages.
 		 */
-		extent_clear_unlock_delalloc(inode,
-				&BTRFS_I(inode)->io_tree,
-				async_extent->start,
+		extent_clear_unlock_delalloc(inode, async_extent->start,
 				async_extent->start +
 				async_extent->ram_size - 1,
-				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
-				EXTENT_CLEAR_UNLOCK |
-				EXTENT_CLEAR_DELALLOC |
-				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
-
+				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				PAGE_SET_WRITEBACK);
 		ret = btrfs_submit_compressed_write(inode,
 				    async_extent->start,
 				    async_extent->ram_size,
 				    ins.objectid,
 				    ins.offset, async_extent->pages,
 				    async_extent->nr_pages);
-
-		BUG_ON(ret);
 		alloc_hint = ins.objectid + ins.offset;
 		kfree(async_extent);
+		if (ret)
+			goto out;
 		cond_resched();
 	}
-
-	return 0;
+	ret = 0;
+out:
+	return ret;
+out_free_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+out_free:
+	extent_clear_unlock_delalloc(inode, async_extent->start,
+				     async_extent->start +
+				     async_extent->ram_size - 1,
+				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+	kfree(async_extent);
+	goto again;
 }
 
 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -777,7 +962,6 @@
 				   int unlock)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
 	unsigned long ram_size;
@@ -789,40 +973,34 @@
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	int ret = 0;
 
-	BUG_ON(btrfs_is_free_space_inode(root, inode));
-	trans = btrfs_join_transaction(root);
-	BUG_ON(IS_ERR(trans));
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+	BUG_ON(btrfs_is_free_space_inode(inode));
 
-	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+	num_bytes = ALIGN(end - start + 1, blocksize);
 	num_bytes = max(blocksize,  num_bytes);
 	disk_num_bytes = num_bytes;
-	ret = 0;
 
 	/* if this is a small write inside eof, kick off defrag */
-	if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
-		btrfs_add_inode_defrag(trans, inode);
+	if (num_bytes < 64 * 1024 &&
+	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
+		btrfs_add_inode_defrag(NULL, inode);
 
 	if (start == 0) {
 		/* lets try to make an inline extent */
-		ret = cow_file_range_inline(trans, root, inode,
-					    start, end, 0, 0, NULL);
+		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
+					    NULL);
 		if (ret == 0) {
-			extent_clear_unlock_delalloc(inode,
-				     &BTRFS_I(inode)->io_tree,
-				     start, end, NULL,
-				     EXTENT_CLEAR_UNLOCK_PAGE |
-				     EXTENT_CLEAR_UNLOCK |
-				     EXTENT_CLEAR_DELALLOC |
-				     EXTENT_CLEAR_DIRTY |
-				     EXTENT_SET_WRITEBACK |
-				     EXTENT_END_WRITEBACK);
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+				     EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG, PAGE_UNLOCK |
+				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+				     PAGE_END_WRITEBACK);
 
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
-			ret = 0;
 			goto out;
+		} else if (ret < 0) {
+			goto out_unlock;
 		}
 	}
 
@@ -836,26 +1014,35 @@
 		unsigned long op;
 
 		cur_alloc_size = disk_num_bytes;
-		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+		ret = btrfs_reserve_extent(root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
-					   (u64)-1, &ins, 1);
-		BUG_ON(ret);
+					   &ins, 1);
+		if (ret < 0)
+			goto out_unlock;
 
 		em = alloc_extent_map();
-		BUG_ON(!em);
+		if (!em) {
+			ret = -ENOMEM;
+			goto out_reserve;
+		}
 		em->start = start;
 		em->orig_start = em->start;
 		ram_size = ins.offset;
 		em->len = ins.offset;
+		em->mod_start = em->start;
+		em->mod_len = em->len;
 
 		em->block_start = ins.objectid;
 		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = ram_size;
 		em->bdev = root->fs_info->fs_devices->latest_bdev;
 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
+		em->generation = -1;
 
 		while (1) {
 			write_lock(&em_tree->lock);
-			ret = add_extent_mapping(em_tree, em);
+			ret = add_extent_mapping(em_tree, em, 1);
 			write_unlock(&em_tree->lock);
 			if (ret != -EEXIST) {
 				free_extent_map(em);
@@ -864,17 +1051,21 @@
 			btrfs_drop_extent_cache(inode, start,
 						start + ram_size - 1, 0);
 		}
+		if (ret)
+			goto out_reserve;
 
 		cur_alloc_size = ins.offset;
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
 					       ram_size, cur_alloc_size, 0);
-		BUG_ON(ret);
+		if (ret)
+			goto out_reserve;
 
 		if (root->root_key.objectid ==
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, start,
 						      cur_alloc_size);
-			BUG_ON(ret);
+			if (ret)
+				goto out_reserve;
 		}
 
 		if (disk_num_bytes < cur_alloc_size)
@@ -887,23 +1078,30 @@
 		 * Do set the Private2 bit so we know this page was properly
 		 * setup for writepage
 		 */
-		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
-		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
-			EXTENT_SET_PRIVATE2;
-
-		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-					     start, start + ram_size - 1,
-					     locked_page, op);
+		op = unlock ? PAGE_UNLOCK : 0;
+		op |= PAGE_SET_PRIVATE2;
+
+		extent_clear_unlock_delalloc(inode, start,
+					     start + ram_size - 1, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC,
+					     op);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
 out:
-	ret = 0;
-	btrfs_end_transaction(trans, root);
-
 	return ret;
+
+out_reserve:
+	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+out_unlock:
+	extent_clear_unlock_delalloc(inode, start, end, locked_page,
+				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+				     EXTENT_DELALLOC | EXTENT_DEFRAG,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
+	goto out;
 }
 
 /*
@@ -918,8 +1116,10 @@
 	compress_file_range(async_cow->inode, async_cow->locked_page,
 			    async_cow->start, async_cow->end, async_cow,
 			    &num_added);
-	if (num_added == 0)
+	if (num_added == 0) {
+		btrfs_add_delayed_iput(async_cow->inode);
 		async_cow->inode = NULL;
+	}
 }
 
 /*
@@ -937,10 +1137,8 @@
 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
 		PAGE_CACHE_SHIFT;
 
-	atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages);
-
-	if (atomic_read(&root->fs_info->async_delalloc_pages) <
-	    5 * 1042 * 1024 &&
+	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
+	    5 * 1024 * 1024 &&
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
 
@@ -952,6 +1150,8 @@
 {
 	struct async_cow *async_cow;
 	async_cow = container_of(work, struct async_cow, work);
+	if (async_cow->inode)
+		btrfs_add_delayed_iput(async_cow->inode);
 	kfree(async_cow);
 }
 
@@ -963,14 +1163,14 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	unsigned long nr_pages;
 	u64 cur_end;
-	int limit = 10 * 1024 * 1042;
+	int limit = 10 * 1024 * 1024;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
 			 1, 0, NULL, GFP_NOFS);
 	while (start < end) {
 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
-		BUG_ON(!async_cow);
-		async_cow->inode = inode;
+		BUG_ON(!async_cow); /* -ENOMEM */
+		async_cow->inode = igrab(inode);
 		async_cow->root = root;
 		async_cow->locked_page = locked_page;
 		async_cow->start = start;
@@ -1059,8 +1259,10 @@
 	u64 extent_offset;
 	u64 disk_bytenr;
 	u64 num_bytes;
+	u64 disk_num_bytes;
+	u64 ram_bytes;
 	int extent_type;
-	int ret;
+	int ret, err;
 	int type;
 	int nocow;
 	int check_prev = 1;
@@ -1068,17 +1270,36 @@
 	u64 ino = btrfs_ino(inode);
 
 	path = btrfs_alloc_path();
-	if (!path)
+	if (!path) {
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
 		return -ENOMEM;
+	}
 
-	nolock = btrfs_is_free_space_inode(root, inode);
+	nolock = btrfs_is_free_space_inode(inode);
 
 	if (nolock)
 		trans = btrfs_join_transaction_nolock(root);
 	else
 		trans = btrfs_join_transaction(root);
 
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	cow_start = (u64)-1;
@@ -1086,7 +1307,10 @@
 	while (1) {
 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
 					       cur_offset, 0);
-		BUG_ON(ret < 0);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error;
+		}
 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
 			leaf = path->nodes[0];
 			btrfs_item_key_to_cpu(leaf, &found_key,
@@ -1100,8 +1324,10 @@
 		leaf = path->nodes[0];
 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 			ret = btrfs_next_leaf(root, path);
-			if (ret < 0)
-				BUG_ON(1);
+			if (ret < 0) {
+				btrfs_abort_transaction(trans, root, ret);
+				goto error;
+			}
 			if (ret > 0)
 				break;
 			leaf = path->nodes[0];
@@ -1127,12 +1353,15 @@
 				    struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(leaf, fi);
 
+		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
+			disk_num_bytes =
+				btrfs_file_extent_disk_num_bytes(leaf, fi);
 			if (extent_end <= start) {
 				path->slots[0]++;
 				goto next_slot;
@@ -1186,10 +1415,13 @@
 
 		btrfs_release_path(path);
 		if (cow_start != (u64)-1) {
-			ret = cow_file_range(inode, locked_page, cow_start,
-					found_key.offset - 1, page_started,
-					nr_written, 1);
-			BUG_ON(ret);
+			ret = cow_file_range(inode, locked_page,
+					     cow_start, found_key.offset - 1,
+					     page_started, nr_written, 1);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				goto error;
+			}
 			cow_start = (u64)-1;
 		}
 
@@ -1198,17 +1430,23 @@
 			struct extent_map_tree *em_tree;
 			em_tree = &BTRFS_I(inode)->extent_tree;
 			em = alloc_extent_map();
-			BUG_ON(!em);
+			BUG_ON(!em); /* -ENOMEM */
 			em->start = cur_offset;
-			em->orig_start = em->start;
+			em->orig_start = found_key.offset - extent_offset;
 			em->len = num_bytes;
 			em->block_len = num_bytes;
 			em->block_start = disk_bytenr;
+			em->orig_block_len = disk_num_bytes;
+			em->ram_bytes = ram_bytes;
 			em->bdev = root->fs_info->fs_devices->latest_bdev;
+			em->mod_start = em->start;
+			em->mod_len = em->len;
 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
+			set_bit(EXTENT_FLAG_FILLING, &em->flags);
+			em->generation = -1;
 			while (1) {
 				write_lock(&em_tree->lock);
-				ret = add_extent_mapping(em_tree, em);
+				ret = add_extent_mapping(em_tree, em, 1);
 				write_unlock(&em_tree->lock);
 				if (ret != -EEXIST) {
 					free_extent_map(em);
@@ -1224,43 +1462,58 @@
 
 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
 					       num_bytes, num_bytes, type);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOMEM */
 
 		if (root->root_key.objectid ==
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
 						      num_bytes);
-			BUG_ON(ret);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				goto error;
+			}
 		}
 
-		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-				cur_offset, cur_offset + num_bytes - 1,
-				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
-				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
-				EXTENT_SET_PRIVATE2);
+		extent_clear_unlock_delalloc(inode, cur_offset,
+					     cur_offset + num_bytes - 1,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC, PAGE_UNLOCK |
+					     PAGE_SET_PRIVATE2);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
 	}
 	btrfs_release_path(path);
 
-	if (cur_offset <= end && cow_start == (u64)-1)
+	if (cur_offset <= end && cow_start == (u64)-1) {
 		cow_start = cur_offset;
+		cur_offset = end;
+	}
+
 	if (cow_start != (u64)-1) {
 		ret = cow_file_range(inode, locked_page, cow_start, end,
 				     page_started, nr_written, 1);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error;
+		}
 	}
 
-	if (nolock) {
-		ret = btrfs_end_transaction_nolock(trans, root);
-		BUG_ON(ret);
-	} else {
-		ret = btrfs_end_transaction(trans, root);
-		BUG_ON(ret);
-	}
+error:
+	err = btrfs_end_transaction(trans, root);
+	if (!ret)
+		ret = err;
+
+	if (ret && cur_offset < end)
+		extent_clear_unlock_delalloc(inode, cur_offset, end,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC | EXTENT_DEFRAG |
+					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 /*
@@ -1273,20 +1526,23 @@
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 1, nr_written);
-	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
+	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
-	else if (!btrfs_test_opt(root, COMPRESS) &&
-		 !(BTRFS_I(inode)->force_compress) &&
-		 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
+	} else if (!btrfs_test_opt(root, COMPRESS) &&
+		   !(BTRFS_I(inode)->force_compress) &&
+		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
 		ret = cow_file_range(inode, locked_page, start, end,
 				      page_started, nr_written, 1);
-	else
+	} else {
+		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+			&BTRFS_I(inode)->runtime_flags);
 		ret = cow_file_range_async(inode, locked_page, start, end,
 					   page_started, nr_written);
+	}
 	return ret;
 }
 
@@ -1321,13 +1577,53 @@
 	spin_unlock(&BTRFS_I(inode)->lock);
 }
 
+static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
+				      struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+			      &root->delalloc_inodes);
+		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			&BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes++;
+		if (root->nr_delalloc_inodes == 1) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(!list_empty(&root->delalloc_root));
+			list_add_tail(&root->delalloc_root,
+				      &root->fs_info->delalloc_roots);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
+}
+
+static void btrfs_del_delalloc_inode(struct btrfs_root *root,
+				     struct inode *inode)
+{
+	spin_lock(&root->delalloc_lock);
+	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			  &BTRFS_I(inode)->runtime_flags);
+		root->nr_delalloc_inodes--;
+		if (!root->nr_delalloc_inodes) {
+			spin_lock(&root->fs_info->delalloc_root_lock);
+			BUG_ON(list_empty(&root->delalloc_root));
+			list_del_init(&root->delalloc_root);
+			spin_unlock(&root->fs_info->delalloc_root_lock);
+		}
+	}
+	spin_unlock(&root->delalloc_lock);
+}
+
 /*
  * extent_io.c set_bit_hook, used to track delayed allocation
  * bytes in this file, and to maintain the list of inodes that
  * have pending delalloc work to be done.
  */
 static void btrfs_set_bit_hook(struct inode *inode,
-			       struct extent_state *state, int *bits)
+			       struct extent_state *state, unsigned long *bits)
 {
 
 	/*
@@ -1338,7 +1634,7 @@
 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
-		bool do_list = !btrfs_is_free_space_inode(root, inode);
+		bool do_list = !btrfs_is_free_space_inode(inode);
 
 		if (*bits & EXTENT_FIRST_DELALLOC) {
 			*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1348,14 +1644,14 @@
 			spin_unlock(&BTRFS_I(inode)->lock);
 		}
 
-		spin_lock(&root->fs_info->delalloc_lock);
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->delalloc_bytes += len;
-		root->fs_info->delalloc_bytes += len;
-		if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-			list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
-				      &root->fs_info->delalloc_inodes);
-		}
-		spin_unlock(&root->fs_info->delalloc_lock);
+		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+					 &BTRFS_I(inode)->runtime_flags))
+			btrfs_add_delalloc_inodes(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
 	}
 }
 
@@ -1363,7 +1659,8 @@
  * extent_io.c clear_bit_hook, see set_bit_hook for why
  */
 static void btrfs_clear_bit_hook(struct inode *inode,
-				 struct extent_state *state, int *bits)
+				 struct extent_state *state,
+				 unsigned long *bits)
 {
 	/*
 	 * set_bit and clear bit hooks normally require _irqsave/restore
@@ -1373,7 +1670,7 @@
 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
 		u64 len = state->end + 1 - state->start;
-		bool do_list = !btrfs_is_free_space_inode(root, inode);
+		bool do_list = !btrfs_is_free_space_inode(inode);
 
 		if (*bits & EXTENT_FIRST_DELALLOC) {
 			*bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1387,18 +1684,18 @@
 			btrfs_delalloc_release_metadata(inode, len);
 
 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
-		    && do_list)
+		    && do_list && !(state->state & EXTENT_NORESERVE))
 			btrfs_free_reserved_data_space(inode, len);
 
-		spin_lock(&root->fs_info->delalloc_lock);
-		root->fs_info->delalloc_bytes -= len;
+		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
+				     root->fs_info->delalloc_batch);
+		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->delalloc_bytes -= len;
-
 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
-		    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
-			list_del_init(&BTRFS_I(inode)->delalloc_inodes);
-		}
-		spin_unlock(&root->fs_info->delalloc_lock);
+		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+			     &BTRFS_I(inode)->runtime_flags))
+			btrfs_del_delalloc_inode(root, inode);
+		spin_unlock(&BTRFS_I(inode)->lock);
 	}
 }
 
@@ -1406,12 +1703,11 @@
  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
  * we don't create bios that span stripes or chunks
  */
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
 			 size_t size, struct bio *bio,
 			 unsigned long bio_flags)
 {
 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
-	struct btrfs_mapping_tree *map_tree;
 	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
@@ -1421,14 +1717,14 @@
 		return 0;
 
 	length = bio->bi_size;
-	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
-	ret = btrfs_map_block(map_tree, READ, logical,
+	ret = btrfs_map_block(root->fs_info, rw, logical,
 			      &map_length, NULL, 0);
-
+	/* Will always return 0 with map_multi == NULL */
+	BUG_ON(ret < 0);
 	if (map_length < length + size)
 		return 1;
-	return ret;
+	return 0;
 }
 
 /*
@@ -1448,7 +1744,7 @@
 	int ret = 0;
 
 	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 	return 0;
 }
 
@@ -1465,7 +1761,12 @@
 			  u64 bio_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
+	int ret;
+
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
+	if (ret)
+		bio_endio(bio, ret);
+	return ret;
 }
 
 /*
@@ -1479,39 +1780,54 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 	int skip_sum;
+	int metadata = 0;
+	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	if (btrfs_is_free_space_inode(root, inode))
-		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
-	else
-		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-	BUG_ON(ret);
+	if (btrfs_is_free_space_inode(inode))
+		metadata = 2;
 
 	if (!(rw & REQ_WRITE)) {
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
+		if (ret)
+			goto out;
+
 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
-			return btrfs_submit_compressed_read(inode, bio,
-						    mirror_num, bio_flags);
+			ret = btrfs_submit_compressed_read(inode, bio,
+							   mirror_num,
+							   bio_flags);
+			goto out;
 		} else if (!skip_sum) {
 			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
 			if (ret)
-				return ret;
+				goto out;
 		}
 		goto mapit;
-	} else if (!skip_sum) {
+	} else if (async && !skip_sum) {
 		/* csum items have already been cloned */
 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
 			goto mapit;
 		/* we're doing a write, do the async checksumming */
-		return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
 				   inode, rw, bio, mirror_num,
 				   bio_flags, bio_offset,
 				   __btrfs_submit_bio_start,
 				   __btrfs_submit_bio_done);
+		goto out;
+	} else if (!skip_sum) {
+		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
+		if (ret)
+			goto out;
 	}
 
 mapit:
-	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
+	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
+
+out:
+	if (ret < 0)
+		bio_endio(bio, ret);
+	return ret;
 }
 
 /*
@@ -1525,8 +1841,10 @@
 	struct btrfs_ordered_sum *sum;
 
 	list_for_each_entry(sum, list, list) {
+		trans->adding_csums = 1;
 		btrfs_csum_file_blocks(trans,
 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
+		trans->adding_csums = 0;
 	}
 	return 0;
 }
@@ -1534,8 +1852,7 @@
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
 			      struct extent_state **cached_state)
 {
-	if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
-		WARN_ON(1);
+	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
 				   cached_state, GFP_NOFS);
 }
@@ -1555,6 +1872,7 @@
 	struct inode *inode;
 	u64 page_start;
 	u64 page_end;
+	int ret;
 
 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
 	page = fixup->page;
@@ -1570,7 +1888,7 @@
 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
-			 &cached_state, GFP_NOFS);
+			 &cached_state);
 
 	/* already ordered? We're done */
 	if (PagePrivate2(page))
@@ -1582,12 +1900,21 @@
 				     page_end, &cached_state, GFP_NOFS);
 		unlock_page(page);
 		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
 		goto again;
 	}
 
-	BUG();
+	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		end_extent_writepage(page, ret, page_start, page_end);
+		ClearPageChecked(page);
+		goto out;
+	 }
+
 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
 	ClearPageChecked(page);
+	set_page_dirty(page);
 out:
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			     &cached_state, GFP_NOFS);
@@ -1630,7 +1957,7 @@
 	fixup->work.func = btrfs_writepage_fixup_worker;
 	fixup->page = page;
 	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
-	return -EAGAIN;
+	return -EBUSY;
 }
 
 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -1645,7 +1972,6 @@
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_key ins;
-	u64 hint;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -1663,15 +1989,17 @@
 	 * the caller is expected to unpin it and allow it to be merged
 	 * with the others.
 	 */
-	ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes,
-				 &hint, 0);
-	BUG_ON(ret);
+	ret = btrfs_drop_extents(trans, root, inode, file_pos,
+				 file_pos + num_bytes, 0);
+	if (ret)
+		goto out;
 
 	ins.objectid = btrfs_ino(inode);
 	ins.offset = file_pos;
 	ins.type = BTRFS_EXTENT_DATA_KEY;
 	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 	leaf = path->nodes[0];
 	fi = btrfs_item_ptr(leaf, path->slots[0],
 			    struct btrfs_file_extent_item);
@@ -1686,10 +2014,8 @@
 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
 
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_set_lock_blocking(leaf);
-
 	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
 
 	inode_add_bytes(inode, num_bytes);
 
@@ -1699,12 +2025,653 @@
 	ret = btrfs_alloc_reserved_file_extent(trans, root,
 					root->root_key.objectid,
 					btrfs_ino(inode), file_pos, &ins);
-	BUG_ON(ret);
+out:
 	btrfs_free_path(path);
 
+	return ret;
+}
+
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+	struct rb_node node;
+	struct old_sa_defrag_extent *old;
+	u64 root_id;
+	u64 inum;
+	u64 file_pos;
+	u64 extent_offset;
+	u64 num_bytes;
+	u64 generation;
+};
+
+struct old_sa_defrag_extent {
+	struct list_head list;
+	struct new_sa_defrag_extent *new;
+
+	u64 extent_offset;
+	u64 bytenr;
+	u64 offset;
+	u64 len;
+	int count;
+};
+
+struct new_sa_defrag_extent {
+	struct rb_root root;
+	struct list_head head;
+	struct btrfs_path *path;
+	struct inode *inode;
+	u64 file_pos;
+	u64 len;
+	u64 bytenr;
+	u64 disk_len;
+	u8 compress_type;
+};
+
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+			struct sa_defrag_extent_backref *b2)
+{
+	if (b1->root_id < b2->root_id)
+		return -1;
+	else if (b1->root_id > b2->root_id)
+		return 1;
+
+	if (b1->inum < b2->inum)
+		return -1;
+	else if (b1->inum > b2->inum)
+		return 1;
+
+	if (b1->file_pos < b2->file_pos)
+		return -1;
+	else if (b1->file_pos > b2->file_pos)
+		return 1;
+
+	/*
+	 * [------------------------------] ===> (a range of space)
+	 *     |<--->|   |<---->| =============> (fs/file tree A)
+	 * |<---------------------------->| ===> (fs/file tree B)
+	 *
+	 * A range of space can refer to two file extents in one tree while
+	 * refer to only one file extent in another tree.
+	 *
+	 * So we may process a disk offset more than one time(two extents in A)
+	 * and locate at the same extent(one extent in B), then insert two same
+	 * backrefs(both refer to the extent in B).
+	 */
 	return 0;
 }
 
+static void backref_insert(struct rb_root *root,
+			   struct sa_defrag_extent_backref *backref)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct sa_defrag_extent_backref *entry;
+	int ret;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+
+		ret = backref_comp(backref, entry);
+		if (ret < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&backref->node, parent, p);
+	rb_insert_color(&backref->node, root);
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+				       void *ctx)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_fs_info *fs_info;
+	struct old_sa_defrag_extent *old = ctx;
+	struct new_sa_defrag_extent *new = old->new;
+	struct btrfs_path *path = new->path;
+	struct btrfs_key key;
+	struct btrfs_root *root;
+	struct sa_defrag_extent_backref *backref;
+	struct extent_buffer *leaf;
+	struct inode *inode = new->inode;
+	int slot;
+	int ret;
+	u64 extent_offset;
+	u64 num_bytes;
+
+	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+	    inum == btrfs_ino(inode))
+		return 0;
+
+	key.objectid = root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(inode)->root->fs_info;
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		WARN_ON(1);
+		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+			 inum, offset, root_id);
+		return PTR_ERR(root);
+	}
+
+	key.objectid = inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (offset > (u64)-1 << 32)
+		key.offset = 0;
+	else
+		key.offset = offset;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		WARN_ON(1);
+		return ret;
+	}
+	ret = 0;
+
+	while (1) {
+		cond_resched();
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				goto out;
+			} else if (ret > 0) {
+				ret = 0;
+				goto out;
+			}
+			continue;
+		}
+
+		path->slots[0]++;
+
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.objectid > inum)
+			goto out;
+
+		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+
+		extent = btrfs_item_ptr(leaf, slot,
+					struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+			continue;
+
+		/*
+		 * 'offset' refers to the exact key.offset,
+		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
+		 * (key.offset - extent_offset).
+		 */
+		if (key.offset != offset)
+			continue;
+
+		extent_offset = btrfs_file_extent_offset(leaf, extent);
+		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+
+		if (extent_offset >= old->extent_offset + old->offset +
+		    old->len || extent_offset + num_bytes <=
+		    old->extent_offset + old->offset)
+			continue;
+		break;
+	}
+
+	backref = kmalloc(sizeof(*backref), GFP_NOFS);
+	if (!backref) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	backref->root_id = root_id;
+	backref->inum = inum;
+	backref->file_pos = offset;
+	backref->num_bytes = num_bytes;
+	backref->extent_offset = extent_offset;
+	backref->generation = btrfs_file_extent_generation(leaf, extent);
+	backref->old = old;
+	backref_insert(&new->root, backref);
+	old->count++;
+out:
+	btrfs_release_path(path);
+	WARN_ON(ret);
+	return ret;
+}
+
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+				   struct new_sa_defrag_extent *new)
+{
+	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+	struct old_sa_defrag_extent *old, *tmp;
+	int ret;
+
+	new->path = path;
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		ret = iterate_inodes_from_logical(old->bytenr +
+						  old->extent_offset, fs_info,
+						  path, record_one_backref,
+						  old);
+		BUG_ON(ret < 0 && ret != -ENOENT);
+
+		/* no backref to be processed for this extent */
+		if (!old->count) {
+			list_del(&old->list);
+			kfree(old);
+		}
+	}
+
+	if (list_empty(&new->head))
+		return false;
+
+	return true;
+}
+
+static int relink_is_mergable(struct extent_buffer *leaf,
+			      struct btrfs_file_extent_item *fi,
+			      struct new_sa_defrag_extent *new)
+{
+	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
+		return 0;
+
+	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+		return 0;
+
+	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
+		return 0;
+
+	if (btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+				 struct sa_defrag_extent_backref *prev,
+				 struct sa_defrag_extent_backref *backref)
+{
+	struct btrfs_file_extent_item *extent;
+	struct btrfs_file_extent_item *item;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct old_sa_defrag_extent *old = backref->old;
+	struct new_sa_defrag_extent *new = old->new;
+	struct inode *src_inode = new->inode;
+	struct inode *inode;
+	struct extent_state *cached = NULL;
+	int ret = 0;
+	u64 start;
+	u64 len;
+	u64 lock_start;
+	u64 lock_end;
+	bool merge = false;
+	int index;
+
+	if (prev && prev->root_id == backref->root_id &&
+	    prev->inum == backref->inum &&
+	    prev->file_pos + prev->num_bytes == backref->file_pos)
+		merge = true;
+
+	/* step 1: get root */
+	key.objectid = backref->root_id;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	fs_info = BTRFS_I(src_inode)->root->fs_info;
+	index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		if (PTR_ERR(root) == -ENOENT)
+			return 0;
+		return PTR_ERR(root);
+	}
+
+	/* step 2: get inode */
+	key.objectid = backref->inum;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+
+	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+	if (IS_ERR(inode)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, index);
+		return 0;
+	}
+
+	srcu_read_unlock(&fs_info->subvol_srcu, index);
+
+	/* step 3: relink backref */
+	lock_start = backref->file_pos;
+	lock_end = backref->file_pos + backref->num_bytes - 1;
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			 0, &cached);
+
+	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		goto out_unlock;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	key.objectid = backref->inum;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = backref->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out_free_path;
+	} else if (ret > 0) {
+		ret = 0;
+		goto out_free_path;
+	}
+
+	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				struct btrfs_file_extent_item);
+
+	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+	    backref->generation)
+		goto out_free_path;
+
+	btrfs_release_path(path);
+
+	start = backref->file_pos;
+	if (backref->extent_offset < old->extent_offset + old->offset)
+		start += old->extent_offset + old->offset -
+			 backref->extent_offset;
+
+	len = min(backref->extent_offset + backref->num_bytes,
+		  old->extent_offset + old->offset + old->len);
+	len -= max(backref->extent_offset, old->extent_offset + old->offset);
+
+	ret = btrfs_drop_extents(trans, root, inode, start,
+				 start + len, 1);
+	if (ret)
+		goto out_free_path;
+again:
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = start;
+
+	path->leave_spinning = 1;
+	if (merge) {
+		struct btrfs_file_extent_item *fi;
+		u64 extent_len;
+		struct btrfs_key found_key;
+
+		ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+		if (ret < 0)
+			goto out_free_path;
+
+		path->slots[0]--;
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+
+		if (extent_len + found_key.offset == start &&
+		    relink_is_mergable(leaf, fi, new)) {
+			btrfs_set_file_extent_num_bytes(leaf, fi,
+							extent_len + len);
+			btrfs_mark_buffer_dirty(leaf);
+			inode_add_bytes(inode, len);
+
+			ret = 1;
+			goto out_free_path;
+		} else {
+			merge = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+					sizeof(*extent));
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_file_extent_item);
+	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+	btrfs_set_file_extent_num_bytes(leaf, item, len);
+	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+	btrfs_set_file_extent_generation(leaf, item, trans->transid);
+	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+	btrfs_set_file_extent_encryption(leaf, item, 0);
+	btrfs_set_file_extent_other_encoding(leaf, item, 0);
+
+	btrfs_mark_buffer_dirty(leaf);
+	inode_add_bytes(inode, len);
+	btrfs_release_path(path);
+
+	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+			new->disk_len, 0,
+			backref->root_id, backref->inum,
+			new->file_pos, 0);	/* start - extent_offset */
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_free_path;
+	}
+
+	ret = 1;
+out_free_path:
+	btrfs_release_path(path);
+	path->leave_spinning = 0;
+	btrfs_end_transaction(trans, root);
+out_unlock:
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+			     &cached, GFP_NOFS);
+	iput(inode);
+	return ret;
+}
+
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+	struct btrfs_path *path;
+	struct old_sa_defrag_extent *old, *tmp;
+	struct sa_defrag_extent_backref *backref;
+	struct sa_defrag_extent_backref *prev = NULL;
+	struct inode *inode;
+	struct btrfs_root *root;
+	struct rb_node *node;
+	int ret;
+
+	inode = new->inode;
+	root = BTRFS_I(inode)->root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return;
+
+	if (!record_extent_backrefs(path, new)) {
+		btrfs_free_path(path);
+		goto out;
+	}
+	btrfs_release_path(path);
+
+	while (1) {
+		node = rb_first(&new->root);
+		if (!node)
+			break;
+		rb_erase(node, &new->root);
+
+		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+
+		ret = relink_extent_backref(path, prev, backref);
+		WARN_ON(ret < 0);
+
+		kfree(prev);
+
+		if (ret == 1)
+			prev = backref;
+		else
+			prev = NULL;
+		cond_resched();
+	}
+	kfree(prev);
+
+	btrfs_free_path(path);
+
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+out:
+	atomic_dec(&root->fs_info->defrag_running);
+	wake_up(&root->fs_info->transaction_wait);
+
+	kfree(new);
+}
+
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+			struct btrfs_ordered_extent *ordered)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct old_sa_defrag_extent *old, *tmp;
+	struct new_sa_defrag_extent *new;
+	int ret;
+
+	new = kmalloc(sizeof(*new), GFP_NOFS);
+	if (!new)
+		return NULL;
+
+	new->inode = inode;
+	new->file_pos = ordered->file_offset;
+	new->len = ordered->len;
+	new->bytenr = ordered->start;
+	new->disk_len = ordered->disk_len;
+	new->compress_type = ordered->compress_type;
+	new->root = RB_ROOT;
+	INIT_LIST_HEAD(&new->head);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		goto out_kfree;
+
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = new->file_pos;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out_free_path;
+	if (ret > 0 && path->slots[0] > 0)
+		path->slots[0]--;
+
+	/* find out all the old extents for the file range */
+	while (1) {
+		struct btrfs_file_extent_item *extent;
+		struct extent_buffer *l;
+		int slot;
+		u64 num_bytes;
+		u64 offset;
+		u64 end;
+		u64 disk_bytenr;
+		u64 extent_offset;
+
+		l = path->nodes[0];
+		slot = path->slots[0];
+
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				goto out_free_list;
+			else if (ret > 0)
+				break;
+			continue;
+		}
+
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid != btrfs_ino(inode))
+			break;
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			break;
+		if (key.offset >= new->file_pos + new->len)
+			break;
+
+		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+
+		num_bytes = btrfs_file_extent_num_bytes(l, extent);
+		if (key.offset + num_bytes < new->file_pos)
+			goto next;
+
+		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+		if (!disk_bytenr)
+			goto next;
+
+		extent_offset = btrfs_file_extent_offset(l, extent);
+
+		old = kmalloc(sizeof(*old), GFP_NOFS);
+		if (!old)
+			goto out_free_list;
+
+		offset = max(new->file_pos, key.offset);
+		end = min(new->file_pos + new->len, key.offset + num_bytes);
+
+		old->bytenr = disk_bytenr;
+		old->extent_offset = extent_offset;
+		old->offset = offset - key.offset;
+		old->len = end - offset;
+		old->new = new;
+		old->count = 0;
+		list_add_tail(&old->list, &new->head);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+
+	btrfs_free_path(path);
+	atomic_inc(&root->fs_info->defrag_running);
+
+	return new;
+
+out_free_list:
+	list_for_each_entry_safe(old, tmp, &new->head, list) {
+		list_del(&old->list);
+		kfree(old);
+	}
+out_free_path:
+	btrfs_free_path(path);
+out_kfree:
+	kfree(new);
+	return NULL;
+}
+
 /*
  * helper function for btrfs_finish_ordered_io, this
  * just reads in some of the csum leaves to prime them into ram
@@ -1715,50 +2682,81 @@
  * an ordered extent if the range of bytes in the file it covers are
  * fully written.
  */
-static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
+static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 {
+	struct inode *inode = ordered_extent->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans = NULL;
-	struct btrfs_ordered_extent *ordered_extent = NULL;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_state *cached_state = NULL;
+	struct new_sa_defrag_extent *new = NULL;
 	int compress_type = 0;
-	int ret;
+	int ret = 0;
+	u64 logical_len = ordered_extent->len;
 	bool nolock;
+	bool truncated = false;
 
-	ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
-					     end - start + 1);
-	if (!ret)
-		return 0;
-	BUG_ON(!ordered_extent);
+	nolock = btrfs_is_free_space_inode(inode);
 
-	nolock = btrfs_is_free_space_inode(root, inode);
+	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
+		ret = -EIO;
+		goto out;
+	}
+
+	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+		truncated = true;
+		logical_len = ordered_extent->truncated_len;
+		/* Truncated the entire extent, don't bother adding */
+		if (!logical_len)
+			goto out;
+	}
 
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
-		BUG_ON(!list_empty(&ordered_extent->list));
-		ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-		if (!ret) {
-			if (nolock)
-				trans = btrfs_join_transaction_nolock(root);
-			else
-				trans = btrfs_join_transaction(root);
-			BUG_ON(IS_ERR(trans));
-			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-			ret = btrfs_update_inode_fallback(trans, root, inode);
-			BUG_ON(ret);
+		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
+		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+		if (nolock)
+			trans = btrfs_join_transaction_nolock(root);
+		else
+			trans = btrfs_join_transaction(root);
+		if (IS_ERR(trans)) {
+			ret = PTR_ERR(trans);
+			trans = NULL;
+			goto out;
 		}
+		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+		ret = btrfs_update_inode_fallback(trans, root, inode);
+		if (ret) /* -ENOMEM or corruption */
+			btrfs_abort_transaction(trans, root, ret);
 		goto out;
 	}
 
 	lock_extent_bits(io_tree, ordered_extent->file_offset,
 			 ordered_extent->file_offset + ordered_extent->len - 1,
-			 0, &cached_state, GFP_NOFS);
+			 0, &cached_state);
+
+	ret = test_range_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 1, cached_state);
+	if (ret) {
+		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+		if (last_snapshot >= BTRFS_I(inode)->generation)
+			/* the inode is shared */
+			new = record_old_file_extents(inode, ordered_extent);
+
+		clear_extent_bit(io_tree, ordered_extent->file_offset,
+			ordered_extent->file_offset + ordered_extent->len - 1,
+			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+	}
 
 	if (nolock)
 		trans = btrfs_join_transaction_nolock(root);
 	else
 		trans = btrfs_join_transaction(root);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out_unlock;
+	}
 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
@@ -1768,61 +2766,122 @@
 		ret = btrfs_mark_extent_written(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->file_offset +
-						ordered_extent->len);
-		BUG_ON(ret);
+						logical_len);
 	} else {
 		BUG_ON(root == root->fs_info->tree_root);
 		ret = insert_reserved_file_extent(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->start,
 						ordered_extent->disk_len,
-						ordered_extent->len,
-						ordered_extent->len,
+						logical_len, logical_len,
 						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
-		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-				   ordered_extent->file_offset,
-				   ordered_extent->len);
-		BUG_ON(ret);
 	}
-	unlock_extent_cached(io_tree, ordered_extent->file_offset,
-			     ordered_extent->file_offset +
-			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
+	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+			   ordered_extent->file_offset, ordered_extent->len,
+			   trans->transid);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_unlock;
+	}
 
 	add_pending_csums(trans, inode, ordered_extent->file_offset,
 			  &ordered_extent->list);
 
-	ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-		ret = btrfs_update_inode_fallback(trans, root, inode);
-		BUG_ON(ret);
+	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+	ret = btrfs_update_inode_fallback(trans, root, inode);
+	if (ret) { /* -ENOMEM or corruption */
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_unlock;
 	}
 	ret = 0;
+out_unlock:
+	unlock_extent_cached(io_tree, ordered_extent->file_offset,
+			     ordered_extent->file_offset +
+			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
 out:
 	if (root != root->fs_info->tree_root)
 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-	if (trans) {
-		if (nolock)
-			btrfs_end_transaction_nolock(trans, root);
+	if (trans)
+		btrfs_end_transaction(trans, root);
+
+	if (ret || truncated) {
+		u64 start, end;
+
+		if (truncated)
+			start = ordered_extent->file_offset + logical_len;
 		else
-			btrfs_end_transaction(trans, root);
+			start = ordered_extent->file_offset;
+		end = ordered_extent->file_offset + ordered_extent->len - 1;
+		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+
+		/* Drop the cache for the part of the extent we didn't write. */
+		btrfs_drop_extent_cache(inode, start, end, 0);
+
+		/*
+		 * If the ordered extent had an IOERR or something else went
+		 * wrong we need to return the space for this ordered extent
+		 * back to the allocator.  We only free the extent in the
+		 * truncated case if we didn't write out the extent at all.
+		 */
+		if ((ret || !logical_len) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+			btrfs_free_reserved_extent(root, ordered_extent->start,
+						   ordered_extent->disk_len);
 	}
 
+
+	/*
+	 * This needs to be done to make sure anybody waiting knows we are done
+	 * updating everything for this ordered extent.
+	 */
+	btrfs_remove_ordered_extent(inode, ordered_extent);
+
+	/* for snapshot-aware defrag */
+	if (new)
+		relink_file_extents(new);
+
 	/* once for us */
 	btrfs_put_ordered_extent(ordered_extent);
 	/* once for the tree */
 	btrfs_put_ordered_extent(ordered_extent);
 
-	return 0;
+	return ret;
+}
+
+static void finish_ordered_fn(struct btrfs_work *work)
+{
+	struct btrfs_ordered_extent *ordered_extent;
+	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
+	btrfs_finish_ordered_io(ordered_extent);
 }
 
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 				struct extent_state *state, int uptodate)
 {
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_ordered_extent *ordered_extent = NULL;
+	struct btrfs_workers *workers;
+
 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
 
 	ClearPagePrivate2(page);
-	return btrfs_finish_ordered_io(page->mapping->host, start, end);
+	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
+					    end - start + 1, uptodate))
+		return 0;
+
+	ordered_extent->work.func = finish_ordered_fn;
+	ordered_extent->work.flags = 0;
+
+	if (btrfs_is_free_space_inode(inode))
+		workers = &root->fs_info->endio_freespace_worker;
+	else
+		workers = &root->fs_info->endio_write_workers;
+	btrfs_queue_worker(workers, &ordered_extent->work);
+
+	return 0;
 }
 
 /*
@@ -1830,17 +2889,19 @@
  * if there's a match, we allow the bio to finish.  If not, the code in
  * extent_io.c will try to find good copies for us.
  */
-static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state)
+static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
 {
-	size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+	size_t offset = start - page_offset(page);
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	char *kaddr;
-	u64 private = ~(u32)0;
-	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u32 csum_expected;
 	u32 csum = ~(u32)0;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+	                              DEFAULT_RATELIMIT_BURST);
 
 	if (PageChecked(page)) {
 		ClearPageChecked(page);
@@ -1857,35 +2918,27 @@
 		return 0;
 	}
 
-	if (state && state->start == start) {
-		private = state->private;
-		ret = 0;
-	} else {
-		ret = get_state_private(io_tree, start, &private);
-	}
-	kaddr = kmap_atomic(page, KM_USER0);
-	if (ret)
-		goto zeroit;
+	phy_offset >>= inode->i_sb->s_blocksize_bits;
+	csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
 
-	csum = btrfs_csum_data(root, kaddr + offset, csum,  end - start + 1);
+	kaddr = kmap_atomic(page);
+	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
 	btrfs_csum_final(csum, (char *)&csum);
-	if (csum != private)
+	if (csum != csum_expected)
 		goto zeroit;
 
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr);
 good:
 	return 0;
 
 zeroit:
-	printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
-		       "private %llu\n",
-		       (unsigned long long)btrfs_ino(page->mapping->host),
-		       (unsigned long long)start, csum,
-		       (unsigned long long)private);
+	if (__ratelimit(&_rs))
+		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
+			btrfs_ino(page->mapping->host), start, csum, csum_expected);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
-	kunmap_atomic(kaddr, KM_USER0);
-	if (private == 0)
+	kunmap_atomic(kaddr);
+	if (csum_expected == 0)
 		return 0;
 	return -EIO;
 }
@@ -1895,6 +2948,8 @@
 	struct inode *inode;
 };
 
+/* JDM: If this is fs-wide, why can't we add a pointer to
+ * btrfs_inode instead and avoid the allocation? */
 void btrfs_add_delayed_iput(struct inode *inode)
 {
 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@@ -1924,7 +2979,6 @@
 	if (empty)
 		return;
 
-	down_read(&root->fs_info->cleanup_work_sem);
 	spin_lock(&fs_info->delayed_iput_lock);
 	list_splice_init(&fs_info->delayed_iputs, &list);
 	spin_unlock(&fs_info->delayed_iput_lock);
@@ -1935,14 +2989,8 @@
 		iput(delayed->inode);
 		kfree(delayed);
 	}
-	up_read(&root->fs_info->cleanup_work_sem);
 }
 
-enum btrfs_orphan_cleanup_state {
-	ORPHAN_CLEANUP_STARTED	= 1,
-	ORPHAN_CLEANUP_DONE	= 2,
-};
-
 /*
  * This is called in transaction commmit time. If there are no orphan
  * files in the subvolume, it removes orphan item and frees block_rsv
@@ -1951,24 +2999,41 @@
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root)
 {
+	struct btrfs_block_rsv *block_rsv;
 	int ret;
 
-	if (!list_empty(&root->orphan_list) ||
+	if (atomic_read(&root->orphan_inodes) ||
 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
 		return;
 
+	spin_lock(&root->orphan_lock);
+	if (atomic_read(&root->orphan_inodes)) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
+		spin_unlock(&root->orphan_lock);
+		return;
+	}
+
+	block_rsv = root->orphan_block_rsv;
+	root->orphan_block_rsv = NULL;
+	spin_unlock(&root->orphan_lock);
+
 	if (root->orphan_item_inserted &&
 	    btrfs_root_refs(&root->root_item) > 0) {
 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
 					    root->root_key.objectid);
-		BUG_ON(ret);
-		root->orphan_item_inserted = 0;
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
+		else
+			root->orphan_item_inserted = 0;
 	}
 
-	if (root->orphan_block_rsv) {
-		WARN_ON(root->orphan_block_rsv->size > 0);
-		btrfs_free_block_rsv(root, root->orphan_block_rsv);
-		root->orphan_block_rsv = NULL;
+	if (block_rsv) {
+		WARN_ON(block_rsv->size > 0);
+		btrfs_free_block_rsv(root, block_rsv);
 	}
 }
 
@@ -1988,7 +3053,7 @@
 	int ret;
 
 	if (!root->orphan_block_rsv) {
-		block_rsv = btrfs_alloc_block_rsv(root);
+		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 		if (!block_rsv)
 			return -ENOMEM;
 	}
@@ -2001,8 +3066,8 @@
 		block_rsv = NULL;
 	}
 
-	if (list_empty(&BTRFS_I(inode)->i_orphan)) {
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			      &BTRFS_I(inode)->runtime_flags)) {
 #if 0
 		/*
 		 * For proper ENOSPC handling, we should do orphan
@@ -2015,31 +3080,47 @@
 			insert = 1;
 #endif
 		insert = 1;
+		atomic_inc(&root->orphan_inodes);
 	}
 
-	if (!BTRFS_I(inode)->orphan_meta_reserved) {
-		BTRFS_I(inode)->orphan_meta_reserved = 1;
+	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+			      &BTRFS_I(inode)->runtime_flags))
 		reserve = 1;
-	}
 	spin_unlock(&root->orphan_lock);
 
 	/* grab metadata reservation from transaction handle */
 	if (reserve) {
 		ret = btrfs_orphan_reserve_metadata(trans, inode);
-		BUG_ON(ret);
+		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
 	}
 
 	/* insert an orphan item to track this unlinked/truncated file */
 	if (insert >= 1) {
 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-		BUG_ON(ret && ret != -EEXIST);
+		if (ret) {
+			if (reserve) {
+				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_orphan_release_metadata(inode);
+			}
+			if (ret != -EEXIST) {
+				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_abort_transaction(trans, root, ret);
+				return ret;
+			}
+		}
+		ret = 0;
 	}
 
 	/* insert an orphan item to track subvolume contains orphan files */
 	if (insert >= 2) {
 		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
 					       root->root_key.objectid);
-		BUG_ON(ret);
+		if (ret && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			return ret;
+		}
 	}
 	return 0;
 }
@@ -2048,7 +3129,8 @@
  * We have done the truncate/delete so we can go ahead and remove the orphan
  * item for this particular inode.
  */
-int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
+static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
+			    struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int delete_item = 0;
@@ -2056,26 +3138,24 @@
 	int ret = 0;
 
 	spin_lock(&root->orphan_lock);
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		list_del_init(&BTRFS_I(inode)->i_orphan);
+	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			       &BTRFS_I(inode)->runtime_flags))
 		delete_item = 1;
-	}
 
-	if (BTRFS_I(inode)->orphan_meta_reserved) {
-		BTRFS_I(inode)->orphan_meta_reserved = 0;
+	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+			       &BTRFS_I(inode)->runtime_flags))
 		release_rsv = 1;
-	}
 	spin_unlock(&root->orphan_lock);
 
-	if (trans && delete_item) {
+	if (trans && delete_item)
 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-		BUG_ON(ret);
-	}
 
-	if (release_rsv)
+	if (release_rsv) {
 		btrfs_orphan_release_metadata(inode);
+		atomic_dec(&root->orphan_inodes);
+	}
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -2143,8 +3223,8 @@
 		 */
 
 		if (found_key.offset == last_objectid) {
-			printk(KERN_ERR "btrfs: Error removing orphan entry, "
-			       "stopping orphan cleanup\n");
+			btrfs_err(root->fs_info,
+				"Error removing orphan entry, stopping orphan cleanup");
 			ret = -EINVAL;
 			goto out;
 		}
@@ -2201,10 +3281,13 @@
 				ret = PTR_ERR(trans);
 				goto out;
 			}
+			btrfs_debug(root->fs_info, "auto deleting %Lu",
+				found_key.objectid);
 			ret = btrfs_del_orphan_item(trans, root,
 						    found_key.objectid);
-			BUG_ON(ret);
 			btrfs_end_transaction(trans, root);
+			if (ret)
+				goto out;
 			continue;
 		}
 
@@ -2212,9 +3295,9 @@
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
 		 */
-		spin_lock(&root->orphan_lock);
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
+		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+			&BTRFS_I(inode)->runtime_flags);
+		atomic_inc(&root->orphan_inodes);
 
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
@@ -2224,14 +3307,24 @@
 				continue;
 			}
 			nr_truncate++;
-			/*
-			 * Need to hold the imutex for reservation purposes, not
-			 * a huge deal here but I have a WARN_ON in
-			 * btrfs_delalloc_reserve_space to catch offenders.
-			 */
-			mutex_lock(&inode->i_mutex);
+
+			/* 1 for the orphan item deletion. */
+			trans = btrfs_start_transaction(root, 1);
+			if (IS_ERR(trans)) {
+				iput(inode);
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+			ret = btrfs_orphan_add(trans, inode);
+			btrfs_end_transaction(trans, root);
+			if (ret) {
+				iput(inode);
+				goto out;
+			}
+
 			ret = btrfs_truncate(inode);
-			mutex_unlock(&inode->i_mutex);
+			if (ret)
+				btrfs_orphan_del(NULL, inode);
 		} else {
 			nr_unlink++;
 		}
@@ -2257,13 +3350,14 @@
 	}
 
 	if (nr_unlink)
-		printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
+		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
 	if (nr_truncate)
-		printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
 
 out:
 	if (ret)
-		printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+		btrfs_crit(root->fs_info,
+			"could not do orphan cleanup %d", ret);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2279,8 +3373,17 @@
 {
 	u32 nritems = btrfs_header_nritems(leaf);
 	struct btrfs_key found_key;
+	static u64 xattr_access = 0;
+	static u64 xattr_default = 0;
 	int scanned = 0;
 
+	if (!xattr_access) {
+		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
+					strlen(POSIX_ACL_XATTR_ACCESS));
+		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
+					strlen(POSIX_ACL_XATTR_DEFAULT));
+	}
+
 	slot++;
 	while (slot < nritems) {
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -2290,8 +3393,11 @@
 			return 0;
 
 		/* we found an xattr, assume we've got an acl */
-		if (found_key.type == BTRFS_XATTR_ITEM_KEY)
-			return 1;
+		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
+			if (found_key.offset == xattr_access ||
+			    found_key.offset == xattr_default)
+				return 1;
+		}
 
 		/*
 		 * we found a key greater than an xattr key, there can't
@@ -2377,7 +3483,19 @@
 
 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
-	BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item);
+	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
+
+	/*
+	 * If we were modified in the current generation and evicted from memory
+	 * and then re-read we need to do a full sync since we don't have any
+	 * idea about which extents were modified before we were evicted from
+	 * cache.
+	 */
+	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
+		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+			&BTRFS_I(inode)->runtime_flags);
+
+	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
 	inode->i_generation = BTRFS_I(inode)->generation;
 	inode->i_rdev = 0;
 	rdev = btrfs_inode_rdev(leaf, inode_item);
@@ -2438,34 +3556,41 @@
 			    struct btrfs_inode_item *item,
 			    struct inode *inode)
 {
-	btrfs_set_inode_uid(leaf, item, inode->i_uid);
-	btrfs_set_inode_gid(leaf, item, inode->i_gid);
-	btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
-	btrfs_set_inode_mode(leaf, item, inode->i_mode);
-	btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
-
-	btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
-			       inode->i_atime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
-				inode->i_atime.tv_nsec);
-
-	btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
-			       inode->i_mtime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
-				inode->i_mtime.tv_nsec);
-
-	btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
-			       inode->i_ctime.tv_sec);
-	btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
-				inode->i_ctime.tv_nsec);
-
-	btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
-	btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
-	btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence);
-	btrfs_set_inode_transid(leaf, item, trans->transid);
-	btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
-	btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-	btrfs_set_inode_block_group(leaf, item, 0);
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	btrfs_set_token_inode_uid(leaf, item, inode->i_uid, &token);
+	btrfs_set_token_inode_gid(leaf, item, inode->i_gid, &token);
+	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+				   &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+				      inode->i_atime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				      inode->i_mtime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				      inode->i_ctime.tv_nsec, &token);
+
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
+					 &token);
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
 }
 
 /*
@@ -2521,8 +3646,10 @@
 	 * The data relocation inode should also be directly updated
 	 * without delay
 	 */
-	if (!btrfs_is_free_space_inode(root, inode)
+	if (!btrfs_is_free_space_inode(inode)
 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+		btrfs_update_root_times(trans, root);
+
 		ret = btrfs_delayed_update_inode(trans, root, inode);
 		if (!ret)
 			btrfs_set_inode_last_trans(trans, inode);
@@ -2532,8 +3659,9 @@
 	return btrfs_update_inode_item(trans, root, inode);
 }
 
-static noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, struct inode *inode)
+noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 struct inode *inode)
 {
 	int ret;
 
@@ -2543,6 +3671,47 @@
 	return ret;
 }
 
+#ifdef MY_ABC_HERE
+static int btrfs_unlink_dir_item_caseless(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct inode *dir, struct inode *inode,
+					const char *name, int name_len)
+{
+	struct btrfs_path *path;
+	int ret = 0;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key key;
+	u64 dir_ino = btrfs_ino(dir);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		return -ENOMEM;
+	}
+	path->caseless_key = 1;
+	path->leave_spinning = 1;
+
+	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
+				    name, name_len, -1);
+	if (IS_ERR(di)) {
+		ret = PTR_ERR(di);
+		goto err;
+	}
+	if (!di) {
+		goto err;
+	}
+	leaf = path->nodes[0];
+	btrfs_dir_item_key_to_cpu(leaf, di, &key);
+	ret = btrfs_delete_one_dir_name(trans, root, path, di);
+	if (ret)
+		goto err;
+err:
+	btrfs_free_path(path);
+	return ret;
+
+}
+#endif
+
 /*
  * unlink helper that gets used here in inode.c and in the tree logging
  * recovery code.  It remove a link in a directory with a given name, and
@@ -2586,35 +3755,51 @@
 		goto err;
 	btrfs_release_path(path);
 
+#ifdef MY_ABC_HERE
+	ret = btrfs_unlink_dir_item_caseless(trans, root, dir, inode, name, name_len);
+	if (ret)
+		goto err;
+#endif
+
 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
 				  dir_ino, &index);
 	if (ret) {
-		printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
-		       "inode %llu parent %llu\n", name_len, name,
-		       (unsigned long long)ino, (unsigned long long)dir_ino);
+		btrfs_info(root->fs_info,
+			"failed to delete reference to %.*s, inode %llu parent %llu",
+			name_len, name, ino, dir_ino);
+		btrfs_abort_transaction(trans, root, ret);
 		goto err;
 	}
 
 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-	if (ret)
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
 		goto err;
+	}
 
 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
 					 inode, dir_ino);
-	BUG_ON(ret != 0 && ret != -ENOENT);
+	if (ret != 0 && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto err;
+	}
 
 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
 					   dir, index);
 	if (ret == -ENOENT)
 		ret = 0;
+	else if (ret)
+		btrfs_abort_transaction(trans, root, ret);
 err:
 	btrfs_free_path(path);
 	if (ret)
 		goto out;
 
 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode_inc_iversion(inode);
+	inode_inc_iversion(dir);
 	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	btrfs_update_inode(trans, root, dir);
+	ret = btrfs_update_inode(trans, root, dir);
 out:
 	return ret;
 }
@@ -2632,220 +3817,49 @@
 	}
 	return ret;
 }
-		
-
-/* helper to check if there is any shared block in the path */
-static int check_path_shared(struct btrfs_root *root,
-			     struct btrfs_path *path)
-{
-	struct extent_buffer *eb;
-	int level;
-	u64 refs = 1;
-
-	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
-		int ret;
-
-		if (!path->nodes[level])
-			break;
-		eb = path->nodes[level];
-		if (!btrfs_block_can_be_shared(root, eb))
-			continue;
-		ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
-					       &refs, NULL);
-		if (refs > 1)
-			return 1;
-	}
-	return 0;
-}
 
 /*
  * helper to start transaction for unlink and rmdir.
  *
- * unlink and rmdir are special in btrfs, they do not always free space.
- * so in enospc case, we should make sure they will free space before
- * allowing them to use the global metadata reservation.
+ * unlink and rmdir are special in btrfs, they do not always free space, so
+ * if we cannot make our reservations the normal way try and see if there is
+ * plenty of slack room in the global reserve to migrate, otherwise we cannot
+ * allow the unlink to occur.
  */
-static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
-						       struct dentry *dentry)
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
-	struct btrfs_path *path;
-	struct btrfs_inode_ref *ref;
-	struct btrfs_dir_item *di;
-	struct inode *inode = dentry->d_inode;
-	u64 index;
-	int check_link = 1;
-	int err = -ENOSPC;
 	int ret;
-	u64 ino = btrfs_ino(inode);
-	u64 dir_ino = btrfs_ino(dir);
 
 	/*
 	 * 1 for the possible orphan item
 	 * 1 for the dir item
 	 * 1 for the dir index
 	 * 1 for the inode ref
-	 * 1 for the inode ref in the tree log
-	 * 2 for the dir entries in the log
 	 * 1 for the inode
 	 */
-	trans = btrfs_start_transaction(root, 8);
+	trans = btrfs_start_transaction(root, 5);
 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
 		return trans;
 
-	if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-		return ERR_PTR(-ENOSPC);
-
-	/* check if there is someone else holds reference */
-	if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-		return ERR_PTR(-ENOSPC);
-
-	if (atomic_read(&inode->i_count) > 2)
-		return ERR_PTR(-ENOSPC);
-
-	if (xchg(&root->fs_info->enospc_unlink, 1))
-		return ERR_PTR(-ENOSPC);
-
-	path = btrfs_alloc_path();
-	if (!path) {
-		root->fs_info->enospc_unlink = 0;
-		return ERR_PTR(-ENOMEM);
-	}
-
-	/* 1 for the orphan item */
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		btrfs_free_path(path);
-		root->fs_info->enospc_unlink = 0;
-		return trans;
-	}
-
-	path->skip_locking = 1;
-	path->search_commit_root = 1;
-
-	ret = btrfs_lookup_inode(trans, root, path,
-				&BTRFS_I(dir)->location, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret == 0) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		check_link = 0;
-	}
-	btrfs_release_path(path);
-
-	ret = btrfs_lookup_inode(trans, root, path,
-				&BTRFS_I(inode)->location, 0);
-	if (ret < 0) {
-		err = ret;
-		goto out;
-	}
-	if (ret == 0) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		check_link = 0;
-	}
-	btrfs_release_path(path);
+	if (PTR_ERR(trans) == -ENOSPC) {
+		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
 
-	if (ret == 0 && S_ISREG(inode->i_mode)) {
-		ret = btrfs_lookup_file_extent(trans, root, path,
-					       ino, (u64)-1, 0);
-		if (ret < 0) {
-			err = ret;
-			goto out;
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans))
+			return trans;
+		ret = btrfs_cond_migrate_bytes(root->fs_info,
+					       &root->fs_info->trans_block_rsv,
+					       num_bytes, 5);
+		if (ret) {
+			btrfs_end_transaction(trans, root);
+			return ERR_PTR(ret);
 		}
-		BUG_ON(ret == 0);
-		if (check_path_shared(root, path))
-			goto out;
-		btrfs_release_path(path);
-	}
-
-	if (!check_link) {
-		err = 0;
-		goto out;
-	}
-
-	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
-				dentry->d_name.name, dentry->d_name.len, 0);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto out;
-	}
-	if (di) {
-		if (check_path_shared(root, path))
-			goto out;
-	} else {
-		err = 0;
-		goto out;
-	}
-	btrfs_release_path(path);
-
-	ref = btrfs_lookup_inode_ref(trans, root, path,
-				dentry->d_name.name, dentry->d_name.len,
-				ino, dir_ino, 0);
-	if (IS_ERR(ref)) {
-		err = PTR_ERR(ref);
-		goto out;
-	}
-	BUG_ON(!ref);
-	if (check_path_shared(root, path))
-		goto out;
-	index = btrfs_inode_ref_index(path->nodes[0], ref);
-	btrfs_release_path(path);
-
-	/*
-	 * This is a commit root search, if we can lookup inode item and other
-	 * relative items in the commit root, it means the transaction of
-	 * dir/file creation has been committed, and the dir index item that we
-	 * delay to insert has also been inserted into the commit root. So
-	 * we needn't worry about the delayed insertion of the dir index item
-	 * here.
-	 */
-	di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
-				dentry->d_name.name, dentry->d_name.len, 0);
-	if (IS_ERR(di)) {
-		err = PTR_ERR(di);
-		goto out;
-	}
-	BUG_ON(ret == -ENOENT);
-	if (check_path_shared(root, path))
-		goto out;
-
-	err = 0;
-out:
-	btrfs_free_path(path);
-	/* Migrate the orphan reservation over */
-	if (!err)
-		err = btrfs_block_rsv_migrate(trans->block_rsv,
-				&root->fs_info->global_block_rsv,
-				trans->bytes_reserved);
-
-	if (err) {
-		btrfs_end_transaction(trans, root);
-		root->fs_info->enospc_unlink = 0;
-		return ERR_PTR(err);
-	}
-
-	trans->block_rsv = &root->fs_info->global_block_rsv;
-	return trans;
-}
-
-static void __unlink_end_trans(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root)
-{
-	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
-		btrfs_block_rsv_release(root, trans->block_rsv,
-					trans->bytes_reserved);
 		trans->block_rsv = &root->fs_info->trans_block_rsv;
-		BUG_ON(!root->fs_info->enospc_unlink);
-		root->fs_info->enospc_unlink = 0;
+		trans->bytes_reserved = num_bytes;
 	}
-	btrfs_end_transaction_throttle(trans, root);
+	return trans;
 }
 
 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -2854,9 +3868,8 @@
 	struct btrfs_trans_handle *trans;
 	struct inode *inode = dentry->d_inode;
 	int ret;
-	unsigned long nr = 0;
 
-	trans = __unlink_start_trans(dir, dentry);
+	trans = __unlink_start_trans(dir);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -2874,9 +3887,8 @@
 	}
 
 out:
-	nr = trans->blocks_used;
-	__unlink_end_trans(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
 	return ret;
 }
 
@@ -2899,23 +3911,42 @@
 
 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
 				   name, name_len, -1);
-	BUG_ON(IS_ERR_OR_NULL(di));
+	if (IS_ERR_OR_NULL(di)) {
+		if (!di)
+			ret = -ENOENT;
+		else
+			ret = PTR_ERR(di);
+		goto out;
+	}
 
 	leaf = path->nodes[0];
 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 	btrfs_release_path(path);
 
 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
 				 objectid, root->root_key.objectid,
 				 dir_ino, &index, name, name_len);
 	if (ret < 0) {
-		BUG_ON(ret != -ENOENT);
+		if (ret != -ENOENT) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 		di = btrfs_search_dir_index_item(root, path, dir_ino,
 						 name, name_len);
-		BUG_ON(IS_ERR_OR_NULL(di));
+		if (IS_ERR_OR_NULL(di)) {
+			if (!di)
+				ret = -ENOENT;
+			else
+				ret = PTR_ERR(di);
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
@@ -2925,15 +3956,20 @@
 	btrfs_release_path(path);
 
 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
+	inode_inc_iversion(dir);
 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
-	ret = btrfs_update_inode(trans, root, dir);
-	BUG_ON(ret);
-
+	ret = btrfs_update_inode_fallback(trans, root, dir);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+out:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
@@ -2942,13 +3978,13 @@
 	int err = 0;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr = 0;
 
-	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-	    btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
+	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
+		return -EPERM;
 
-	trans = __unlink_start_trans(dir, dentry);
+	trans = __unlink_start_trans(dir);
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -2970,9 +4006,8 @@
 	if (!err)
 		btrfs_i_size_write(inode, 0);
 out:
-	nr = trans->blocks_used;
-	__unlink_end_trans(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
 
 	return err;
 }
@@ -3002,14 +4037,13 @@
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
-	u64 mask = root->sectorsize - 1;
+	u64 last_size = (u64)-1;
 	u32 found_type = (u8)-1;
 	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
 	int pending_del_slot = 0;
 	int extent_type = -1;
-	int encoding;
 	int ret;
 	int err = 0;
 	u64 ino = btrfs_ino(inode);
@@ -3021,8 +4055,14 @@
 		return -ENOMEM;
 	path->reada = -1;
 
+	/*
+	 * We want to drop from the next block forward in case this new size is
+	 * not block aligned since we will be keeping the last block of the
+	 * extent just the way it is.
+	 */
 	if (root->ref_cows || root == root->fs_info->tree_root)
-		btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+		btrfs_drop_extent_cache(inode, ALIGN(new_size,
+					root->sectorsize), (u64)-1, 0);
 
 	/*
 	 * This function is also used to drop the items in the log tree before
@@ -3059,7 +4099,6 @@
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		found_type = btrfs_key_type(&found_key);
-		encoding = 0;
 
 		if (found_key.objectid != ino)
 			break;
@@ -3072,10 +4111,6 @@
 			fi = btrfs_item_ptr(leaf, path->slots[0],
 					    struct btrfs_file_extent_item);
 			extent_type = btrfs_file_extent_type(leaf, fi);
-			encoding = btrfs_file_extent_compression(leaf, fi);
-			encoding |= btrfs_file_extent_encryption(leaf, fi);
-			encoding |= btrfs_file_extent_other_encoding(leaf, fi);
-
 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 				item_end +=
 				    btrfs_file_extent_num_bytes(leaf, fi);
@@ -3100,16 +4135,20 @@
 		if (found_type != BTRFS_EXTENT_DATA_KEY)
 			goto delete;
 
+		if (del_item)
+			last_size = found_key.offset;
+		else
+			last_size = new_size;
+
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
-			if (!del_item && !encoding) {
+			if (!del_item) {
 				u64 orig_num_bytes =
 					btrfs_file_extent_num_bytes(leaf, fi);
-				extent_num_bytes = new_size -
-					found_key.offset + root->sectorsize - 1;
-				extent_num_bytes = extent_num_bytes &
-					~((u64)root->sectorsize - 1);
+				extent_num_bytes = ALIGN(new_size -
+						found_key.offset,
+						root->sectorsize);
 				btrfs_set_file_extent_num_bytes(leaf, fi,
 							 extent_num_bytes);
 				num_dec = (orig_num_bytes -
@@ -3149,8 +4188,7 @@
 				}
 				size =
 				    btrfs_file_extent_calc_inline_size(size);
-				ret = btrfs_truncate_item(trans, root, path,
-							  size, 1);
+				btrfs_truncate_item(root, path, size, 1);
 			} else if (root->ref_cows) {
 				inode_sub_bytes(inode, item_end + 1 -
 						found_key.offset);
@@ -3179,7 +4217,7 @@
 			ret = btrfs_free_extent(trans, root, extent_start,
 						extent_num_bytes, 0,
 						btrfs_header_owner(leaf),
-						ino, extent_offset);
+						ino, extent_offset, 0);
 			BUG_ON(ret);
 		}
 
@@ -3188,17 +4226,15 @@
 
 		if (path->slots[0] == 0 ||
 		    path->slots[0] != pending_del_slot) {
-			if (root->ref_cows &&
-			    BTRFS_I(inode)->location.objectid !=
-						BTRFS_FREE_INO_OBJECTID) {
-				err = -EAGAIN;
-				goto out;
-			}
 			if (pending_del_nr) {
 				ret = btrfs_del_items(trans, root, path,
 						pending_del_slot,
 						pending_del_nr);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_abort_transaction(trans,
+								root, ret);
+					goto error;
+				}
 				pending_del_nr = 0;
 			}
 			btrfs_release_path(path);
@@ -3211,19 +4247,31 @@
 	if (pending_del_nr) {
 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
 				      pending_del_nr);
-		BUG_ON(ret);
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
 	}
+error:
+	if (last_size != (u64)-1)
+		btrfs_ordered_update_i_size(inode, last_size, NULL);
 	btrfs_free_path(path);
 	return err;
 }
 
 /*
- * taken from block_truncate_page, but does cow as it zeros out
- * any bytes left in the last page in the file.
+ * btrfs_truncate_page - read, zero a chunk and write a page
+ * @inode - inode that we're zeroing
+ * @from - the offset to start zeroing
+ * @len - the length to zero, 0 to zero the entire range respective to the
+ *	offset
+ * @front - zero up to the offset instead of from the offset on
+ *
+ * This will find the page for the "from" offset and cow the page and zero the
+ * part we want to zero.  This is used with truncate and hole punching.
  */
-static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
+int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
+			int front)
 {
-	struct inode *inode = mapping->host;
+	struct address_space *mapping = inode->i_mapping;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct btrfs_ordered_extent *ordered;
@@ -3238,17 +4286,18 @@
 	u64 page_start;
 	u64 page_end;
 
-	if ((offset & (blocksize - 1)) == 0)
+	if ((offset & (blocksize - 1)) == 0 &&
+	    (!len || ((len & (blocksize - 1)) == 0)))
 		goto out;
 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
-	ret = -ENOMEM;
 again:
 	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+		ret = -ENOMEM;
 		goto out;
 	}
 
@@ -3270,8 +4319,7 @@
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
-			 GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
 	set_page_extent_mapped(page);
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -3286,7 +4334,8 @@
 	}
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -3297,10 +4346,14 @@
 		goto out_unlock;
 	}
 
-	ret = 0;
 	if (offset != PAGE_CACHE_SIZE) {
+		if (!len)
+			len = PAGE_CACHE_SIZE - offset;
 		kaddr = kmap(page);
-		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		if (front)
+			memset(kaddr, 0, offset);
+		else
+			memset(kaddr + offset, 0, len);
 		flush_dcache_page(page);
 		kunmap(page);
 	}
@@ -3331,14 +4384,23 @@
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map *em = NULL;
 	struct extent_state *cached_state = NULL;
-	u64 mask = root->sectorsize - 1;
-	u64 hole_start = (oldsize + mask) & ~mask;
-	u64 block_end = (size + mask) & ~mask;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	u64 hole_start = ALIGN(oldsize, root->sectorsize);
+	u64 block_end = ALIGN(size, root->sectorsize);
 	u64 last_byte;
 	u64 cur_offset;
 	u64 hole_size;
 	int err = 0;
 
+	/*
+	 * If our size started in the middle of a page we need to zero out the
+	 * rest of the page before we expand the i_size, otherwise we could
+	 * expose stale data.
+	 */
+	err = btrfs_truncate_page(inode, oldsize, 0, 0);
+	if (err)
+		return err;
+
 	if (size <= hole_start)
 		return 0;
 
@@ -3347,7 +4409,7 @@
 		btrfs_wait_ordered_range(inode, hole_start,
 					 block_end - hole_start);
 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
-				 &cached_state, GFP_NOFS);
+				 &cached_state);
 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
 		if (!ordered)
 			break;
@@ -3360,11 +4422,15 @@
 	while (1) {
 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
 				block_end - cur_offset, 0);
-		BUG_ON(IS_ERR_OR_NULL(em));
+		if (IS_ERR(em)) {
+			err = PTR_ERR(em);
+			em = NULL;
+			break;
+		}
 		last_byte = min(extent_map_end(em), block_end);
-		last_byte = (last_byte + mask) & ~mask;
+		last_byte = ALIGN(last_byte , root->sectorsize);
 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
-			u64 hint_byte = 0;
+			struct extent_map *hole_em;
 			hole_size = last_byte - cur_offset;
 
 			trans = btrfs_start_transaction(root, 3);
@@ -3373,11 +4439,11 @@
 				break;
 			}
 
-			err = btrfs_drop_extents(trans, inode, cur_offset,
-						 cur_offset + hole_size,
-						 &hint_byte, 1);
+			err = btrfs_drop_extents(trans, root, inode,
+						 cur_offset,
+						 cur_offset + hole_size, 1);
 			if (err) {
-				btrfs_update_inode(trans, root, inode);
+				btrfs_abort_transaction(trans, root, err);
 				btrfs_end_transaction(trans, root);
 				break;
 			}
@@ -3387,14 +4453,43 @@
 					0, hole_size, 0, hole_size,
 					0, 0, 0);
 			if (err) {
-				btrfs_update_inode(trans, root, inode);
+				btrfs_abort_transaction(trans, root, err);
 				btrfs_end_transaction(trans, root);
 				break;
 			}
 
-			btrfs_drop_extent_cache(inode, hole_start,
-					last_byte - 1, 0);
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + hole_size - 1, 0);
+			hole_em = alloc_extent_map();
+			if (!hole_em) {
+				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+					&BTRFS_I(inode)->runtime_flags);
+				goto next;
+			}
+			hole_em->start = cur_offset;
+			hole_em->len = hole_size;
+			hole_em->orig_start = cur_offset;
+
+			hole_em->block_start = EXTENT_MAP_HOLE;
+			hole_em->block_len = 0;
+			hole_em->orig_block_len = 0;
+			hole_em->ram_bytes = hole_size;
+			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
+			hole_em->compress_type = BTRFS_COMPRESS_NONE;
+			hole_em->generation = trans->transid;
 
+			while (1) {
+				write_lock(&em_tree->lock);
+				err = add_extent_mapping(em_tree, hole_em, 1);
+				write_unlock(&em_tree->lock);
+				if (err != -EEXIST)
+					break;
+				btrfs_drop_extent_cache(inode, cur_offset,
+							cur_offset +
+							hole_size - 1, 0);
+			}
+			free_extent_map(hole_em);
+next:
 			btrfs_update_inode(trans, root, inode);
 			btrfs_end_transaction(trans, root);
 		}
@@ -3411,15 +4506,23 @@
 	return err;
 }
 
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	loff_t oldsize = i_size_read(inode);
+	loff_t newsize = attr->ia_size;
+	int mask = attr->ia_valid;
 	int ret;
 
-	if (newsize == oldsize)
-		return 0;
+	/*
+	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+	 * special case where we need to update the times despite not having
+	 * these flags set.  For all other operations the VFS set these flags
+	 * explicitly if it wants a timestamp update.
+	 */
+	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 
 	if (newsize > oldsize) {
 		truncate_pagecache(inode, oldsize, newsize);
@@ -3434,7 +4537,7 @@
 		i_size_write(inode, newsize);
 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
 		ret = btrfs_update_inode(trans, root, inode);
-		btrfs_end_transaction_throttle(trans, root);
+		btrfs_end_transaction(trans, root);
 	} else {
 
 		/*
@@ -3443,11 +4546,61 @@
 		 * any new writes get down to disk quickly.
 		 */
 		if (newsize == 0)
-			BTRFS_I(inode)->ordered_data_close = 1;
+			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+				&BTRFS_I(inode)->runtime_flags);
+
+		/*
+		 * 1 for the orphan item we're going to add
+		 * 1 for the orphan item deletion.
+		 */
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans))
+			return PTR_ERR(trans);
+
+		/*
+		 * We need to do this in case we fail at _any_ point during the
+		 * actual truncate.  Once we do the truncate_setsize we could
+		 * invalidate pages which forces any outstanding ordered io to
+		 * be instantly completed which will give us extents that need
+		 * to be truncated.  If we fail to get an orphan inode down we
+		 * could have left over extents that were never meant to live,
+		 * so we need to garuntee from this point on that everything
+		 * will be consistent.
+		 */
+		ret = btrfs_orphan_add(trans, inode);
+		btrfs_end_transaction(trans, root);
+		if (ret)
+			return ret;
 
 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
 		truncate_setsize(inode, newsize);
+
+		/* Disable nonlocked read DIO to avoid the end less truncate */
+		btrfs_inode_block_unlocked_dio(inode);
+		inode_dio_wait(inode);
+		btrfs_inode_resume_unlocked_dio(inode);
+
 		ret = btrfs_truncate(inode);
+		if (ret && inode->i_nlink) {
+			int err;
+
+			/*
+			 * failed to truncate, disk_i_size is only adjusted down
+			 * as we remove extents, so it should represent the true
+			 * size of the inode, so reset the in memory size and
+			 * delete our orphan entry.
+			 */
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans)) {
+				btrfs_orphan_del(NULL, inode);
+				return ret;
+			}
+			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+			err = btrfs_orphan_del(trans, inode);
+			if (err)
+				btrfs_abort_transaction(trans, root, err);
+			btrfs_end_transaction(trans, root);
+		}
 	}
 
 	return ret;
@@ -3462,18 +4615,26 @@
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	if (IS_SYNOACL(dentry))
+		goto no_check_perm;
+#endif
 	err = inode_change_ok(inode, attr);
 	if (err)
 		return err;
 
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+no_check_perm:
+#endif
 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-		err = btrfs_setsize(inode, attr->ia_size);
+		err = btrfs_setsize(inode, attr);
 		if (err)
 			return err;
 	}
 
 	if (attr->ia_valid) {
 		setattr_copy(inode, attr);
+		inode_inc_iversion(inode);
 		err = btrfs_dirty_inode(inode);
 
 		if (!err && attr->ia_valid & ATTR_MODE)
@@ -3489,14 +4650,13 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv, *global_rsv;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
-	unsigned long nr;
 	int ret;
 
 	trace_btrfs_inode_evict(inode);
 
 	truncate_inode_pages(&inode->i_data, 0);
 	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
-			       btrfs_is_free_space_inode(root, inode)))
+			       btrfs_is_free_space_inode(inode)))
 		goto no_delete;
 
 	if (is_bad_inode(inode)) {
@@ -3507,7 +4667,8 @@
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 
 	if (root->fs_info->log_root_recovering) {
-		BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan));
+		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+				 &BTRFS_I(inode)->runtime_flags));
 		goto no_delete;
 	}
 
@@ -3516,29 +4677,32 @@
 		goto no_delete;
 	}
 
-	rsv = btrfs_alloc_block_rsv(root);
+	ret = btrfs_commit_inode_delayed_inode(inode);
+	if (ret) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv) {
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
 	}
 	rsv->size = min_size;
+	rsv->failfast = 1;
 	global_rsv = &root->fs_info->global_block_rsv;
 
 	btrfs_i_size_write(inode, 0);
 
 	/*
-	 * This is a bit simpler than btrfs_truncate since
-	 *
-	 * 1) We've already reserved our space for our orphan item in the
-	 *    unlink.
-	 * 2) We're going to delete the inode item, so we don't need to update
-	 *    it at all.
-	 *
-	 * So we just need to reserve some slack space in case we add bytes when
-	 * doing the truncate.
+	 * This is a bit simpler than btrfs_truncate since we've already
+	 * reserved our space for our orphan item in the unlink, so we just
+	 * need to reserve some slack space in case we add bytes and update
+	 * inode item when doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_refill_noflush(root, rsv, min_size);
+		ret = btrfs_block_rsv_refill(root, rsv, min_size,
+					     BTRFS_RESERVE_FLUSH_LIMIT);
 
 		/*
 		 * Try and steal from the global reserve since we will
@@ -3549,14 +4713,15 @@
 			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
 
 		if (ret) {
-			printk(KERN_WARNING "Could not get space for a "
-			       "delete, will truncate on mount %d\n", ret);
+			btrfs_warn(root->fs_info,
+				"Could not get space for a delete, will truncate on mount %d",
+				ret);
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
 			goto no_delete;
 		}
 
-		trans = btrfs_start_transaction(root, 0);
+		trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans)) {
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
@@ -3566,21 +4731,26 @@
 		trans->block_rsv = rsv;
 
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
-		if (ret != -EAGAIN)
+		if (ret != -ENOSPC)
 			break;
 
-		nr = trans->blocks_used;
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 	}
 
 	btrfs_free_block_rsv(root, rsv);
 
+	/*
+	 * Errors here aren't a big deal, it just means we leave orphan items
+	 * in the tree.  They will be cleaned up on the next mount.
+	 */
 	if (ret == 0) {
 		trans->block_rsv = root->orphan_block_rsv;
-		ret = btrfs_orphan_del(trans, inode);
-		BUG_ON(ret);
+		btrfs_orphan_del(trans, inode);
+	} else {
+		btrfs_orphan_del(NULL, inode);
 	}
 
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3588,10 +4758,10 @@
 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
 		btrfs_return_ino(root, btrfs_ino(inode));
 
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 no_delete:
+	btrfs_remove_delayed_node(inode);
 	end_writeback(inode);
 	return;
 }
@@ -3600,8 +4770,13 @@
  * this returns the key found in the dir entry in the location pointer.
  * If no dir entries were found, location->objectid is 0.
  */
+#ifdef MY_ABC_HERE
+static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
+			       struct btrfs_key *location, int caseless)
+#else
 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
 			       struct btrfs_key *location)
+#endif
 {
 	const char *name = dentry->d_name.name;
 	int namelen = dentry->d_name.len;
@@ -3614,6 +4789,12 @@
 	if (!path)
 		return -ENOMEM;
 
+#ifdef MY_ABC_HERE
+	if (caseless) {
+		path->caseless_key = 1;
+		path->caseless_name = 1;
+	}
+#endif
 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
 				    namelen, 0);
 	if (IS_ERR(di))
@@ -3622,6 +4803,32 @@
 	if (IS_ERR_OR_NULL(di))
 		goto out_err;
 
+#ifdef MY_ABC_HERE
+	if (caseless) {
+		struct extent_buffer *leaf = path->nodes[0];
+		int real_name_len = btrfs_dir_name_len(leaf, di);
+		if (real_name_len > (DNAME_INLINE_LEN - 1) && real_name_len > dentry->d_name.len) {
+			char *old_name = NULL;
+			char *new_name = kmalloc(real_name_len + 1, GFP_KERNEL);
+			if (!new_name) {
+				goto out_err;
+			}
+			if (dentry->d_name.len > (DNAME_INLINE_LEN -1)) {
+				old_name = (char *)dentry->d_name.name;
+			}
+			read_extent_buffer(leaf, (void *) new_name, (unsigned long)(di + 1), real_name_len);
+			new_name[real_name_len] = 0;
+			dentry->d_name.name = new_name;
+			dentry->d_name.len = real_name_len;
+			if (old_name)
+				kfree(old_name);
+		} else {
+			read_extent_buffer(leaf, (void *) dentry->d_name.name, (unsigned long)(di + 1), real_name_len);
+			((char *)dentry->d_name.name)[real_name_len] = 0;
+			dentry->d_name.len = real_name_len;
+		}
+	}
+#endif
 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
 out:
 	btrfs_free_path(path);
@@ -3685,11 +4892,6 @@
 		goto out;
 	}
 
-	if (btrfs_root_refs(&new_root->root_item) == 0) {
-		err = -ENOENT;
-		goto out;
-	}
-
 	*sub_root = new_root;
 	location->objectid = btrfs_root_dirid(&new_root->root_item);
 	location->type = BTRFS_INODE_ITEM_KEY;
@@ -3706,15 +4908,14 @@
 	struct btrfs_inode *entry;
 	struct rb_node **p;
 	struct rb_node *parent;
+	struct rb_node *new = &BTRFS_I(inode)->rb_node;
 	u64 ino = btrfs_ino(inode);
-again:
-	p = &root->inode_tree.rb_node;
-	parent = NULL;
 
 	if (inode_unhashed(inode))
 		return;
-
+	parent = NULL;
 	spin_lock(&root->inode_lock);
+	p = &root->inode_tree.rb_node;
 	while (*p) {
 		parent = *p;
 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
@@ -3726,14 +4927,14 @@
 		else {
 			WARN_ON(!(entry->vfs_inode.i_state &
 				  (I_WILL_FREE | I_FREEING)));
-			rb_erase(parent, &root->inode_tree);
+			rb_replace_node(parent, new, &root->inode_tree);
 			RB_CLEAR_NODE(parent);
 			spin_unlock(&root->inode_lock);
-			goto again;
+			return;
 		}
 	}
-	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
-	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+	rb_link_node(new, parent, p);
+	rb_insert_color(new, &root->inode_tree);
 	spin_unlock(&root->inode_lock);
 }
 
@@ -3767,7 +4968,7 @@
 	}
 }
 
-int btrfs_invalidate_inodes(struct btrfs_root *root)
+void btrfs_invalidate_inodes(struct btrfs_root *root)
 {
 	struct rb_node *node;
 	struct rb_node *prev;
@@ -3827,7 +5028,6 @@
 		node = rb_next(node);
 	}
 	spin_unlock(&root->inode_lock);
-	return 0;
 }
 
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
@@ -3835,7 +5035,6 @@
 	struct btrfs_iget_args *args = p;
 	inode->i_ino = args->ino;
 	BTRFS_I(inode)->root = args->root;
-	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
 }
 
@@ -3878,10 +5077,23 @@
 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
 		btrfs_read_locked_inode(inode);
 		if (!is_bad_inode(inode)) {
+#ifdef MY_ABC_HERE
+			int retval;
+			__le32 archive_bit;
+
+#endif
 			inode_tree_add(inode);
 			unlock_new_inode(inode);
 			if (new)
 				*new = 1;
+#ifdef MY_ABC_HERE
+			retval = __btrfs_getxattr(inode, XATTR_SYNO_PREFIX XATTR_SYNO_ARCHIVE_BIT, &archive_bit, sizeof(archive_bit));
+			if (0 < retval) {
+				inode->i_mode2 = le32_to_cpu(archive_bit);
+			} else {
+				inode->i_mode2 = 0;
+			}
+#endif
 		} else {
 			unlock_new_inode(inode);
 			iput(inode);
@@ -3903,10 +5115,10 @@
 
 	BTRFS_I(inode)->root = root;
 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
-	BTRFS_I(inode)->dummy_inode = 1;
+	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
 
 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
-	inode->i_op = &simple_dir_inode_operations;
+	inode->i_op = &btrfs_dir_ro_inode_operations;
 	inode->i_fop = &simple_dir_operations;
 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
@@ -3914,7 +5126,11 @@
 	return inode;
 }
 
+#ifdef MY_ABC_HERE
+struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry, int caseless)
+#else
 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
+#endif
 {
 	struct inode *inode;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
@@ -3933,7 +5149,11 @@
 		/* This thing is hashed, drop it for now */
 		d_drop(dentry);
 	} else {
+#ifdef MY_ABC_HERE
+		ret = btrfs_inode_by_name(dir, dentry, &location, caseless);
+#else
 		ret = btrfs_inode_by_name(dir, dentry, &location);
+#endif
 	}
 
 	if (ret < 0)
@@ -3967,8 +5187,10 @@
 		if (!(inode->i_sb->s_flags & MS_RDONLY))
 			ret = btrfs_orphan_cleanup(sub_root);
 		up_read(&root->fs_info->cleanup_work_sem);
-		if (ret)
+		if (ret) {
+			iput(inode);
 			inode = ERR_PTR(ret);
+		}
 	}
 
 	return inode;
@@ -3977,14 +5199,18 @@
 static int btrfs_dentry_delete(const struct dentry *dentry)
 {
 	struct btrfs_root *root;
+	struct inode *inode = dentry->d_inode;
 
-	if (!dentry->d_inode && !IS_ROOT(dentry))
-		dentry = dentry->d_parent;
+	if (!inode && !IS_ROOT(dentry))
+		inode = dentry->d_parent->d_inode;
 
-	if (dentry->d_inode) {
-		root = BTRFS_I(dentry->d_inode)->root;
+	if (inode) {
+		root = BTRFS_I(inode)->root;
 		if (btrfs_root_refs(&root->root_item) == 0)
 			return 1;
+
+		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+			return 1;
 	}
 	return 0;
 }
@@ -4000,7 +5226,11 @@
 {
 	struct dentry *ret;
 
+#ifdef MY_ABC_HERE
+	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry, (nd && nd->flags & LOOKUP_CASELESS_COMPARE)?1:0), dentry);
+#else
 	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
+#endif
 	if (unlikely(d_need_lookup(dentry))) {
 		spin_lock(&dentry->d_lock);
 		dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
@@ -4025,7 +5255,6 @@
 	struct btrfs_path *path;
 	struct list_head ins_list;
 	struct list_head del_list;
-	struct qstr q;
 	int ret;
 	struct extent_buffer *leaf;
 	int slot;
@@ -4116,7 +5345,6 @@
 
 		while (di_cur < di_total) {
 			struct btrfs_key location;
-			struct dentry *tmp;
 
 			if (verify_dir_item(root, leaf, di))
 				break;
@@ -4137,35 +5365,15 @@
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
 
-			q.name = name_ptr;
-			q.len = name_len;
-			q.hash = full_name_hash(q.name, q.len);
-			tmp = d_lookup(filp->f_dentry, &q);
-			if (!tmp) {
-				struct btrfs_key *newkey;
-
-				newkey = kzalloc(sizeof(struct btrfs_key),
-						 GFP_NOFS);
-				if (!newkey)
-					goto no_dentry;
-				tmp = d_alloc(filp->f_dentry, &q);
-				if (!tmp) {
-					kfree(newkey);
-					dput(tmp);
-					goto no_dentry;
-				}
-				memcpy(newkey, &location,
-				       sizeof(struct btrfs_key));
-				tmp->d_fsdata = newkey;
-				tmp->d_flags |= DCACHE_NEED_LOOKUP;
-				d_rehash(tmp);
-				dput(tmp);
-			} else {
-				dput(tmp);
-			}
-no_dentry:
+
 			/* is this a reference to our own snapshot? If so
-			 * skip it
+			 * skip it.
+			 *
+			 * In contrast to old kernels, we insert the snapshot's
+			 * dir item and dir index after it has been created, so
+			 * we won't find a reference to our own snapshot. We
+			 * still keep the following code for backward
+			 * compatibility.
 			 */
 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
 			    location.objectid == root->root_key.objectid) {
@@ -4201,14 +5409,31 @@
 	}
 
 	/* Reached end of directory/root. Bump pos past the last item. */
-	if (key_type == BTRFS_DIR_INDEX_KEY)
-		/*
-		 * 32-bit glibc will use getdents64, but then strtol -
-		 * so the last number we can serve is this.
-		 */
-		filp->f_pos = 0x7fffffff;
-	else
-		filp->f_pos++;
+	filp->f_pos++;
+
+	/*
+	 * Stop new entries from being returned after we return the last
+	 * entry.
+	 *
+	 * New directory entries are assigned a strictly increasing
+	 * offset.  This means that new entries created during readdir
+	 * are *guaranteed* to be seen in the future by that readdir.
+	 * This has broken buggy programs which operate on names as
+	 * they're returned by readdir.  Until we re-use freed offsets
+	 * we have this hack to stop new entries from being returned
+	 * under the assumption that they'll never reach this huge
+	 * offset.
+	 *
+	 * This is being careful not to overflow 32bit loff_t unless the
+	 * last entry requires it because doing so has broken 32bit apps
+	 * in the past.
+	 */
+	if (key_type == BTRFS_DIR_INDEX_KEY) {
+		if (filp->f_pos >= INT_MAX)
+			filp->f_pos = LLONG_MAX;
+		else
+			filp->f_pos = INT_MAX;
+	}
 nopos:
 	ret = 0;
 err:
@@ -4225,10 +5450,10 @@
 	int ret = 0;
 	bool nolock = false;
 
-	if (BTRFS_I(inode)->dummy_inode)
+	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
 		return 0;
 
-	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(root, inode))
+	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
 		nolock = true;
 
 	if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -4238,10 +5463,7 @@
 			trans = btrfs_join_transaction(root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
-		if (nolock)
-			ret = btrfs_end_transaction_nolock(trans, root);
-		else
-			ret = btrfs_commit_transaction(trans, root);
+		ret = btrfs_commit_transaction(trans, root);
 	}
 	return ret;
 }
@@ -4252,13 +5474,13 @@
  * FIXME, needs more benchmarking...there are no reasons other than performance
  * to keep or drop this code.
  */
-int btrfs_dirty_inode(struct inode *inode)
+static int btrfs_dirty_inode(struct inode *inode)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	if (BTRFS_I(inode)->dummy_inode)
+	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
 		return 0;
 
 	trans = btrfs_join_transaction(root);
@@ -4286,46 +5508,23 @@
  * This is a copy of file_update_time.  We need this so we can return error on
  * ENOSPC for updating the inode in the case of file write and mmap writes.
  */
-int btrfs_update_time(struct file *file)
+static int btrfs_update_time(struct inode *inode, struct timespec *now,
+			     int flags)
 {
-	struct inode *inode = file->f_path.dentry->d_inode;
-	struct timespec now;
-	int ret;
-	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
-
-	/* First try to exhaust all avenues to not sync */
-	if (IS_NOCMTIME(inode))
-		return 0;
-
-	now = current_fs_time(inode->i_sb);
-	if (!timespec_equal(&inode->i_mtime, &now))
-		sync_it = S_MTIME;
-
-	if (!timespec_equal(&inode->i_ctime, &now))
-		sync_it |= S_CTIME;
-
-	if (IS_I_VERSION(inode))
-		sync_it |= S_VERSION;
-
-	if (!sync_it)
-		return 0;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	/* Finally allowed to write? Takes lock. */
-	if (mnt_want_write_file(file))
-		return 0;
+	if (btrfs_root_readonly(root))
+		return -EROFS;
 
-	/* Only change inode inside the lock region */
-	if (sync_it & S_VERSION)
+	if (flags & S_VERSION)
 		inode_inc_iversion(inode);
-	if (sync_it & S_CTIME)
-		inode->i_ctime = now;
-	if (sync_it & S_MTIME)
-		inode->i_mtime = now;
-	ret = btrfs_dirty_inode(inode);
-	if (!ret)
-		mark_inode_dirty_sync(inode);
-	mnt_drop_write(file->f_path.mnt);
-	return ret;
+	if (flags & S_CTIME)
+		inode->i_ctime = *now;
+	if (flags & S_MTIME)
+		inode->i_mtime = *now;
+	if (flags & S_ATIME)
+		inode->i_atime = *now;
+	return btrfs_dirty_inode(inode);
 }
 
 /*
@@ -4412,8 +5611,8 @@
 				     struct btrfs_root *root,
 				     struct inode *dir,
 				     const char *name, int name_len,
-				     u64 ref_objectid, u64 objectid, int mode,
-				     u64 *index)
+				     u64 ref_objectid, u64 objectid,
+				     umode_t mode, u64 *index)
 {
 	struct inode *inode;
 	struct btrfs_inode_item *inode_item;
@@ -4461,7 +5660,14 @@
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
 	inode->i_generation = BTRFS_I(inode)->generation;
-	btrfs_set_inode_space_info(root, inode);
+
+	/*
+	 * We could have gotten an inode number from somebody who was fsynced
+	 * and then removed in this same transaction, so let's just set full
+	 * sync since it will be a full sync anyway and this will blow away the
+	 * old info in the log.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 
 	if (S_ISDIR(mode))
 		owner = 0;
@@ -4472,6 +5678,12 @@
 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
 	key[0].offset = 0;
 
+	/*
+	 * Start new inodes with an inode_ref. This is slightly more
+	 * efficient for small numbers of hard links since they will
+	 * be packed into one item. Extended refs will kick in if we
+	 * add more hard links than can fit in the ref item.
+	 */
 	key[1].objectid = objectid;
 	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
 	key[1].offset = ref_objectid;
@@ -4489,6 +5701,8 @@
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				  struct btrfs_inode_item);
+	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
+			     sizeof(*inode_item));
 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
 
 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
@@ -4511,9 +5725,9 @@
 	if (S_ISREG(mode)) {
 		if (btrfs_test_opt(root, NODATASUM))
 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-		if (btrfs_test_opt(root, NODATACOW) ||
-		    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
-			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+		if (btrfs_test_opt(root, NODATACOW))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+				BTRFS_INODE_NODATASUM;
 	}
 
 	insert_inode_hash(inode);
@@ -4522,6 +5736,8 @@
 	trace_btrfs_inode_new(inode);
 	btrfs_set_inode_last_trans(trans, inode);
 
+	btrfs_update_root_times(trans, root);
+
 	return inode;
 fail:
 	if (dir)
@@ -4569,16 +5785,43 @@
 					     parent_ino, index);
 	}
 
-	if (ret == 0) {
-		ret = btrfs_insert_dir_item(trans, root, name, name_len,
-					    parent_inode, &key,
-					    btrfs_inode_type(inode), index);
-		BUG_ON(ret);
+	/* Nothing to clean up yet */
+	if (ret)
+		return ret;
+
+	ret = btrfs_insert_dir_item(trans, root, name, name_len,
+				    parent_inode, &key,
+				    btrfs_inode_type(inode), index);
+	if (ret == -EEXIST || ret == -EOVERFLOW)
+		goto fail_dir_item;
+	else if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		return ret;
+	}
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+			   name_len * 2);
+	inode_inc_iversion(parent_inode);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode(trans, root, parent_inode);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	return ret;
+
+fail_dir_item:
+	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
+		u64 local_index;
+		int err;
+		err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
+				 key.objectid, root->root_key.objectid,
+				 parent_ino, &local_index, name, name_len);
 
-		btrfs_i_size_write(parent_inode, parent_inode->i_size +
-				   name_len * 2);
-		parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
-		ret = btrfs_update_inode(trans, root, parent_inode);
+	} else if (add_backref) {
+		u64 local_index;
+		int err;
+
+		err = btrfs_del_inode_ref(trans, root, name, name_len,
+					  ino, parent_ino, &local_index);
 	}
 	return ret;
 }
@@ -4604,18 +5847,26 @@
 	int err;
 	int drop_inode = 0;
 	u64 objectid;
-	unsigned long nr = 0;
 	u64 index = 0;
 
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
+#ifdef MY_ABC_HERE
+	/*
+	 * 2 for inode item and ref
+	 * 3 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 6);
+#else
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
 	 * 1 for xattr if selinux is on
 	 */
 	trans = btrfs_start_transaction(root, 5);
+#endif
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -4637,6 +5888,14 @@
 		goto out_unlock;
 	}
 
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+	err = syno_btrfs_init_attr(trans, inode);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+#endif
+
 	/*
 	* If the active LSM wants to access the inode during
 	* d_instantiate it needs these. Smack checks to see
@@ -4654,9 +5913,8 @@
 		d_instantiate(dentry, inode);
 	}
 out_unlock:
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_end_transaction(trans, root);
+	btrfs_btree_balance_dirty(root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
@@ -4670,18 +5928,26 @@
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = NULL;
-	int drop_inode = 0;
+	int drop_inode_on_err = 0;
 	int err;
-	unsigned long nr = 0;
 	u64 objectid;
 	u64 index = 0;
 
+#ifdef MY_ABC_HERE
+	/*
+	 * 2 for inode item and ref
+	 * 3 for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 6);
+#else
 	/*
 	 * 2 for inode item and ref
 	 * 2 for dir items
 	 * 1 for xattr if selinux is on
 	 */
 	trans = btrfs_start_transaction(root, 5);
+#endif
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -4696,12 +5962,22 @@
 		err = PTR_ERR(inode);
 		goto out_unlock;
 	}
+	drop_inode_on_err = 1;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_unlock;
+
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+	err = syno_btrfs_init_attr(trans, inode);
 	if (err) {
-		drop_inode = 1;
 		goto out_unlock;
 	}
+#endif
+
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_unlock;
 
 	/*
 	* If the active LSM wants to access the inode during
@@ -4714,21 +5990,20 @@
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
-		drop_inode = 1;
-	else {
-		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-		d_instantiate(dentry, inode);
-	}
+		goto out_unlock;
+
+	inode->i_mapping->a_ops = &btrfs_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	d_instantiate(dentry, inode);
+
 out_unlock:
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
-	if (drop_inode) {
+	btrfs_end_transaction(trans, root);
+	if (err && drop_inode_on_err) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -4739,7 +6014,6 @@
 	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct inode *inode = old_dentry->d_inode;
 	u64 index;
-	unsigned long nr = 0;
 	int err;
 	int drop_inode = 0;
 
@@ -4747,7 +6021,7 @@
 	if (root->objectid != BTRFS_I(inode)->root->objectid)
 		return -EXDEV;
 
-	if (inode->i_nlink == ~0U)
+	if (inode->i_nlink >= BTRFS_LINK_MAX)
 		return -EMLINK;
 
 	err = btrfs_set_inode_index(dir, &index);
@@ -4766,8 +6040,10 @@
 	}
 
 	btrfs_inc_nlink(inode);
+	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ihold(inode);
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
 
@@ -4776,19 +6052,19 @@
 	} else {
 		struct dentry *parent = dentry->d_parent;
 		err = btrfs_update_inode(trans, root, inode);
-		BUG_ON(err);
+		if (err)
+			goto fail;
 		d_instantiate(dentry, inode);
 		btrfs_log_new_name(trans, inode, NULL, parent);
 	}
 
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 fail:
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -4801,14 +6077,22 @@
 	int drop_on_err = 0;
 	u64 objectid = 0;
 	u64 index = 0;
-	unsigned long nr = 1;
 
+#ifdef MY_ABC_HERE
+	/*
+	 * 2 items for inode and ref
+	 * 3 items for dir items
+	 * 1 for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 6);
+#else
 	/*
 	 * 2 items for inode and ref
 	 * 2 items for dir items
 	 * 1 for xattr if selinux is on
 	 */
 	trans = btrfs_start_transaction(root, 5);
+#endif
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -4830,6 +6114,13 @@
 	if (err)
 		goto out_fail;
 
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+	err = syno_btrfs_init_attr(trans, inode);
+	if (err) {
+		goto out_fail;
+	}
+#endif
+
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 
@@ -4847,11 +6138,10 @@
 	drop_on_err = 0;
 
 out_fail:
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_on_err)
 		iput(inode);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -4875,7 +6165,7 @@
 		em->block_start += start_diff;
 		em->block_len -= start_diff;
 	}
-	return add_extent_mapping(em_tree, em);
+	return add_extent_mapping(em_tree, em, 0);
 }
 
 static noinline int uncompress_inline(struct btrfs_path *path,
@@ -4907,12 +6197,12 @@
 	ret = btrfs_decompress(compress_type, tmp, page,
 			       extent_offset, inline_size, max_size);
 	if (ret) {
-		char *kaddr = kmap_atomic(page, KM_USER0);
+		char *kaddr = kmap_atomic(page);
 		unsigned long copy_size = min_t(u64,
 				  PAGE_CACHE_SIZE - pg_offset,
 				  max_size - extent_offset);
 		memset(kaddr + pg_offset, 0, copy_size);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 	}
 	kfree(tmp);
 	return 0;
@@ -5022,8 +6312,7 @@
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size_t size;
 		size = btrfs_file_extent_inline_len(leaf, item);
-		extent_end = (extent_start + size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
+		extent_end = ALIGN(extent_start + size, root->sectorsize);
 	}
 
 	if (start >= extent_end) {
@@ -5045,16 +6334,20 @@
 		if (start + len <= found_key.offset)
 			goto not_found;
 		em->start = start;
+		em->orig_start = start;
 		em->len = found_key.offset - start;
 		goto not_found_em;
 	}
 
+	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
 		em->start = extent_start;
 		em->len = extent_end - extent_start;
 		em->orig_start = extent_start -
 				 btrfs_file_extent_offset(leaf, item);
+		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
+								      item);
 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
 		if (bytenr == 0) {
 			em->block_start = EXTENT_MAP_HOLE;
@@ -5064,8 +6357,7 @@
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 			em->compress_type = compress_type;
 			em->block_start = bytenr;
-			em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
-									 item);
+			em->block_len = em->orig_block_len;
 		} else {
 			bytenr += btrfs_file_extent_offset(leaf, item);
 			em->block_start = bytenr;
@@ -5093,9 +6385,9 @@
 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
 				size - extent_offset);
 		em->start = extent_start + extent_offset;
-		em->len = (copy_size + root->sectorsize - 1) &
-			~((u64)root->sectorsize - 1);
-		em->orig_start = EXTENT_MAP_INLINE;
+		em->len = ALIGN(copy_size, root->sectorsize);
+		em->orig_block_len = em->len;
+		em->orig_start = em->start;
 		if (compress_type) {
 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
 			em->compress_type = compress_type;
@@ -5107,7 +6399,7 @@
 				ret = uncompress_inline(path, inode, page,
 							pg_offset,
 							extent_offset, item);
-				BUG_ON(ret);
+				BUG_ON(ret); /* -ENOMEM */
 			} else {
 				map = kmap(page);
 				read_extent_buffer(leaf, map + pg_offset, ptr,
@@ -5121,7 +6413,7 @@
 			}
 			flush_dcache_page(page);
 		} else if (create && PageUptodate(page)) {
-			WARN_ON(1);
+			BUG();
 			if (!trans) {
 				kunmap(page);
 				free_extent_map(em);
@@ -5144,11 +6436,11 @@
 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
 		goto insert;
 	} else {
-		printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
-		WARN_ON(1);
+		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
 	}
 not_found:
 	em->start = start;
+	em->orig_start = start;
 	em->len = len;
 not_found_em:
 	em->block_start = EXTENT_MAP_HOLE;
@@ -5156,18 +6448,15 @@
 insert:
 	btrfs_release_path(path);
 	if (em->start > start || extent_map_end(em) <= start) {
-		printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
-		       "[%llu %llu]\n", (unsigned long long)em->start,
-		       (unsigned long long)em->len,
-		       (unsigned long long)start,
-		       (unsigned long long)len);
+		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
+			em->start, em->len, start, len);
 		err = -EIO;
 		goto out;
 	}
 
 	err = 0;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
 	/* it is possible that someone inserted the extent into the tree
 	 * while we had the lock dropped.  It is also possible that
 	 * an overlapping map exists in the tree
@@ -5209,7 +6498,8 @@
 	write_unlock(&em_tree->lock);
 out:
 
-	trace_btrfs_get_extent(root, em);
+	if (em)
+		trace_btrfs_get_extent(root, em);
 
 	if (path)
 		btrfs_free_path(path);
@@ -5222,6 +6512,7 @@
 		free_extent_map(em);
 		return ERR_PTR(err);
 	}
+	BUG_ON(!em); /* Error is always set */
 	return em;
 }
 
@@ -5242,10 +6533,13 @@
 		return em;
 	if (em) {
 		/*
-		 * if our em maps to a hole, there might
-		 * actually be delalloc bytes behind it
+		 * if our em maps to
+		 * -  a hole or
+		 * -  a pre-alloc extent,
+		 * there might actually be delalloc bytes behind it.
 		 */
-		if (em->block_start != EXTENT_MAP_HOLE)
+		if (em->block_start != EXTENT_MAP_HOLE &&
+		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			return em;
 		else
 			hole_em = em;
@@ -5327,6 +6621,8 @@
 			 */
 			em->block_start = hole_em->block_start;
 			em->block_len = hole_len;
+			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
 		} else {
 			em->start = range_start;
 			em->len = found;
@@ -5348,88 +6644,35 @@
 }
 
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
-						  struct extent_map *em,
 						  u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
 	struct btrfs_key ins;
 	u64 alloc_hint;
 	int ret;
-	bool insert = false;
-
-	/*
-	 * Ok if the extent map we looked up is a hole and is for the exact
-	 * range we want, there is no reason to allocate a new one, however if
-	 * it is not right then we need to free this one and drop the cache for
-	 * our range.
-	 */
-	if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
-	    em->len != len) {
-		free_extent_map(em);
-		em = NULL;
-		insert = true;
-		btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
-	}
-
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
-		return ERR_CAST(trans);
-
-	if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
-		btrfs_add_inode_defrag(trans, inode);
-
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
 
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
-	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
-				   alloc_hint, (u64)-1, &ins, 1);
-	if (ret) {
-		em = ERR_PTR(ret);
-		goto out;
-	}
-
-	if (!em) {
-		em = alloc_extent_map();
-		if (!em) {
-			em = ERR_PTR(-ENOMEM);
-			goto out;
-		}
-	}
-
-	em->start = start;
-	em->orig_start = em->start;
-	em->len = ins.offset;
-
-	em->block_start = ins.objectid;
-	em->block_len = ins.offset;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
-
-	/*
-	 * We need to do this because if we're using the original em we searched
-	 * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
-	 */
-	em->flags = 0;
-	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
+				   alloc_hint, &ins, 1);
+	if (ret)
+		return ERR_PTR(ret);
 
-	while (insert) {
-		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
-		write_unlock(&em_tree->lock);
-		if (ret != -EEXIST)
-			break;
-		btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+			      ins.offset, ins.offset, ins.offset, 0);
+	if (IS_ERR(em)) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+		return em;
 	}
 
 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
 					   ins.offset, ins.offset, 0);
 	if (ret) {
 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
-		em = ERR_PTR(ret);
+		free_extent_map(em);
+		return ERR_PTR(ret);
 	}
-out:
-	btrfs_end_transaction(trans, root);
+
 	return em;
 }
 
@@ -5437,9 +6680,11 @@
  * returns 1 when the nocow is safe, < 1 on error, 0 if the
  * block must be cow'd
  */
-static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
-				      struct inode *inode, u64 offset, u64 len)
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
+			      u64 *orig_start, u64 *orig_block_len,
+			      u64 *ram_bytes)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
 	int ret;
 	struct extent_buffer *leaf;
@@ -5452,12 +6697,12 @@
 	u64 num_bytes;
 	int slot;
 	int found_type;
-
+	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
-	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 				       offset, 0);
 	if (ret < 0)
 		goto out;
@@ -5492,15 +6737,29 @@
 		/* not a regular extent, must cow */
 		goto out;
 	}
+
+	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
+		goto out;
+
 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-	backref_offset = btrfs_file_extent_offset(leaf, fi);
+	if (disk_bytenr == 0)
+		goto out;
 
-	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
-	if (extent_end < offset + len) {
-		/* extent doesn't include our full range, must cow */
+	if (btrfs_file_extent_compression(leaf, fi) ||
+	    btrfs_file_extent_encryption(leaf, fi) ||
+	    btrfs_file_extent_other_encoding(leaf, fi))
 		goto out;
+
+	backref_offset = btrfs_file_extent_offset(leaf, fi);
+
+	if (orig_start) {
+		*orig_start = key.offset - backref_offset;
+		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
 	}
 
+	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+
 	if (btrfs_extent_readonly(root, disk_bytenr))
 		goto out;
 
@@ -5508,9 +6767,19 @@
 	 * look for other files referencing this extent, if we
 	 * find any we must cow
 	 */
-	if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
-				  key.offset - backref_offset, disk_bytenr))
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = 0;
 		goto out;
+	}
+
+	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
+				    key.offset - backref_offset, disk_bytenr);
+	btrfs_end_transaction(trans, root);
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
 
 	/*
 	 * adjust disk_bytenr and num_bytes to cover just the bytes
@@ -5520,31 +6789,161 @@
 	 */
 	disk_bytenr += backref_offset;
 	disk_bytenr += offset - key.offset;
-	num_bytes = min(offset + len, extent_end) - offset;
+	num_bytes = min(offset + *len, extent_end) - offset;
 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
 				goto out;
 	/*
 	 * all of the above have passed, it is safe to overwrite this extent
 	 * without cow
 	 */
+	*len = num_bytes;
 	ret = 1;
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
+			      struct extent_state **cached_state, int writing)
+{
+	struct btrfs_ordered_extent *ordered;
+	int ret = 0;
+
+	while (1) {
+		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				 0, cached_state);
+		/*
+		 * We're concerned with the entire range that we're going to be
+		 * doing DIO to, so we need to make sure theres no ordered
+		 * extents in this range.
+		 */
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+
+		/*
+		 * We need to make sure there are no buffered pages in this
+		 * range either, we could have raced between the invalidate in
+		 * generic_file_direct_write and locking the extent.  The
+		 * invalidate needs to happen so that reads after a write do not
+		 * get stale data.
+		 */
+		if (!ordered && (!writing ||
+		    !test_range_bit(&BTRFS_I(inode)->io_tree,
+				    lockstart, lockend, EXTENT_UPTODATE, 0,
+				    *cached_state)))
+			break;
+
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+				     cached_state, GFP_NOFS);
+
+		if (ordered) {
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+		} else {
+			/* Screw you mmap */
+			ret = filemap_write_and_wait_range(inode->i_mapping,
+							   lockstart,
+							   lockend);
+			if (ret)
+				break;
+
+			/*
+			 * If we found a page that couldn't be invalidated just
+			 * fall back to buffered.
+			 */
+			ret = invalidate_inode_pages2_range(inode->i_mapping,
+					lockstart >> PAGE_CACHE_SHIFT,
+					lockend >> PAGE_CACHE_SHIFT);
+			if (ret)
+				break;
+		}
+
+		cond_resched();
+	}
+
+	return ret;
+}
+
+static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
+					   u64 len, u64 orig_start,
+					   u64 block_start, u64 block_len,
+					   u64 orig_block_len, u64 ram_bytes,
+					   int type)
+{
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	int ret;
+
+	em_tree = &BTRFS_I(inode)->extent_tree;
+	em = alloc_extent_map();
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	em->start = start;
+	em->orig_start = orig_start;
+	em->mod_start = start;
+	em->mod_len = len;
+	em->len = len;
+	em->block_len = block_len;
+	em->block_start = block_start;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	em->orig_block_len = orig_block_len;
+	em->ram_bytes = ram_bytes;
+	em->generation = -1;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+	if (type == BTRFS_ORDERED_PREALLOC)
+		set_bit(EXTENT_FLAG_FILLING, &em->flags);
+
+	do {
+		btrfs_drop_extent_cache(inode, em->start,
+				em->start + em->len - 1, 0);
+		write_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em, 1);
+		write_unlock(&em_tree->lock);
+	} while (ret == -EEXIST);
+
+	if (ret) {
+		free_extent_map(em);
+		return ERR_PTR(ret);
+	}
+
+	return em;
+}
+
+
 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_state *cached_state = NULL;
 	u64 start = iblock << inode->i_blkbits;
+	u64 lockstart, lockend;
 	u64 len = bh_result->b_size;
-	struct btrfs_trans_handle *trans;
+	int unlock_bits = EXTENT_LOCKED;
+	int ret = 0;
+
+	if (create)
+		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+	else
+		len = min_t(u64, len, root->sectorsize);
+
+	lockstart = start;
+	lockend = start + len - 1;
+
+	/*
+	 * If this errors out it's because we couldn't invalidate pagecache for
+	 * this range and we need to fallback to buffered.
+	 */
+	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
+		return -ENOTBLK;
 
 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
 
 	/*
 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
@@ -5563,17 +6962,15 @@
 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
 	    em->block_start == EXTENT_MAP_INLINE) {
 		free_extent_map(em);
-		return -ENOTBLK;
+		ret = -ENOTBLK;
+		goto unlock_err;
 	}
 
 	/* Just a good old fashioned hole, return */
 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
 		free_extent_map(em);
-		/* DIO will do one hole at a time, so just unlock a sector */
-		unlock_extent(&BTRFS_I(inode)->io_tree, start,
-			      start + root->sectorsize - 1, GFP_NOFS);
-		return 0;
+		goto unlock_err;
 	}
 
 	/*
@@ -5586,8 +6983,9 @@
 	 *
 	 */
 	if (!create) {
-		len = em->len - (start - em->start);
-		goto map;
+		len = min(len, em->len - (start - em->start));
+		lockstart = start + len;
+		goto unlock;
 	}
 
 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
@@ -5595,7 +6993,7 @@
 	     em->block_start != EXTENT_MAP_HOLE)) {
 		int type;
 		int ret;
-		u64 block_start;
+		u64 block_start, orig_start, orig_block_len, ram_bytes;
 
 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
 			type = BTRFS_ORDERED_PREALLOC;
@@ -5604,71 +7002,90 @@
 		len = min(len, em->len - (start - em->start));
 		block_start = em->block_start + (start - em->start);
 
-		/*
-		 * we're not going to log anything, but we do need
-		 * to make sure the current transaction stays open
-		 * while we look for nocow cross refs
-		 */
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans))
-			goto must_cow;
+		if (can_nocow_extent(inode, start, &len, &orig_start,
+				     &orig_block_len, &ram_bytes) == 1) {
+			if (type == BTRFS_ORDERED_PREALLOC) {
+				free_extent_map(em);
+				em = create_pinned_em(inode, start, len,
+						       orig_start,
+						       block_start, len,
+						       orig_block_len,
+						       ram_bytes, type);
+				if (IS_ERR(em))
+					goto unlock_err;
+			}
 
-		if (can_nocow_odirect(trans, inode, start, len) == 1) {
 			ret = btrfs_add_ordered_extent_dio(inode, start,
 					   block_start, len, len, type);
-			btrfs_end_transaction(trans, root);
 			if (ret) {
 				free_extent_map(em);
-				return ret;
+				goto unlock_err;
 			}
 			goto unlock;
 		}
-		btrfs_end_transaction(trans, root);
 	}
-must_cow:
+
 	/*
 	 * this will cow the extent, reset the len in case we changed
 	 * it above
 	 */
 	len = bh_result->b_size;
-	em = btrfs_new_extent_direct(inode, em, start, len);
-	if (IS_ERR(em))
-		return PTR_ERR(em);
+	free_extent_map(em);
+	em = btrfs_new_extent_direct(inode, start, len);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto unlock_err;
+	}
 	len = min(len, em->len - (start - em->start));
 unlock:
-	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
-			  EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
-			  0, NULL, GFP_NOFS);
-map:
 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
 		inode->i_blkbits;
 	bh_result->b_size = len;
 	bh_result->b_bdev = em->bdev;
 	set_buffer_mapped(bh_result);
-	if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
-		set_buffer_new(bh_result);
+	if (create) {
+		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+			set_buffer_new(bh_result);
 
-	free_extent_map(em);
+		/*
+		 * Need to update the i_size under the extent lock so buffered
+		 * readers will get the updated i_size when we unlock.
+		 */
+		if (start + len > i_size_read(inode))
+			i_size_write(inode, start + len);
 
-	return 0;
-}
+		spin_lock(&BTRFS_I(inode)->lock);
+		BTRFS_I(inode)->outstanding_extents++;
+		spin_unlock(&BTRFS_I(inode)->lock);
 
-struct btrfs_dio_private {
-	struct inode *inode;
-	u64 logical_offset;
-	u64 disk_bytenr;
-	u64 bytes;
-	u32 *csums;
-	void *private;
+		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				     lockstart + len - 1, EXTENT_DELALLOC, NULL,
+				     &cached_state, GFP_NOFS);
+		BUG_ON(ret);
+	}
 
-	/* number of bios pending for this dio */
-	atomic_t pending_bios;
+	/*
+	 * In the case of write we need to clear and unlock the entire range,
+	 * in the case of read we need to unlock only the end area that we
+	 * aren't using if there is any left over space.
+	 */
+	if (lockstart < lockend) {
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+				 lockend, unlock_bits, 1, 0,
+				 &cached_state, GFP_NOFS);
+	} else {
+		free_extent_state(cached_state);
+	}
 
-	/* IO errors */
-	int errors;
+	free_extent_map(em);
 
-	struct bio *orig_bio;
-};
+	return 0;
+
+unlock_err:
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
+	return ret;
+}
 
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
@@ -5677,8 +7094,10 @@
 	struct bio_vec *bvec = bio->bi_io_vec;
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct bio *dio_bio;
+	u32 *csums = (u32 *)dip->csum;
+	int index = 0;
 	u64 start;
-	u32 *private = dip->csums;
 
 	start = dip->logical_offset;
 	do {
@@ -5689,40 +7108,38 @@
 			unsigned long flags;
 
 			local_irq_save(flags);
-			kaddr = kmap_atomic(page, KM_IRQ0);
-			csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+			kaddr = kmap_atomic(page);
+			csum = btrfs_csum_data(kaddr + bvec->bv_offset,
 					       csum, bvec->bv_len);
 			btrfs_csum_final(csum, (char *)&csum);
-			kunmap_atomic(kaddr, KM_IRQ0);
+			kunmap_atomic(kaddr);
 			local_irq_restore(flags);
 
 			flush_dcache_page(bvec->bv_page);
-			if (csum != *private) {
-				printk(KERN_ERR "btrfs csum failed ino %llu off"
-				      " %llu csum %u private %u\n",
-				      (unsigned long long)btrfs_ino(inode),
-				      (unsigned long long)start,
-				      csum, *private);
+			if (csum != csums[index]) {
+				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
+					  btrfs_ino(inode), start, csum,
+					  csums[index]);
 				err = -EIO;
 			}
 		}
 
 		start += bvec->bv_len;
-		private++;
 		bvec++;
+		index++;
 	} while (bvec <= bvec_end);
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
-		      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
-	bio->bi_private = dip->private;
+		      dip->logical_offset + dip->bytes - 1);
+	dio_bio = dip->dio_bio;
 
-	kfree(dip->csums);
 	kfree(dip);
 
 	/* If we had a csum failure make sure to clear the uptodate flag */
 	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	dio_end_io(bio, err);
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+	bio_put(bio);
 }
 
 static void btrfs_endio_direct_write(struct bio *bio, int err)
@@ -5730,11 +7147,10 @@
 	struct btrfs_dio_private *dip = bio->bi_private;
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_ordered_extent *ordered = NULL;
-	struct extent_state *cached_state = NULL;
 	u64 ordered_offset = dip->logical_offset;
 	u64 ordered_bytes = dip->bytes;
+	struct bio *dio_bio;
 	int ret;
 
 	if (err)
@@ -5742,73 +7158,14 @@
 again:
 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
 						   &ordered_offset,
-						   ordered_bytes);
+						   ordered_bytes, !err);
 	if (!ret)
 		goto out_test;
 
-	BUG_ON(!ordered);
-
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans)) {
-		err = -ENOMEM;
-		goto out;
-	}
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
-	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
-		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-		if (!ret)
-			err = btrfs_update_inode_fallback(trans, root, inode);
-		goto out;
-	}
-
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-			 ordered->file_offset + ordered->len - 1, 0,
-			 &cached_state, GFP_NOFS);
-
-	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-		ret = btrfs_mark_extent_written(trans, inode,
-						ordered->file_offset,
-						ordered->file_offset +
-						ordered->len);
-		if (ret) {
-			err = ret;
-			goto out_unlock;
-		}
-	} else {
-		ret = insert_reserved_file_extent(trans, inode,
-						  ordered->file_offset,
-						  ordered->start,
-						  ordered->disk_len,
-						  ordered->len,
-						  ordered->len,
-						  0, 0, 0,
-						  BTRFS_FILE_EXTENT_REG);
-		unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
-				   ordered->file_offset, ordered->len);
-		if (ret) {
-			err = ret;
-			WARN_ON(1);
-			goto out_unlock;
-		}
-	}
-
-	add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-	ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-	if (!ret || !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags))
-		btrfs_update_inode_fallback(trans, root, inode);
-	ret = 0;
-out_unlock:
-	unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
-			     ordered->file_offset + ordered->len - 1,
-			     &cached_state, GFP_NOFS);
-out:
-	btrfs_delalloc_release_metadata(inode, ordered->len);
-	btrfs_end_transaction(trans, root);
-	ordered_offset = ordered->file_offset + ordered->len;
-	btrfs_put_ordered_extent(ordered);
-	btrfs_put_ordered_extent(ordered);
-
+	ordered->work.func = finish_ordered_fn;
+	ordered->work.flags = 0;
+	btrfs_queue_worker(&root->fs_info->endio_write_workers,
+			   &ordered->work);
 out_test:
 	/*
 	 * our bio might span multiple ordered extents.  If we haven't
@@ -5817,18 +7174,19 @@
 	if (ordered_offset < dip->logical_offset + dip->bytes) {
 		ordered_bytes = dip->logical_offset + dip->bytes -
 			ordered_offset;
+		ordered = NULL;
 		goto again;
 	}
 out_done:
-	bio->bi_private = dip->private;
+	dio_bio = dip->dio_bio;
 
-	kfree(dip->csums);
 	kfree(dip);
 
 	/* If we had an error make sure to clear the uptodate flag */
 	if (err)
-		clear_bit(BIO_UPTODATE, &bio->bi_flags);
-	dio_end_io(bio, err);
+		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
+	dio_end_io(dio_bio, err);
+	bio_put(bio);
 }
 
 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
@@ -5838,7 +7196,7 @@
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 	return 0;
 }
 
@@ -5849,7 +7207,7 @@
 	if (err) {
 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
 		      "sector %#Lx len %u err no %d\n",
-		      (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
+		      btrfs_ino(dip->inode), bio->bi_rw,
 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
 		dip->errors = 1;
 
@@ -5864,10 +7222,10 @@
 	if (!atomic_dec_and_test(&dip->pending_bios))
 		goto out;
 
-	if (dip->errors)
+	if (dip->errors) {
 		bio_io_error(dip->orig_bio);
-	else {
-		set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+	} else {
+		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
 		bio_endio(dip->orig_bio, 0);
 	}
 out:
@@ -5883,16 +7241,23 @@
 
 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 					 int rw, u64 file_offset, int skip_sum,
-					 u32 *csums, int async_submit)
+					 int async_submit)
 {
+	struct btrfs_dio_private *dip = bio->bi_private;
 	int write = rw & REQ_WRITE;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
 
+	if (async_submit)
+		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
+
 	bio_get(bio);
-	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-	if (ret)
-		goto err;
+
+	if (!write) {
+		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+		if (ret)
+			goto err;
+	}
 
 	if (skip_sum)
 		goto map;
@@ -5913,8 +7278,8 @@
 		if (ret)
 			goto err;
 	} else if (!skip_sum) {
-		ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
-					  file_offset, csums);
+		ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
+						file_offset);
 		if (ret)
 			goto err;
 	}
@@ -5931,7 +7296,6 @@
 {
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
 	struct bio *bio;
 	struct bio *orig_bio = dip->orig_bio;
 	struct bio_vec *bvec = orig_bio->bi_io_vec;
@@ -5940,13 +7304,11 @@
 	u64 submit_len = 0;
 	u64 map_length;
 	int nr_pages = 0;
-	u32 *csums = dip->csums;
 	int ret = 0;
 	int async_submit = 0;
-	int write = rw & REQ_WRITE;
 
 	map_length = orig_bio->bi_size;
-	ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
 			      &map_length, NULL, 0);
 	if (ret) {
 		bio_put(orig_bio);
@@ -5958,7 +7320,13 @@
 		goto submit;
 	}
 
-	async_submit = 1;
+	/* async crcs make it difficult to collect full stripe writes. */
+	if (btrfs_get_alloc_profile(root, 1) &
+	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+		async_submit = 0;
+	else
+		async_submit = 1;
+
 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
 	if (!bio)
 		return -ENOMEM;
@@ -5979,16 +7347,13 @@
 			atomic_inc(&dip->pending_bios);
 			ret = __btrfs_submit_dio_bio(bio, inode, rw,
 						     file_offset, skip_sum,
-						     csums, async_submit);
+						     async_submit);
 			if (ret) {
 				bio_put(bio);
 				atomic_dec(&dip->pending_bios);
 				goto out_err;
 			}
 
-			/* Write's use the ordered csums */
-			if (!write && !skip_sum)
-				csums = csums + nr_pages;
 			start_sector += submit_len >> 9;
 			file_offset += submit_len;
 
@@ -6003,7 +7368,8 @@
 			bio->bi_end_io = btrfs_end_dio_bio;
 
 			map_length = orig_bio->bi_size;
-			ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+			ret = btrfs_map_block(root->fs_info, rw,
+					      start_sector << 9,
 					      &map_length, NULL, 0);
 			if (ret) {
 				bio_put(bio);
@@ -6018,7 +7384,7 @@
 
 submit:
 	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
-				     csums, async_submit);
+				     async_submit);
 	if (!ret)
 		return 0;
 
@@ -6037,59 +7403,63 @@
 	return 0;
 }
 
-static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
-				loff_t file_offset)
+static void btrfs_submit_direct(int rw, struct bio *dio_bio,
+				struct inode *inode, loff_t file_offset)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_dio_private *dip;
-	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio *io_bio;
 	int skip_sum;
+	int sum_len;
 	int write = rw & REQ_WRITE;
 	int ret = 0;
+	u16 csum_size;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
-	dip = kmalloc(sizeof(*dip), GFP_NOFS);
-	if (!dip) {
+	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
+	if (!io_bio) {
 		ret = -ENOMEM;
 		goto free_ordered;
 	}
-	dip->csums = NULL;
 
-	/* Write's use the ordered csum stuff, so we don't need dip->csums */
-	if (!write && !skip_sum) {
-		dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
-		if (!dip->csums) {
-			kfree(dip);
-			ret = -ENOMEM;
-			goto free_ordered;
-		}
+	if (!skip_sum && !write) {
+		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+		sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+		sum_len *= csum_size;
+	} else {
+		sum_len = 0;
 	}
 
-	dip->private = bio->bi_private;
+	dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
+	if (!dip) {
+		ret = -ENOMEM;
+		goto free_io_bio;
+	}
+
+	dip->private = dio_bio->bi_private;
 	dip->inode = inode;
 	dip->logical_offset = file_offset;
-
-	dip->bytes = 0;
-	do {
-		dip->bytes += bvec->bv_len;
-		bvec++;
-	} while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
-
-	dip->disk_bytenr = (u64)bio->bi_sector << 9;
-	bio->bi_private = dip;
+	dip->bytes = dio_bio->bi_size;
+	dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
+	io_bio->bi_private = dip;
 	dip->errors = 0;
-	dip->orig_bio = bio;
+	dip->orig_bio = io_bio;
+	dip->dio_bio = dio_bio;
 	atomic_set(&dip->pending_bios, 0);
 
 	if (write)
-		bio->bi_end_io = btrfs_endio_direct_write;
+		io_bio->bi_end_io = btrfs_endio_direct_write;
 	else
-		bio->bi_end_io = btrfs_endio_direct_read;
+		io_bio->bi_end_io = btrfs_endio_direct_read;
 
 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
 	if (!ret)
 		return;
+
+free_io_bio:
+	bio_put(io_bio);
+
 free_ordered:
 	/*
 	 * If this is a write, we need to clean up the reserved space and kill
@@ -6105,7 +7475,7 @@
 		btrfs_put_ordered_extent(ordered);
 		btrfs_put_ordered_extent(ordered);
 	}
-	bio_endio(bio, ret);
+	bio_endio(dio_bio, ret);
 }
 
 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
@@ -6149,101 +7519,88 @@
 out:
 	return retval;
 }
+
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 			const struct iovec *iov, loff_t offset,
 			unsigned long nr_segs)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file->f_mapping->host;
-	struct btrfs_ordered_extent *ordered;
-	struct extent_state *cached_state = NULL;
-	u64 lockstart, lockend;
+	size_t count = 0;
+	int flags = 0;
+	bool wakeup = true;
+	bool relock = false;
 	ssize_t ret;
-	int writing = rw & WRITE;
-	int write_bits = 0;
-	size_t count = iov_length(iov, nr_segs);
 
 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
-			    offset, nr_segs)) {
+			    offset, nr_segs))
 		return 0;
-	}
 
-	lockstart = offset;
-	lockend = offset + count - 1;
+	atomic_inc(&inode->i_dio_count);
+	smp_mb__after_atomic_inc();
 
-	if (writing) {
-		ret = btrfs_delalloc_reserve_space(inode, count);
-		if (ret)
-			goto out;
-	}
+	/*
+	 * The generic stuff only does filemap_write_and_wait_range, which isn't
+	 * enough if we've written compressed pages to this area, so we need to
+	 * call btrfs_wait_ordered_range to make absolutely sure that any
+	 * outstanding dirty pages are on disk.
+	 */
+	count = iov_length(iov, nr_segs);
+	btrfs_wait_ordered_range(inode, offset, count);
 
-	while (1) {
-		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 0, &cached_state, GFP_NOFS);
+	if (rw & WRITE) {
 		/*
-		 * We're concerned with the entire range that we're going to be
-		 * doing DIO to, so we need to make sure theres no ordered
-		 * extents in this range.
+		 * If the write DIO is beyond the EOF, we need update
+		 * the isize, but it is protected by i_mutex. So we can
+		 * not unlock the i_mutex at this case.
 		 */
-		ordered = btrfs_lookup_ordered_range(inode, lockstart,
-						     lockend - lockstart + 1);
-		if (!ordered)
-			break;
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				     &cached_state, GFP_NOFS);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-		cond_resched();
-	}
-
-	/*
-	 * we don't use btrfs_set_extent_delalloc because we don't want
-	 * the dirty or uptodate bits
-	 */
-	if (writing) {
-		write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
-		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				     EXTENT_DELALLOC, 0, NULL, &cached_state,
-				     GFP_NOFS);
-		if (ret) {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-					 lockend, EXTENT_LOCKED | write_bits,
-					 1, 0, &cached_state, GFP_NOFS);
-			goto out;
+		if (offset + count <= inode->i_size) {
+			mutex_unlock(&inode->i_mutex);
+			relock = true;
 		}
+		ret = btrfs_delalloc_reserve_space(inode, count);
+		if (ret)
+			goto out;
+	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+				     &BTRFS_I(inode)->runtime_flags))) {
+		inode_dio_done(inode);
+		flags = DIO_LOCKING | DIO_SKIP_HOLES;
+		wakeup = false;
 	}
 
-	free_extent_state(cached_state);
-	cached_state = NULL;
-
 	ret = __blockdev_direct_IO(rw, iocb, inode,
-		   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
-		   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-		   btrfs_submit_direct, 0);
-
-	if (ret < 0 && ret != -EIOCBQUEUED) {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
-			      offset + iov_length(iov, nr_segs) - 1,
-			      EXTENT_LOCKED | write_bits, 1, 0,
-			      &cached_state, GFP_NOFS);
-	} else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
-		/*
-		 * We're falling back to buffered, unlock the section we didn't
-		 * do IO on.
-		 */
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
-			      offset + iov_length(iov, nr_segs) - 1,
-			      EXTENT_LOCKED | write_bits, 1, 0,
-			      &cached_state, GFP_NOFS);
+			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+			btrfs_submit_direct, flags);
+	if (rw & WRITE) {
+		if (ret < 0 && ret != -EIOCBQUEUED)
+			btrfs_delalloc_release_space(inode, count);
+		else if (ret >= 0 && (size_t)ret < count)
+			btrfs_delalloc_release_space(inode,
+						     count - (size_t)ret);
+		else
+			btrfs_delalloc_release_metadata(inode, 0);
 	}
 out:
-	free_extent_state(cached_state);
+	if (wakeup)
+		inode_dio_done(inode);
+	if (relock)
+		mutex_lock(&inode->i_mutex);
+
 	return ret;
 }
 
+#define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
+
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len)
 {
+	int	ret;
+
+	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
+	if (ret)
+		return ret;
+
 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 
@@ -6268,8 +7625,8 @@
 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
 }
 
-int btrfs_writepages(struct address_space *mapping,
-		     struct writeback_control *wbc)
+static int btrfs_writepages(struct address_space *mapping,
+			    struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
 
@@ -6312,13 +7669,13 @@
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
+	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree;
 	struct btrfs_ordered_extent *ordered;
 	struct extent_state *cached_state = NULL;
 	u64 page_start = page_offset(page);
 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-
 	/*
 	 * we have the page locked, so new writeback can't start,
 	 * and the dirty bit won't be cleared while we are here.
@@ -6328,15 +7685,13 @@
 	 */
 	wait_on_page_writeback(page);
 
-	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	tree = &BTRFS_I(inode)->io_tree;
 	if (offset) {
 		btrfs_releasepage(page, GFP_NOFS);
 		return;
 	}
-	lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-			 GFP_NOFS);
-	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
-					   page_offset(page));
+	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
 	if (ordered) {
 		/*
 		 * IO on this page will never be started, so we need
@@ -6344,24 +7699,38 @@
 		 */
 		clear_extent_bit(tree, page_start, page_end,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
-				 &cached_state, GFP_NOFS);
+				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
 		/*
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
 		 */
 		if (TestClearPagePrivate2(page)) {
-			btrfs_finish_ordered_io(page->mapping->host,
-						page_start, page_end);
+			struct btrfs_ordered_inode_tree *tree;
+			u64 new_len;
+
+			tree = &BTRFS_I(inode)->ordered_tree;
+
+			spin_lock_irq(&tree->lock);
+			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+			new_len = page_start - ordered->file_offset;
+			if (new_len < ordered->truncated_len)
+				ordered->truncated_len = new_len;
+			spin_unlock_irq(&tree->lock);
+
+			if (btrfs_dec_test_ordered_pending(inode, &ordered,
+							   page_start,
+							   PAGE_CACHE_SIZE, 1))
+				btrfs_finish_ordered_io(ordered);
 		}
 		btrfs_put_ordered_extent(ordered);
 		cached_state = NULL;
-		lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
-				 GFP_NOFS);
+		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
 	}
 	clear_extent_bit(tree, page_start, page_end,
 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
-		 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
+		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
+		 &cached_state, GFP_NOFS);
 	__btrfs_releasepage(page, GFP_NOFS);
 
 	ClearPageChecked(page);
@@ -6399,21 +7768,24 @@
 	unsigned long zero_start;
 	loff_t size;
 	int ret;
+	int reserved = 0;
 	u64 page_start;
 	u64 page_end;
 
-	/* Need this to keep space reservations serialized */
-	mutex_lock(&inode->i_mutex);
+	sb_start_pagefault(inode->i_sb);
 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-	mutex_unlock(&inode->i_mutex);
-	if (!ret)
-		ret = btrfs_update_time(vma->vm_file);
+	if (!ret) {
+		ret = file_update_time(vma->vm_file);
+		reserved = 1;
+	}
 	if (ret) {
 		if (ret == -ENOMEM)
 			ret = VM_FAULT_OOM;
 		else /* -ENOSPC, -EIO, etc */
 			ret = VM_FAULT_SIGBUS;
-		goto out;
+		if (reserved)
+			goto out;
+		goto out_noreserve;
 	}
 
 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
@@ -6430,8 +7802,7 @@
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
-			 GFP_NOFS);
+	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
 	set_page_extent_mapped(page);
 
 	/*
@@ -6456,7 +7827,8 @@
 	 * prepare_pages in the normal write path.
 	 */
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
-			  EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
+			  EXTENT_DIRTY | EXTENT_DELALLOC |
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 			  0, 0, &cached_state, GFP_NOFS);
 
 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
@@ -6487,15 +7859,20 @@
 
 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 
 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 
 out_unlock:
-	if (!ret)
+	if (!ret) {
+		sb_end_pagefault(inode->i_sb);
 		return VM_FAULT_LOCKED;
+	}
 	unlock_page(page);
-	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
+	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+out_noreserve:
+	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
 
@@ -6503,19 +7880,13 @@
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv;
-	int ret;
+	int ret = 0;
 	int err = 0;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
-	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
-	if (ret)
-		return ret;
-
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
-	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
 	/*
 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
@@ -6553,18 +7924,17 @@
 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
 	 * updating the inode.
 	 */
-	rsv = btrfs_alloc_block_rsv(root);
+	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
 	if (!rsv)
 		return -ENOMEM;
 	rsv->size = min_size;
+	rsv->failfast = 1;
 
 	/*
 	 * 1 for the truncate slack space
-	 * 1 for the orphan item we're going to add
-	 * 1 for the orphan item deletion
 	 * 1 for updating the inode.
 	 */
-	trans = btrfs_start_transaction(root, 4);
+	trans = btrfs_start_transaction(root, 2);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out;
@@ -6575,12 +7945,6 @@
 				      min_size);
 	BUG_ON(ret);
 
-	ret = btrfs_orphan_add(trans, inode);
-	if (ret) {
-		btrfs_end_transaction(trans, root);
-		goto out;
-	}
-
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
 	 * but that is only tested during the last file release.  That
@@ -6598,39 +7962,25 @@
 	 * using truncate to replace the contents of the file will
 	 * end up with a zero length file after a crash.
 	 */
-	if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
+					   &BTRFS_I(inode)->runtime_flags))
 		btrfs_add_ordered_operation(trans, root, inode);
 
-	while (1) {
-		ret = btrfs_block_rsv_refill(root, rsv, min_size);
-		if (ret) {
-			/*
-			 * This can only happen with the original transaction we
-			 * started above, every other time we shouldn't have a
-			 * transaction started yet.
-			 */
-			if (ret == -EAGAIN)
-				goto end_trans;
-			err = ret;
-			break;
-		}
-
-		if (!trans) {
-			/* Just need the 1 for updating the inode */
-			trans = btrfs_start_transaction(root, 1);
-			if (IS_ERR(trans)) {
-				ret = err = PTR_ERR(trans);
-				trans = NULL;
-				break;
-			}
-		}
-
-		trans->block_rsv = rsv;
+	/*
+	 * So if we truncate and then write and fsync we normally would just
+	 * write the extents that changed, which is a problem if we need to
+	 * first truncate that entire inode.  So set this flag so we write out
+	 * all of the extents in the inode to the sync log so we're completely
+	 * safe.
+	 */
+	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
+	trans->block_rsv = rsv;
 
+	while (1) {
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
-		if (ret != -EAGAIN) {
+		if (ret != -ENOSPC) {
 			err = ret;
 			break;
 		}
@@ -6641,11 +7991,21 @@
 			err = ret;
 			break;
 		}
-end_trans:
-		nr = trans->blocks_used;
+
 		btrfs_end_transaction(trans, root);
-		trans = NULL;
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
+
+		trans = btrfs_start_transaction(root, 2);
+		if (IS_ERR(trans)) {
+			ret = err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
+
+		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
+					      rsv, min_size);
+		BUG_ON(ret);	/* shouldn't happen */
+		trans->block_rsv = rsv;
 	}
 
 	if (ret == 0 && inode->i_nlink > 0) {
@@ -6653,12 +8013,6 @@
 		ret = btrfs_orphan_del(trans, inode);
 		if (ret)
 			err = ret;
-	} else if (ret && inode->i_nlink > 0) {
-		/*
-		 * Failed to do the truncate, remove us from the in memory
-		 * orphan list.
-		 */
-		ret = btrfs_orphan_del(NULL, inode);
 	}
 
 	if (trans) {
@@ -6667,9 +8021,8 @@
 		if (ret && !err)
 			err = ret;
 
-		nr = trans->blocks_used;
-		ret = btrfs_end_transaction_throttle(trans, root);
-		btrfs_btree_balance_dirty(root, nr);
+		ret = btrfs_end_transaction(trans, root);
+		btrfs_btree_balance_dirty(root);
 	}
 
 out:
@@ -6691,10 +8044,17 @@
 	int err;
 	u64 index = 0;
 
-	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-				new_dirid, S_IFDIR | 0700, &index);
+	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
+				new_dirid, new_dirid,
+				S_IFDIR | (~current_umask() & S_IRWXUGO),
+				&index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+	err = syno_btrfs_init_attr(trans, inode);
+	if (err)
+		goto out;
+#endif
 	inode->i_op = &btrfs_dir_inode_operations;
 	inode->i_fop = &btrfs_dir_file_operations;
 
@@ -6702,10 +8062,12 @@
 	btrfs_i_size_write(inode, 0);
 
 	err = btrfs_update_inode(trans, new_root, inode);
-	BUG_ON(err);
 
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+out:
+#endif
 	iput(inode);
-	return 0;
+	return err;
 }
 
 struct inode *btrfs_alloc_inode(struct super_block *sb)
@@ -6718,9 +8080,7 @@
 		return NULL;
 
 	ei->root = NULL;
-	ei->space_info = NULL;
 	ei->generation = 0;
-	ei->sequence = 0;
 	ei->last_trans = 0;
 	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
@@ -6730,16 +8090,13 @@
 	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
 	ei->last_unlink_trans = 0;
+	ei->last_log_commit = 0;
 
 	spin_lock_init(&ei->lock);
 	ei->outstanding_extents = 0;
 	ei->reserved_extents = 0;
 
-	ei->ordered_data_close = 0;
-	ei->orphan_meta_reserved = 0;
-	ei->dummy_inode = 0;
-	ei->in_defrag = 0;
-	ei->delalloc_meta_reserved = 0;
+	ei->runtime_flags = 0;
 	ei->force_compress = BTRFS_COMPRESS_NONE;
 
 	ei->delayed_node = NULL;
@@ -6748,9 +8105,12 @@
 	extent_map_tree_init(&ei->extent_tree);
 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
+	ei->io_tree.track_uptodate = 1;
+	ei->io_failure_tree.track_uptodate = 1;
+	atomic_set(&ei->sync_writers, 0);
 	mutex_init(&ei->log_mutex);
+	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
-	INIT_LIST_HEAD(&ei->i_orphan);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
 	INIT_LIST_HEAD(&ei->ordered_operations);
 	RB_CLEAR_NODE(&ei->rb_node);
@@ -6790,28 +8150,25 @@
 	 */
 	smp_mb();
 	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		spin_lock(&root->fs_info->ordered_root_lock);
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 	}
 
-	spin_lock(&root->orphan_lock);
-	if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-		printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
-		       (unsigned long long)btrfs_ino(inode));
-		list_del_init(&BTRFS_I(inode)->i_orphan);
+	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+		     &BTRFS_I(inode)->runtime_flags)) {
+		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
+			btrfs_ino(inode));
+		atomic_dec(&root->orphan_inodes);
 	}
-	spin_unlock(&root->orphan_lock);
 
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
 		if (!ordered)
 			break;
 		else {
-			printk(KERN_ERR "btrfs found ordered "
-			       "extent %llu %llu on inode cleanup\n",
-			       (unsigned long long)ordered->file_offset,
-			       (unsigned long long)ordered->len);
+			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
+				ordered->file_offset, ordered->len);
 			btrfs_remove_ordered_extent(inode, ordered);
 			btrfs_put_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
@@ -6820,7 +8177,6 @@
 	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-	btrfs_remove_delayed_node(inode);
 	call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 
@@ -6828,8 +8184,12 @@
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
+	if (root == NULL)
+		return 1;
+
+	/* the snap/subvol tree is on deleting */
 	if (btrfs_root_refs(&root->root_item) == 0 &&
-	    !btrfs_is_free_space_inode(root, inode))
+	    root != root->fs_info->tree_root)
 		return 1;
 	else
 		return generic_drop_inode(inode);
@@ -6844,6 +8204,11 @@
 
 void btrfs_destroy_cachep(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	if (btrfs_inode_cachep)
 		kmem_cache_destroy(btrfs_inode_cachep);
 	if (btrfs_trans_handle_cachep)
@@ -6854,40 +8219,49 @@
 		kmem_cache_destroy(btrfs_path_cachep);
 	if (btrfs_free_space_cachep)
 		kmem_cache_destroy(btrfs_free_space_cachep);
+	if (btrfs_delalloc_work_cachep)
+		kmem_cache_destroy(btrfs_delalloc_work_cachep);
 }
 
 int btrfs_init_cachep(void)
 {
-	btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
+	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
-	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
+	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
 			sizeof(struct btrfs_trans_handle), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_trans_handle_cachep)
 		goto fail;
 
-	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
+	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
 			sizeof(struct btrfs_transaction), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_transaction_cachep)
 		goto fail;
 
-	btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
+	btrfs_path_cachep = kmem_cache_create("btrfs_path",
 			sizeof(struct btrfs_path), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_path_cachep)
 		goto fail;
 
-	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
 			sizeof(struct btrfs_free_space), 0,
 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
 	if (!btrfs_free_space_cachep)
 		goto fail;
 
+	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
+			sizeof(struct btrfs_delalloc_work), 0,
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+			NULL);
+	if (!btrfs_delalloc_work_cachep)
+		goto fail;
+
 	return 0;
 fail:
 	btrfs_destroy_cachep();
@@ -6897,36 +8271,59 @@
 static int btrfs_getattr(struct vfsmount *mnt,
 			 struct dentry *dentry, struct kstat *stat)
 {
+	u64 delalloc_bytes;
 	struct inode *inode = dentry->d_inode;
 	u32 blocksize = inode->i_sb->s_blocksize;
 
 	generic_fillattr(inode, stat);
 	stat->dev = BTRFS_I(inode)->root->anon_dev;
 	stat->blksize = PAGE_CACHE_SIZE;
+
+	spin_lock(&BTRFS_I(inode)->lock);
+	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+	spin_unlock(&BTRFS_I(inode)->lock);
 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-		ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
+			ALIGN(delalloc_bytes, blocksize)) >> 9;
 	return 0;
 }
 
-/*
- * If a file is moved, it will inherit the cow and compression flags of the new
- * directory.
- */
-static void fixup_inode_flags(struct inode *dir, struct inode *inode)
+#ifdef MY_ABC_HERE
+int syno_btrfs_getattr(struct dentry *d, struct kstat *stat, int flags)
 {
-	struct btrfs_inode *b_dir = BTRFS_I(dir);
-	struct btrfs_inode *b_inode = BTRFS_I(inode);
-
-	if (b_dir->flags & BTRFS_INODE_NODATACOW)
-		b_inode->flags |= BTRFS_INODE_NODATACOW;
-	else
-		b_inode->flags &= ~BTRFS_INODE_NODATACOW;
+	int err = 0;
+	struct inode *inode = d->d_inode;
 
-	if (b_dir->flags & BTRFS_INODE_COMPRESS)
-		b_inode->flags |= BTRFS_INODE_COMPRESS;
-	else
-		b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_CREATIME) {
+		struct btrfs_timespec crtime;
+
+		err = __btrfs_getxattr(inode, XATTR_SYNO_PREFIX XATTR_SYNO_CREATE_TIME, &crtime, sizeof(crtime));
+		if (0 < err) {
+			inode->i_CreateTime.tv_sec = le64_to_cpu(crtime.sec);
+			inode->i_CreateTime.tv_nsec = le32_to_cpu(crtime.nsec);
+		} else if (-ENODATA == err) {
+			inode->i_CreateTime.tv_sec = 0;
+			inode->i_CreateTime.tv_nsec = 0;
+		} else {
+			return err;
+		}
+		stat->SynoCreateTime = inode->i_CreateTime;
+		err = 0;
+	}
+#endif
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_ARBIT) {
+		stat->SynoMode = inode->i_mode2;
+	}
+#endif
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_BKPVER) {
+		err = syno_btrfs_get_archive_ver(d, &stat->syno_archive_version);
+	}
+#endif
+	return err;
 }
+#endif
 
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 			   struct inode *new_dir, struct dentry *new_dentry)
@@ -6956,6 +8353,28 @@
 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
 		return -ENOTEMPTY;
+
+
+	/* check for collisions, even if the  name isn't there */
+	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
+			     new_dentry->d_name.name,
+			     new_dentry->d_name.len);
+
+	if (ret) {
+		if (ret == -EEXIST) {
+			/* we shouldn't get
+			 * eexist without a new_inode */
+			if (!new_inode) {
+				WARN_ON(1);
+				return ret;
+			}
+		} else {
+			/* maybe -EOVERFLOW */
+			return ret;
+		}
+	}
+	ret = 0;
+
 	/*
 	 * we're using rename to replace one file with another.
 	 * and the replacement file is large.  Start IO on it now so
@@ -6976,7 +8395,7 @@
 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
 	 * should cover the worst case number of items we'll modify.
 	 */
-	trans = btrfs_start_transaction(root, 20);
+	trans = btrfs_start_transaction(root, 11);
 	if (IS_ERR(trans)) {
                 ret = PTR_ERR(trans);
                 goto out_notrans;
@@ -7016,6 +8435,9 @@
 	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
 		btrfs_add_ordered_operation(trans, root, old_inode);
 
+	inode_inc_iversion(old_dir);
+	inode_inc_iversion(new_dir);
+	inode_inc_iversion(old_inode);
 	old_dir->i_ctime = old_dir->i_mtime = ctime;
 	new_dir->i_ctime = new_dir->i_mtime = ctime;
 	old_inode->i_ctime = ctime;
@@ -7036,9 +8458,13 @@
 		if (!ret)
 			ret = btrfs_update_inode(trans, root, old_inode);
 	}
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
 
 	if (new_inode) {
+		inode_inc_iversion(new_inode);
 		new_inode->i_ctime = CURRENT_TIME;
 		if (unlikely(btrfs_ino(new_inode) ==
 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
@@ -7054,19 +8480,21 @@
 						 new_dentry->d_name.name,
 						 new_dentry->d_name.len);
 		}
-		BUG_ON(ret);
-		if (new_inode->i_nlink == 0) {
+		if (!ret && new_inode->i_nlink == 0)
 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
-			BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out_fail;
 		}
 	}
 
-	fixup_inode_flags(new_dir, old_inode);
-
 	ret = btrfs_add_link(trans, new_dir, old_inode,
 			     new_dentry->d_name.name,
 			     new_dentry->d_name.len, 0, index);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_fail;
+	}
 
 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
 		struct dentry *parent = new_dentry->d_parent;
@@ -7074,7 +8502,7 @@
 		btrfs_end_log_trans(root);
 	}
 out_fail:
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 out_notrans:
 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
 		up_read(&root->fs_info->subvol_sem);
@@ -7082,40 +8510,127 @@
 	return ret;
 }
 
+static void btrfs_run_delalloc_work(struct btrfs_work *work)
+{
+	struct btrfs_delalloc_work *delalloc_work;
+
+	delalloc_work = container_of(work, struct btrfs_delalloc_work,
+				     work);
+	if (delalloc_work->wait)
+		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
+	else
+		filemap_flush(delalloc_work->inode->i_mapping);
+
+	if (delalloc_work->delay_iput)
+		btrfs_add_delayed_iput(delalloc_work->inode);
+	else
+		iput(delalloc_work->inode);
+	complete(&delalloc_work->completion);
+}
+
+struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
+						    int wait, int delay_iput)
+{
+	struct btrfs_delalloc_work *work;
+
+	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+	if (!work)
+		return NULL;
+
+	init_completion(&work->completion);
+	INIT_LIST_HEAD(&work->list);
+	work->inode = inode;
+	work->wait = wait;
+	work->delay_iput = delay_iput;
+	work->work.func = btrfs_run_delalloc_work;
+
+	return work;
+}
+
+void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
+{
+	wait_for_completion(&work->completion);
+	kmem_cache_free(btrfs_delalloc_work_cachep, work);
+}
+
 /*
  * some fairly slow code that needs optimization. This walks the list
  * of all the inodes with pending delalloc and forces them to disk.
  */
-int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
-	struct list_head *head = &root->fs_info->delalloc_inodes;
 	struct btrfs_inode *binode;
 	struct inode *inode;
+	struct btrfs_delalloc_work *work, *next;
+	struct list_head works;
+	struct list_head splice;
+	int ret = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	INIT_LIST_HEAD(&works);
+	INIT_LIST_HEAD(&splice);
 
-	spin_lock(&root->fs_info->delalloc_lock);
-	while (!list_empty(head)) {
-		binode = list_entry(head->next, struct btrfs_inode,
+	spin_lock(&root->delalloc_lock);
+	list_splice_init(&root->delalloc_inodes, &splice);
+	while (!list_empty(&splice)) {
+		binode = list_entry(splice.next, struct btrfs_inode,
 				    delalloc_inodes);
+
+		list_move_tail(&binode->delalloc_inodes,
+			       &root->delalloc_inodes);
 		inode = igrab(&binode->vfs_inode);
-		if (!inode)
-			list_del_init(&binode->delalloc_inodes);
-		spin_unlock(&root->fs_info->delalloc_lock);
-		if (inode) {
-			filemap_flush(inode->i_mapping);
+		if (!inode) {
+			cond_resched_lock(&root->delalloc_lock);
+			continue;
+		}
+		spin_unlock(&root->delalloc_lock);
+
+		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+		if (unlikely(!work)) {
 			if (delay_iput)
 				btrfs_add_delayed_iput(inode);
 			else
 				iput(inode);
+			ret = -ENOMEM;
+			goto out;
 		}
+		list_add_tail(&work->list, &works);
+		btrfs_queue_worker(&root->fs_info->flush_workers,
+				   &work->work);
+
 		cond_resched();
-		spin_lock(&root->fs_info->delalloc_lock);
+		spin_lock(&root->delalloc_lock);
 	}
-	spin_unlock(&root->fs_info->delalloc_lock);
+	spin_unlock(&root->delalloc_lock);
 
-	/* the filemap_flush will queue IO into the worker threads, but
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+	return 0;
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&root->delalloc_lock);
+		list_splice_tail(&splice, &root->delalloc_inodes);
+		spin_unlock(&root->delalloc_lock);
+	}
+	return ret;
+}
+
+int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
+{
+	int ret;
+
+	if (root->fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	ret = __start_delalloc_inodes(root, delay_iput);
+	/*
+	 * the filemap_flush will queue IO into the worker threads, but
 	 * we have to make sure the IO is actually started and that
 	 * ordered extents get created before we return
 	 */
@@ -7127,7 +8642,57 @@
 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 	}
 	atomic_dec(&root->fs_info->async_submit_draining);
+	return ret;
+}
+
+int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
+				    int delay_iput)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+	int ret;
+
+	if (fs_info->sb->s_flags & MS_RDONLY)
+		return -EROFS;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->delalloc_root_lock);
+	list_splice_init(&fs_info->delalloc_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					delalloc_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->delalloc_root,
+			       &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
+
+		ret = __start_delalloc_inodes(root, delay_iput);
+		btrfs_put_fs_root(root);
+		if (ret)
+			goto out;
+
+		spin_lock(&fs_info->delalloc_root_lock);
+	}
+	spin_unlock(&fs_info->delalloc_root_lock);
+
+	atomic_inc(&fs_info->async_submit_draining);
+	while (atomic_read(&fs_info->nr_async_submits) ||
+	      atomic_read(&fs_info->async_delalloc_pages)) {
+		wait_event(fs_info->async_submit_wait,
+		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
+		    atomic_read(&fs_info->async_delalloc_pages) == 0));
+	}
+	atomic_dec(&fs_info->async_submit_draining);
 	return 0;
+out:
+	if (!list_empty_careful(&splice)) {
+		spin_lock(&fs_info->delalloc_root_lock);
+		list_splice_tail(&splice, &fs_info->delalloc_roots);
+		spin_unlock(&fs_info->delalloc_root_lock);
+	}
+	return ret;
 }
 
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
@@ -7147,18 +8712,26 @@
 	unsigned long ptr;
 	struct btrfs_file_extent_item *ei;
 	struct extent_buffer *leaf;
-	unsigned long nr = 0;
 
 	name_len = strlen(symname) + 1;
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
+#ifdef MY_ABC_HERE
+	/*
+	 * 2 items for inode item and ref
+	 * 3 items for dir items
+	 * 1 item for xattr if selinux is on
+	 */
+	trans = btrfs_start_transaction(root, 6);
+#else
 	/*
 	 * 2 items for inode item and ref
 	 * 2 items for dir items
 	 * 1 item for xattr if selinux is on
 	 */
 	trans = btrfs_start_transaction(root, 5);
+#endif
 	if (IS_ERR(trans))
 		return PTR_ERR(trans);
 
@@ -7180,6 +8753,14 @@
 		goto out_unlock;
 	}
 
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+	err = syno_btrfs_init_attr(trans, inode);
+	if (err) {
+		drop_inode = 1;
+		goto out_unlock;
+	}
+#endif
+
 	/*
 	* If the active LSM wants to access the inode during
 	* d_instantiate it needs these. Smack checks to see
@@ -7245,13 +8826,12 @@
 out_unlock:
 	if (!err)
 		d_instantiate(dentry, inode);
-	nr = trans->blocks_used;
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
 		iput(inode);
 	}
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	return err;
 }
 
@@ -7260,10 +8840,13 @@
 				       loff_t actual_len, u64 *alloc_hint,
 				       struct btrfs_trans_handle *trans)
 {
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key ins;
 	u64 cur_offset = start;
 	u64 i_size;
+	u64 cur_bytes;
 	int ret = 0;
 	bool own_trans = true;
 
@@ -7278,8 +8861,10 @@
 			}
 		}
 
-		ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
-					   0, *alloc_hint, (u64)-1, &ins, 1);
+		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+		cur_bytes = max(cur_bytes, min_size);
+		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
+					   *alloc_hint, &ins, 1);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
@@ -7291,14 +8876,50 @@
 						  ins.offset, ins.offset,
 						  ins.offset, 0, 0, 0,
 						  BTRFS_FILE_EXTENT_PREALLOC);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
 		btrfs_drop_extent_cache(inode, cur_offset,
 					cur_offset + ins.offset -1, 0);
 
+		em = alloc_extent_map();
+		if (!em) {
+			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				&BTRFS_I(inode)->runtime_flags);
+			goto next;
+		}
+
+		em->start = cur_offset;
+		em->orig_start = cur_offset;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->block_len = ins.offset;
+		em->orig_block_len = ins.offset;
+		em->ram_bytes = ins.offset;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+		em->generation = trans->transid;
+
+		while (1) {
+			write_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em, 1);
+			write_unlock(&em_tree->lock);
+			if (ret != -EEXIST)
+				break;
+			btrfs_drop_extent_cache(inode, cur_offset,
+						cur_offset + ins.offset - 1,
+						0);
+		}
+		free_extent_map(em);
+next:
 		num_bytes -= ins.offset;
 		cur_offset += ins.offset;
 		*alloc_hint = ins.objectid + ins.offset;
 
+		inode_inc_iversion(inode);
 		inode->i_ctime = CURRENT_TIME;
 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -7313,7 +8934,13 @@
 		}
 
 		ret = btrfs_update_inode(trans, root, inode);
-		BUG_ON(ret);
+
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			if (own_trans)
+				btrfs_end_transaction(trans, root);
+			break;
+		}
 
 		if (own_trans)
 			btrfs_end_transaction(trans, root);
@@ -7360,6 +8987,19 @@
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_btrfs_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = syno_btrfs_set_crtime,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_bit = syno_btrfs_set_archive_bit,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_ver = syno_btrfs_set_archive_ver,
+	.syno_get_archive_ver = syno_btrfs_get_archive_ver,
+#endif
 	.getattr	= btrfs_getattr,
 	.lookup		= btrfs_lookup,
 	.create		= btrfs_create,
@@ -7376,12 +9016,31 @@
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	.syno_acl_get	= btrfs_get_syno_acl,
+	.syno_acl_set	= btrfs_set_syno_acl,
+#endif
 	.get_acl	= btrfs_get_acl,
+	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_dir_ro_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_btrfs_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = syno_btrfs_set_crtime,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_bit = syno_btrfs_set_archive_bit,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_ver = syno_btrfs_set_archive_ver,
+	.syno_get_archive_ver = syno_btrfs_get_archive_ver,
+#endif
 	.lookup		= btrfs_lookup,
 	.permission	= btrfs_permission,
 	.get_acl	= btrfs_get_acl,
+	.update_time	= btrfs_update_time,
 };
 
 static const struct file_operations btrfs_dir_file_operations = {
@@ -7441,6 +9100,19 @@
 };
 
 static const struct inode_operations btrfs_file_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_btrfs_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = syno_btrfs_set_crtime,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_bit = syno_btrfs_set_archive_bit,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_ver = syno_btrfs_set_archive_ver,
+	.syno_get_archive_ver = syno_btrfs_get_archive_ver,
+#endif
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.setxattr	= btrfs_setxattr,
@@ -7449,9 +9121,27 @@
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
 	.fiemap		= btrfs_fiemap,
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	.syno_acl_get	= btrfs_get_syno_acl,
+	.syno_acl_set	= btrfs_set_syno_acl,
+#endif
 	.get_acl	= btrfs_get_acl,
+	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_btrfs_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = syno_btrfs_set_crtime,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_bit = syno_btrfs_set_archive_bit,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_ver = syno_btrfs_set_archive_ver,
+	.syno_get_archive_ver = syno_btrfs_get_archive_ver,
+#endif
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
@@ -7459,9 +9149,27 @@
 	.getxattr	= btrfs_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	.syno_acl_get	= btrfs_get_syno_acl,
+	.syno_acl_set	= btrfs_set_syno_acl,
+#endif
 	.get_acl	= btrfs_get_acl,
+	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_btrfs_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = syno_btrfs_set_crtime,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_bit = syno_btrfs_set_archive_bit,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_ver = syno_btrfs_set_archive_ver,
+	.syno_get_archive_ver = syno_btrfs_get_archive_ver,
+#endif
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
@@ -7473,9 +9181,42 @@
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.get_acl	= btrfs_get_acl,
+	.update_time	= btrfs_update_time,
 };
 
+#ifdef MY_ABC_HERE
+extern unsigned char SYNOBtrfsGlobalBuf[UNICODE_UTF8_BUFSIZE];
+extern spinlock_t SYNOBtrfsGlobalLock;  /* init at btrfs_fill_super() */
+
+/* Hash a string to an integer in a caseless way */
+static int btrfs_dentry_hash(const struct dentry *dentry, const struct inode *inode, struct qstr *this)
+{
+	unsigned int upperlen;
+
+	spin_lock(&SYNOBtrfsGlobalLock);
+	upperlen = SYNOUnicodeUTF8toUpper(SYNOBtrfsGlobalBuf, this->name, UNICODE_UTF8_BUFSIZE - 1 , this->len, NULL);
+	this->hash = btrfs_name_hash(SYNOBtrfsGlobalBuf, upperlen);
+	spin_unlock(&SYNOBtrfsGlobalLock);
+	return 0;
+}
+
+/* return 1 on failure and 0 on success */
+static int btrfs_dentry_compare(const struct dentry *parent, const struct inode *pinode,
+							   const struct dentry *dentry, const struct inode *inode,
+							   unsigned int len, const char *str, const struct qstr *name, int caseless)
+{
+	if (caseless) {
+		return SYNOUnicodeUTF8Strcmp(str, name->name, len, name->len, NULL);
+	} else {
+		return dentry_cmp(str, len, name->name, name->len);
+	}
+}
+#endif
 const struct dentry_operations btrfs_dentry_operations = {
 	.d_delete	= btrfs_dentry_delete,
 	.d_release	= btrfs_dentry_release,
+#ifdef MY_ABC_HERE
+	.d_hash		= btrfs_dentry_hash,
+	.d_compare_case	= btrfs_dentry_compare,
+#endif
 };
diff -ur a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
--- a/fs/btrfs/inode-item.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/inode-item.c	2014-02-17 11:56:58.000000000 +0100
@@ -18,7 +18,9 @@
 
 #include "ctree.h"
 #include "disk-io.h"
+#include "hash.h"
 #include "transaction.h"
+#include "print-tree.h"
 
 static int find_name_in_backref(struct btrfs_path *path, const char *name,
 			 int name_len, struct btrfs_inode_ref **ref_ret)
@@ -49,18 +51,57 @@
 	return 0;
 }
 
-struct btrfs_inode_ref *
+int btrfs_find_name_in_ext_backref(struct btrfs_path *path, u64 ref_objectid,
+				   const char *name, int name_len,
+				   struct btrfs_inode_extref **extref_ret)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_inode_extref *extref;
+	unsigned long ptr;
+	unsigned long name_ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	/*
+	 * Search all extended backrefs in this item. We're only
+	 * looking through any collisions so most of the time this is
+	 * just going to compare against one buffer. If all is well,
+	 * we'll return success and the inode ref object.
+	 */
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+		name_ptr = (unsigned long)(&extref->name);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+		if (ref_name_len == name_len &&
+		    btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
+		    (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)) {
+			if (extref_ret)
+				*extref_ret = extref;
+			return 1;
+		}
+
+		cur_offset += ref_name_len + sizeof(*extref);
+	}
+	return 0;
+}
+
+static struct btrfs_inode_ref *
 btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
-			struct btrfs_path *path,
-			const char *name, int name_len,
-			u64 inode_objectid, u64 ref_objectid, int mod)
+		       struct btrfs_root *root,
+		       struct btrfs_path *path,
+		       const char *name, int name_len,
+		       u64 inode_objectid, u64 ref_objectid, int ins_len,
+		       int cow)
 {
+	int ret;
 	struct btrfs_key key;
 	struct btrfs_inode_ref *ref;
-	int ins_len = mod < 0 ? -1 : 0;
-	int cow = mod != 0;
-	int ret;
 
 	key.objectid = inode_objectid;
 	key.type = BTRFS_INODE_REF_KEY;
@@ -76,10 +117,148 @@
 	return ref;
 }
 
+/* Returns NULL if no extref found */
+struct btrfs_inode_extref *
+btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  struct btrfs_path *path,
+			  const char *name, int name_len,
+			  u64 inode_objectid, u64 ref_objectid, int ins_len,
+			  int cow)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret > 0)
+		return NULL;
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid, name, name_len, &extref))
+		return NULL;
+	return extref;
+}
+
+int btrfs_get_inode_ref_index(struct btrfs_trans_handle *trans,
+			      struct btrfs_root *root,
+			      struct btrfs_path *path,
+			      const char *name, int name_len,
+			      u64 inode_objectid, u64 ref_objectid, int mod,
+			      u64 *ret_index)
+{
+	struct btrfs_inode_ref *ref;
+	struct btrfs_inode_extref *extref;
+	int ins_len = mod < 0 ? -1 : 0;
+	int cow = mod != 0;
+
+	ref = btrfs_lookup_inode_ref(trans, root, path, name, name_len,
+				     inode_objectid, ref_objectid, ins_len,
+				     cow);
+	if (IS_ERR(ref))
+		return PTR_ERR(ref);
+
+	if (ref != NULL) {
+		*ret_index = btrfs_inode_ref_index(path->nodes[0], ref);
+		return 0;
+	}
+
+	btrfs_release_path(path);
+
+	extref = btrfs_lookup_inode_extref(trans, root, path, name,
+					   name_len, inode_objectid,
+					   ref_objectid, ins_len, cow);
+	if (IS_ERR(extref))
+		return PTR_ERR(extref);
+
+	if (extref) {
+		*ret_index = btrfs_inode_extref_index(path->nodes[0], extref);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  const char *name, int name_len,
+				  u64 inode_objectid, u64 ref_objectid,
+				  u64 *index)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	int ret;
+	int del_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	unsigned long item_start;
+	u32 item_size;
+
+	key.objectid = inode_objectid;
+	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	/*
+	 * Sanity check - did we find the right item for this name?
+	 * This should always succeed so error here will make the FS
+	 * readonly.
+	 */
+	if (!btrfs_find_name_in_ext_backref(path, ref_objectid,
+					    name, name_len, &extref)) {
+		btrfs_std_error(root->fs_info, -ENOENT);
+		ret = -EROFS;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	if (index)
+		*index = btrfs_inode_extref_index(leaf, extref);
+
+	if (del_len == item_size) {
+		/*
+		 * Common case only one ref in the item, remove the
+		 * whole item.
+		 */
+		ret = btrfs_del_item(trans, root, path);
+		goto out;
+	}
+
+	ptr = (unsigned long)extref;
+	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+	memmove_extent_buffer(leaf, ptr, ptr + del_len,
+			      item_size - (ptr + del_len - item_start));
+
+	btrfs_truncate_item(root, path, item_size - del_len, 1);
+
+out:
+	btrfs_free_path(path);
+
+	return ret;
+}
+
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
-			   const char *name, int name_len,
-			   u64 inode_objectid, u64 ref_objectid, u64 *index)
+			struct btrfs_root *root,
+			const char *name, int name_len,
+			u64 inode_objectid, u64 ref_objectid, u64 *index)
 {
 	struct btrfs_path *path;
 	struct btrfs_key key;
@@ -90,6 +269,7 @@
 	u32 item_size;
 	u32 sub_item_len;
 	int ret;
+	int search_ext_refs = 0;
 	int del_len = name_len + sizeof(*ref);
 
 	key.objectid = inode_objectid;
@@ -105,12 +285,14 @@
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret > 0) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	} else if (ret < 0) {
 		goto out;
 	}
 	if (!find_name_in_backref(path, name, name_len, &ref)) {
 		ret = -ENOENT;
+		search_ext_refs = 1;
 		goto out;
 	}
 	leaf = path->nodes[0];
@@ -128,13 +310,84 @@
 	item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
 	memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
 			      item_size - (ptr + sub_item_len - item_start));
-	ret = btrfs_truncate_item(trans, root, path,
-				  item_size - sub_item_len, 1);
+	btrfs_truncate_item(root, path, item_size - sub_item_len, 1);
+out:
+	btrfs_free_path(path);
+
+	if (search_ext_refs) {
+		/*
+		 * No refs were found, or we could not find the
+		 * name in our ref array. Find and remove the extended
+		 * inode ref then.
+		 */
+		return btrfs_del_inode_extref(trans, root, name, name_len,
+					      inode_objectid, ref_objectid, index);
+	}
+
+	return ret;
+}
+
+/*
+ * btrfs_insert_inode_extref() - Inserts an extended inode ref into a tree.
+ *
+ * The caller must have checked against BTRFS_LINK_MAX already.
+ */
+static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     const char *name, int name_len,
+				     u64 inode_objectid, u64 ref_objectid, u64 index)
+{
+	struct btrfs_inode_extref *extref;
+	int ret;
+	int ins_len = name_len + sizeof(*extref);
+	unsigned long ptr;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_item *item;
+
+	key.objectid = inode_objectid;
+	key.type = BTRFS_INODE_EXTREF_KEY;
+	key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      ins_len);
+	if (ret == -EEXIST) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, name_len, NULL))
+			goto out;
+
+		btrfs_extend_item(root, path, ins_len);
+		ret = 0;
+	}
+	if (ret < 0)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_nr(leaf, path->slots[0]);
+	ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char);
+	ptr += btrfs_item_size(leaf, item) - ins_len;
+	extref = (struct btrfs_inode_extref *)ptr;
+
+	btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
+	btrfs_set_inode_extref_index(path->nodes[0], extref, index);
+	btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
+
+	ptr = (unsigned long)&extref->name;
+	write_extent_buffer(path->nodes[0], name, ptr, name_len);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
 			   const char *name, int name_len,
@@ -165,7 +418,7 @@
 			goto out;
 
 		old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-		ret = btrfs_extend_item(trans, root, path, ins_len);
+		btrfs_extend_item(root, path, ins_len);
 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				     struct btrfs_inode_ref);
 		ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
@@ -189,6 +442,19 @@
 
 out:
 	btrfs_free_path(path);
+
+	if (ret == -EMLINK) {
+		struct btrfs_super_block *disk_super = root->fs_info->super_copy;
+		/* We ran out of space in the ref array. Need to
+		 * add an extended ref. */
+		if (btrfs_super_incompat_flags(disk_super)
+		    & BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
+			ret = btrfs_insert_inode_extref(trans, root, name,
+							name_len,
+							inode_objectid,
+							ref_objectid, index);
+	}
+
 	return ret;
 }
 
diff -ur a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
--- a/fs/btrfs/inode-map.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/inode-map.c	2014-02-17 11:56:58.000000000 +0100
@@ -178,7 +178,7 @@
 
 	tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
 			  root->root_key.objectid);
-	BUG_ON(IS_ERR(tsk));
+	BUG_ON(IS_ERR(tsk)); /* -ENOMEM */
 }
 
 int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
@@ -271,7 +271,7 @@
 			break;
 
 		info = rb_entry(n, struct btrfs_free_space, offset_index);
-		BUG_ON(info->bitmap);
+		BUG_ON(info->bitmap); /* Logic error */
 
 		if (info->offset > root->cache_progress)
 			goto free;
@@ -429,24 +429,28 @@
 	num_bytes = trans->bytes_reserved;
 	/*
 	 * 1 item for inode item insertion if need
-	 * 3 items for inode item update (in the worst case)
+	 * 4 items for inode item update (in the worst case)
+	 * 1 items for slack space if we need do truncation
 	 * 1 item for free space object
 	 * 3 items for pre-allocation
 	 */
-	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 8);
-	ret = btrfs_block_rsv_add_noflush(root, trans->block_rsv,
-					  trans->bytes_reserved);
+	trans->bytes_reserved = btrfs_calc_trans_metadata_size(root, 10);
+	ret = btrfs_block_rsv_add(root, trans->block_rsv,
+				  trans->bytes_reserved,
+				  BTRFS_RESERVE_NO_FLUSH);
 	if (ret)
 		goto out;
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+				      trans->transid, trans->bytes_reserved, 1);
 again:
 	inode = lookup_free_ino_inode(root, path);
-	if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+	if (IS_ERR(inode) && (PTR_ERR(inode) != -ENOENT || retry)) {
 		ret = PTR_ERR(inode);
 		goto out_release;
 	}
 
 	if (IS_ERR(inode)) {
-		BUG_ON(retry);
+		BUG_ON(retry); /* Logic error */
 		retry = true;
 
 		ret = create_free_ino_inode(root, trans, path);
@@ -457,12 +461,18 @@
 
 	BTRFS_I(inode)->generation = 0;
 	ret = btrfs_update_inode(trans, root, inode);
-	WARN_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_put;
+	}
 
 	if (i_size_read(inode) > 0) {
 		ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
-		if (ret)
+		if (ret) {
+			if (ret != -ENOSPC)
+				btrfs_abort_transaction(trans, root, ret);
 			goto out_put;
+		}
 	}
 
 	spin_lock(&root->cache_lock);
@@ -498,6 +508,8 @@
 out_put:
 	iput(inode);
 out_release:
+	trace_btrfs_space_reservation(root->fs_info, "ino_cache",
+				      trans->transid, trans->bytes_reserved, 0);
 	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 out:
 	trans->block_rsv = rsv;
@@ -526,7 +538,7 @@
 	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret < 0)
 		goto error;
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Corruption */
 	if (path->slots[0] > 0) {
 		slot = path->slots[0] - 1;
 		l = path->nodes[0];
diff -ur a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
--- a/fs/btrfs/ioctl.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/ioctl.c	2014-02-17 11:56:58.000000000 +0100
@@ -41,17 +41,25 @@
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
+#include <linux/uuid.h>
+#include <linux/btrfs.h>
+#include <linux/uaccess.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
 #include "inode-map.h"
 #include "backref.h"
+#include "rcu-string.h"
+#include "send.h"
+#include "dev-replace.h"
+
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -137,8 +145,11 @@
 		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
 	}
 
-	if (flags & BTRFS_INODE_NODATACOW)
+	if (flags & BTRFS_INODE_NODATACOW) {
 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+		if (S_ISREG(inode->i_mode))
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
+	}
 
 	btrfs_update_iflags(inode);
 }
@@ -176,6 +187,9 @@
 	struct btrfs_trans_handle *trans;
 	unsigned int flags, oldflags;
 	int ret;
+	u64 ip_oldflags;
+	unsigned int i_oldflags;
+	umode_t mode;
 
 	if (btrfs_root_readonly(root))
 		return -EROFS;
@@ -190,8 +204,16 @@
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
 	mutex_lock(&inode->i_mutex);
 
+	ip_oldflags = ip->flags;
+	i_oldflags = inode->i_flags;
+	mode = inode->i_mode;
+
 	flags = btrfs_mask_flags(inode->i_mode, flags);
 	oldflags = btrfs_flags_to_ioctl(ip->flags);
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
@@ -201,10 +223,6 @@
 		}
 	}
 
-	ret = mnt_want_write_file(file);
-	if (ret)
-		goto out_unlock;
-
 	if (flags & FS_SYNC_FL)
 		ip->flags |= BTRFS_INODE_SYNC;
 	else
@@ -229,10 +247,31 @@
 		ip->flags |= BTRFS_INODE_DIRSYNC;
 	else
 		ip->flags &= ~BTRFS_INODE_DIRSYNC;
-	if (flags & FS_NOCOW_FL)
-		ip->flags |= BTRFS_INODE_NODATACOW;
-	else
-		ip->flags &= ~BTRFS_INODE_NODATACOW;
+	if (flags & FS_NOCOW_FL) {
+		if (S_ISREG(mode)) {
+			/*
+			 * It's safe to turn csums off here, no extents exist.
+			 * Otherwise we want the flag to reflect the real COW
+			 * status of the file and will not set it.
+			 */
+			if (inode->i_size == 0)
+				ip->flags |= BTRFS_INODE_NODATACOW
+					   | BTRFS_INODE_NODATASUM;
+		} else {
+			ip->flags |= BTRFS_INODE_NODATACOW;
+		}
+	} else {
+		/*
+		 * Revert back under same assuptions as above
+		 */
+		if (S_ISREG(mode)) {
+			if (inode->i_size == 0)
+				ip->flags &= ~(BTRFS_INODE_NODATACOW
+				             | BTRFS_INODE_NODATASUM);
+		} else {
+			ip->flags &= ~BTRFS_INODE_NODATACOW;
+		}
+	}
 
 	/*
 	 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
@@ -249,21 +288,27 @@
 		ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 	}
 
-	trans = btrfs_join_transaction(root);
-	BUG_ON(IS_ERR(trans));
+	trans = btrfs_start_transaction(root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_drop;
+	}
 
 	btrfs_update_iflags(inode);
+	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode(trans, root, inode);
-	BUG_ON(ret);
 
 	btrfs_end_transaction(trans, root);
+ out_drop:
+	if (ret) {
+		ip->flags = ip_oldflags;
+		inode->i_flags = i_oldflags;
+	}
 
-	mnt_drop_write_file(file);
-
-	ret = 0;
  out_unlock:
 	mutex_unlock(&inode->i_mutex);
+	mnt_drop_write_file(file);
 	return ret;
 }
 
@@ -276,14 +321,13 @@
 
 static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 {
-	struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(fdentry(file)->d_sb);
 	struct btrfs_device *device;
 	struct request_queue *q;
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
-	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+	u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -307,12 +351,13 @@
 		return -EOPNOTSUPP;
 	if (copy_from_user(&range, arg, sizeof(range)))
 		return -EFAULT;
-	if (range.start > total_bytes)
+	if (range.start > total_bytes ||
+	    range.len < fs_info->sb->s_blocksize)
 		return -EINVAL;
 
 	range.len = min(range.len, total_bytes - range.start);
 	range.minlen = max(range.minlen, minlen);
-	ret = btrfs_trim_fs(root, &range);
+	ret = btrfs_trim_fs(fs_info->tree_root, &range);
 	if (ret < 0)
 		return ret;
 
@@ -322,40 +367,61 @@
 	return 0;
 }
 
-static noinline int create_subvol(struct btrfs_root *root,
+int btrfs_is_empty_uuid(u8 *uuid)
+{
+	static char empty_uuid[BTRFS_UUID_SIZE] = {0};
+
+	return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE);
+}
+
+static noinline int create_subvol(struct inode *dir,
 				  struct dentry *dentry,
 				  char *name, int namelen,
-				  u64 *async_transid)
+				  u64 *async_transid,
+				  struct btrfs_qgroup_inherit *inherit)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_key key;
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
+	struct btrfs_root *root = BTRFS_I(dir)->root;
 	struct btrfs_root *new_root;
-	struct dentry *parent = dentry->d_parent;
-	struct inode *dir;
+	struct btrfs_block_rsv block_rsv;
+	struct timespec cur_time = CURRENT_TIME;
 	int ret;
 	int err;
 	u64 objectid;
 	u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
 	u64 index = 0;
+	u64 qgroup_reserved;
+	uuid_le new_uuid;
 
 	ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
 	if (ret)
 		return ret;
 
-	dir = parent->d_inode;
-
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 	/*
-	 * 1 - inode item
-	 * 2 - refs
-	 * 1 - root item
-	 * 2 - dir items
+	 * The same as the snapshot creation, please see the comment
+	 * of create_snapshot().
 	 */
-	trans = btrfs_start_transaction(root, 6);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       8, &qgroup_reserved, false);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
+
+	ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
+	if (ret)
+		goto fail;
 
 	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
 				      0, objectid, NULL, 0, 0, 0);
@@ -370,25 +436,25 @@
 	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
-	write_extent_buffer(leaf, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    btrfs_header_chunk_tree_uuid(leaf),
 			    BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
+	memset(&root_item, 0, sizeof(root_item));
+
 	inode_item = &root_item.inode;
-	memset(inode_item, 0, sizeof(*inode_item));
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
-
-	root_item.flags = 0;
-	root_item.byte_limit = 0;
-	inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
+
+	btrfs_set_root_flags(&root_item, 0);
+	btrfs_set_root_limit(&root_item, 0);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
 	btrfs_set_root_generation(&root_item, trans->transid);
@@ -397,8 +463,15 @@
 	btrfs_set_root_used(&root_item, leaf->len);
 	btrfs_set_root_last_snapshot(&root_item, 0);
 
-	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
-	root_item.drop_level = 0;
+	btrfs_set_root_generation_v2(&root_item,
+			btrfs_root_generation(&root_item));
+	uuid_le_gen(&new_uuid);
+	memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
+	btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
+	root_item.ctime = root_item.otime;
+	btrfs_set_root_ctransid(&root_item, trans->transid);
+	btrfs_set_root_otransid(&root_item, trans->transid);
 
 	btrfs_tree_unlock(leaf);
 	free_extent_buffer(leaf);
@@ -416,22 +489,37 @@
 
 	key.offset = (u64)-1;
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &key);
-	BUG_ON(IS_ERR(new_root));
+	if (IS_ERR(new_root)) {
+		btrfs_abort_transaction(trans, root, PTR_ERR(new_root));
+		ret = PTR_ERR(new_root);
+		goto fail;
+	}
 
 	btrfs_record_root_in_trans(trans, new_root);
 
 	ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
+	if (ret) {
+		/* We potentially lose an unused inode item here */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
 	/*
 	 * insert the directory item
 	 */
 	ret = btrfs_set_inode_index(dir, &index);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	ret = btrfs_insert_dir_item(trans, root,
 				    name, namelen, dir, &key,
 				    BTRFS_FT_DIR, index);
-	if (ret)
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
 		goto fail;
+	}
 
 	btrfs_i_size_write(dir, dir->i_size + namelen * 2);
 	ret = btrfs_update_inode(trans, root, dir);
@@ -440,25 +528,43 @@
 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
 				 objectid, root->root_key.objectid,
 				 btrfs_ino(dir), index, name, namelen);
-
 	BUG_ON(ret);
 
-	d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+	ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+				  root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  objectid);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+
 fail:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
 	if (async_transid) {
 		*async_transid = trans->transid;
 		err = btrfs_commit_transaction_async(trans, root, 1);
+		if (err)
+			err = btrfs_commit_transaction(trans, root);
 	} else {
 		err = btrfs_commit_transaction(trans, root);
 	}
 	if (err && !ret)
 		ret = err;
+
+	if (!ret)
+#ifdef MY_ABC_HERE
+		d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry, 0));
+#else
+		d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+#endif
+out:
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 	return ret;
 }
 
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
-			   char *name, int namelen, u64 *async_transid,
-			   bool readonly)
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
+			   struct dentry *dentry, char *name, int namelen,
+			   u64 *async_transid, bool readonly,
+			   struct btrfs_qgroup_inherit *inherit)
 {
 	struct inode *inode;
 	struct btrfs_pending_snapshot *pending_snapshot;
@@ -468,24 +574,45 @@
 	if (!root->ref_cows)
 		return -EINVAL;
 
+	ret = btrfs_start_delalloc_inodes(root, 0);
+	if (ret)
+		return ret;
+
+	btrfs_wait_ordered_extents(root);
+
 	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
 	if (!pending_snapshot)
 		return -ENOMEM;
 
-	btrfs_init_block_rsv(&pending_snapshot->block_rsv);
+	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
+			     BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * 1 - parent dir inode
+	 * 2 - dir entries
+	 * 1 - root item
+	 * 2 - root ref/backref
+	 * 1 - root of snapshot
+	 * 1 - UUID item
+	 */
+	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+					&pending_snapshot->block_rsv, 8,
+					&pending_snapshot->qgroup_reserved,
+					false);
+	if (ret)
+		goto out;
+
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
 	pending_snapshot->readonly = readonly;
+	pending_snapshot->dir = dir;
+	pending_snapshot->inherit = inherit;
 
-	trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto fail;
 	}
 
-	ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
-	BUG_ON(ret);
-
 	spin_lock(&root->fs_info->trans_lock);
 	list_add(&pending_snapshot->list,
 		 &trans->transaction->pending_snapshots);
@@ -494,11 +621,14 @@
 		*async_transid = trans->transid;
 		ret = btrfs_commit_transaction_async(trans,
 				     root->fs_info->extent_root, 1);
+		if (ret)
+			ret = btrfs_commit_transaction(trans, root);
 	} else {
 		ret = btrfs_commit_transaction(trans,
 					       root->fs_info->extent_root);
 	}
-	BUG_ON(ret);
+	if (ret)
+		goto fail;
 
 	ret = pending_snapshot->error;
 	if (ret)
@@ -508,7 +638,11 @@
 	if (ret)
 		goto fail;
 
+#ifdef MY_ABC_HERE
+	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry, 0);
+#else
 	inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+#endif
 	if (IS_ERR(inode)) {
 		ret = PTR_ERR(inode);
 		goto fail;
@@ -517,6 +651,10 @@
 	d_instantiate(dentry, inode);
 	ret = 0;
 fail:
+	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+					 &pending_snapshot->block_rsv,
+					 pending_snapshot->qgroup_reserved);
+out:
 	kfree(pending_snapshot);
 	return ret;
 }
@@ -609,13 +747,16 @@
 static noinline int btrfs_mksubvol(struct path *parent,
 				   char *name, int namelen,
 				   struct btrfs_root *snap_src,
-				   u64 *async_transid, bool readonly)
+				   u64 *async_transid, bool readonly,
+				   struct btrfs_qgroup_inherit *inherit)
 {
 	struct inode *dir  = parent->dentry->d_inode;
 	struct dentry *dentry;
 	int error;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	error = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (error == -EINTR)
+		return error;
 
 	dentry = lookup_one_len(name, parent->dentry, namelen);
 	error = PTR_ERR(dentry);
@@ -626,13 +767,19 @@
 	if (dentry->d_inode)
 		goto out_dput;
 
-	error = mnt_want_write(parent->mnt);
+	error = btrfs_may_create(dir, dentry);
 	if (error)
 		goto out_dput;
 
-	error = btrfs_may_create(dir, dentry);
+	/*
+	 * even if this name doesn't exist, we may get hash collisions.
+	 * check for them now when we can safely fail
+	 */
+	error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
+					       dir->i_ino, name,
+					       namelen);
 	if (error)
-		goto out_drop_write;
+		goto out_dput;
 
 	down_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
 
@@ -640,18 +787,16 @@
 		goto out_up_read;
 
 	if (snap_src) {
-		error = create_snapshot(snap_src, dentry,
-					name, namelen, async_transid, readonly);
+		error = create_snapshot(snap_src, dir, dentry, name, namelen,
+					async_transid, readonly, inherit);
 	} else {
-		error = create_subvol(BTRFS_I(dir)->root, dentry,
-				      name, namelen, async_transid);
+		error = create_subvol(dir, dentry, name, namelen,
+				      async_transid, inherit);
 	}
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
 out_up_read:
 	up_read(&BTRFS_I(dir)->root->fs_info->subvol_sem);
-out_drop_write:
-	mnt_drop_write(parent->mnt);
 out_dput:
 	dput(dentry);
 out_unlock:
@@ -728,7 +873,7 @@
 
 	while(1) {
 		ret = btrfs_search_forward(root, &min_key, &max_key,
-					   path, 0, newer_than);
+					   path, newer_than);
 		if (ret != 0)
 			goto none;
 		if (min_key.objectid != ino)
@@ -760,23 +905,12 @@
 	return -ENOENT;
 }
 
-static int should_defrag_range(struct inode *inode, u64 start, u64 len,
-			       int thresh, u64 *last_len, u64 *skip,
-			       u64 *defrag_end)
+static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
 {
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	struct extent_map *em = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	int ret = 1;
-
-	/*
-	 * make sure that once we start defragging an extent, we keep on
-	 * defragging it
-	 */
-	if (start < *defrag_end)
-		return 1;
-
-	*skip = 0;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map *em;
+	u64 len = PAGE_CACHE_SIZE;
 
 	/*
 	 * hopefully we have this extent in the tree already, try without
@@ -788,24 +922,71 @@
 
 	if (!em) {
 		/* get the big lock and read metadata off disk */
-		lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+		lock_extent(io_tree, start, start + len - 1);
 		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
-		unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
+		unlock_extent(io_tree, start, start + len - 1);
 
 		if (IS_ERR(em))
-			return 0;
+			return NULL;
 	}
 
+	return em;
+}
+
+static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
+{
+	struct extent_map *next;
+	bool ret = true;
+
+	/* this is the last extent */
+	if (em->start + em->len >= i_size_read(inode))
+		return false;
+
+	next = defrag_lookup_extent(inode, em->start + em->len);
+	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+		ret = false;
+
+	free_extent_map(next);
+	return ret;
+}
+
+static int should_defrag_range(struct inode *inode, u64 start, int thresh,
+			       u64 *last_len, u64 *skip, u64 *defrag_end,
+			       int compress)
+{
+	struct extent_map *em;
+	int ret = 1;
+	bool next_mergeable = true;
+
+	/*
+	 * make sure that once we start defragging an extent, we keep on
+	 * defragging it
+	 */
+	if (start < *defrag_end)
+		return 1;
+
+	*skip = 0;
+
+	em = defrag_lookup_extent(inode, start);
+	if (!em)
+		return 0;
+
 	/* this will cover holes, and inline extents */
-	if (em->block_start >= EXTENT_MAP_LAST_BYTE)
+	if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
 		ret = 0;
+		goto out;
+	}
+
+	next_mergeable = defrag_check_next_extent(inode, em);
 
 	/*
-	 * we hit a real extent, if it is big don't bother defragging it again
+	 * we hit a real extent, if it is big or the next extent is not a
+	 * real extent, don't bother defragging it
 	 */
-	if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
+	if (!compress && (*last_len == 0 || *last_len >= thresh) &&
+	    (em->len >= thresh || !next_mergeable))
 		ret = 0;
-
+out:
 	/*
 	 * last_len ends up being a counter of how many bytes we've defragged.
 	 * every time we choose not to defrag an extent, we reset *last_len
@@ -847,35 +1028,62 @@
 	u64 isize = i_size_read(inode);
 	u64 page_start;
 	u64 page_end;
+	u64 page_cnt;
 	int ret;
 	int i;
 	int i_done;
 	struct btrfs_ordered_extent *ordered;
 	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *tree;
 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 
-	if (isize == 0)
-		return 0;
 	file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+	if (!isize || start_index > file_end)
+		return 0;
+
+	page_cnt = min_t(u64, (u64)num_pages, (u64)file_end - start_index + 1);
 
-	mutex_lock(&inode->i_mutex);
 	ret = btrfs_delalloc_reserve_space(inode,
-					   num_pages << PAGE_CACHE_SHIFT);
-	mutex_unlock(&inode->i_mutex);
+					   page_cnt << PAGE_CACHE_SHIFT);
 	if (ret)
 		return ret;
-again:
-	ret = 0;
 	i_done = 0;
+	tree = &BTRFS_I(inode)->io_tree;
 
 	/* step one, lock all the pages */
-	for (i = 0; i < num_pages; i++) {
+	for (i = 0; i < page_cnt; i++) {
 		struct page *page;
+again:
 		page = find_or_create_page(inode->i_mapping,
-					    start_index + i, mask);
+					   start_index + i, mask);
 		if (!page)
 			break;
 
+		page_start = page_offset(page);
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		while (1) {
+			lock_extent(tree, page_start, page_end);
+			ordered = btrfs_lookup_ordered_extent(inode,
+							      page_start);
+			unlock_extent(tree, page_start, page_end);
+			if (!ordered)
+				break;
+
+			unlock_page(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			lock_page(page);
+			/*
+			 * we unlocked the page above, so we need check if
+			 * it was released or not.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto again;
+			}
+		}
+
 		if (!PageUptodate(page)) {
 			btrfs_readpage(NULL, page);
 			lock_page(page);
@@ -886,15 +1094,13 @@
 				break;
 			}
 		}
-		isize = i_size_read(inode);
-		file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
-		if (!isize || page->index > file_end ||
-		    page->mapping != inode->i_mapping) {
-			/* whoops, we blew past eof, skip this page */
+
+		if (page->mapping != inode->i_mapping) {
 			unlock_page(page);
 			page_cache_release(page);
-			break;
+			goto again;
 		}
+
 		pages[i] = page;
 		i_done++;
 	}
@@ -915,43 +1121,23 @@
 	page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree,
-			 page_start, page_end - 1, 0, &cached_state,
-			 GFP_NOFS);
-	ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
-	if (ordered &&
-	    ordered->file_offset + ordered->len > page_start &&
-	    ordered->file_offset < page_end) {
-		btrfs_put_ordered_extent(ordered);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-				     page_start, page_end - 1,
-				     &cached_state, GFP_NOFS);
-		for (i = 0; i < i_done; i++) {
-			unlock_page(pages[i]);
-			page_cache_release(pages[i]);
-		}
-		btrfs_wait_ordered_range(inode, page_start,
-					 page_end - page_start);
-		goto again;
-	}
-	if (ordered)
-		btrfs_put_ordered_extent(ordered);
-
+			 page_start, page_end - 1, 0, &cached_state);
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
 			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
-			  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
-			  GFP_NOFS);
+			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
+			  &cached_state, GFP_NOFS);
 
-	if (i_done != num_pages) {
+	if (i_done != page_cnt) {
 		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->outstanding_extents++;
 		spin_unlock(&BTRFS_I(inode)->lock);
 		btrfs_delalloc_release_space(inode,
-				     (num_pages - i_done) << PAGE_CACHE_SHIFT);
+				     (page_cnt - i_done) << PAGE_CACHE_SHIFT);
 	}
 
 
-	btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
-				  &cached_state);
+	set_extent_defrag(&BTRFS_I(inode)->io_tree, page_start, page_end - 1,
+			  &cached_state, GFP_NOFS);
 
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree,
 			     page_start, page_end - 1, &cached_state,
@@ -971,7 +1157,7 @@
 		unlock_page(pages[i]);
 		page_cache_release(pages[i]);
 	}
-	btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+	btrfs_delalloc_release_space(inode, page_cnt << PAGE_CACHE_SHIFT);
 	return ret;
 
 }
@@ -981,11 +1167,9 @@
 		      u64 newer_than, unsigned long max_to_defrag)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_super_block *disk_super;
 	struct file_ra_state *ra = NULL;
 	unsigned long last_index;
 	u64 isize = i_size_read(inode);
-	u64 features;
 	u64 last_len = 0;
 	u64 skip = 0;
 	u64 defrag_end = 0;
@@ -1001,8 +1185,11 @@
 	u64 new_align = ~((u64)128 * 1024 - 1);
 	struct page **pages = NULL;
 
-	if (extent_thresh == 0)
-		extent_thresh = 256 * 1024;
+	if (isize == 0)
+		return 0;
+
+	if (range->start >= isize)
+		return -EINVAL;
 
 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
 		if (range->compress_type > BTRFS_COMPRESS_TYPES)
@@ -1011,8 +1198,8 @@
 			compress_type = range->compress_type;
 	}
 
-	if (isize == 0)
-		return 0;
+	if (extent_thresh == 0)
+		extent_thresh = 256 * 1024;
 
 	/*
 	 * if we were not given a file, allocate a readahead
@@ -1058,7 +1245,7 @@
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index;
+		max_to_defrag = last_index + 1;
 
 	/*
 	 * make writeback starts from i, so the defrag range can be
@@ -1077,12 +1264,16 @@
 		if (!(inode->i_sb->s_flags & MS_ACTIVE))
 			break;
 
-		if (!newer_than &&
-		    !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
-					PAGE_CACHE_SIZE,
-					extent_thresh,
-					&last_len, &skip,
-					&defrag_end)) {
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+					 extent_thresh, &last_len, &skip,
+					 &defrag_end, range->flags &
+					 BTRFS_DEFRAG_RANGE_COMPRESS)) {
 			unsigned long next;
 			/*
 			 * the should_defrag function tells us how much to skip
@@ -1101,9 +1292,6 @@
 			cluster = max_cluster;
 		}
 
-		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->force_compress = compress_type;
-
 		if (i + cluster > ra_index) {
 			ra_index = max(i, ra_index);
 			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
@@ -1111,17 +1299,26 @@
 			ra_index += max_cluster;
 		}
 
+		mutex_lock(&inode->i_mutex);
+		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+			BTRFS_I(inode)->force_compress = compress_type;
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
-		if (ret < 0)
+		if (ret < 0) {
+			mutex_unlock(&inode->i_mutex);
 			goto out_ra;
+		}
 
 		defrag_count += ret;
-		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+		balance_dirty_pages_ratelimited(inode->i_mapping);
+		mutex_unlock(&inode->i_mutex);
 
 		if (newer_than) {
 			if (newer_off == (u64)-1)
 				break;
 
+			if (ret > 0)
+				i += ret;
+
 			newer_off = max(newer_off + 1,
 					(u64)i << PAGE_CACHE_SHIFT);
 
@@ -1161,34 +1358,33 @@
 			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 		}
 		atomic_dec(&root->fs_info->async_submit_draining);
-
-		mutex_lock(&inode->i_mutex);
-		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
-		mutex_unlock(&inode->i_mutex);
 	}
 
-	disk_super = root->fs_info->super_copy;
-	features = btrfs_super_incompat_flags(disk_super);
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
-		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
-		btrfs_set_super_incompat_flags(disk_super, features);
+		btrfs_set_fs_incompat(root->fs_info, COMPRESS_LZO);
 	}
 
 	ret = defrag_count;
 
 out_ra:
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		mutex_lock(&inode->i_mutex);
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+		mutex_unlock(&inode->i_mutex);
+	}
 	if (!file)
 		kfree(ra);
 	kfree(pages);
 	return ret;
 }
 
-static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
+static noinline int btrfs_ioctl_resize(struct file *file,
 					void __user *arg)
 {
 	u64 new_size;
 	u64 old_size;
 	u64 devid = 1;
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
 	struct btrfs_device *device = NULL;
@@ -1197,19 +1393,28 @@
 	int ret = 0;
 	int mod = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		mnt_drop_write_file(file);
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 
-	mutex_lock(&root->fs_info->volume_mutex);
 	sizestr = vol_args->name;
 	devstr = strchr(sizestr, ':');
 	if (devstr) {
@@ -1218,16 +1423,29 @@
 		*devstr = '\0';
 		devstr = vol_args->name;
 		devid = simple_strtoull(devstr, &end, 10);
-		printk(KERN_INFO "btrfs: resizing devid %llu\n",
-		       (unsigned long long)devid);
+		if (!devid) {
+			ret = -EINVAL;
+			goto out_free;
+		}
+		printk(KERN_INFO "btrfs: resizing devid %llu\n", devid);
 	}
-	device = btrfs_find_device(root, devid, NULL, NULL);
+
+	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 	if (!device) {
 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
-		       (unsigned long long)devid);
-		ret = -EINVAL;
-		goto out_unlock;
+		       devid);
+		ret = -ENODEV;
+		goto out_free;
+	}
+
+	if (!device->writeable) {
+		printk(KERN_INFO "btrfs: resizer unable to apply on "
+		       "readonly device %llu\n",
+		       devid);
+		ret = -EPERM;
+		goto out_free;
 	}
+
 	if (!strcmp(sizestr, "max"))
 		new_size = device->bdev->bd_inode->i_size;
 	else {
@@ -1241,16 +1459,21 @@
 		new_size = memparse(sizestr, NULL);
 		if (new_size == 0) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 	}
 
+	if (device->is_tgtdev_for_dev_replace) {
+		ret = -EPERM;
+		goto out_free;
+	}
+
 	old_size = device->total_bytes;
 
 	if (mod < 0) {
 		if (new_size > old_size) {
 			ret = -EINVAL;
-			goto out_unlock;
+			goto out_free;
 		}
 		new_size = old_size - new_size;
 	} else if (mod > 0) {
@@ -1259,67 +1482,75 @@
 
 	if (new_size < 256 * 1024 * 1024) {
 		ret = -EINVAL;
-		goto out_unlock;
+		goto out_free;
 	}
 	if (new_size > device->bdev->bd_inode->i_size) {
 		ret = -EFBIG;
-		goto out_unlock;
+		goto out_free;
 	}
 
 	do_div(new_size, root->sectorsize);
 	new_size *= root->sectorsize;
 
-	printk(KERN_INFO "btrfs: new size for %s is %llu\n",
-		device->name, (unsigned long long)new_size);
+	printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
+		      rcu_str_deref(device->name), new_size);
 
 	if (new_size > old_size) {
 		trans = btrfs_start_transaction(root, 0);
 		if (IS_ERR(trans)) {
 			ret = PTR_ERR(trans);
-			goto out_unlock;
+			goto out_free;
 		}
 		ret = btrfs_grow_device(trans, device, new_size);
 		btrfs_commit_transaction(trans, root);
 	} else if (new_size < old_size) {
 		ret = btrfs_shrink_device(device, new_size);
-	}
+	} /* equal, nothing need to do */
 
-out_unlock:
-	mutex_unlock(&root->fs_info->volume_mutex);
+out_free:
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+	mnt_drop_write_file(file);
 	return ret;
 }
 
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
-						    char *name,
-						    unsigned long fd,
-						    int subvol,
-						    u64 *transid,
-						    bool readonly)
+				char *name, unsigned long fd, int subvol,
+				u64 *transid, bool readonly,
+				struct btrfs_qgroup_inherit *inherit)
 {
-	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct file *src_file;
 	int namelen;
 	int ret = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto out;
 
 	namelen = strlen(name);
 	if (strchr(name, '/')) {
 		ret = -EINVAL;
-		goto out;
+		goto out_drop_write;
+	}
+
+	if (name[0] == '.' &&
+	   (namelen == 1 || (name[1] == '.' && namelen == 2))) {
+		ret = -EEXIST;
+		goto out_drop_write;
 	}
 
 	if (subvol) {
 		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     NULL, transid, readonly);
+				     NULL, transid, readonly, inherit);
 	} else {
 		struct inode *src_inode;
-		src_file = fget(fd);
+		int fput_needed;
+		src_file = fget_light(fd, &fput_needed);
 		if (!src_file) {
 			ret = -EINVAL;
-			goto out;
+			goto out_drop_write;
 		}
 
 		src_inode = src_file->f_path.dentry->d_inode;
@@ -1327,14 +1558,15 @@
 			printk(KERN_INFO "btrfs: Snapshot src from "
 			       "another FS\n");
 			ret = -EINVAL;
-			fput(src_file);
-			goto out;
+		} else {
+			ret = btrfs_mksubvol(&file->f_path, name, namelen,
+					     BTRFS_I(src_inode)->root,
+					     transid, readonly, inherit);
 		}
-		ret = btrfs_mksubvol(&file->f_path, name, namelen,
-				     BTRFS_I(src_inode)->root,
-				     transid, readonly);
-		fput(src_file);
+		fput_light(src_file, fput_needed);
 	}
+out_drop_write:
+	mnt_drop_write_file(file);
 out:
 	return ret;
 }
@@ -1352,7 +1584,7 @@
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
 					      vol_args->fd, subvol,
-					      NULL, false);
+					      NULL, false, NULL);
 
 	kfree(vol_args);
 	return ret;
@@ -1366,6 +1598,7 @@
 	u64 transid = 0;
 	u64 *ptr = NULL;
 	bool readonly = false;
+	struct btrfs_qgroup_inherit *inherit = NULL;
 
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args))
@@ -1373,7 +1606,8 @@
 	vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
 
 	if (vol_args->flags &
-	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
+	      BTRFS_SUBVOL_QGROUP_INHERIT)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -1382,10 +1616,21 @@
 		ptr = &transid;
 	if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
 		readonly = true;
+	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
+		if (vol_args->size > PAGE_CACHE_SIZE) {
+			ret = -EINVAL;
+			goto out;
+		}
+		inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
+		if (IS_ERR(inherit)) {
+			ret = PTR_ERR(inherit);
+			goto out;
+		}
+	}
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
-					      vol_args->fd, subvol,
-					      ptr, readonly);
+					      vol_args->fd, subvol, ptr,
+					      readonly, inherit);
 
 	if (ret == 0 && ptr &&
 	    copy_to_user(arg +
@@ -1394,6 +1639,7 @@
 		ret = -EFAULT;
 out:
 	kfree(vol_args);
+	kfree(inherit);
 	return ret;
 }
 
@@ -1429,29 +1675,40 @@
 	u64 flags;
 	int ret = 0;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		goto out;
 
-	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
-		return -EINVAL;
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		ret = -EINVAL;
+		goto out_drop_write;
+	}
 
-	if (copy_from_user(&flags, arg, sizeof(flags)))
-		return -EFAULT;
+	if (copy_from_user(&flags, arg, sizeof(flags))) {
+		ret = -EFAULT;
+		goto out_drop_write;
+	}
 
-	if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
-		return -EINVAL;
+	if (flags & BTRFS_SUBVOL_CREATE_ASYNC) {
+		ret = -EINVAL;
+		goto out_drop_write;
+	}
 
-	if (flags & ~BTRFS_SUBVOL_RDONLY)
-		return -EOPNOTSUPP;
+	if (flags & ~BTRFS_SUBVOL_RDONLY) {
+		ret = -EOPNOTSUPP;
+		goto out_drop_write;
+	}
 
-	if (!inode_owner_or_capable(inode))
-		return -EACCES;
+	if (!inode_owner_or_capable(inode)) {
+		ret = -EACCES;
+		goto out_drop_write;
+	}
 
 	down_write(&root->fs_info->subvol_sem);
 
 	/* nothing to do */
 	if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
-		goto out;
+		goto out_drop_sem;
 
 	root_flags = btrfs_root_flags(&root->root_item);
 	if (flags & BTRFS_SUBVOL_RDONLY)
@@ -1474,8 +1731,11 @@
 out_reset:
 	if (ret)
 		btrfs_set_root_flags(&root->root_item, root_flags);
-out:
+out_drop_sem:
 	up_write(&root->fs_info->subvol_sem);
+out_drop_write:
+	mnt_drop_write_file(file);
+out:
 	return ret;
 }
 
@@ -1485,13 +1745,28 @@
 static noinline int may_destroy_subvol(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
 	struct btrfs_key key;
+	u64 dir_id;
 	int ret;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
+	/* Make sure this root isn't set as the default subvol */
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+		if (key.objectid == root->root_key.objectid) {
+			ret = -ENOTEMPTY;
+			goto out;
+		}
+		btrfs_release_path(path);
+	}
+
 	key.objectid = root->root_key.objectid;
 	key.type = BTRFS_ROOT_REF_KEY;
 	key.offset = (u64)-1;
@@ -1571,7 +1846,11 @@
 		item_off = btrfs_item_ptr_offset(leaf, i);
 		item_len = btrfs_item_size_nr(leaf, i);
 
-		if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
+		btrfs_item_key_to_cpu(leaf, key, i);
+		if (!key_in_sk(key, sk))
+			continue;
+
+		if (sizeof(sh) + item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
 			item_len = 0;
 
 		if (sizeof(sh) + item_len + *sk_offset >
@@ -1580,10 +1859,6 @@
 			goto overflow;
 		}
 
-		btrfs_item_key_to_cpu(leaf, key, i);
-		if (!key_in_sk(key, sk))
-			continue;
-
 		sh.objectid = key->objectid;
 		sh.offset = key->offset;
 		sh.type = key->type;
@@ -1667,7 +1942,7 @@
 	path->keep_locks = 1;
 
 	while(1) {
-		ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+		ret = btrfs_search_forward(root, &key, &max_key, path,
 					   sk->min_transid);
 		if (ret != 0) {
 			if (ret > 0)
@@ -1757,25 +2032,29 @@
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
+		else if (ret > 0) {
+			ret = btrfs_previous_item(root, path, dirid,
+						  BTRFS_INODE_REF_KEY);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0) {
+				ret = -ENOENT;
+				goto out;
+			}
+		}
 
 		l = path->nodes[0];
 		slot = path->slots[0];
-		if (ret > 0 && slot > 0)
-			slot--;
 		btrfs_item_key_to_cpu(l, &key, slot);
 
-		if (ret > 0 && (key.objectid != dirid ||
-				key.type != BTRFS_INODE_REF_KEY)) {
-			ret = -ENOENT;
-			goto out;
-		}
-
 		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
 		len = btrfs_inode_ref_name_len(l, iref);
 		ptr -= len + 1;
 		total_len += len + 1;
-		if (ptr < name)
+		if (ptr < name) {
+			ret = -ENAMETOOLONG;
 			goto out;
+		}
 
 		*(ptr + len) = '/';
 		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
@@ -1788,8 +2067,6 @@
 		key.offset = (u64)-1;
 		dirid = key.objectid;
 	}
-	if (ptr < name)
-		goto out;
 	memmove(name, ptr, total_len);
 	name[total_len]='\0';
 	ret = 0;
@@ -1839,6 +2116,8 @@
 	struct btrfs_root *dest = NULL;
 	struct btrfs_ioctl_vol_args *vol_args;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_block_rsv block_rsv;
+	u64 qgroup_reserved;
 	int namelen;
 	int ret;
 	int err = 0;
@@ -1859,7 +2138,9 @@
 	if (err)
 		goto out;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	err = mutex_lock_killable_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	if (err == -EINTR)
+		goto out;
 	dentry = lookup_one_len(vol_args->name, parent, namelen);
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -1905,13 +2186,13 @@
 		err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
 		if (err)
 			goto out_dput;
-
-		/* check if subvolume may be deleted by a non-root user */
-		err = btrfs_may_delete(dir, dentry, 1);
-		if (err)
-			goto out_dput;
 	}
 
+	/* check if subvolume may be deleted by a user */
+	err = btrfs_may_delete(dir, dentry, 1);
+	if (err)
+		goto out_dput;
+
 	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
 		err = -EINVAL;
 		goto out_dput;
@@ -1928,18 +2209,33 @@
 	if (err)
 		goto out_up_write;
 
+	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+	/*
+	 * One for dir inode, two for dir entries, two for root
+	 * ref/backref.
+	 */
+	err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+					       5, &qgroup_reserved, true);
+	if (err)
+		goto out_up_write;
+
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
-		goto out_up_write;
+		goto out_release;
 	}
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &block_rsv;
+	trans->bytes_reserved = block_rsv.size;
 
 	ret = btrfs_unlink_subvol(trans, root, dir,
 				dest->root_key.objectid,
 				dentry->d_name.name,
 				dentry->d_name.len);
-	BUG_ON(ret);
+	if (ret) {
+		err = ret;
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_end_trans;
+	}
 
 	btrfs_record_root_in_trans(trans, dest);
 
@@ -1952,12 +2248,42 @@
 		ret = btrfs_insert_orphan_item(trans,
 					root->fs_info->tree_root,
 					dest->root_key.objectid);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			err = ret;
+			goto out_end_trans;
+		}
+	}
+
+	ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				  dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  dest->root_key.objectid);
+	if (ret && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, root, ret);
+		err = ret;
+		goto out_end_trans;
+	}
+	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+		ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+					  dest->root_item.received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  dest->root_key.objectid);
+		if (ret && ret != -ENOENT) {
+			btrfs_abort_transaction(trans, root, ret);
+			err = ret;
+			goto out_end_trans;
+		}
 	}
 
+out_end_trans:
+	trans->block_rsv = NULL;
+	trans->bytes_reserved = 0;
 	ret = btrfs_end_transaction(trans, root);
-	BUG_ON(ret);
+	if (ret && !err)
+		err = ret;
 	inode->i_flags |= S_DEAD;
+out_release:
+	btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 out_up_write:
 	up_write(&root->fs_info->subvol_sem);
 out_unlock:
@@ -1966,6 +2292,12 @@
 		shrink_dcache_sb(root->fs_info->sb);
 		btrfs_invalidate_inodes(dest);
 		d_delete(dentry);
+
+		/* the last ref */
+		if (dest->cache_inode) {
+			iput(dest->cache_inode);
+			dest->cache_inode = NULL;
+		}
 	}
 out_dput:
 	dput(dentry);
@@ -1984,23 +2316,25 @@
 	struct btrfs_ioctl_defrag_range_args *range;
 	int ret;
 
-	if (btrfs_root_readonly(root))
-		return -EROFS;
-
 	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
 
+	if (btrfs_root_readonly(root)) {
+		ret = -EROFS;
+		goto out;
+	}
+
 	switch (inode->i_mode & S_IFMT) {
 	case S_IFDIR:
 		if (!capable(CAP_SYS_ADMIN)) {
 			ret = -EPERM;
 			goto out;
 		}
-		ret = btrfs_defrag_root(root, 0);
+		ret = btrfs_defrag_root(root);
 		if (ret)
 			goto out;
-		ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+		ret = btrfs_defrag_root(root->fs_info->extent_root);
 		break;
 	case S_IFREG:
 		if (!(file->f_mode & FMODE_WRITE)) {
@@ -2052,36 +2386,63 @@
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
 	ret = btrfs_init_new_device(root, vol_args->name);
 
 	kfree(vol_args);
+out:
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 	return ret;
 }
 
-static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 {
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_vol_args *vol_args;
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
 
 	vol_args = memdup_user(arg, sizeof(*vol_args));
-	if (IS_ERR(vol_args))
-		return PTR_ERR(vol_args);
+	if (IS_ERR(vol_args)) {
+		ret = PTR_ERR(vol_args);
+		goto out;
+	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+
+	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+	mutex_lock(&root->fs_info->volume_mutex);
 	ret = btrfs_rm_device(root, vol_args->name);
+	mutex_unlock(&root->fs_info->volume_mutex);
+	atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
 
+out:
 	kfree(vol_args);
+	mnt_drop_write_file(file);
 	return ret;
 }
 
@@ -2100,10 +2461,10 @@
 	if (!fi_args)
 		return -ENOMEM;
 
+	mutex_lock(&fs_devices->device_list_mutex);
 	fi_args->num_devices = fs_devices->num_devices;
 	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
 
-	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->devid > fi_args->max_id)
 			fi_args->max_id = device->devid;
@@ -2124,7 +2485,6 @@
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	int ret = 0;
 	char *s_uuid = NULL;
-	char empty_uuid[BTRFS_UUID_SIZE] = {0};
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2133,12 +2493,11 @@
 	if (IS_ERR(di_args))
 		return PTR_ERR(di_args);
 
-	if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
+	if (!btrfs_is_empty_uuid(di_args->uuid))
 		s_uuid = di_args->uuid;
 
 	mutex_lock(&fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
-	mutex_unlock(&fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, di_args->devid, s_uuid, NULL);
 
 	if (!dev) {
 		ret = -ENODEV;
@@ -2149,9 +2508,20 @@
 	di_args->bytes_used = dev->bytes_used;
 	di_args->total_bytes = dev->total_bytes;
 	memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
-	strncpy(di_args->path, dev->name, sizeof(di_args->path));
+	if (dev->name) {
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		strncpy(di_args->path, name->str, sizeof(di_args->path));
+		rcu_read_unlock();
+		di_args->path[sizeof(di_args->path) - 1] = 0;
+	} else {
+		di_args->path[0] = '\0';
+	}
 
 out:
+	mutex_unlock(&fs_devices->device_list_mutex);
 	if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
 		ret = -EFAULT;
 
@@ -2159,136 +2529,350 @@
 	return ret;
 }
 
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
-				       u64 off, u64 olen, u64 destoff)
+static struct page *extent_same_get_page(struct inode *inode, u64 off)
+{
+	struct page *page;
+	pgoff_t index;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+	index = off >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(inode->i_mapping, index);
+	if (!page)
+		return NULL;
+
+	if (!PageUptodate(page)) {
+		if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
+						 0))
+			return NULL;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+			return NULL;
+		}
+	}
+	unlock_page(page);
+
+	return page;
+}
+
+static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+{
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    off + len - 1);
+		if (!ordered &&
+		    !test_range_bit(&BTRFS_I(inode)->io_tree, off,
+				    off + len - 1, EXTENT_DELALLOC, 0, NULL))
+			break;
+		unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		btrfs_wait_ordered_range(inode, off, len);
+	}
+}
+
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+				struct inode *inode2, u64 loff2, u64 len)
+{
+	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+	mutex_unlock(&inode1->i_mutex);
+	mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+			      struct inode *inode2, u64 loff2, u64 len)
+{
+	if (inode1 < inode2) {
+		swap(inode1, inode2);
+		swap(loff1, loff2);
+	}
+
+	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+	lock_extent_range(inode1, loff1, len);
+	if (inode1 != inode2) {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+		lock_extent_range(inode2, loff2, len);
+	}
+}
+
+static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
+			  u64 dst_loff, u64 len)
+{
+	int ret = 0;
+	struct page *src_page, *dst_page;
+	unsigned int cmp_len = PAGE_CACHE_SIZE;
+	void *addr, *dst_addr;
+
+	while (len) {
+		if (len < PAGE_CACHE_SIZE)
+			cmp_len = len;
+
+		src_page = extent_same_get_page(src, loff);
+		if (!src_page)
+			return -EINVAL;
+		dst_page = extent_same_get_page(dst, dst_loff);
+		if (!dst_page) {
+			page_cache_release(src_page);
+			return -EINVAL;
+		}
+		addr = kmap_atomic(src_page);
+		dst_addr = kmap_atomic(dst_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dst_page);
+
+		if (memcmp(addr, dst_addr, cmp_len))
+			ret = BTRFS_SAME_DATA_DIFFERS;
+
+		kunmap_atomic(addr);
+		kunmap_atomic(dst_addr);
+		page_cache_release(src_page);
+		page_cache_release(dst_page);
+
+		if (ret)
+			break;
+
+		loff += cmp_len;
+		dst_loff += cmp_len;
+		len -= cmp_len;
+	}
+
+	return ret;
+}
+
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+{
+	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
+
+	if (off + len > inode->i_size || off + len < off)
+		return -EINVAL;
+	/* Check that we are block aligned - btrfs_clone() requires this */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+			     struct inode *dst, u64 dst_loff)
 {
-	struct inode *inode = fdentry(file)->d_inode;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct file *src_file;
-	struct inode *src;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	char *buf;
-	struct btrfs_key key;
-	u32 nritems;
-	int slot;
 	int ret;
-	u64 len = olen;
-	u64 bs = root->fs_info->sb->s_blocksize;
-	u64 hint_byte;
 
 	/*
-	 * TODO:
-	 * - split compressed inline extents.  annoying: we need to
-	 *   decompress into destination's address_space (the file offset
-	 *   may change, so source mapping won't do), then recompress (or
-	 *   otherwise reinsert) a subrange.
-	 * - allow ranges within the same file to be cloned (provided
-	 *   they don't overlap)?
+	 * btrfs_clone() can't handle extents in the same file
+	 * yet. Once that works, we can drop this check and replace it
+	 * with a check for the same inode, but overlapping extents.
 	 */
-
-	/* the destination must be opened for writing */
-	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+	if (src == dst)
 		return -EINVAL;
 
-	if (btrfs_root_readonly(root))
-		return -EROFS;
+	btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+	ret = extent_same_check_offsets(src, loff, len);
+	if (ret)
+		goto out_unlock;
+
+	ret = extent_same_check_offsets(dst, dst_loff, len);
+	if (ret)
+		goto out_unlock;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+	if (ret == 0)
+		ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out_unlock:
+	btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+	return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+					 void __user *argp)
+{
+	struct btrfs_ioctl_same_args tmp;
+	struct btrfs_ioctl_same_args *same;
+	struct btrfs_ioctl_same_extent_info *info;
+	struct inode *src = file->f_dentry->d_inode;
+	struct file *dst_file = NULL;
+	struct inode *dst;
+	u64 off;
+	u64 len;
+	int i;
+	int ret;
+	unsigned long size;
+	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+	bool is_admin = capable(CAP_SYS_ADMIN);
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EINVAL;
 
 	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
 
-	src_file = fget(srcfd);
-	if (!src_file) {
-		ret = -EBADF;
-		goto out_drop_write;
+	if (copy_from_user(&tmp,
+			   (struct btrfs_ioctl_same_args __user *)argp,
+			   sizeof(tmp))) {
+		ret = -EFAULT;
+		goto out;
 	}
 
-	src = src_file->f_dentry->d_inode;
+	size = sizeof(tmp) +
+		tmp.dest_count * sizeof(struct btrfs_ioctl_same_extent_info);
 
-	ret = -EINVAL;
-	if (src == inode)
-		goto out_fput;
+	same = kmalloc(size, GFP_NOFS);
+	if (!same) {
+		ret = -EFAULT;
+		goto out;
+	}
 
-	/* the src must be open for reading */
-	if (!(src_file->f_mode & FMODE_READ))
-		goto out_fput;
+	if (copy_from_user(same,
+			   (struct btrfs_ioctl_same_args __user *)argp, size)) {
+		ret = -EFAULT;
+		goto out;
+	}
 
-	/* don't make the dst file partly checksummed */
-	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
-		goto out_fput;
+	off = same->logical_offset;
+	len = same->length;
 
-	ret = -EISDIR;
-	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
-		goto out_fput;
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > BTRFS_MAX_DEDUPE_LEN)
+		len = BTRFS_MAX_DEDUPE_LEN;
 
-	ret = -EXDEV;
-	if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root)
-		goto out_fput;
+	if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+		/*
+		 * Btrfs does not support blocksize < page_size. As a
+		 * result, btrfs_cmp_data() won't correctly handle
+		 * this situation without an update.
+		 */
+		ret = -EINVAL;
+		goto out;
+	}
 
-	ret = -ENOMEM;
-	buf = vmalloc(btrfs_level_size(root, 0));
-	if (!buf)
-		goto out_fput;
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode))
+		goto out;
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		vfree(buf);
-		goto out_fput;
-	}
-	path->reada = 2;
+	ret = -EACCES;
+	if (!S_ISREG(src->i_mode))
+		goto out;
 
-	if (inode < src) {
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
-	} else {
-		mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+	/* pre-format output fields to sane values */
+	for (i = 0; i < same->dest_count; i++) {
+		same->info[i].bytes_deduped = 0ULL;
+		same->info[i].status = 0;
 	}
 
-	/* determine range to clone */
-	ret = -EINVAL;
-	if (off + len > src->i_size || off + len < off)
-		goto out_unlock;
-	if (len == 0)
-		olen = len = src->i_size - off;
-	/* if we extend to eof, continue to block boundary */
-	if (off + len == src->i_size)
-		len = ALIGN(src->i_size, bs) - off;
+	ret = 0;
+	for (i = 0; i < same->dest_count; i++) {
+		info = &same->info[i];
 
-	/* verify the end result is block aligned */
-	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-	    !IS_ALIGNED(destoff, bs))
-		goto out_unlock;
+		dst_file = fget(info->fd);
+		if (!dst_file) {
+			info->status = -EBADF;
+			goto next;
+		}
 
-	if (destoff > inode->i_size) {
-		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
-		if (ret)
-			goto out_unlock;
+		if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+			info->status = -EINVAL;
+			goto next;
+		}
+
+		info->status = -EXDEV;
+		if (file->f_path.mnt != dst_file->f_path.mnt)
+			goto next;
+
+		dst = dst_file->f_dentry->d_inode;
+		if (src->i_sb != dst->i_sb)
+			goto next;
+
+		if (S_ISDIR(dst->i_mode)) {
+			info->status = -EISDIR;
+			goto next;
+		}
+
+		if (!S_ISREG(dst->i_mode)) {
+			info->status = -EACCES;
+			goto next;
+		}
+
+		info->status = btrfs_extent_same(src, off, len, dst,
+						info->logical_offset);
+		if (info->status == 0)
+			info->bytes_deduped += len;
+
+next:
+		if (dst_file)
+			fput(dst_file);
 	}
 
-	/* truncate page cache pages from target inode range */
-	truncate_inode_pages_range(&inode->i_data, destoff,
-				   PAGE_CACHE_ALIGN(destoff + len) - 1);
+	ret = copy_to_user(argp, same, size);
+	if (ret)
+		ret = -EFAULT;
 
-	/* do any pending delalloc/csum calc on src, one way or
-	   another, and lock file content */
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-		ordered = btrfs_lookup_first_ordered_extent(src, off+len);
-		if (!ordered &&
-		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
-				   EXTENT_DELALLOC, 0, NULL))
-			break;
-		unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		btrfs_wait_ordered_range(src, off, len);
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen, extent_same uses
+ *               identical values here
+ * @destoff: Offset within @inode to start clone
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *leaf;
+	struct btrfs_trans_handle *trans;
+	char *buf = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int slot;
+	int ret;
+	u64 len = olen_aligned;
+
+	ret = -ENOMEM;
+	buf = vmalloc(btrfs_level_size(root, 0));
+	if (!buf)
+		return ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
+		return ret;
 	}
 
+	path->reada = 2;
 	/* clone data */
 	key.objectid = btrfs_ino(src);
 	key.type = BTRFS_EXTENT_DATA_KEY;
@@ -2299,13 +2883,14 @@
 		 * note the key will change type as we walk through the
 		 * tree.
 		 */
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
+				0, 0);
 		if (ret < 0)
 			goto out;
 
 		nritems = btrfs_header_nritems(path->nodes[0]);
 		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(root, path);
+			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
 			if (ret < 0)
 				goto out;
 			if (ret > 0)
@@ -2356,7 +2941,7 @@
 			btrfs_release_path(path);
 
 			if (key.offset + datal <= off ||
-			    key.offset >= off+len)
+			    key.offset >= off + len - 1)
 				goto next;
 
 			memcpy(&new_key, &key, sizeof(new_key));
@@ -2394,15 +2979,25 @@
 					datal -= off - key.offset;
 				}
 
-				ret = btrfs_drop_extents(trans, inode,
+				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
-							 &hint_byte, 1);
-				BUG_ON(ret);
+							 1);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				ret = btrfs_insert_empty_item(trans, root, path,
 							      &new_key, size);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				leaf = path->nodes[0];
 				slot = path->slots[0];
@@ -2427,8 +3022,17 @@
 							disko, diskl, 0,
 							root->root_key.objectid,
 							btrfs_ino(inode),
-							new_key.offset - datao);
-					BUG_ON(ret);
+							new_key.offset - datao,
+							0);
+					if (ret) {
+						btrfs_abort_transaction(trans,
+									root,
+									ret);
+						btrfs_end_transaction(trans,
+								      root);
+						goto out;
+
+					}
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				u64 skip = 0;
@@ -2438,8 +3042,8 @@
 					new_key.offset += skip;
 				}
 
-				if (key.offset + datal > off+len)
-					trim = key.offset + datal - (off+len);
+				if (key.offset + datal > off + len)
+					trim = key.offset + datal - (off + len);
 
 				if (comp && (skip || trim)) {
 					ret = -EINVAL;
@@ -2449,15 +3053,25 @@
 				size -= skip + trim;
 				datal -= skip + trim;
 
-				ret = btrfs_drop_extents(trans, inode,
+				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
 							 new_key.offset + datal,
-							 &hint_byte, 1);
-				BUG_ON(ret);
+							 1);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				ret = btrfs_insert_empty_item(trans, root, path,
 							      &new_key, size);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_abort_transaction(trans, root,
+								ret);
+					btrfs_end_transaction(trans, root);
+					goto out;
+				}
 
 				if (skip) {
 					u32 start =
@@ -2477,6 +3091,7 @@
 			btrfs_mark_buffer_dirty(leaf);
 			btrfs_release_path(path);
 
+			inode_inc_iversion(inode);
 			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 
 			/*
@@ -2491,24 +3106,146 @@
 				btrfs_i_size_write(inode, endoff);
 
 			ret = btrfs_update_inode(trans, root, inode);
-			BUG_ON(ret);
-			btrfs_end_transaction(trans, root);
+			if (ret) {
+				btrfs_abort_transaction(trans, root, ret);
+				btrfs_end_transaction(trans, root);
+				goto out;
+			}
+			ret = btrfs_end_transaction(trans, root);
 		}
 next:
 		btrfs_release_path(path);
 		key.offset++;
 	}
 	ret = 0;
+
 out:
 	btrfs_release_path(path);
-	unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
+	btrfs_free_path(path);
+	vfree(buf);
+	return ret;
+}
+
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+				       u64 off, u64 olen, u64 destoff)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct file *src_file;
+	struct inode *src;
+	int ret, fput_needed;
+	u64 len = olen;
+	u64 bs = root->fs_info->sb->s_blocksize;
+	int same_inode = 0;
+
+	/*
+	 * TODO:
+	 * - split compressed inline extents.  annoying: we need to
+	 *   decompress into destination's address_space (the file offset
+	 *   may change, so source mapping won't do), then recompress (or
+	 *   otherwise reinsert) a subrange.
+	 * - allow ranges within the same file to be cloned (provided
+	 *   they don't overlap)?
+	 */
+
+	/* the destination must be opened for writing */
+	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+		return -EINVAL;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	src_file = fget_light(srcfd, &fput_needed);
+	if (!src_file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+
+	ret = -EXDEV;
+	if (src_file->f_path.mnt != file->f_path.mnt)
+		goto out_fput;
+
+	src = src_file->f_dentry->d_inode;
+
+	ret = -EINVAL;
+	if (src == inode)
+		same_inode = 1;
+
+	/* the src must be open for reading */
+	if (!(src_file->f_mode & FMODE_READ))
+		goto out_fput;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+		goto out_fput;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+		} else {
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		}
+	} else {
+		mutex_lock(&src->i_mutex);
+	}
+
+	/* determine range to clone */
+	ret = -EINVAL;
+	if (off + len > src->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - off;
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == src->i_size)
+		len = ALIGN(src->i_size, bs) - off;
+
+	/* verify the end result is block aligned */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+	    !IS_ALIGNED(destoff, bs))
+		goto out_unlock;
+
+	/* verify if ranges are overlapped within the same file */
+	if (same_inode) {
+		if (destoff + len > off && destoff < off + len)
+			goto out_unlock;
+	}
+
+	if (destoff > inode->i_size) {
+		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/* truncate page cache pages from target inode range */
+	truncate_inode_pages_range(&inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len) - 1);
+
+	lock_extent_range(src, off, len);
+
+	ret = btrfs_clone(src, inode, off, olen, len, destoff);
+
+	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
 	mutex_unlock(&src->i_mutex);
-	mutex_unlock(&inode->i_mutex);
-	vfree(buf);
-	btrfs_free_path(path);
+	if (!same_inode)
+		mutex_unlock(&inode->i_mutex);
 out_fput:
-	fput(src_file);
+	fput_light(src_file, fput_needed);
 out_drop_write:
 	mnt_drop_write_file(file);
 	return ret;
@@ -2580,40 +3317,47 @@
 	struct btrfs_path *path;
 	struct btrfs_key location;
 	struct btrfs_disk_key disk_key;
-	struct btrfs_super_block *disk_super;
-	u64 features;
 	u64 objectid = 0;
 	u64 dir_id;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (copy_from_user(&objectid, argp, sizeof(objectid)))
-		return -EFAULT;
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	if (copy_from_user(&objectid, argp, sizeof(objectid))) {
+		ret = -EFAULT;
+		goto out;
+	}
 
 	if (!objectid)
-		objectid = root->root_key.objectid;
+		objectid = BTRFS_FS_TREE_OBJECTID;
 
 	location.objectid = objectid;
 	location.type = BTRFS_ROOT_ITEM_KEY;
 	location.offset = (u64)-1;
 
 	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-	if (IS_ERR(new_root))
-		return PTR_ERR(new_root);
-
-	if (btrfs_root_refs(&new_root->root_item) == 0)
-		return -ENOENT;
+	if (IS_ERR(new_root)) {
+		ret = PTR_ERR(new_root);
+		goto out;
+	}
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
 	path->leave_spinning = 1;
 
 	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
-		return PTR_ERR(trans);
+		ret = PTR_ERR(trans);
+		goto out;
 	}
 
 	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
@@ -2624,7 +3368,8 @@
 		btrfs_end_transaction(trans, root);
 		printk(KERN_ERR "Umm, you don't have the default dir item, "
 		       "this isn't going to work\n");
-		return -ENOENT;
+		ret = -ENOENT;
+		goto out;
 	}
 
 	btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
@@ -2632,19 +3377,15 @@
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	disk_super = root->fs_info->super_copy;
-	features = btrfs_super_incompat_flags(disk_super);
-	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
-		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
-		btrfs_set_super_incompat_flags(disk_super, features);
-	}
+	btrfs_set_fs_incompat(root->fs_info, DEFAULT_SUBVOL);
 	btrfs_end_transaction(trans, root);
-
-	return 0;
+out:
+	mnt_drop_write_file(file);
+	return ret;
 }
 
-static void get_block_group_info(struct list_head *groups_list,
-				 struct btrfs_ioctl_space_info *space)
+void btrfs_get_block_group_info(struct list_head *groups_list,
+				struct btrfs_ioctl_space_info *space)
 {
 	struct btrfs_block_group_cache *block_group;
 
@@ -2659,7 +3400,7 @@
 	}
 }
 
-long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
 	struct btrfs_ioctl_space_args space_args;
 	struct btrfs_ioctl_space_info space;
@@ -2752,8 +3493,8 @@
 		down_read(&info->groups_sem);
 		for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
 			if (!list_empty(&info->block_groups[c])) {
-				get_block_group_info(&info->block_groups[c],
-						     &space);
+				btrfs_get_block_group_info(
+					&info->block_groups[c], &space);
 				memcpy(dest, &space, sizeof(space));
 				dest++;
 				space_args.total_spaces++;
@@ -2765,7 +3506,7 @@
 		up_read(&info->groups_sem);
 	}
 
-	user_dest = (struct btrfs_ioctl_space_info *)
+	user_dest = (struct btrfs_ioctl_space_info __user *)
 		(arg + sizeof(struct btrfs_ioctl_space_args));
 
 	if (copy_to_user(user_dest, dest_orig, alloc_size))
@@ -2804,32 +3545,38 @@
 	return 0;
 }
 
-static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
+					    void __user *argp)
 {
-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
 	struct btrfs_trans_handle *trans;
 	u64 transid;
 	int ret;
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans))
-		return PTR_ERR(trans);
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		if (PTR_ERR(trans) != -ENOENT)
+			return PTR_ERR(trans);
+
+		/* No running transaction, don't bother */
+		transid = root->fs_info->last_trans_committed;
+		goto out;
+	}
 	transid = trans->transid;
 	ret = btrfs_commit_transaction_async(trans, root, 0);
 	if (ret) {
 		btrfs_end_transaction(trans, root);
 		return ret;
 	}
-
+out:
 	if (argp)
 		if (copy_to_user(argp, &transid, sizeof(transid)))
 			return -EFAULT;
 	return 0;
 }
 
-static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+static noinline long btrfs_ioctl_wait_sync(struct btrfs_root *root,
+					   void __user *argp)
 {
-	struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
 	u64 transid;
 
 	if (argp) {
@@ -2841,10 +3588,11 @@
 	return btrfs_wait_for_commit(root, transid);
 }
 
-static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
 {
-	int ret;
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
 	struct btrfs_ioctl_scrub_args *sa;
+	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2853,12 +3601,22 @@
 	if (IS_ERR(sa))
 		return PTR_ERR(sa);
 
-	ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
-			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+	if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
+		ret = mnt_want_write_file(file);
+		if (ret)
+			goto out;
+	}
+
+	ret = btrfs_scrub_dev(root->fs_info, sa->devid, sa->start, sa->end,
+			      &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
+			      0);
 
 	if (copy_to_user(arg, sa, sizeof(*sa)))
 		ret = -EFAULT;
 
+	if (!(sa->flags & BTRFS_SCRUB_READONLY))
+		mnt_drop_write_file(file);
+out:
 	kfree(sa);
 	return ret;
 }
@@ -2868,7 +3626,7 @@
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	return btrfs_scrub_cancel(root);
+	return btrfs_scrub_cancel(root->fs_info);
 }
 
 static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
@@ -2893,6 +3651,77 @@
 	return ret;
 }
 
+static long btrfs_ioctl_get_dev_stats(struct btrfs_root *root,
+				      void __user *arg)
+{
+	struct btrfs_ioctl_get_dev_stats *sa;
+	int ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa))
+		return PTR_ERR(sa);
+
+	if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
+		kfree(sa);
+		return -EPERM;
+	}
+
+	ret = btrfs_get_dev_stats(root, sa);
+
+	if (copy_to_user(arg, sa, sizeof(*sa)))
+		ret = -EFAULT;
+
+	kfree(sa);
+	return ret;
+}
+
+static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
+{
+	struct btrfs_ioctl_dev_replace_args *p;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	p = memdup_user(arg, sizeof(*p));
+	if (IS_ERR(p))
+		return PTR_ERR(p);
+
+	switch (p->cmd) {
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+		if (root->fs_info->sb->s_flags & MS_RDONLY)
+			return -EROFS;
+
+		if (atomic_xchg(
+			&root->fs_info->mutually_exclusive_operation_running,
+			1)) {
+			ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		} else {
+			ret = btrfs_dev_replace_start(root, p);
+			atomic_set(
+			 &root->fs_info->mutually_exclusive_operation_running,
+			 0);
+		}
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
+		btrfs_dev_replace_status(root->fs_info, p);
+		ret = 0;
+		break;
+	case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
+		ret = btrfs_dev_replace_cancel(root->fs_info, p);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	if (copy_to_user(arg, p, sizeof(*p)))
+		ret = -EFAULT;
+
+	kfree(p);
+	return ret;
+}
+
 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 {
 	int ret = 0;
@@ -2903,7 +3732,7 @@
 	struct inode_fs_paths *ipath = NULL;
 	struct btrfs_path *path;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_DAC_READ_SEARCH))
 		return -EPERM;
 
 	path = btrfs_alloc_path();
@@ -2977,11 +3806,9 @@
 {
 	int ret = 0;
 	int size;
-	u64 extent_offset;
 	struct btrfs_ioctl_logical_ino_args *loi;
 	struct btrfs_data_container *inodes = NULL;
 	struct btrfs_path *path = NULL;
-	struct btrfs_key key;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2999,7 +3826,7 @@
 		goto out;
 	}
 
-	size = min_t(u32, loi->size, 4096);
+	size = min_t(u32, loi->size, 64 * 1024);
 	inodes = init_data_container(size);
 	if (IS_ERR(inodes)) {
 		ret = PTR_ERR(inodes);
@@ -3007,20 +3834,13 @@
 		goto out;
 	}
 
-	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
-
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+	ret = iterate_inodes_from_logical(loi->logical, root->fs_info, path,
+					  build_ino_list, inodes);
+	if (ret == -EINVAL)
 		ret = -ENOENT;
 	if (ret < 0)
 		goto out;
 
-	extent_offset = loi->logical - key.objectid;
-	ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
-					extent_offset, build_ino_list, inodes);
-
-	if (ret < 0)
-		goto out;
-
 	ret = copy_to_user((void *)(unsigned long)loi->inodes,
 			   (void *)(unsigned long)inodes, size);
 	if (ret)
@@ -3028,12 +3848,660 @@
 
 out:
 	btrfs_free_path(path);
-	kfree(inodes);
+	vfree(inodes);
 	kfree(loi);
 
 	return ret;
 }
 
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+			       struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	bargs->flags = bctl->flags;
+
+	if (atomic_read(&fs_info->balance_running))
+		bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
+	if (atomic_read(&fs_info->balance_pause_req))
+		bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
+	if (atomic_read(&fs_info->balance_cancel_req))
+		bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
+
+	memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
+	memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
+	memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
+
+	if (lock) {
+		spin_lock(&fs_info->balance_lock);
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+		spin_unlock(&fs_info->balance_lock);
+	} else {
+		memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
+	}
+}
+
+static long btrfs_ioctl_balance(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	struct btrfs_balance_control *bctl;
+	bool need_unlock; /* for mut. excl. ops lock */
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+again:
+	if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+		need_unlock = true;
+		goto locked;
+	}
+
+	/*
+	 * mut. excl. ops lock is locked.  Three possibilites:
+	 *   (1) some other op is running
+	 *   (2) balance is running
+	 *   (3) balance is paused -- special case (think resume)
+	 */
+	mutex_lock(&fs_info->balance_mutex);
+	if (fs_info->balance_ctl) {
+		/* this is either (2) or (3) */
+		if (!atomic_read(&fs_info->balance_running)) {
+			mutex_unlock(&fs_info->balance_mutex);
+			if (!mutex_trylock(&fs_info->volume_mutex))
+				goto again;
+			mutex_lock(&fs_info->balance_mutex);
+
+			if (fs_info->balance_ctl &&
+			    !atomic_read(&fs_info->balance_running)) {
+				/* this is (3) */
+				need_unlock = false;
+				goto locked;
+			}
+
+			mutex_unlock(&fs_info->balance_mutex);
+			mutex_unlock(&fs_info->volume_mutex);
+			goto again;
+		} else {
+			/* this is (2) */
+			mutex_unlock(&fs_info->balance_mutex);
+			ret = -EINPROGRESS;
+			goto out;
+		}
+	} else {
+		/* this is (1) */
+		mutex_unlock(&fs_info->balance_mutex);
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
+		goto out;
+	}
+
+locked:
+	BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
+
+	if (arg) {
+		bargs = memdup_user(arg, sizeof(*bargs));
+		if (IS_ERR(bargs)) {
+			ret = PTR_ERR(bargs);
+			goto out_unlock;
+		}
+
+		if (bargs->flags & BTRFS_BALANCE_RESUME) {
+			if (!fs_info->balance_ctl) {
+				ret = -ENOTCONN;
+				goto out_bargs;
+			}
+
+			bctl = fs_info->balance_ctl;
+			spin_lock(&fs_info->balance_lock);
+			bctl->flags |= BTRFS_BALANCE_RESUME;
+			spin_unlock(&fs_info->balance_lock);
+
+			goto do_balance;
+		}
+	} else {
+		bargs = NULL;
+	}
+
+	if (fs_info->balance_ctl) {
+		ret = -EINPROGRESS;
+		goto out_bargs;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out_bargs;
+	}
+
+	bctl->fs_info = fs_info;
+	if (arg) {
+		memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
+		memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
+		memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
+
+		bctl->flags = bargs->flags;
+	} else {
+		/* balance everything - no filters */
+		bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
+	}
+
+do_balance:
+	/*
+	 * Ownership of bctl and mutually_exclusive_operation_running
+	 * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
+	 * or, if restriper was paused all the way until unmount, in
+	 * free_fs_info.  mutually_exclusive_operation_running is
+	 * cleared in __cancel_balance.
+	 */
+	need_unlock = false;
+
+	ret = btrfs_balance(bctl, bargs);
+
+	if (arg) {
+		if (copy_to_user(arg, bargs, sizeof(*bargs)))
+			ret = -EFAULT;
+	}
+
+out_bargs:
+	kfree(bargs);
+out_unlock:
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+	if (need_unlock)
+		atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_balance_ctl(struct btrfs_root *root, int cmd)
+{
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case BTRFS_BALANCE_CTL_PAUSE:
+		return btrfs_pause_balance(root->fs_info);
+	case BTRFS_BALANCE_CTL_CANCEL:
+		return btrfs_cancel_balance(root->fs_info);
+	}
+
+	return -EINVAL;
+}
+
+static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
+					 void __user *arg)
+{
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_ioctl_balance_args *bargs;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		ret = -ENOTCONN;
+		goto out;
+	}
+
+	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+	if (!bargs) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	update_ioctl_balance_args(fs_info, 1, bargs);
+
+	if (copy_to_user(arg, bargs, sizeof(*bargs)))
+		ret = -EFAULT;
+
+	kfree(bargs);
+out:
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_ioctl_quota_ctl_args *sa;
+	struct btrfs_trans_handle *trans = NULL;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	down_write(&root->fs_info->subvol_sem);
+	trans = btrfs_start_transaction(root->fs_info->tree_root, 2);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	switch (sa->cmd) {
+	case BTRFS_QUOTA_CTL_ENABLE:
+		ret = btrfs_quota_enable(trans, root->fs_info);
+		break;
+	case BTRFS_QUOTA_CTL_DISABLE:
+		ret = btrfs_quota_disable(trans, root->fs_info);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+
+	err = btrfs_commit_transaction(trans, root->fs_info->tree_root);
+	if (err && !ret)
+		ret = err;
+out:
+	kfree(sa);
+	up_write(&root->fs_info->subvol_sem);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_ioctl_qgroup_assign_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->assign) {
+		ret = btrfs_add_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	} else {
+		ret = btrfs_del_qgroup_relation(trans, root->fs_info,
+						sa->src, sa->dst);
+	}
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_ioctl_qgroup_create_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	if (!sa->qgroupid) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	if (sa->create) {
+		ret = btrfs_create_qgroup(trans, root->fs_info, sa->qgroupid,
+					  NULL);
+	} else {
+		ret = btrfs_remove_qgroup(trans, root->fs_info, sa->qgroupid);
+	}
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_ioctl_qgroup_limit_args *sa;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err;
+	u64 qgroupid;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		goto drop_write;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	qgroupid = sa->qgroupid;
+	if (!qgroupid) {
+		/* take the current subvol as qgroup */
+		qgroupid = root->root_key.objectid;
+	}
+
+	/* FIXME: check if the IDs really exist */
+	ret = btrfs_limit_qgroup(trans, root->fs_info, qgroupid, &sa->lim);
+
+	err = btrfs_end_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+
+out:
+	kfree(sa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	qsa = memdup_user(arg, sizeof(*qsa));
+	if (IS_ERR(qsa)) {
+		ret = PTR_ERR(qsa);
+		goto drop_write;
+	}
+
+	if (qsa->flags) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = btrfs_qgroup_rescan(root->fs_info);
+
+out:
+	kfree(qsa);
+drop_write:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_ioctl_quota_rescan_args *qsa;
+	int ret = 0;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+	if (!qsa)
+		return -ENOMEM;
+
+	if (root->fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+		qsa->flags = 1;
+		qsa->progress = root->fs_info->qgroup_rescan_progress.objectid;
+	}
+
+	if (copy_to_user(arg, qsa, sizeof(*qsa)))
+		ret = -EFAULT;
+
+	kfree(qsa);
+	return ret;
+}
+
+static long btrfs_ioctl_quota_rescan_wait(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return btrfs_qgroup_wait_for_completion(root->fs_info);
+}
+
+static long btrfs_ioctl_set_received_subvol(struct file *file,
+					    void __user *arg)
+{
+	struct btrfs_ioctl_received_subvol_args *sa = NULL;
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_root_item *root_item = &root->root_item;
+	struct btrfs_trans_handle *trans;
+	struct timespec ct = CURRENT_TIME;
+	int ret = 0;
+	int received_uuid_changed;
+
+	ret = mnt_want_write_file(file);
+	if (ret < 0)
+		return ret;
+
+	down_write(&root->fs_info->subvol_sem);
+
+	if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (btrfs_root_readonly(root)) {
+		ret = -EROFS;
+		goto out;
+	}
+
+	if (!inode_owner_or_capable(inode)) {
+		ret = -EACCES;
+		goto out;
+	}
+
+	sa = memdup_user(arg, sizeof(*sa));
+	if (IS_ERR(sa)) {
+		ret = PTR_ERR(sa);
+		sa = NULL;
+		goto out;
+	}
+
+	/*
+	 * 1 - root item
+	 * 2 - uuid items (received uuid + subvol uuid)
+	 */
+	trans = btrfs_start_transaction(root, 3);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		trans = NULL;
+		goto out;
+	}
+
+	sa->rtransid = trans->transid;
+	sa->rtime.sec = ct.tv_sec;
+	sa->rtime.nsec = ct.tv_nsec;
+
+	received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
+				       BTRFS_UUID_SIZE);
+	if (received_uuid_changed &&
+	    !btrfs_is_empty_uuid(root_item->received_uuid))
+		btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				    root_item->received_uuid,
+				    BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+				    root->root_key.objectid);
+	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
+	btrfs_set_root_stransid(root_item, sa->stransid);
+	btrfs_set_root_rtransid(root_item, sa->rtransid);
+	btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
+	btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&root->root_key, &root->root_item);
+	if (ret < 0) {
+		btrfs_end_transaction(trans, root);
+		goto out;
+	}
+	if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
+		ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+					  sa->uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  root->root_key.objectid);
+		if (ret < 0 && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
+
+	ret = copy_to_user(arg, sa, sizeof(*sa));
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(sa);
+	up_write(&root->fs_info->subvol_sem);
+	mnt_drop_write_file(file);
+	return ret;
+}
+
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	size_t len;
+	int ret;
+	char label[BTRFS_LABEL_SIZE];
+
+	spin_lock(&root->fs_info->super_lock);
+	memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+	spin_unlock(&root->fs_info->super_lock);
+
+	len = strnlen(label, BTRFS_LABEL_SIZE);
+
+	if (len == BTRFS_LABEL_SIZE) {
+		pr_warn("btrfs: label is too long, return the first %zu bytes\n",
+			--len);
+	}
+
+	ret = copy_to_user(arg, label, len);
+
+	return ret ? -EFAULT : 0;
+}
+
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+	struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+	struct btrfs_super_block *super_block = root->fs_info->super_copy;
+	struct btrfs_trans_handle *trans;
+	char label[BTRFS_LABEL_SIZE];
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (copy_from_user(label, arg, sizeof(label)))
+		return -EFAULT;
+
+	if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+		pr_err("btrfs: unable to set label with more than %d bytes\n",
+		       BTRFS_LABEL_SIZE - 1);
+		return -EINVAL;
+	}
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out_unlock;
+	}
+
+	spin_lock(&root->fs_info->super_lock);
+	strcpy(super_block->label, label);
+	spin_unlock(&root->fs_info->super_lock);
+	ret = btrfs_end_transaction(trans, root);
+
+out_unlock:
+	mnt_drop_write_file(file);
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -3055,6 +4523,8 @@
 		return btrfs_ioctl_snap_create_v2(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 1);
+	case BTRFS_IOC_SUBVOL_CREATE_V2:
+		return btrfs_ioctl_snap_create_v2(file, argp, 1);
 	case BTRFS_IOC_SNAP_DESTROY:
 		return btrfs_ioctl_snap_destroy(file, argp);
 	case BTRFS_IOC_SUBVOL_GETFLAGS:
@@ -3068,17 +4538,17 @@
 	case BTRFS_IOC_DEFRAG_RANGE:
 		return btrfs_ioctl_defrag(file, argp);
 	case BTRFS_IOC_RESIZE:
-		return btrfs_ioctl_resize(root, argp);
+		return btrfs_ioctl_resize(file, argp);
 	case BTRFS_IOC_ADD_DEV:
 		return btrfs_ioctl_add_dev(root, argp);
 	case BTRFS_IOC_RM_DEV:
-		return btrfs_ioctl_rm_dev(root, argp);
+		return btrfs_ioctl_rm_dev(file, argp);
 	case BTRFS_IOC_FS_INFO:
 		return btrfs_ioctl_fs_info(root, argp);
 	case BTRFS_IOC_DEV_INFO:
 		return btrfs_ioctl_dev_info(root, argp);
 	case BTRFS_IOC_BALANCE:
-		return btrfs_balance(root->fs_info->dev_root);
+		return btrfs_ioctl_balance(file, NULL);
 	case BTRFS_IOC_CLONE:
 		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
 	case BTRFS_IOC_CLONE_RANGE:
@@ -3101,15 +4571,49 @@
 		btrfs_sync_fs(file->f_dentry->d_sb, 1);
 		return 0;
 	case BTRFS_IOC_START_SYNC:
-		return btrfs_ioctl_start_sync(file, argp);
+		return btrfs_ioctl_start_sync(root, argp);
 	case BTRFS_IOC_WAIT_SYNC:
-		return btrfs_ioctl_wait_sync(file, argp);
+		return btrfs_ioctl_wait_sync(root, argp);
 	case BTRFS_IOC_SCRUB:
-		return btrfs_ioctl_scrub(root, argp);
+		return btrfs_ioctl_scrub(file, argp);
 	case BTRFS_IOC_SCRUB_CANCEL:
 		return btrfs_ioctl_scrub_cancel(root, argp);
 	case BTRFS_IOC_SCRUB_PROGRESS:
 		return btrfs_ioctl_scrub_progress(root, argp);
+	case BTRFS_IOC_BALANCE_V2:
+		return btrfs_ioctl_balance(file, argp);
+	case BTRFS_IOC_BALANCE_CTL:
+		return btrfs_ioctl_balance_ctl(root, arg);
+	case BTRFS_IOC_BALANCE_PROGRESS:
+		return btrfs_ioctl_balance_progress(root, argp);
+	case BTRFS_IOC_SET_RECEIVED_SUBVOL:
+		return btrfs_ioctl_set_received_subvol(file, argp);
+	case BTRFS_IOC_SEND:
+		return btrfs_ioctl_send(file, argp);
+	case BTRFS_IOC_GET_DEV_STATS:
+		return btrfs_ioctl_get_dev_stats(root, argp);
+	case BTRFS_IOC_QUOTA_CTL:
+		return btrfs_ioctl_quota_ctl(file, argp);
+	case BTRFS_IOC_QGROUP_ASSIGN:
+		return btrfs_ioctl_qgroup_assign(file, argp);
+	case BTRFS_IOC_QGROUP_CREATE:
+		return btrfs_ioctl_qgroup_create(file, argp);
+	case BTRFS_IOC_QGROUP_LIMIT:
+		return btrfs_ioctl_qgroup_limit(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN:
+		return btrfs_ioctl_quota_rescan(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_STATUS:
+		return btrfs_ioctl_quota_rescan_status(file, argp);
+	case BTRFS_IOC_QUOTA_RESCAN_WAIT:
+		return btrfs_ioctl_quota_rescan_wait(file, argp);
+	case BTRFS_IOC_DEV_REPLACE:
+		return btrfs_ioctl_dev_replace(root, argp);
+	case BTRFS_IOC_GET_FSLABEL:
+		return btrfs_ioctl_get_fslabel(file, argp);
+	case BTRFS_IOC_SET_FSLABEL:
+		return btrfs_ioctl_set_fslabel(file, argp);
+	case BTRFS_IOC_FILE_EXTENT_SAME:
+		return btrfs_ioctl_file_extent_same(file, argp);
 	}
 
 	return -ENOTTY;
Nur in a/fs/btrfs: ioctl.h.
diff -ur a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
--- a/fs/btrfs/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/fs/btrfs/Kconfig	2014-01-21 09:37:24.000000000 +0100
@@ -1,11 +1,13 @@
 config BTRFS_FS
-	tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
-	depends on EXPERIMENTAL
+	tristate "Btrfs filesystem support"
 	select LIBCRC32C
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select LZO_COMPRESS
 	select LZO_DECOMPRESS
+	select RAID6_PQ
+	select XOR_BLOCKS
+
 	help
 	  Btrfs is a new filesystem with extents, writable snapshotting,
 	  support for multiple devices and many more features.
@@ -19,6 +21,17 @@
 
 	  If unsure, say N.
 
+config BTRFS_FS_SYNO_ACL
+	bool "Btrfs Synology Access Control Lists"
+	depends on BTRFS_FS
+	depends on !BTRFS_FS_POSIX_ACL
+	select FS_SYNO_ACL
+	help
+	  Synology Access Control Lists (ACLs) support Windows permissions for users and
+	  groups beyond the owner/group/world scheme.
+
+	  If you don't know what Access Control Lists are, say N
+
 config BTRFS_FS_POSIX_ACL
 	bool "Btrfs POSIX Access Control Lists"
 	depends on BTRFS_FS
@@ -31,3 +44,51 @@
 	  Linux website <http://acl.bestbits.at/>.
 
 	  If you don't know what Access Control Lists are, say N
+
+config BTRFS_FS_CHECK_INTEGRITY
+	bool "Btrfs with integrity check tool compiled in (DANGEROUS)"
+	depends on BTRFS_FS
+	help
+	  Adds code that examines all block write requests (including
+	  writes of the super block). The goal is to verify that the
+	  state of the filesystem on disk is always consistent, i.e.,
+	  after a power-loss or kernel panic event the filesystem is
+	  in a consistent state.
+
+	  If the integrity check tool is included and activated in
+	  the mount options, plenty of kernel memory is used, and
+	  plenty of additional CPU cycles are spent. Enabling this
+	  functionality is not intended for normal use.
+
+	  In most cases, unless you are a btrfs developer who needs
+	  to verify the integrity of (super)-block write requests
+	  during the run of a regression test, say N
+
+config BTRFS_FS_RUN_SANITY_TESTS
+	bool "Btrfs will run sanity tests upon loading"
+	depends on BTRFS_FS
+	help
+	  This will run some basic sanity tests on the free space cache
+	  code to make sure it is acting as it should.  These are mostly
+	  regression tests and are only really interesting to btrfs devlopers.
+
+	  If unsure, say N.
+
+config BTRFS_DEBUG
+	bool "Btrfs debugging support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time debugging support for the btrfs filesystem. This may
+	  enable additional and expensive checks with negative impact on
+	  performance, or export extra information via sysfs.
+
+	  If unsure, say N.
+
+config BTRFS_ASSERT
+	bool "Btrfs assert support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time assertion checking.  This will result in panics if
+	  any of the assertions trip.  This is meant for btrfs developers only.
+
+	  If unsure, say N.
diff -ur a/fs/btrfs/locking.c b/fs/btrfs/locking.c
--- a/fs/btrfs/locking.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/locking.c	2014-02-17 11:56:58.000000000 +0100
@@ -24,7 +24,7 @@
 #include "extent_io.h"
 #include "locking.h"
 
-void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb);
 
 /*
  * if we currently have a spinning reader or writer lock
@@ -33,6 +33,14 @@
  */
 void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	if (rw == BTRFS_WRITE_LOCK) {
 		if (atomic_read(&eb->blocking_writers) == 0) {
 			WARN_ON(atomic_read(&eb->spinning_writers) != 1);
@@ -57,18 +65,28 @@
  */
 void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	if (rw == BTRFS_WRITE_LOCK_BLOCKING) {
 		BUG_ON(atomic_read(&eb->blocking_writers) != 1);
 		write_lock(&eb->lock);
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_inc(&eb->spinning_writers);
-		if (atomic_dec_and_test(&eb->blocking_writers))
+		if (atomic_dec_and_test(&eb->blocking_writers) &&
+		    waitqueue_active(&eb->write_lock_wq))
 			wake_up(&eb->write_lock_wq);
 	} else if (rw == BTRFS_READ_LOCK_BLOCKING) {
 		BUG_ON(atomic_read(&eb->blocking_readers) == 0);
 		read_lock(&eb->lock);
 		atomic_inc(&eb->spinning_readers);
-		if (atomic_dec_and_test(&eb->blocking_readers))
+		if (atomic_dec_and_test(&eb->blocking_readers) &&
+		    waitqueue_active(&eb->read_lock_wq))
 			wake_up(&eb->read_lock_wq);
 	}
 	return;
@@ -81,8 +99,20 @@
 void btrfs_tree_read_lock(struct extent_buffer *eb)
 {
 again:
-	wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
 	read_lock(&eb->lock);
+	if (atomic_read(&eb->blocking_writers) &&
+	    current->pid == eb->lock_owner) {
+		/*
+		 * This extent is already write-locked by our thread. We allow
+		 * an additional read lock to be added because it's for the same
+		 * thread. btrfs_find_all_roots() depends on this as it may be
+		 * called on a partly (write-)locked tree.
+		 */
+		BUG_ON(eb->lock_nested);
+		eb->lock_nested = 1;
+		read_unlock(&eb->lock);
+		return;
+	}
 	if (atomic_read(&eb->blocking_writers)) {
 		read_unlock(&eb->lock);
 		wait_event(eb->write_lock_wq,
@@ -129,6 +159,7 @@
 	}
 	atomic_inc(&eb->write_locks);
 	atomic_inc(&eb->spinning_writers);
+	eb->lock_owner = current->pid;
 	return 1;
 }
 
@@ -137,6 +168,15 @@
  */
 void btrfs_tree_read_unlock(struct extent_buffer *eb)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			eb->lock_nested = 0;
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->spinning_readers) == 0);
 	atomic_dec(&eb->spinning_readers);
@@ -149,9 +189,19 @@
  */
 void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb)
 {
+	if (eb->lock_nested) {
+		read_lock(&eb->lock);
+		if (eb->lock_nested && current->pid == eb->lock_owner) {
+			eb->lock_nested = 0;
+			read_unlock(&eb->lock);
+			return;
+		}
+		read_unlock(&eb->lock);
+	}
 	btrfs_assert_tree_read_locked(eb);
 	WARN_ON(atomic_read(&eb->blocking_readers) == 0);
-	if (atomic_dec_and_test(&eb->blocking_readers))
+	if (atomic_dec_and_test(&eb->blocking_readers) &&
+	    waitqueue_active(&eb->read_lock_wq))
 		wake_up(&eb->read_lock_wq);
 	atomic_dec(&eb->read_locks);
 }
@@ -160,7 +210,7 @@
  * take a spinning write lock.  This will wait for both
  * blocking readers or writers
  */
-int btrfs_tree_lock(struct extent_buffer *eb)
+void btrfs_tree_lock(struct extent_buffer *eb)
 {
 again:
 	wait_event(eb->read_lock_wq, atomic_read(&eb->blocking_readers) == 0);
@@ -181,13 +231,13 @@
 	WARN_ON(atomic_read(&eb->spinning_writers));
 	atomic_inc(&eb->spinning_writers);
 	atomic_inc(&eb->write_locks);
-	return 0;
+	eb->lock_owner = current->pid;
 }
 
 /*
  * drop a spinning or a blocking write lock.
  */
-int btrfs_tree_unlock(struct extent_buffer *eb)
+void btrfs_tree_unlock(struct extent_buffer *eb)
 {
 	int blockers = atomic_read(&eb->blocking_writers);
 
@@ -199,14 +249,14 @@
 	if (blockers) {
 		WARN_ON(atomic_read(&eb->spinning_writers));
 		atomic_dec(&eb->blocking_writers);
-		smp_wmb();
-		wake_up(&eb->write_lock_wq);
+		smp_mb();
+		if (waitqueue_active(&eb->write_lock_wq))
+			wake_up(&eb->write_lock_wq);
 	} else {
 		WARN_ON(atomic_read(&eb->spinning_writers) != 1);
 		atomic_dec(&eb->spinning_writers);
 		write_unlock(&eb->lock);
 	}
-	return 0;
 }
 
 void btrfs_assert_tree_locked(struct extent_buffer *eb)
@@ -214,7 +264,7 @@
 	BUG_ON(!atomic_read(&eb->write_locks));
 }
 
-void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
+static void btrfs_assert_tree_read_locked(struct extent_buffer *eb)
 {
 	BUG_ON(!atomic_read(&eb->read_locks));
 }
diff -ur a/fs/btrfs/locking.h b/fs/btrfs/locking.h
--- a/fs/btrfs/locking.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/locking.h	2014-02-17 11:56:58.000000000 +0100
@@ -24,9 +24,8 @@
 #define BTRFS_WRITE_LOCK_BLOCKING 3
 #define BTRFS_READ_LOCK_BLOCKING 4
 
-int btrfs_tree_lock(struct extent_buffer *eb);
-int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_try_spin_lock(struct extent_buffer *eb);
+void btrfs_tree_lock(struct extent_buffer *eb);
+void btrfs_tree_unlock(struct extent_buffer *eb);
 
 void btrfs_tree_read_lock(struct extent_buffer *eb);
 void btrfs_tree_read_unlock(struct extent_buffer *eb);
diff -ur a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
--- a/fs/btrfs/lzo.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/lzo.c	2014-02-17 11:56:58.000000000 +0100
@@ -31,8 +31,8 @@
 
 struct workspace {
 	void *mem;
-	void *buf;	/* where compressed data goes */
-	void *cbuf;	/* where decompressed data goes */
+	void *buf;	/* where decompressed data goes */
+	void *cbuf;	/* where compressed data goes */
 	struct list_head list;
 };
 
@@ -207,8 +207,10 @@
 		}
 
 		/* we're making it bigger, give up */
-		if (tot_in > 8192 && tot_in < tot_out)
+		if (tot_in > 8192 && tot_in < tot_out) {
+			ret = -1;
 			goto out;
+		}
 
 		/* we're all done */
 		if (tot_in >= len)
@@ -411,9 +413,9 @@
 
 	bytes = min_t(unsigned long, destlen, out_len - start_byte);
 
-	kaddr = kmap_atomic(dest_page, KM_USER0);
+	kaddr = kmap_atomic(dest_page);
 	memcpy(kaddr, workspace->buf + start_byte, bytes);
-	kunmap_atomic(kaddr, KM_USER0);
+	kunmap_atomic(kaddr);
 out:
 	return ret;
 }
diff -ur a/fs/btrfs/Makefile b/fs/btrfs/Makefile
--- a/fs/btrfs/Makefile	2013-08-03 09:59:52.000000000 +0200
+++ b/fs/btrfs/Makefile	2014-01-21 09:37:24.000000000 +0100
@@ -8,6 +8,11 @@
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o
+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+	   uuid-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
+btrfs-$(CONFIG_BTRFS_FS_SYNO_ACL) += syno_acl.o
+btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o
Nur in b/fs/btrfs: math.h.
diff -ur a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
--- a/fs/btrfs/ordered-data.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/ordered-data.c	2014-02-17 11:56:58.000000000 +0100
@@ -24,6 +24,9 @@
 #include "transaction.h"
 #include "btrfs_inode.h"
 #include "extent_io.h"
+#include "disk-io.h"
+
+static struct kmem_cache *btrfs_ordered_extent_cache;
 
 static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
@@ -59,6 +62,14 @@
 	return NULL;
 }
 
+static void ordered_data_tree_panic(struct inode *inode, int errno,
+					       u64 offset)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
+		    "%llu\n", offset);
+}
+
 /*
  * look for a given offset in the tree, and if it can't be found return the
  * first lesser offset
@@ -174,22 +185,27 @@
 				      u64 start, u64 len, u64 disk_len,
 				      int type, int dio, int compress_type)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	entry = kzalloc(sizeof(*entry), GFP_NOFS);
+	entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS);
 	if (!entry)
 		return -ENOMEM;
 
 	entry->file_offset = file_offset;
 	entry->start = start;
 	entry->len = len;
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+	    !(type == BTRFS_ORDERED_NOCOW))
+		entry->csum_bytes_left = disk_len;
 	entry->disk_len = disk_len;
 	entry->bytes_left = len;
-	entry->inode = inode;
+	entry->inode = igrab(inode);
 	entry->compress_type = compress_type;
+	entry->truncated_len = (u64)-1;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
@@ -201,21 +217,32 @@
 	init_waitqueue_head(&entry->wait);
 	INIT_LIST_HEAD(&entry->list);
 	INIT_LIST_HEAD(&entry->root_extent_list);
+	INIT_LIST_HEAD(&entry->work_list);
+	init_completion(&entry->completion);
+	INIT_LIST_HEAD(&entry->log_list);
 
 	trace_btrfs_ordered_extent_add(inode, entry);
 
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	node = tree_insert(&tree->tree, file_offset,
 			   &entry->rb_node);
-	BUG_ON(node);
-	spin_unlock(&tree->lock);
+	if (node)
+		ordered_data_tree_panic(inode, -EEXIST, file_offset);
+	spin_unlock_irq(&tree->lock);
 
-	spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+	spin_lock(&root->ordered_extent_lock);
 	list_add_tail(&entry->root_extent_list,
-		      &BTRFS_I(inode)->root->fs_info->ordered_extents);
-	spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
+		      &root->ordered_extents);
+	root->nr_ordered_extents++;
+	if (root->nr_ordered_extents == 1) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(!list_empty(&root->ordered_root));
+		list_add_tail(&root->ordered_root,
+			      &root->fs_info->ordered_roots);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
 
-	BUG_ON(node);
 	return 0;
 }
 
@@ -249,17 +276,20 @@
  * when an ordered extent is finished.  If the list covers more than one
  * ordered extent, it is split across multiples.
  */
-int btrfs_add_ordered_sum(struct inode *inode,
-			  struct btrfs_ordered_extent *entry,
-			  struct btrfs_ordered_sum *sum)
+void btrfs_add_ordered_sum(struct inode *inode,
+			   struct btrfs_ordered_extent *entry,
+			   struct btrfs_ordered_sum *sum)
 {
 	struct btrfs_ordered_inode_tree *tree;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	list_add_tail(&sum->list, &entry->list);
-	spin_unlock(&tree->lock);
-	return 0;
+	WARN_ON(entry->csum_bytes_left < sum->len);
+	entry->csum_bytes_left -= sum->len;
+	if (entry->csum_bytes_left == 0)
+		wake_up(&entry->wait);
+	spin_unlock_irq(&tree->lock);
 }
 
 /*
@@ -276,18 +306,19 @@
  */
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 *file_offset, u64 io_size)
+				   u64 *file_offset, u64 io_size, int uptodate)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
 	int ret;
+	unsigned long flags;
 	u64 dec_end;
 	u64 dec_start;
 	u64 to_dec;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irqsave(&tree->lock, flags);
 	node = tree_search(tree, *file_offset);
 	if (!node) {
 		ret = 1;
@@ -306,16 +337,17 @@
 	*file_offset = dec_end;
 	if (dec_start > dec_end) {
 		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
-		       (unsigned long long)dec_start,
-		       (unsigned long long)dec_end);
+		       dec_start, dec_end);
 	}
 	to_dec = dec_end - dec_start;
 	if (to_dec > entry->bytes_left) {
 		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
-		       (unsigned long long)entry->bytes_left,
-		       (unsigned long long)to_dec);
+		       entry->bytes_left, to_dec);
 	}
 	entry->bytes_left -= to_dec;
+	if (!uptodate)
+		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
 	if (entry->bytes_left == 0)
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 	else
@@ -325,7 +357,7 @@
 		*cached = entry;
 		atomic_inc(&entry->refs);
 	}
-	spin_unlock(&tree->lock);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	return ret == 0;
 }
 
@@ -340,15 +372,21 @@
  */
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 file_offset, u64 io_size)
+				   u64 file_offset, u64 io_size, int uptodate)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 	struct btrfs_ordered_extent *entry = NULL;
+	unsigned long flags;
 	int ret;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irqsave(&tree->lock, flags);
+	if (cached && *cached) {
+		entry = *cached;
+		goto have_entry;
+	}
+
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		ret = 1;
@@ -356,6 +394,7 @@
 	}
 
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+have_entry:
 	if (!offset_in_entry(entry, file_offset)) {
 		ret = 1;
 		goto out;
@@ -363,10 +402,12 @@
 
 	if (io_size > entry->bytes_left) {
 		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
-		       (unsigned long long)entry->bytes_left,
-		       (unsigned long long)io_size);
+		       entry->bytes_left, io_size);
 	}
 	entry->bytes_left -= io_size;
+	if (!uptodate)
+		set_bit(BTRFS_ORDERED_IOERR, &entry->flags);
+
 	if (entry->bytes_left == 0)
 		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
 	else
@@ -376,15 +417,75 @@
 		*cached = entry;
 		atomic_inc(&entry->refs);
 	}
-	spin_unlock(&tree->lock);
+	spin_unlock_irqrestore(&tree->lock, flags);
 	return ret == 0;
 }
 
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	struct rb_node *n;
+	int index = log->log_transid % 2;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
+	for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+		ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+		spin_lock(&log->log_extents_lock[index]);
+		if (list_empty(&ordered->log_list)) {
+			list_add_tail(&ordered->log_list, &log->logged_list[index]);
+			atomic_inc(&ordered->refs);
+		}
+		spin_unlock(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&tree->lock);
+}
+
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+						   &ordered->flags));
+		btrfs_put_ordered_extent(ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+	struct btrfs_ordered_extent *ordered;
+	int index = transid % 2;
+
+	spin_lock_irq(&log->log_extents_lock[index]);
+	while (!list_empty(&log->logged_list[index])) {
+		ordered = list_first_entry(&log->logged_list[index],
+					   struct btrfs_ordered_extent,
+					   log_list);
+		list_del_init(&ordered->log_list);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		btrfs_put_ordered_extent(ordered);
+		spin_lock_irq(&log->log_extents_lock[index]);
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+}
+
 /*
  * used to drop a reference on an ordered extent.  This will free
  * the extent if the last reference is dropped
  */
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
 {
 	struct list_head *cur;
 	struct btrfs_ordered_sum *sum;
@@ -392,37 +493,40 @@
 	trace_btrfs_ordered_extent_put(entry->inode, entry);
 
 	if (atomic_dec_and_test(&entry->refs)) {
+		if (entry->inode)
+			btrfs_add_delayed_iput(entry->inode);
 		while (!list_empty(&entry->list)) {
 			cur = entry->list.next;
 			sum = list_entry(cur, struct btrfs_ordered_sum, list);
 			list_del(&sum->list);
 			kfree(sum);
 		}
-		kfree(entry);
+		kmem_cache_free(btrfs_ordered_extent_cache, entry);
 	}
-	return 0;
 }
 
 /*
  * remove an ordered extent from the tree.  No references are dropped
- * and you must wake_up entry->wait.  You must hold the tree lock
- * while you call this function.
+ * and waiters are woken up.
  */
-static int __btrfs_remove_ordered_extent(struct inode *inode,
-				struct btrfs_ordered_extent *entry)
+void btrfs_remove_ordered_extent(struct inode *inode,
+				 struct btrfs_ordered_extent *entry)
 {
 	struct btrfs_ordered_inode_tree *tree;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct rb_node *node;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
+	spin_lock_irq(&tree->lock);
 	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
 	tree->last = NULL;
 	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	spin_unlock_irq(&tree->lock);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->ordered_extent_lock);
 	list_del_init(&entry->root_extent_list);
+	root->nr_ordered_extents--;
 
 	trace_btrfs_ordered_extent_remove(inode, entry);
 
@@ -435,84 +539,92 @@
 	    !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
 		list_del_init(&BTRFS_I(inode)->ordered_operations);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
 
-	return 0;
+	if (!root->nr_ordered_extents) {
+		spin_lock(&root->fs_info->ordered_root_lock);
+		BUG_ON(list_empty(&root->ordered_root));
+		list_del_init(&root->ordered_root);
+		spin_unlock(&root->fs_info->ordered_root_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
+	wake_up(&entry->wait);
 }
 
-/*
- * remove an ordered extent from the tree.  No references are dropped
- * but any waiters are woken.
- */
-int btrfs_remove_ordered_extent(struct inode *inode,
-				struct btrfs_ordered_extent *entry)
+static void btrfs_run_ordered_extent_work(struct btrfs_work *work)
 {
-	struct btrfs_ordered_inode_tree *tree;
-	int ret;
-
-	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
-	ret = __btrfs_remove_ordered_extent(inode, entry);
-	spin_unlock(&tree->lock);
-	wake_up(&entry->wait);
+	struct btrfs_ordered_extent *ordered;
 
-	return ret;
+	ordered = container_of(work, struct btrfs_ordered_extent, flush_work);
+	btrfs_start_ordered_extent(ordered->inode, ordered, 1);
+	complete(&ordered->completion);
 }
 
 /*
  * wait for all the ordered extents in a root.  This is done when balancing
  * space between drives.
  */
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
-			       int nocow_only, int delay_iput)
+void btrfs_wait_ordered_extents(struct btrfs_root *root)
 {
-	struct list_head splice;
-	struct list_head *cur;
-	struct btrfs_ordered_extent *ordered;
-	struct inode *inode;
+	struct list_head splice, works;
+	struct btrfs_ordered_extent *ordered, *next;
 
 	INIT_LIST_HEAD(&splice);
+	INIT_LIST_HEAD(&works);
 
-	spin_lock(&root->fs_info->ordered_extent_lock);
-	list_splice_init(&root->fs_info->ordered_extents, &splice);
+	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	spin_lock(&root->ordered_extent_lock);
+	list_splice_init(&root->ordered_extents, &splice);
 	while (!list_empty(&splice)) {
-		cur = splice.next;
-		ordered = list_entry(cur, struct btrfs_ordered_extent,
-				     root_extent_list);
-		if (nocow_only &&
-		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) &&
-		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
-			list_move(&ordered->root_extent_list,
-				  &root->fs_info->ordered_extents);
-			cond_resched_lock(&root->fs_info->ordered_extent_lock);
-			continue;
-		}
-
-		list_del_init(&ordered->root_extent_list);
+		ordered = list_first_entry(&splice, struct btrfs_ordered_extent,
+					   root_extent_list);
+		list_move_tail(&ordered->root_extent_list,
+			       &root->ordered_extents);
 		atomic_inc(&ordered->refs);
+		spin_unlock(&root->ordered_extent_lock);
 
-		/*
-		 * the inode may be getting freed (in sys_unlink path).
-		 */
-		inode = igrab(ordered->inode);
+		ordered->flush_work.func = btrfs_run_ordered_extent_work;
+		list_add_tail(&ordered->work_list, &works);
+		btrfs_queue_worker(&root->fs_info->flush_workers,
+				   &ordered->flush_work);
 
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+		cond_resched();
+		spin_lock(&root->ordered_extent_lock);
+	}
+	spin_unlock(&root->ordered_extent_lock);
 
-		if (inode) {
-			btrfs_start_ordered_extent(inode, ordered, 1);
-			btrfs_put_ordered_extent(ordered);
-			if (delay_iput)
-				btrfs_add_delayed_iput(inode);
-			else
-				iput(inode);
-		} else {
-			btrfs_put_ordered_extent(ordered);
-		}
+	list_for_each_entry_safe(ordered, next, &works, work_list) {
+		list_del_init(&ordered->work_list);
+		wait_for_completion(&ordered->completion);
+		btrfs_put_ordered_extent(ordered);
+		cond_resched();
+	}
+	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+}
 
-		spin_lock(&root->fs_info->ordered_extent_lock);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *root;
+	struct list_head splice;
+
+	INIT_LIST_HEAD(&splice);
+
+	spin_lock(&fs_info->ordered_root_lock);
+	list_splice_init(&fs_info->ordered_roots, &splice);
+	while (!list_empty(&splice)) {
+		root = list_first_entry(&splice, struct btrfs_root,
+					ordered_root);
+		root = btrfs_grab_fs_root(root);
+		BUG_ON(!root);
+		list_move_tail(&root->ordered_root,
+			       &fs_info->ordered_roots);
+		spin_unlock(&fs_info->ordered_root_lock);
+
+		btrfs_wait_ordered_extents(root);
+		btrfs_put_fs_root(root);
+
+		spin_lock(&fs_info->ordered_root_lock);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-	return 0;
+	spin_unlock(&fs_info->ordered_root_lock);
 }
 
 /*
@@ -525,23 +637,26 @@
  * extra check to make sure the ordered operation list really is empty
  * before we return
  */
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, int wait)
 {
 	struct btrfs_inode *btrfs_inode;
 	struct inode *inode;
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct list_head splice;
+	struct list_head works;
+	struct btrfs_delalloc_work *work, *next;
+	int ret = 0;
 
 	INIT_LIST_HEAD(&splice);
+	INIT_LIST_HEAD(&works);
 
-	mutex_lock(&root->fs_info->ordered_operations_mutex);
-	spin_lock(&root->fs_info->ordered_extent_lock);
-again:
-	list_splice_init(&root->fs_info->ordered_operations, &splice);
-
+	mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
+	spin_lock(&root->fs_info->ordered_root_lock);
+	list_splice_init(&cur_trans->ordered_operations, &splice);
 	while (!list_empty(&splice)) {
 		btrfs_inode = list_entry(splice.next, struct btrfs_inode,
 				   ordered_operations);
-
 		inode = &btrfs_inode->vfs_inode;
 
 		list_del_init(&btrfs_inode->ordered_operations);
@@ -550,31 +665,41 @@
 		 * the inode may be getting freed (in sys_unlink path).
 		 */
 		inode = igrab(inode);
+		if (!inode)
+			continue;
 
-		if (!wait && inode) {
+		if (!wait)
 			list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
-		}
-		spin_unlock(&root->fs_info->ordered_extent_lock);
+				      &cur_trans->ordered_operations);
+		spin_unlock(&root->fs_info->ordered_root_lock);
 
-		if (inode) {
-			if (wait)
-				btrfs_wait_ordered_range(inode, 0, (u64)-1);
-			else
-				filemap_flush(inode->i_mapping);
-			btrfs_add_delayed_iput(inode);
+		work = btrfs_alloc_delalloc_work(inode, wait, 1);
+		if (!work) {
+			spin_lock(&root->fs_info->ordered_root_lock);
+			if (list_empty(&BTRFS_I(inode)->ordered_operations))
+				list_add_tail(&btrfs_inode->ordered_operations,
+					      &splice);
+			list_splice_tail(&splice,
+					 &cur_trans->ordered_operations);
+			spin_unlock(&root->fs_info->ordered_root_lock);
+			ret = -ENOMEM;
+			goto out;
 		}
+		list_add_tail(&work->list, &works);
+		btrfs_queue_worker(&root->fs_info->flush_workers,
+				   &work->work);
 
 		cond_resched();
-		spin_lock(&root->fs_info->ordered_extent_lock);
+		spin_lock(&root->fs_info->ordered_root_lock);
 	}
-	if (wait && !list_empty(&root->fs_info->ordered_operations))
-		goto again;
-
-	spin_unlock(&root->fs_info->ordered_extent_lock);
-	mutex_unlock(&root->fs_info->ordered_operations_mutex);
-
-	return 0;
+	spin_unlock(&root->fs_info->ordered_root_lock);
+out:
+	list_for_each_entry_safe(work, next, &works, list) {
+		list_del_init(&work->list);
+		btrfs_wait_and_free_delalloc_work(work);
+	}
+	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
+	return ret;
 }
 
 /*
@@ -596,7 +721,7 @@
 	/*
 	 * pages in the range can be dirty, clean or writeback.  We
 	 * start IO on any dirty ones so the wait doesn't stall waiting
-	 * for pdflush to find them
+	 * for the flusher thread to find them
 	 */
 	if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
 		filemap_fdatawrite_range(inode->i_mapping, start, end);
@@ -609,12 +734,11 @@
 /*
  * Used to wait on ordered extents across a large range of bytes.
  */
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
 	u64 end;
 	u64 orig_end;
 	struct btrfs_ordered_extent *ordered;
-	int found;
 
 	if (start + len < start) {
 		orig_end = INT_LIMIT(loff_t);
@@ -623,22 +747,33 @@
 		if (orig_end > INT_LIMIT(loff_t))
 			orig_end = INT_LIMIT(loff_t);
 	}
-again:
+
 	/* start IO across the range first to instantiate any delalloc
 	 * extents
 	 */
 	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
 
-	/* The compression code will leave pages locked but return from
-	 * writepage without setting the page writeback.  Starting again
-	 * with WB_SYNC_ALL will end up waiting for the IO to actually start.
+	/*
+	 * So with compression we will find and lock a dirty page and clear the
+	 * first one as dirty, setup an async extent, and immediately return
+	 * with the entire range locked but with nobody actually marked with
+	 * writeback.  So we can't just filemap_write_and_wait_range() and
+	 * expect it to work since it will just kick off a thread to do the
+	 * actual work.  So we need to call filemap_fdatawrite_range _again_
+	 * since it will wait on the page lock, which won't be unlocked until
+	 * after the pages have been marked as writeback and so we're good to go
+	 * from there.  We have to do this otherwise we'll miss the ordered
+	 * extents and that results in badness.  Please Josef, do not think you
+	 * know better and pull this out at some point in the future, it is
+	 * right and you are wrong.
 	 */
-	filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
+	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+		     &BTRFS_I(inode)->runtime_flags))
+		filemap_fdatawrite_range(inode->i_mapping, start, orig_end);
 
 	filemap_fdatawait_range(inode->i_mapping, start, orig_end);
 
 	end = orig_end;
-	found = 0;
 	while (1) {
 		ordered = btrfs_lookup_first_ordered_extent(inode, end);
 		if (!ordered)
@@ -651,7 +786,6 @@
 			btrfs_put_ordered_extent(ordered);
 			break;
 		}
-		found++;
 		btrfs_start_ordered_extent(inode, ordered, 1);
 		end = ordered->file_offset;
 		btrfs_put_ordered_extent(ordered);
@@ -659,12 +793,6 @@
 			break;
 		end--;
 	}
-	if (found || test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end,
-			   EXTENT_DELALLOC, 0, NULL)) {
-		schedule_timeout(1);
-		goto again;
-	}
-	return 0;
 }
 
 /*
@@ -679,7 +807,7 @@
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -690,7 +818,7 @@
 	if (entry)
 		atomic_inc(&entry->refs);
 out:
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return entry;
 }
 
@@ -706,7 +834,7 @@
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node) {
 		node = tree_search(tree, file_offset + len);
@@ -731,7 +859,7 @@
 out:
 	if (entry)
 		atomic_inc(&entry->refs);
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return entry;
 }
 
@@ -747,7 +875,7 @@
 	struct btrfs_ordered_extent *entry = NULL;
 
 	tree = &BTRFS_I(inode)->ordered_tree;
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	node = tree_search(tree, file_offset);
 	if (!node)
 		goto out;
@@ -755,7 +883,7 @@
 	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
 	atomic_inc(&entry->refs);
 out:
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	return entry;
 }
 
@@ -767,22 +895,24 @@
 				struct btrfs_ordered_extent *ordered)
 {
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	u64 disk_i_size;
 	u64 new_i_size;
-	u64 i_size_test;
 	u64 i_size = i_size_read(inode);
 	struct rb_node *node;
 	struct rb_node *prev = NULL;
 	struct btrfs_ordered_extent *test;
 	int ret = 1;
 
-	if (ordered)
+	spin_lock_irq(&tree->lock);
+	if (ordered) {
 		offset = entry_end(ordered);
-	else
+		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
+			offset = min(offset,
+				     ordered->file_offset +
+				     ordered->truncated_len);
+	} else {
 		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
-
-	spin_lock(&tree->lock);
+	}
 	disk_i_size = BTRFS_I(inode)->disk_i_size;
 
 	/* truncate file */
@@ -796,18 +926,17 @@
 	 * if the disk i_size is already at the inode->i_size, or
 	 * this ordered extent is inside the disk i_size, we're done
 	 */
-	if (disk_i_size == i_size || offset <= disk_i_size) {
+	if (disk_i_size == i_size)
 		goto out;
-	}
 
 	/*
-	 * we can't update the disk_isize if there are delalloc bytes
-	 * between disk_i_size and  this ordered extent
+	 * We still need to update disk_i_size if outstanding_isize is greater
+	 * than disk_i_size.
 	 */
-	if (test_range_bit(io_tree, disk_i_size, offset - 1,
-			   EXTENT_DELALLOC, 0, NULL)) {
+	if (offset <= disk_i_size &&
+	    (!ordered || ordered->outstanding_isize <= disk_i_size))
 		goto out;
-	}
+
 	/*
 	 * walk backward from this ordered extent to disk_i_size.
 	 * if we find an ordered extent then we can't update disk i_size
@@ -828,69 +957,53 @@
 		}
 		node = prev;
 	}
-	while (node) {
+	for (; node; node = rb_prev(node)) {
 		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+
+		/* We treat this entry as if it doesnt exist */
+		if (test_bit(BTRFS_ORDERED_UPDATED_ISIZE, &test->flags))
+			continue;
 		if (test->file_offset + test->len <= disk_i_size)
 			break;
 		if (test->file_offset >= i_size)
 			break;
-		if (test->file_offset >= disk_i_size)
+		if (entry_end(test) > disk_i_size) {
+			/*
+			 * we don't update disk_i_size now, so record this
+			 * undealt i_size. Or we will not know the real
+			 * i_size.
+			 */
+			if (test->outstanding_isize < offset)
+				test->outstanding_isize = offset;
+			if (ordered &&
+			    ordered->outstanding_isize >
+			    test->outstanding_isize)
+				test->outstanding_isize =
+						ordered->outstanding_isize;
 			goto out;
-		node = rb_prev(node);
+		}
 	}
 	new_i_size = min_t(u64, offset, i_size);
 
 	/*
-	 * at this point, we know we can safely update i_size to at least
-	 * the offset from this ordered extent.  But, we need to
-	 * walk forward and see if ios from higher up in the file have
-	 * finished.
+	 * Some ordered extents may completed before the current one, and
+	 * we hold the real i_size in ->outstanding_isize.
 	 */
-	if (ordered) {
-		node = rb_next(&ordered->rb_node);
-	} else {
-		if (prev)
-			node = rb_next(prev);
-		else
-			node = rb_first(&tree->tree);
-	}
-	i_size_test = 0;
-	if (node) {
-		/*
-		 * do we have an area where IO might have finished
-		 * between our ordered extent and the next one.
-		 */
-		test = rb_entry(node, struct btrfs_ordered_extent, rb_node);
-		if (test->file_offset > offset)
-			i_size_test = test->file_offset;
-	} else {
-		i_size_test = i_size;
-	}
-
-	/*
-	 * i_size_test is the end of a region after this ordered
-	 * extent where there are no ordered extents.  As long as there
-	 * are no delalloc bytes in this area, it is safe to update
-	 * disk_i_size to the end of the region.
-	 */
-	if (i_size_test > offset &&
-	    !test_range_bit(io_tree, offset, i_size_test - 1,
-			    EXTENT_DELALLOC, 0, NULL)) {
-		new_i_size = min_t(u64, i_size_test, i_size);
-	}
+	if (ordered && ordered->outstanding_isize > new_i_size)
+		new_i_size = min_t(u64, ordered->outstanding_isize, i_size);
 	BTRFS_I(inode)->disk_i_size = new_i_size;
 	ret = 0;
 out:
 	/*
-	 * we need to remove the ordered extent with the tree lock held
-	 * so that other people calling this function don't find our fully
-	 * processed ordered entry and skip updating the i_size
+	 * We need to do this because we can't remove ordered extents until
+	 * after the i_disk_size has been updated and then the inode has been
+	 * updated to reflect the change, so we need to tell anybody who finds
+	 * this ordered extent that we've already done all the real work, we
+	 * just haven't completed all the other work.
 	 */
 	if (ordered)
-		__btrfs_remove_ordered_extent(inode, ordered);
-	spin_unlock(&tree->lock);
-	if (ordered)
-		wake_up(&ordered->wait);
+		set_bit(BTRFS_ORDERED_UPDATED_ISIZE, &ordered->flags);
+	spin_unlock_irq(&tree->lock);
 	return ret;
 }
 
@@ -900,39 +1013,42 @@
  * be reclaimed before their checksum is actually put into the btree
  */
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
-			   u32 *sum)
+			   u32 *sum, int len)
 {
 	struct btrfs_ordered_sum *ordered_sum;
-	struct btrfs_sector_sum *sector_sums;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree;
 	unsigned long num_sectors;
 	unsigned long i;
 	u32 sectorsize = BTRFS_I(inode)->root->sectorsize;
-	int ret = 1;
+	int index = 0;
 
 	ordered = btrfs_lookup_ordered_extent(inode, offset);
 	if (!ordered)
-		return 1;
+		return 0;
 
-	spin_lock(&tree->lock);
+	spin_lock_irq(&tree->lock);
 	list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
-		if (disk_bytenr >= ordered_sum->bytenr) {
-			num_sectors = ordered_sum->len / sectorsize;
-			sector_sums = ordered_sum->sums;
-			for (i = 0; i < num_sectors; i++) {
-				if (sector_sums[i].bytenr == disk_bytenr) {
-					*sum = sector_sums[i].sum;
-					ret = 0;
-					goto out;
-				}
-			}
+		if (disk_bytenr >= ordered_sum->bytenr &&
+		    disk_bytenr < ordered_sum->bytenr + ordered_sum->len) {
+			i = (disk_bytenr - ordered_sum->bytenr) >>
+			    inode->i_sb->s_blocksize_bits;
+			num_sectors = ordered_sum->len >>
+				      inode->i_sb->s_blocksize_bits;
+			num_sectors = min_t(int, len - index, num_sectors - i);
+			memcpy(sum + index, ordered_sum->sums + i,
+			       num_sectors);
+
+			index += (int)num_sectors;
+			if (index == len)
+				goto out;
+			disk_bytenr += num_sectors * sectorsize;
 		}
 	}
 out:
-	spin_unlock(&tree->lock);
+	spin_unlock_irq(&tree->lock);
 	btrfs_put_ordered_extent(ordered);
-	return ret;
+	return index;
 }
 
 
@@ -948,10 +1064,10 @@
  * If trans is not null, we'll do a friendly check for a transaction that
  * is already flushing things and force the IO down ourselves.
  */
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct inode *inode)
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, struct inode *inode)
 {
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	u64 last_mod;
 
 	last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -961,23 +1077,30 @@
 	 * commit, we can safely return without doing anything
 	 */
 	if (last_mod < root->fs_info->last_trans_committed)
-		return 0;
+		return;
 
-	/*
-	 * the transaction is already committing.  Just start the IO and
-	 * don't bother with all of this list nonsense
-	 */
-	if (trans && root->fs_info->running_transaction->blocked) {
-		btrfs_wait_ordered_range(inode, 0, (u64)-1);
-		return 0;
-	}
-
-	spin_lock(&root->fs_info->ordered_extent_lock);
+	spin_lock(&root->fs_info->ordered_root_lock);
 	if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
 		list_add_tail(&BTRFS_I(inode)->ordered_operations,
-			      &root->fs_info->ordered_operations);
+			      &cur_trans->ordered_operations);
 	}
-	spin_unlock(&root->fs_info->ordered_extent_lock);
+	spin_unlock(&root->fs_info->ordered_root_lock);
+}
+
+int __init ordered_data_init(void)
+{
+	btrfs_ordered_extent_cache = kmem_cache_create("btrfs_ordered_extent",
+				     sizeof(struct btrfs_ordered_extent), 0,
+				     SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				     NULL);
+	if (!btrfs_ordered_extent_cache)
+		return -ENOMEM;
 
 	return 0;
 }
+
+void ordered_data_exit(void)
+{
+	if (btrfs_ordered_extent_cache)
+		kmem_cache_destroy(btrfs_ordered_extent_cache);
+}
diff -ur a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
--- a/fs/btrfs/ordered-data.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/ordered-data.h	2014-02-17 11:56:58.000000000 +0100
@@ -26,18 +26,6 @@
 	struct rb_node *last;
 };
 
-/*
- * these are used to collect checksums done just before bios submission.
- * They are attached via a list into the ordered extent, and
- * checksum items are inserted into the tree after all the blocks in
- * the ordered extent are on disk
- */
-struct btrfs_sector_sum {
-	/* bytenr on disk */
-	u64 bytenr;
-	u32 sum;
-};
-
 struct btrfs_ordered_sum {
 	/* bytenr is the start of this extent on disk */
 	u64 bytenr;
@@ -45,10 +33,10 @@
 	/*
 	 * this is the length in bytes covered by the sums array below.
 	 */
-	unsigned long len;
+	int len;
 	struct list_head list;
-	/* last field is a variable length array of btrfs_sector_sums */
-	struct btrfs_sector_sum sums[];
+	/* last field is a variable length array of csums */
+	u32 sums[];
 };
 
 /*
@@ -74,6 +62,15 @@
 
 #define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 
+#define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
+
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+				       * has done its due diligence in updating
+				       * the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+				       ordered extent */
+#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
+
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
 	u64 file_offset;
@@ -90,6 +87,22 @@
 	/* number of bytes that still need writing */
 	u64 bytes_left;
 
+	/* number of bytes that still need csumming */
+	u64 csum_bytes_left;
+
+	/*
+	 * the end of the ordered extent which is behind it but
+	 * didn't update disk_i_size. Please see the comment of
+	 * btrfs_ordered_update_i_size();
+	 */
+	u64 outstanding_isize;
+
+	/*
+	 * If we get truncated we need to adjust the file extent we enter for
+	 * this ordered extent so that we do not expose stale data.
+	 */
+	u64 truncated_len;
+
 	/* flags (described above) */
 	unsigned long flags;
 
@@ -105,6 +118,9 @@
 	/* list of checksums for insertion when the extent io is done */
 	struct list_head list;
 
+	/* If we need to wait on this to be done */
+	struct list_head log_list;
+
 	/* used to wait for the BTRFS_ORDERED_COMPLETE bit */
 	wait_queue_head_t wait;
 
@@ -113,8 +129,13 @@
 
 	/* a per root list of all the pending ordered extents */
 	struct list_head root_extent_list;
-};
 
+	struct btrfs_work work;
+
+	struct completion completion;
+	struct btrfs_work flush_work;
+	struct list_head work_list;
+};
 
 /*
  * calculates the total size you need to allocate for an ordered sum
@@ -123,11 +144,8 @@
 static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
 					 unsigned long bytes)
 {
-	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
-		root->sectorsize;
-	num_sectors++;
-	return sizeof(struct btrfs_ordered_sum) +
-		num_sectors * sizeof(struct btrfs_sector_sum);
+	int num_sectors = (int)DIV_ROUND_UP(bytes, root->sectorsize);
+	return sizeof(struct btrfs_ordered_sum) + num_sectors * sizeof(u32);
 }
 
 static inline void
@@ -138,15 +156,16 @@
 	t->last = NULL;
 }
 
-int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
-int btrfs_remove_ordered_extent(struct inode *inode,
+void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+void btrfs_remove_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry);
 int btrfs_dec_test_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 file_offset, u64 io_size);
+				   u64 file_offset, u64 io_size, int uptodate);
 int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 				   struct btrfs_ordered_extent **cached,
-				   u64 *file_offset, u64 io_size);
+				   u64 *file_offset, u64 io_size,
+				   int uptodate);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 			     u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
@@ -154,14 +173,14 @@
 int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
 				      u64 start, u64 len, u64 disk_len,
 				      int type, int compress_type);
-int btrfs_add_ordered_sum(struct inode *inode,
-			  struct btrfs_ordered_extent *entry,
-			  struct btrfs_ordered_sum *sum);
+void btrfs_add_ordered_sum(struct inode *inode,
+			   struct btrfs_ordered_extent *entry,
+			   struct btrfs_ordered_sum *sum);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
 							 u64 file_offset);
 void btrfs_start_ordered_extent(struct inode *inode,
 				struct btrfs_ordered_extent *entry, int wait);
-int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
 struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
@@ -169,11 +188,18 @@
 							u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 				struct btrfs_ordered_extent *ordered);
-int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
-int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root,
-				struct inode *inode);
-int btrfs_wait_ordered_extents(struct btrfs_root *root,
-			       int nocow_only, int delay_iput);
+int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
+			   u32 *sum, int len);
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root, int wait);
+void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct inode *inode);
+void btrfs_wait_ordered_extents(struct btrfs_root *root);
+void btrfs_wait_all_ordered_extents(struct btrfs_fs_info *fs_info);
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
+int __init ordered_data_init(void);
+void ordered_data_exit(void);
 #endif
diff -ur a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
--- a/fs/btrfs/orphan.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/orphan.c	2014-02-17 11:56:58.000000000 +0100
@@ -58,7 +58,7 @@
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
 		goto out;
-	if (ret) {
+	if (ret) { /* JDM: Really? */
 		ret = -ENOENT;
 		goto out;
 	}
diff -ur a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
--- a/fs/btrfs/print-tree.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/print-tree.c	2014-02-17 11:56:58.000000000 +0100
@@ -26,14 +26,12 @@
 	int i;
 	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
 	       "num_stripes %d\n",
-	       (unsigned long long)btrfs_chunk_length(eb, chunk),
-	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
-	       (unsigned long long)btrfs_chunk_type(eb, chunk),
-	       num_stripes);
+	       btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk),
+	       btrfs_chunk_type(eb, chunk), num_stripes);
 	for (i = 0 ; i < num_stripes ; i++) {
 		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
-		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
-		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+		      btrfs_stripe_devid_nr(eb, chunk, i),
+		      btrfs_stripe_offset_nr(eb, chunk, i));
 	}
 }
 static void print_dev_item(struct extent_buffer *eb,
@@ -41,18 +39,18 @@
 {
 	printk(KERN_INFO "\t\tdev item devid %llu "
 	       "total_bytes %llu bytes used %llu\n",
-	       (unsigned long long)btrfs_device_id(eb, dev_item),
-	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
-	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+	       btrfs_device_id(eb, dev_item),
+	       btrfs_device_total_bytes(eb, dev_item),
+	       btrfs_device_bytes_used(eb, dev_item));
 }
 static void print_extent_data_ref(struct extent_buffer *eb,
 				  struct btrfs_extent_data_ref *ref)
 {
 	printk(KERN_INFO "\t\textent data backref root %llu "
 	       "objectid %llu offset %llu count %u\n",
-	       (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+	       btrfs_extent_data_ref_root(eb, ref),
+	       btrfs_extent_data_ref_objectid(eb, ref),
+	       btrfs_extent_data_ref_offset(eb, ref),
 	       btrfs_extent_data_ref_count(eb, ref));
 }
 
@@ -87,19 +85,17 @@
 	flags = btrfs_extent_flags(eb, ei);
 
 	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
-	       (unsigned long long)btrfs_extent_refs(eb, ei),
-	       (unsigned long long)btrfs_extent_generation(eb, ei),
-	       (unsigned long long)flags);
+	       btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
+	       flags);
 
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		struct btrfs_tree_block_info *info;
 		info = (struct btrfs_tree_block_info *)(ei + 1);
 		btrfs_tree_block_key(eb, info, &key);
-		printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+		printk(KERN_INFO "\t\ttree block key (%llu %u %llu) "
 		       "level %d\n",
-		       (unsigned long long)btrfs_disk_key_objectid(&key),
-		       key.type,
-		       (unsigned long long)btrfs_disk_key_offset(&key),
+		       btrfs_disk_key_objectid(&key), key.type,
+		       btrfs_disk_key_offset(&key),
 		       btrfs_tree_block_level(eb, info));
 		iref = (struct btrfs_extent_inline_ref *)(info + 1);
 	} else {
@@ -115,11 +111,11 @@
 		switch (type) {
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			printk(KERN_INFO "\t\ttree block backref "
-				"root %llu\n", (unsigned long long)offset);
+				"root %llu\n", offset);
 			break;
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			printk(KERN_INFO "\t\tshared block backref "
-				"parent %llu\n", (unsigned long long)offset);
+				"parent %llu\n", offset);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY:
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -129,8 +125,7 @@
 			sref = (struct btrfs_shared_data_ref *)(iref + 1);
 			printk(KERN_INFO "\t\tshared data backref "
 			       "parent %llu count %u\n",
-			       (unsigned long long)offset,
-			       btrfs_shared_data_ref_count(eb, sref));
+			       offset, btrfs_shared_data_ref_count(eb, sref));
 			break;
 		default:
 			BUG();
@@ -148,13 +143,32 @@
 	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
 	printk("\t\textent back ref root %llu gen %llu "
 		"owner %llu num_refs %lu\n",
-		(unsigned long long)btrfs_ref_root_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_generation_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+		btrfs_ref_root_v0(eb, ref0),
+		btrfs_ref_generation_v0(eb, ref0),
+		btrfs_ref_objectid_v0(eb, ref0),
 		(unsigned long)btrfs_ref_count_v0(eb, ref0));
 }
 #endif
 
+static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
+			    u32 item_size)
+{
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		pr_warn("btrfs: uuid item with illegal size %lu!\n",
+			(unsigned long)item_size);
+		return;
+	}
+	while (item_size) {
+		__le64 subvol_id;
+
+		read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
+		printk(KERN_INFO "\t\tsubvol_id %llu\n",
+		       (unsigned long long)le64_to_cpu(subvol_id));
+		item_size -= sizeof(u64);
+		offset += sizeof(u64);
+	}
+}
+
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
@@ -176,40 +190,35 @@
 
 	nr = btrfs_header_nritems(l);
 
-	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
-		(unsigned long long)btrfs_header_bytenr(l), nr,
-		btrfs_leaf_free_space(root, l));
+	btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
+		   btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(l, i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
-		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
 		       "itemsize %d\n",
-			i,
-			(unsigned long long)key.objectid, type,
-			(unsigned long long)key.offset,
+			i, key.objectid, type, key.offset,
 			btrfs_item_offset(l, item), btrfs_item_size(l, item));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
 			printk(KERN_INFO "\t\tinode generation %llu size %llu "
 			       "mode %o\n",
-			       (unsigned long long)
 			       btrfs_inode_generation(l, ii),
-			      (unsigned long long)btrfs_inode_size(l, ii),
+			       btrfs_inode_size(l, ii),
 			       btrfs_inode_mode(l, ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
 			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
-				(unsigned long long)found_key.objectid,
+				found_key.objectid,
 				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
 			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
-				(unsigned long long)
 				btrfs_disk_root_bytenr(l, ri),
 				btrfs_disk_root_refs(l, ri));
 			break;
@@ -245,17 +254,12 @@
 			}
 			printk(KERN_INFO "\t\textent data disk bytenr %llu "
 			       "nr %llu\n",
-			       (unsigned long long)
 			       btrfs_file_extent_disk_bytenr(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_disk_num_bytes(l, fi));
 			printk(KERN_INFO "\t\textent data offset %llu "
 			       "nr %llu ram %llu\n",
-			       (unsigned long long)
 			       btrfs_file_extent_offset(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_num_bytes(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_EXTENT_REF_V0_KEY:
@@ -269,7 +273,6 @@
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
 			printk(KERN_INFO "\t\tblock group used %llu\n",
-			       (unsigned long long)
 			       btrfs_disk_block_group_used(l, bi));
 			break;
 		case BTRFS_CHUNK_ITEM_KEY:
@@ -286,14 +289,22 @@
 			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
 			       "\t\tchunk objectid %llu chunk offset %llu "
 			       "length %llu\n",
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_tree(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_offset(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_length(l, dev_extent));
+			break;
+		case BTRFS_DEV_STATS_KEY:
+			printk(KERN_INFO "\t\tdevice stats\n");
+			break;
+		case BTRFS_DEV_REPLACE_KEY:
+			printk(KERN_INFO "\t\tdev replace\n");
+			break;
+		case BTRFS_UUID_KEY_SUBVOL:
+		case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+					btrfs_item_size_nr(l, i));
+			break;
 		};
 	}
 }
@@ -312,18 +323,14 @@
 		btrfs_print_leaf(root, c);
 		return;
 	}
-	printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
-	       (unsigned long long)btrfs_header_bytenr(c),
-	      level, nr,
-	       (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+	btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u",
+		btrfs_header_bytenr(c), level, nr,
+		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
 		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
-		       i,
-		       (unsigned long long)key.objectid,
-		       key.type,
-		       (unsigned long long)key.offset,
-		       (unsigned long long)btrfs_node_blockptr(c, i));
+		       i, key.objectid, key.type, key.offset,
+		       btrfs_node_blockptr(c, i));
 	}
 	for (i = 0; i < nr; i++) {
 		struct extent_buffer *next = read_tree_block(root,
diff -ur a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h
--- a/fs/btrfs/print-tree.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/print-tree.h	2014-02-17 11:56:58.000000000 +0100
@@ -19,5 +19,5 @@
 #ifndef __PRINT_TREE_
 #define __PRINT_TREE_
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l);
-void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t);
+void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c);
 #endif
Nur in b/fs/btrfs: qgroup.c.
Nur in b/fs/btrfs: raid56.c.
Nur in b/fs/btrfs: raid56.h.
Nur in b/fs/btrfs: rcu-string.h.
diff -ur a/fs/btrfs/reada.c b/fs/btrfs/reada.c
--- a/fs/btrfs/reada.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/reada.c	2014-02-17 11:56:58.000000000 +0100
@@ -27,6 +27,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "dev-replace.h"
 
 #undef DEBUG
 
@@ -54,7 +55,6 @@
  * than the 2 started one after another.
  */
 
-#define MAX_MIRRORS 2
 #define MAX_IN_FLIGHT 6
 
 struct reada_extctl {
@@ -69,9 +69,9 @@
 	u32			blocksize;
 	int			err;
 	struct list_head	extctl;
-	struct kref		refcnt;
+	int 			refcnt;
 	spinlock_t		lock;
-	struct reada_zone	*zones[MAX_MIRRORS];
+	struct reada_zone	*zones[BTRFS_MAX_MIRRORS];
 	int			nzones;
 	struct btrfs_device	*scheduled_for;
 };
@@ -84,7 +84,8 @@
 	spinlock_t		lock;
 	int			locked;
 	struct btrfs_device	*device;
-	struct btrfs_device	*devs[MAX_MIRRORS]; /* full list, incl self */
+	struct btrfs_device	*devs[BTRFS_MAX_MIRRORS]; /* full list, incl
+							   * self */
 	int			ndevs;
 	struct kref		refcnt;
 };
@@ -126,7 +127,7 @@
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree, index);
 	if (re)
-		kref_get(&re->refcnt);
+		re->refcnt++;
 	spin_unlock(&fs_info->reada_lock);
 
 	if (!re)
@@ -250,14 +251,12 @@
 					  struct btrfs_bio *bbio)
 {
 	int ret;
-	int looped = 0;
 	struct reada_zone *zone;
 	struct btrfs_block_group_cache *cache = NULL;
 	u64 start;
 	u64 end;
 	int i;
 
-again:
 	zone = NULL;
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
@@ -274,9 +273,6 @@
 		spin_unlock(&fs_info->reada_lock);
 	}
 
-	if (looped)
-		return NULL;
-
 	cache = btrfs_lookup_block_group(fs_info, logical);
 	if (!cache)
 		return NULL;
@@ -305,15 +301,17 @@
 
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&dev->reada_zones,
-				(unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+				(unsigned long)(zone->end >> PAGE_CACHE_SHIFT),
 				zone);
-	spin_unlock(&fs_info->reada_lock);
 
-	if (ret) {
+	if (ret == -EEXIST) {
 		kfree(zone);
-		looped = 1;
-		goto again;
+		ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+					     logical >> PAGE_CACHE_SHIFT, 1);
+		if (ret == 1)
+			kref_get(&zone->refcnt);
 	}
+	spin_unlock(&fs_info->reada_lock);
 
 	return zone;
 }
@@ -323,26 +321,26 @@
 					      struct btrfs_key *top, int level)
 {
 	int ret;
-	int looped = 0;
 	struct reada_extent *re = NULL;
+	struct reada_extent *re_exist = NULL;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_bio *bbio = NULL;
 	struct btrfs_device *dev;
+	struct btrfs_device *prev_dev;
 	u32 blocksize;
 	u64 length;
 	int nzones = 0;
 	int i;
 	unsigned long index = logical >> PAGE_CACHE_SHIFT;
+	int dev_replace_is_ongoing;
 
-again:
 	spin_lock(&fs_info->reada_lock);
 	re = radix_tree_lookup(&fs_info->reada_tree, index);
 	if (re)
-		kref_get(&re->refcnt);
+		re->refcnt++;
 	spin_unlock(&fs_info->reada_lock);
 
-	if (re || looped)
+	if (re)
 		return re;
 
 	re = kzalloc(sizeof(*re), GFP_NOFS);
@@ -355,19 +353,20 @@
 	re->top = *top;
 	INIT_LIST_HEAD(&re->extctl);
 	spin_lock_init(&re->lock);
-	kref_init(&re->refcnt);
+	re->refcnt = 1;
 
 	/*
 	 * map block
 	 */
 	length = blocksize;
-	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+	ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical, &length,
+			      &bbio, 0);
 	if (ret || !bbio || length < blocksize)
 		goto error;
 
-	if (bbio->num_stripes > MAX_MIRRORS) {
+	if (bbio->num_stripes > BTRFS_MAX_MIRRORS) {
 		printk(KERN_ERR "btrfs readahead: more than %d copies not "
-				"supported", MAX_MIRRORS);
+				"supported", BTRFS_MAX_MIRRORS);
 		goto error;
 	}
 
@@ -396,32 +395,67 @@
 	}
 
 	/* insert extent in reada_tree + all per-device trees, all or nothing */
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+	if (ret == -EEXIST) {
+		re_exist = radix_tree_lookup(&fs_info->reada_tree, index);
+		BUG_ON(!re_exist);
+		re_exist->refcnt++;
+		spin_unlock(&fs_info->reada_lock);
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
+		goto error;
+	}
 	if (ret) {
 		spin_unlock(&fs_info->reada_lock);
-		if (ret != -ENOMEM) {
-			/* someone inserted the extent in the meantime */
-			looped = 1;
-		}
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		goto error;
 	}
+	prev_dev = NULL;
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(
+			&fs_info->dev_replace);
 	for (i = 0; i < nzones; ++i) {
 		dev = bbio->stripes[i].dev;
+		if (dev == prev_dev) {
+			/*
+			 * in case of DUP, just add the first zone. As both
+			 * are on the same device, there's nothing to gain
+			 * from adding both.
+			 * Also, it wouldn't work, as the tree is per device
+			 * and adding would fail with EEXIST
+			 */
+			continue;
+		}
+		if (!dev->bdev) {
+			/* cannot read ahead on missing device */
+			continue;
+		}
+		if (dev_replace_is_ongoing &&
+		    dev == fs_info->dev_replace.tgtdev) {
+			/*
+			 * as this device is selected for reading only as
+			 * a last resort, skip it for read ahead.
+			 */
+			continue;
+		}
+		prev_dev = dev;
 		ret = radix_tree_insert(&dev->reada_extents, index, re);
 		if (ret) {
 			while (--i >= 0) {
 				dev = bbio->stripes[i].dev;
 				BUG_ON(dev == NULL);
+				/* ignore whether the entry was inserted */
 				radix_tree_delete(&dev->reada_extents, index);
 			}
 			BUG_ON(fs_info == NULL);
 			radix_tree_delete(&fs_info->reada_tree, index);
 			spin_unlock(&fs_info->reada_lock);
+			btrfs_dev_replace_unlock(&fs_info->dev_replace);
 			goto error;
 		}
 	}
 	spin_unlock(&fs_info->reada_lock);
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
 
 	kfree(bbio);
 	return re;
@@ -450,13 +484,7 @@
 	}
 	kfree(bbio);
 	kfree(re);
-	if (looped)
-		goto again;
-	return NULL;
-}
-
-static void reada_kref_dummy(struct kref *kr)
-{
+	return re_exist;
 }
 
 static void reada_extent_put(struct btrfs_fs_info *fs_info,
@@ -466,7 +494,7 @@
 	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
 
 	spin_lock(&fs_info->reada_lock);
-	if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+	if (--re->refcnt) {
 		spin_unlock(&fs_info->reada_lock);
 		return;
 	}
@@ -661,7 +689,7 @@
 		return 0;
 	}
 	dev->reada_next = re->logical + re->blocksize;
-	kref_get(&re->refcnt);
+	re->refcnt++;
 
 	spin_unlock(&fs_info->reada_lock);
 
@@ -708,13 +736,18 @@
 {
 	struct reada_machine_work *rmw;
 	struct btrfs_fs_info *fs_info;
+	int old_ioprio;
 
 	rmw = container_of(work, struct reada_machine_work, work);
 	fs_info = rmw->fs_info;
 
 	kfree(rmw);
 
+	old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
+				       task_nice_ioprio(current));
+	set_task_ioprio(current, BTRFS_IOPRIO_READA);
 	__reada_start_machine(fs_info);
+	set_task_ioprio(current, old_ioprio);
 }
 
 static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -904,7 +937,10 @@
 	generation = btrfs_header_generation(node);
 	free_extent_buffer(node);
 
-	reada_add_block(rc, start, &max_key, level, generation);
+	if (reada_add_block(rc, start, &max_key, level, generation)) {
+		kfree(rc);
+		return ERR_PTR(-ENOMEM);
+	}
 
 	reada_start_machine(root->fs_info);
 
@@ -919,10 +955,11 @@
 	while (atomic_read(&rc->elems)) {
 		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
 				   5 * HZ);
-		dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+		dump_devs(rc->root->fs_info,
+			  atomic_read(&rc->elems) < 10 ? 1 : 0);
 	}
 
-	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+	dump_devs(rc->root->fs_info, atomic_read(&rc->elems) < 10 ? 1 : 0);
 
 	kref_put(&rc->refcnt, reada_control_release);
 
diff -ur a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
--- a/fs/btrfs/relocation.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/relocation.c	2014-02-17 11:56:58.000000000 +0100
@@ -326,6 +326,18 @@
 	return NULL;
 }
 
+static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
+{
+
+	struct btrfs_fs_info *fs_info = NULL;
+	struct backref_node *bnode = rb_entry(rb_node, struct backref_node,
+					      rb_node);
+	if (bnode->root)
+		fs_info = bnode->root->fs_info;
+	btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
+		    "found at offset %llu\n", bytenr);
+}
+
 /*
  * walk up backref nodes until reach node presents tree root
  */
@@ -452,7 +464,8 @@
 	rb_erase(&node->rb_node, &cache->rb_root);
 	node->bytenr = bytenr;
 	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
-	BUG_ON(rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, bytenr);
 }
 
 /*
@@ -575,7 +588,7 @@
 	else
 		key.offset = (u64)-1;
 
-	return btrfs_read_fs_root_no_name(fs_info, &key);
+	return btrfs_get_fs_root(fs_info, &key, false);
 }
 
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
@@ -605,10 +618,13 @@
 int find_inline_backref(struct extent_buffer *leaf, int slot,
 			unsigned long *ptr, unsigned long *end)
 {
+	struct btrfs_key key;
 	struct btrfs_extent_item *ei;
 	struct btrfs_tree_block_info *bi;
 	u32 item_size;
 
+	btrfs_item_key_to_cpu(leaf, &key, slot);
+
 	item_size = btrfs_item_size_nr(leaf, slot);
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 	if (item_size < sizeof(*ei)) {
@@ -620,13 +636,23 @@
 	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
 		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
 
-	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+	if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+	    item_size <= sizeof(*ei) + sizeof(*bi)) {
 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 		return 1;
 	}
+	if (key.type == BTRFS_METADATA_ITEM_KEY &&
+	    item_size <= sizeof(*ei)) {
+		WARN_ON(item_size < sizeof(*ei));
+		return 1;
+	}
 
-	bi = (struct btrfs_tree_block_info *)(ei + 1);
-	*ptr = (unsigned long)(bi + 1);
+	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		*ptr = (unsigned long)(bi + 1);
+	} else {
+		*ptr = (unsigned long)(ei + 1);
+	}
 	*end = (unsigned long)ei + item_size;
 	return 0;
 }
@@ -670,6 +696,7 @@
 	int cowonly;
 	int ret;
 	int err = 0;
+	bool need_check = true;
 
 	path1 = btrfs_alloc_path();
 	path2 = btrfs_alloc_path();
@@ -694,7 +721,7 @@
 	end = 0;
 	ptr = 0;
 	key.objectid = cur->bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.type = BTRFS_METADATA_ITEM_KEY;
 	key.offset = (u64)-1;
 
 	path1->search_commit_root = 1;
@@ -752,7 +779,8 @@
 				break;
 			}
 
-			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+			if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+			    key.type == BTRFS_METADATA_ITEM_KEY) {
 				ret = find_inline_backref(eb, path1->slots[0],
 							  &ptr, &end);
 				if (ret)
@@ -892,6 +920,7 @@
 			cur->bytenr);
 
 		lower = cur;
+		need_check = true;
 		for (; level < BTRFS_MAX_LEVEL; level++) {
 			if (!path2->nodes[level]) {
 				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
@@ -935,14 +964,12 @@
 
 				/*
 				 * add the block to pending list if we
-				 * need check its backrefs. only block
-				 * at 'cur->level + 1' is added to the
-				 * tail of pending list. this guarantees
-				 * we check backrefs from lower level
-				 * blocks to upper level blocks.
+				 * need check its backrefs, we only do this once
+				 * while walking up a tree as we will catch
+				 * anything else later on.
 				 */
-				if (!upper->checked &&
-				    level == cur->level + 1) {
+				if (!upper->checked && need_check) {
+					need_check = false;
 					list_add_tail(&edge->list[UPPER],
 						      &list);
 				} else
@@ -999,7 +1026,8 @@
 	if (!cowonly) {
 		rb_node = tree_insert(&cache->rb_root, node->bytenr,
 				      &node->rb_node);
-		BUG_ON(rb_node);
+		if (rb_node)
+			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
 		list_add_tail(&node->lower, &cache->leaves);
 	}
 
@@ -1034,7 +1062,9 @@
 		if (!cowonly) {
 			rb_node = tree_insert(&cache->rb_root, upper->bytenr,
 					      &upper->rb_node);
-			BUG_ON(rb_node);
+			if (rb_node)
+				backref_tree_panic(rb_node, -EEXIST,
+						   upper->bytenr);
 		}
 
 		list_add_tail(&edge->list[UPPER], &upper->lower);
@@ -1180,7 +1210,8 @@
 
 	rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
 			      &new_node->rb_node);
-	BUG_ON(rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, new_node->bytenr);
 
 	if (!new_node->lowest) {
 		list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
@@ -1203,14 +1234,15 @@
 /*
  * helper to add 'address of tree root -> reloc tree' mapping
  */
-static int __add_reloc_root(struct btrfs_root *root)
+static int __must_check __add_reloc_root(struct btrfs_root *root)
 {
 	struct rb_node *rb_node;
 	struct mapping_node *node;
 	struct reloc_control *rc = root->fs_info->reloc_ctl;
 
 	node = kmalloc(sizeof(*node), GFP_NOFS);
-	BUG_ON(!node);
+	if (!node)
+		return -ENOMEM;
 
 	node->bytenr = root->node->start;
 	node->data = root;
@@ -1219,7 +1251,13 @@
 	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
 			      node->bytenr, &node->rb_node);
 	spin_unlock(&rc->reloc_root_tree.lock);
-	BUG_ON(rb_node);
+	if (rb_node) {
+		btrfs_panic(root->fs_info, -EEXIST, "Duplicate root found "
+			    "for start=%llu while inserting into relocation "
+			    "tree\n", node->bytenr);
+		kfree(node);
+		return -EEXIST;
+	}
 
 	list_add_tail(&root->root_list, &rc->reloc_roots);
 	return 0;
@@ -1244,6 +1282,8 @@
 	}
 	spin_unlock(&rc->reloc_root_tree.lock);
 
+	if (!node)
+		return 0;
 	BUG_ON((struct btrfs_root *)node->data != root);
 
 	if (!del) {
@@ -1252,9 +1292,12 @@
 		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
 				      node->bytenr, &node->rb_node);
 		spin_unlock(&rc->reloc_root_tree.lock);
-		BUG_ON(rb_node);
+		if (rb_node)
+			backref_tree_panic(rb_node, -EEXIST, node->bytenr);
 	} else {
+		spin_lock(&root->fs_info->trans_lock);
 		list_del_init(&root->root_list);
+		spin_unlock(&root->fs_info->trans_lock);
 		kfree(node);
 	}
 	return 0;
@@ -1267,6 +1310,7 @@
 	struct extent_buffer *eb;
 	struct btrfs_root_item *root_item;
 	struct btrfs_key root_key;
+	u64 last_snap = 0;
 	int ret;
 
 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
@@ -1282,6 +1326,7 @@
 				      BTRFS_TREE_RELOC_OBJECTID);
 		BUG_ON(ret);
 
+		last_snap = btrfs_root_last_snapshot(&root->root_item);
 		btrfs_set_root_last_snapshot(&root->root_item,
 					     trans->transid - 1);
 	} else {
@@ -1307,6 +1352,12 @@
 		memset(&root_item->drop_progress, 0,
 		       sizeof(struct btrfs_disk_key));
 		root_item->drop_level = 0;
+		/*
+		 * abuse rtransid, it is safe because it is impossible to
+		 * receive data into a relocation tree.
+		 */
+		btrfs_set_root_rtransid(root_item, last_snap);
+		btrfs_set_root_otransid(root_item, trans->transid);
 	}
 
 	btrfs_tree_unlock(eb);
@@ -1317,8 +1368,7 @@
 	BUG_ON(ret);
 	kfree(root_item);
 
-	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						 &root_key);
+	reloc_root = btrfs_read_fs_root(root->fs_info->tree_root, &root_key);
 	BUG_ON(IS_ERR(reloc_root));
 	reloc_root->last_trans = trans->transid;
 	return reloc_root;
@@ -1334,6 +1384,7 @@
 	struct btrfs_root *reloc_root;
 	struct reloc_control *rc = root->fs_info->reloc_ctl;
 	int clear_rsv = 0;
+	int ret;
 
 	if (root->reloc_root) {
 		reloc_root = root->reloc_root;
@@ -1353,7 +1404,8 @@
 	if (clear_rsv)
 		trans->block_rsv = NULL;
 
-	__add_reloc_root(reloc_root);
+	ret = __add_reloc_root(reloc_root);
+	BUG_ON(ret < 0);
 	root->reloc_root = reloc_root;
 	return 0;
 }
@@ -1496,7 +1548,7 @@
 	       btrfs_file_extent_other_encoding(leaf, fi));
 
 	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
-		ret = 1;
+		ret = -EINVAL;
 		goto out;
 	}
 
@@ -1527,7 +1579,7 @@
 	u64 end;
 	u32 nritems;
 	u32 i;
-	int ret;
+	int ret = 0;
 	int first = 1;
 	int dirty = 0;
 
@@ -1577,25 +1629,26 @@
 				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
 				end--;
 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
-						      key.offset, end,
-						      GFP_NOFS);
+						      key.offset, end);
 				if (!ret)
 					continue;
 
 				btrfs_drop_extent_cache(inode, key.offset, end,
 							1);
 				unlock_extent(&BTRFS_I(inode)->io_tree,
-					      key.offset, end, GFP_NOFS);
+					      key.offset, end);
 			}
 		}
 
 		ret = get_new_location(rc->data_inode, &new_bytenr,
 				       bytenr, num_bytes);
-		if (ret > 0) {
-			WARN_ON(1);
-			continue;
+		if (ret) {
+			/*
+			 * Don't have to abort since we've not changed anything
+			 * in the file extent yet.
+			 */
+			break;
 		}
-		BUG_ON(ret < 0);
 
 		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
 		dirty = 1;
@@ -1604,19 +1657,25 @@
 		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
 					   num_bytes, parent,
 					   btrfs_header_owner(leaf),
-					   key.objectid, key.offset);
-		BUG_ON(ret);
+					   key.objectid, key.offset, 1);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
 
 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
 					parent, btrfs_header_owner(leaf),
-					key.objectid, key.offset);
-		BUG_ON(ret);
+					key.objectid, key.offset, 1);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			break;
+		}
 	}
 	if (dirty)
 		btrfs_mark_buffer_dirty(leaf);
 	if (inode)
 		btrfs_add_delayed_iput(inode);
-	return 0;
+	return ret;
 }
 
 static noinline_for_stack
@@ -1731,7 +1790,11 @@
 
 			eb = read_tree_block(dest, old_bytenr, blocksize,
 					     old_ptr_gen);
-			BUG_ON(!eb);
+			if (!eb || !extent_buffer_uptodate(eb)) {
+				ret = (!eb) ? -ENOMEM : -EIO;
+				free_extent_buffer(eb);
+				break;
+			}
 			btrfs_tree_lock(eb);
 			if (cow) {
 				ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -1778,21 +1841,23 @@
 
 		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
 					path->nodes[level]->start,
-					src->root_key.objectid, level - 1, 0);
+					src->root_key.objectid, level - 1, 0,
+					1);
 		BUG_ON(ret);
 
 		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
 					0, dest->root_key.objectid, level - 1,
-					0);
+					0, 1);
 		BUG_ON(ret);
 
 		btrfs_unlock_up_safe(path, 0);
@@ -1882,6 +1947,10 @@
 		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
 		blocksize = btrfs_level_size(root, i - 1);
 		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
 		BUG_ON(btrfs_header_level(eb) != i - 1);
 		path->nodes[i - 1] = eb;
 		path->slots[i - 1] = 0;
@@ -1954,9 +2023,9 @@
 		}
 
 		/* the lock_extent waits for readpage to complete */
-		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
 		btrfs_drop_extent_cache(inode, start, end, 1);
-		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
 	}
 	return 0;
 }
@@ -1994,7 +2063,6 @@
 	struct btrfs_root_item *root_item;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	unsigned long nr;
 	int level;
 	int max_level;
 	int replaced = 0;
@@ -2043,7 +2111,8 @@
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
-		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
+					     BTRFS_RESERVE_FLUSH_ALL);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -2094,10 +2163,9 @@
 			       path->slots[level]);
 		root_item->drop_level = level;
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction_throttle(trans, root);
 
-		btrfs_btree_balance_dirty(root, nr);
+		btrfs_btree_balance_dirty(root);
 
 		if (replaced && rc->stage == UPDATE_DATA_PTRS)
 			invalidate_extent_cache(root, &key, &next_key);
@@ -2124,10 +2192,9 @@
 		btrfs_update_reloc_root(trans, root);
 	}
 
-	nr = trans->blocks_used;
 	btrfs_end_transaction_throttle(trans, root);
 
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 
 	if (replaced && rc->stage == UPDATE_DATA_PTRS)
 		invalidate_extent_cache(root, &key, &next_key);
@@ -2153,7 +2220,8 @@
 again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
-		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+					  BTRFS_RESERVE_FLUSH_ALL);
 		if (ret)
 			err = ret;
 	}
@@ -2208,13 +2276,32 @@
 }
 
 static noinline_for_stack
+void free_reloc_roots(struct list_head *list)
+{
+	struct btrfs_root *reloc_root;
+
+	while (!list_empty(list)) {
+		reloc_root = list_entry(list->next, struct btrfs_root,
+					root_list);
+		__update_reloc_root(reloc_root, 1);
+		free_extent_buffer(reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		kfree(reloc_root);
+	}
+}
+
+static noinline_for_stack
 int merge_reloc_roots(struct reloc_control *rc)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	struct btrfs_root *reloc_root;
+	u64 last_snap;
+	u64 otransid;
+	u64 objectid;
 	LIST_HEAD(reloc_roots);
 	int found = 0;
-	int ret;
+	int ret = 0;
 again:
 	root = rc->extent_root;
 
@@ -2240,19 +2327,67 @@
 			BUG_ON(root->reloc_root != reloc_root);
 
 			ret = merge_reloc_root(rc, root);
-			BUG_ON(ret);
+			if (ret) {
+				__update_reloc_root(reloc_root, 1);
+				free_extent_buffer(reloc_root->node);
+				free_extent_buffer(reloc_root->commit_root);
+				kfree(reloc_root);
+				goto out;
+			}
 		} else {
 			list_del_init(&reloc_root->root_list);
 		}
-		btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
+
+		/*
+		 * we keep the old last snapshod transid in rtranid when we
+		 * created the relocation tree.
+		 */
+		last_snap = btrfs_root_rtransid(&reloc_root->root_item);
+		otransid = btrfs_root_otransid(&reloc_root->root_item);
+		objectid = reloc_root->root_key.offset;
+
+		ret = btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0, 1);
+		if (ret < 0) {
+			if (list_empty(&reloc_root->root_list))
+				list_add_tail(&reloc_root->root_list,
+					      &reloc_roots);
+			goto out;
+		} else if (!ret) {
+			/*
+			 * recover the last snapshot tranid to avoid
+			 * the space balance break NOCOW.
+			 */
+			root = read_fs_root(rc->extent_root->fs_info,
+					    objectid);
+			if (IS_ERR(root))
+				continue;
+
+			trans = btrfs_join_transaction(root);
+			BUG_ON(IS_ERR(trans));
+
+			/* Check if the fs/file tree was snapshoted or not. */
+			if (btrfs_root_last_snapshot(&root->root_item) ==
+			    otransid - 1)
+				btrfs_set_root_last_snapshot(&root->root_item,
+							     last_snap);
+				
+			btrfs_end_transaction(trans, root);
+		}
 	}
 
 	if (found) {
 		found = 0;
 		goto again;
 	}
+out:
+	if (ret) {
+		btrfs_std_error(root->fs_info, ret);
+		if (!list_empty(&reloc_roots))
+			free_reloc_roots(&reloc_roots);
+	}
+
 	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
-	return 0;
+	return ret;
 }
 
 static void free_block_list(struct rb_root *blocks)
@@ -2427,7 +2562,8 @@
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
+	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes,
+				  BTRFS_RESERVE_FLUSH_ALL);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
@@ -2530,7 +2666,8 @@
 		blocksize = btrfs_level_size(root, node->level);
 		generation = btrfs_node_ptr_generation(upper->eb, slot);
 		eb = read_tree_block(root, bytenr, blocksize, generation);
-		if (!eb) {
+		if (!eb || !extent_buffer_uptodate(eb)) {
+			free_extent_buffer(eb);
 			err = -EIO;
 			goto next;
 		}
@@ -2558,7 +2695,7 @@
 						node->eb->start, blocksize,
 						upper->eb->start,
 						btrfs_header_owner(upper->eb),
-						node->level, 0);
+						node->level, 0, 1);
 			BUG_ON(ret);
 
 			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
@@ -2691,7 +2828,10 @@
 	BUG_ON(block->key_ready);
 	eb = read_tree_block(rc->extent_root, block->bytenr,
 			     block->key.objectid, block->key.offset);
-	BUG_ON(!eb);
+	if (!eb || !extent_buffer_uptodate(eb)) {
+		free_extent_buffer(eb);
+		return -EIO;
+	}
 	WARN_ON(btrfs_header_level(eb) != block->level);
 	if (block->level == 0)
 		btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -2706,8 +2846,13 @@
 			    struct tree_block *block)
 {
 	BUG_ON(block->key_ready);
-	readahead_tree_block(rc->extent_root, block->bytenr,
-			     block->key.objectid, block->key.offset);
+	if (block->key.type == BTRFS_METADATA_ITEM_KEY)
+		readahead_tree_block(rc->extent_root, block->bytenr,
+				     block->key.objectid,
+				     rc->extent_root->leafsize);
+	else
+		readahead_tree_block(rc->extent_root, block->bytenr,
+				     block->key.objectid, block->key.offset);
 	return 0;
 }
 
@@ -2786,8 +2931,10 @@
 	int err = 0;
 
 	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	if (!path) {
+		err = -ENOMEM;
+		goto out_free_blocks;
+	}
 
 	rb_node = rb_first(blocks);
 	while (rb_node) {
@@ -2800,8 +2947,11 @@
 	rb_node = rb_first(blocks);
 	while (rb_node) {
 		block = rb_entry(rb_node, struct tree_block, rb_node);
-		if (!block->key_ready)
-			get_tree_block_key(rc, block);
+		if (!block->key_ready) {
+			err = get_tree_block_key(rc, block);
+			if (err)
+				goto out_free_path;
+		}
 		rb_node = rb_next(rb_node);
 	}
 
@@ -2826,10 +2976,12 @@
 		rb_node = rb_next(rb_node);
 	}
 out:
-	free_block_list(blocks);
 	err = finish_pending_nodes(trans, rc, path, err);
 
+out_free_path:
 	btrfs_free_path(path);
+out_free_blocks:
+	free_block_list(blocks);
 	return err;
 }
 
@@ -2860,12 +3012,12 @@
 		else
 			end = cluster->end - offset;
 
-		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end);
 		num_bytes = end + 1 - start;
 		ret = btrfs_prealloc_file_range(inode, 0, start,
 						num_bytes, num_bytes,
 						end + 1, &alloc_hint);
-		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
 		if (ret)
 			break;
 		nr++;
@@ -2897,10 +3049,10 @@
 	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
 
-	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end);
 	while (1) {
 		write_lock(&em_tree->lock);
-		ret = add_extent_mapping(em_tree, em);
+		ret = add_extent_mapping(em_tree, em, 0);
 		write_unlock(&em_tree->lock);
 		if (ret != -EEXIST) {
 			free_extent_map(em);
@@ -2908,7 +3060,7 @@
 		}
 		btrfs_drop_extent_cache(inode, start, end, 0);
 	}
-	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end);
 	return ret;
 }
 
@@ -2947,9 +3099,7 @@
 	index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
 	last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
 	while (index <= last_index) {
-		mutex_lock(&inode->i_mutex);
 		ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
-		mutex_unlock(&inode->i_mutex);
 		if (ret)
 			goto out;
 
@@ -2987,11 +3137,10 @@
 			}
 		}
 
-		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_start = page_offset(page);
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 
-		lock_extent(&BTRFS_I(inode)->io_tree,
-			    page_start, page_end, GFP_NOFS);
+		lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
 
 		set_page_extent_mapped(page);
 
@@ -3007,7 +3156,7 @@
 		set_page_dirty(page);
 
 		unlock_extent(&BTRFS_I(inode)->io_tree,
-			      page_start, page_end, GFP_NOFS);
+			      page_start, page_end);
 		unlock_page(page);
 		page_cache_release(page);
 
@@ -3114,12 +3263,17 @@
 	eb =  path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 
-	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+	if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
+	    item_size >= sizeof(*ei) + sizeof(*bi)) {
 		ei = btrfs_item_ptr(eb, path->slots[0],
 				struct btrfs_extent_item);
-		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
+			bi = (struct btrfs_tree_block_info *)(ei + 1);
+			level = btrfs_tree_block_level(eb, bi);
+		} else {
+			level = (int)extent_key->offset;
+		}
 		generation = btrfs_extent_generation(eb, ei);
-		level = btrfs_tree_block_level(eb, bi);
 	} else {
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
 		u64 ref_owner;
@@ -3148,13 +3302,14 @@
 		return -ENOMEM;
 
 	block->bytenr = extent_key->objectid;
-	block->key.objectid = extent_key->offset;
+	block->key.objectid = rc->extent_root->leafsize;
 	block->key.offset = generation;
 	block->level = level;
 	block->key_ready = 0;
 
 	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-	BUG_ON(rb_node);
+	if (rb_node)
+		backref_tree_panic(rb_node, -EEXIST, block->bytenr);
 
 	return 0;
 }
@@ -3169,6 +3324,8 @@
 	struct btrfs_path *path;
 	struct btrfs_key key;
 	int ret;
+	bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
+					SKINNY_METADATA);
 
 	if (tree_block_processed(bytenr, blocksize, rc))
 		return 0;
@@ -3179,19 +3336,42 @@
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-
+again:
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_ITEM_KEY;
-	key.offset = blocksize;
+	if (skinny) {
+		key.type = BTRFS_METADATA_ITEM_KEY;
+		key.offset = (u64)-1;
+	} else {
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = blocksize;
+	}
 
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
+
+	if (ret > 0 && skinny) {
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    (key.type == BTRFS_METADATA_ITEM_KEY ||
+			     (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			      key.offset == blocksize)))
+				ret = 0;
+		}
+
+		if (ret) {
+			skinny = false;
+			btrfs_release_path(path);
+			goto again;
+		}
+	}
 	BUG_ON(ret);
 
-	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
 	ret = add_tree_block(rc, &key, path, blocks);
 out:
 	btrfs_free_path(path);
@@ -3212,7 +3392,8 @@
 		return 1;
 
 	ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
-				       eb->start, eb->len, NULL, &flags);
+				       eb->start, btrfs_header_level(eb), 1,
+				       NULL, &flags);
 	BUG_ON(ret);
 
 	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
@@ -3229,7 +3410,6 @@
 	struct btrfs_path *path;
 	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_trans_handle *trans;
-	unsigned long nr;
 	int ret = 0;
 
 	if (inode)
@@ -3240,13 +3420,18 @@
 	key.offset = 0;
 
 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
-	if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
-		if (inode && !IS_ERR(inode))
+	if (IS_ERR(inode) || is_bad_inode(inode)) {
+		if (!IS_ERR(inode))
 			iput(inode);
 		return -ENOENT;
 	}
 
 truncate:
+	ret = btrfs_check_trunc_cache_free_space(root,
+						 &fs_info->global_block_rsv);
+	if (ret)
+		goto out;
+
 	path = btrfs_alloc_path();
 	if (!path) {
 		ret = -ENOMEM;
@@ -3263,9 +3448,8 @@
 	ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
 
 	btrfs_free_path(path);
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 out:
 	iput(inode);
 	return ret;
@@ -3426,7 +3610,9 @@
 			block->key_ready = 1;
 			rb_node = tree_insert(blocks, block->bytenr,
 					      &block->rb_node);
-			BUG_ON(rb_node);
+			if (rb_node)
+				backref_tree_panic(rb_node, -EEXIST,
+						   block->bytenr);
 		}
 		if (counted)
 			added = 1;
@@ -3442,7 +3628,7 @@
 }
 
 /*
- * hepler to find all tree blocks that reference a given data extent
+ * helper to find all tree blocks that reference a given data extent
  */
 static noinline_for_stack
 int add_data_references(struct reloc_control *rc,
@@ -3457,7 +3643,7 @@
 	unsigned long ptr;
 	unsigned long end;
 	u32 blocksize = btrfs_level_size(rc->extent_root, 0);
-	int ret;
+	int ret = 0;
 	int err = 0;
 
 	eb = path->nodes[0];
@@ -3484,6 +3670,10 @@
 		} else {
 			BUG();
 		}
+		if (ret) {
+			err = ret;
+			goto out;
+		}
 		ptr += btrfs_extent_inline_ref_size(key.type);
 	}
 	WARN_ON(ptr > end);
@@ -3529,6 +3719,7 @@
 		}
 		path->slots[0]++;
 	}
+out:
 	btrfs_release_path(path);
 	if (err)
 		free_block_list(blocks);
@@ -3536,7 +3727,7 @@
 }
 
 /*
- * hepler to find next unprocessed extent
+ * helper to find next unprocessed extent
  */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
@@ -3581,21 +3772,38 @@
 			break;
 		}
 
-		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+		if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+		    key.type != BTRFS_METADATA_ITEM_KEY) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY &&
 		    key.objectid + key.offset <= rc->search_start) {
 			path->slots[0]++;
 			goto next;
 		}
 
+		if (key.type == BTRFS_METADATA_ITEM_KEY &&
+		    key.objectid + rc->extent_root->leafsize <=
+		    rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
 		ret = find_first_extent_bit(&rc->processed_blocks,
 					    key.objectid, &start, &end,
-					    EXTENT_DIRTY);
+					    EXTENT_DIRTY, NULL);
 
 		if (ret == 0 && start <= key.objectid) {
 			btrfs_release_path(path);
 			rc->search_start = end + 1;
 		} else {
-			rc->search_start = key.objectid + key.offset;
+			if (key.type == BTRFS_EXTENT_ITEM_KEY)
+				rc->search_start = key.objectid + key.offset;
+			else
+				rc->search_start = key.objectid +
+					rc->extent_root->leafsize;
 			memcpy(extent_key, &key, sizeof(key));
 			return 0;
 		}
@@ -3642,7 +3850,8 @@
 	struct btrfs_trans_handle *trans;
 	int ret;
 
-	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+	rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root,
+					      BTRFS_BLOCK_RSV_TEMP);
 	if (!rc->block_rsv)
 		return -ENOMEM;
 
@@ -3652,7 +3861,8 @@
 	 * is no reservation in transaction handle.
 	 */
 	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
-				  rc->extent_root->nodesize * 256);
+				  rc->extent_root->nodesize * 256,
+				  BTRFS_RESERVE_FLUSH_ALL);
 	if (ret)
 		return ret;
 
@@ -3666,7 +3876,15 @@
 	set_reloc_control(rc);
 
 	trans = btrfs_join_transaction(rc->extent_root);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		unset_reloc_control(rc);
+		/*
+		 * extent tree is not a ref_cow tree and has no reloc_root to
+		 * cleanup.  And callers are responsible to free the above
+		 * block rsv.
+		 */
+		return PTR_ERR(trans);
+	}
 	btrfs_commit_transaction(trans, rc->extent_root);
 	return 0;
 }
@@ -3678,7 +3896,6 @@
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_path *path;
 	struct btrfs_extent_item *ei;
-	unsigned long nr;
 	u64 flags;
 	u32 item_size;
 	int ret;
@@ -3699,7 +3916,11 @@
 	while (1) {
 		progress++;
 		trans = btrfs_start_transaction(rc->extent_root, 0);
-		BUG_ON(IS_ERR(trans));
+		if (IS_ERR(trans)) {
+			err = PTR_ERR(trans);
+			trans = NULL;
+			break;
+		}
 restart:
 		if (update_backref_cache(trans, &rc->backref_cache)) {
 			btrfs_end_transaction(trans, rc->extent_root);
@@ -3782,7 +4003,7 @@
 
 		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
 		if (ret < 0) {
-			if (ret != -EAGAIN) {
+			if (ret != -ENOSPC) {
 				err = ret;
 				WARN_ON(1);
 				break;
@@ -3795,9 +4016,8 @@
 			ret = btrfs_commit_transaction(trans, rc->extent_root);
 			BUG_ON(ret);
 		} else {
-			nr = trans->blocks_used;
 			btrfs_end_transaction_throttle(trans, rc->extent_root);
-			btrfs_btree_balance_dirty(rc->extent_root, nr);
+			btrfs_btree_balance_dirty(rc->extent_root);
 		}
 		trans = NULL;
 
@@ -3827,9 +4047,8 @@
 			  GFP_NOFS);
 
 	if (trans) {
-		nr = trans->blocks_used;
 		btrfs_end_transaction_throttle(trans, rc->extent_root);
-		btrfs_btree_balance_dirty(rc->extent_root, nr);
+		btrfs_btree_balance_dirty(rc->extent_root);
 	}
 
 	if (!err) {
@@ -3908,7 +4127,6 @@
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root;
 	struct btrfs_key key;
-	unsigned long nr;
 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
 	int err = 0;
 
@@ -3936,9 +4154,8 @@
 
 	err = btrfs_orphan_add(trans, inode);
 out:
-	nr = trans->blocks_used;
 	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
+	btrfs_btree_balance_dirty(root);
 	if (err) {
 		if (inode)
 			iput(inode);
@@ -3947,7 +4164,7 @@
 	return inode;
 }
 
-static struct reloc_control *alloc_reloc_control(void)
+static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
 {
 	struct reloc_control *rc;
 
@@ -3958,7 +4175,8 @@
 	INIT_LIST_HEAD(&rc->reloc_roots);
 	backref_cache_init(&rc->backref_cache);
 	mapping_tree_init(&rc->reloc_root_tree);
-	extent_io_tree_init(&rc->processed_blocks, NULL);
+	extent_io_tree_init(&rc->processed_blocks,
+			    fs_info->btree_inode->i_mapping);
 	return rc;
 }
 
@@ -3975,7 +4193,7 @@
 	int rw = 0;
 	int err = 0;
 
-	rc = alloc_reloc_control();
+	rc = alloc_reloc_control(fs_info);
 	if (!rc)
 		return -ENOMEM;
 
@@ -4021,18 +4239,18 @@
 	}
 
 	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
-	       (unsigned long long)rc->block_group->key.objectid,
-	       (unsigned long long)rc->block_group->flags);
+	       rc->block_group->key.objectid, rc->block_group->flags);
 
-	btrfs_start_delalloc_inodes(fs_info->tree_root, 0);
-	btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
+	ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	btrfs_wait_all_ordered_extents(fs_info);
 
 	while (1) {
 		mutex_lock(&fs_info->cleaner_mutex);
-
-		btrfs_clean_old_snapshots(fs_info->tree_root);
 		ret = relocate_block_group(rc);
-
 		mutex_unlock(&fs_info->cleaner_mutex);
 		if (ret < 0) {
 			err = ret;
@@ -4043,7 +4261,7 @@
 			break;
 
 		printk(KERN_INFO "btrfs: found %llu extents\n",
-			(unsigned long long)rc->extents_found);
+			rc->extents_found);
 
 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
 			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
@@ -4073,10 +4291,11 @@
 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle *trans;
-	int ret;
+	int ret, err;
 
 	trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
 
 	memset(&root->root_item.drop_progress, 0,
 		sizeof(root->root_item.drop_progress));
@@ -4084,11 +4303,11 @@
 	btrfs_set_root_refs(&root->root_item, 0);
 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
 				&root->root_key, &root->root_item);
-	BUG_ON(ret);
 
-	ret = btrfs_end_transaction(trans, root->fs_info->tree_root);
-	BUG_ON(ret);
-	return 0;
+	err = btrfs_end_transaction(trans, root->fs_info->tree_root);
+	if (err)
+		return err;
+	return ret;
 }
 
 /*
@@ -4139,7 +4358,7 @@
 		    key.type != BTRFS_ROOT_ITEM_KEY)
 			break;
 
-		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+		reloc_root = btrfs_read_fs_root(root, &key);
 		if (IS_ERR(reloc_root)) {
 			err = PTR_ERR(reloc_root);
 			goto out;
@@ -4156,7 +4375,11 @@
 					err = ret;
 					goto out;
 				}
-				mark_garbage_root(reloc_root);
+				ret = mark_garbage_root(reloc_root);
+				if (ret < 0) {
+					err = ret;
+					goto out;
+				}
 			}
 		}
 
@@ -4170,7 +4393,7 @@
 	if (list_empty(&reloc_roots))
 		goto out;
 
-	rc = alloc_reloc_control();
+	rc = alloc_reloc_control(root->fs_info);
 	if (!rc) {
 		err = -ENOMEM;
 		goto out;
@@ -4202,13 +4425,19 @@
 
 		fs_root = read_fs_root(root->fs_info,
 				       reloc_root->root_key.offset);
-		BUG_ON(IS_ERR(fs_root));
+		if (IS_ERR(fs_root)) {
+			err = PTR_ERR(fs_root);
+			goto out_free;
+		}
 
-		__add_reloc_root(reloc_root);
+		err = __add_reloc_root(reloc_root);
+		BUG_ON(err < 0); /* -ENOMEM or logic error */
 		fs_root->reloc_root = reloc_root;
 	}
 
-	btrfs_commit_transaction(trans, rc->extent_root);
+	err = btrfs_commit_transaction(trans, rc->extent_root);
+	if (err)
+		goto out_free;
 
 	merge_reloc_roots(rc);
 
@@ -4218,18 +4447,13 @@
 	if (IS_ERR(trans))
 		err = PTR_ERR(trans);
 	else
-		btrfs_commit_transaction(trans, rc->extent_root);
+		err = btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
 	kfree(rc);
 out:
-	while (!list_empty(&reloc_roots)) {
-		reloc_root = list_entry(reloc_roots.next,
-					struct btrfs_root, root_list);
-		list_del(&reloc_root->root_list);
-		free_extent_buffer(reloc_root->node);
-		free_extent_buffer(reloc_root->commit_root);
-		kfree(reloc_root);
-	}
+	if (!list_empty(&reloc_roots))
+		free_reloc_roots(&reloc_roots);
+
 	btrfs_free_path(path);
 
 	if (err == 0) {
@@ -4253,10 +4477,8 @@
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
 {
 	struct btrfs_ordered_sum *sums;
-	struct btrfs_sector_sum *sector_sum;
 	struct btrfs_ordered_extent *ordered;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	size_t offset;
 	int ret;
 	u64 disk_bytenr;
 	LIST_HEAD(list);
@@ -4267,40 +4489,37 @@
 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
 				       disk_bytenr + len - 1, &list, 0);
+	if (ret)
+		goto out;
 
+	disk_bytenr = ordered->start;
 	while (!list_empty(&list)) {
 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
 		list_del_init(&sums->list);
 
-		sector_sum = sums->sums;
-		sums->bytenr = ordered->start;
-
-		offset = 0;
-		while (offset < sums->len) {
-			sector_sum->bytenr += ordered->start - disk_bytenr;
-			sector_sum++;
-			offset += root->sectorsize;
-		}
+		sums->bytenr = disk_bytenr;
+		disk_bytenr += sums->len;
 
 		btrfs_add_ordered_sum(inode, ordered, sums);
 	}
+out:
 	btrfs_put_ordered_extent(ordered);
 	return ret;
 }
 
-void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, struct extent_buffer *buf,
-			   struct extent_buffer *cow)
+int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root, struct extent_buffer *buf,
+			  struct extent_buffer *cow)
 {
 	struct reloc_control *rc;
 	struct backref_node *node;
 	int first_cow = 0;
 	int level;
-	int ret;
+	int ret = 0;
 
 	rc = root->fs_info->reloc_ctl;
 	if (!rc)
-		return;
+		return 0;
 
 	BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
 	       root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
@@ -4336,10 +4555,9 @@
 			rc->nodes_relocated += buf->len;
 	}
 
-	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+	if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
 		ret = replace_file_extents(trans, rc, root, cow);
-		BUG_ON(ret);
-	}
+	return ret;
 }
 
 /*
@@ -4380,7 +4598,7 @@
  * called after snapshot is created. migrate block reservation
  * and create reloc root for the newly created snapshot
  */
-void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
 			       struct btrfs_pending_snapshot *pending)
 {
 	struct btrfs_root *root = pending->root;
@@ -4390,7 +4608,7 @@
 	int ret;
 
 	if (!root->reloc_root)
-		return;
+		return 0;
 
 	rc = root->fs_info->reloc_ctl;
 	rc->merging_rsv_size += rc->nodes_relocated;
@@ -4399,18 +4617,21 @@
 		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
 					      rc->block_rsv,
 					      rc->nodes_relocated);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 
 	new_root = pending->snap;
 	reloc_root = create_reloc_root(trans, root->reloc_root,
 				       new_root->root_key.objectid);
+	if (IS_ERR(reloc_root))
+		return PTR_ERR(reloc_root);
 
-	__add_reloc_root(reloc_root);
+	ret = __add_reloc_root(reloc_root);
+	BUG_ON(ret < 0);
 	new_root->reloc_root = reloc_root;
 
-	if (rc->create_reloc_tree) {
+	if (rc->create_reloc_tree)
 		ret = clone_backref_node(trans, rc, root, reloc_root);
-		BUG_ON(ret);
-	}
+	return ret;
 }
diff -ur a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
--- a/fs/btrfs/root-tree.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/root-tree.c	2014-02-17 11:56:58.000000000 +0100
@@ -16,58 +16,107 @@
  * Boston, MA 021110-1307, USA.
  */
 
+#include <linux/uuid.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "print-tree.h"
 
 /*
- * lookup the root with the highest offset for a given objectid.  The key we do
- * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
- * on error.
+ * Read a root item from the tree. In case we detect a root item smaller then
+ * sizeof(root_item), we know it's an old version of the root structure and
+ * initialize all new fields to zero. The same happens if we detect mismatching
+ * generation numbers as then we know the root was once mounted with an older
+ * kernel that was not aware of the root item structure change.
  */
-int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
-			struct btrfs_root_item *item, struct btrfs_key *key)
+static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+				struct btrfs_root_item *item)
+{
+	uuid_le uuid;
+	int len;
+	int need_reset = 0;
+
+	len = btrfs_item_size_nr(eb, slot);
+	read_extent_buffer(eb, item, btrfs_item_ptr_offset(eb, slot),
+			min_t(int, len, (int)sizeof(*item)));
+	if (len < sizeof(*item))
+		need_reset = 1;
+	if (!need_reset && btrfs_root_generation(item)
+		!= btrfs_root_generation_v2(item)) {
+		if (btrfs_root_generation_v2(item) != 0) {
+			printk(KERN_WARNING "btrfs: mismatching "
+					"generation and generation_v2 "
+					"found in root item. This root "
+					"was probably mounted with an "
+					"older kernel. Resetting all "
+					"new fields.\n");
+		}
+		need_reset = 1;
+	}
+	if (need_reset) {
+		memset(&item->generation_v2, 0,
+			sizeof(*item) - offsetof(struct btrfs_root_item,
+					generation_v2));
+
+		uuid_le_gen(&uuid);
+		memcpy(item->uuid, uuid.b, BTRFS_UUID_SIZE);
+	}
+}
+
+/*
+ * btrfs_find_root - lookup the root by the key.
+ * root: the root of the root tree
+ * search_key: the key to search
+ * path: the path we search
+ * root_item: the root item of the tree we look for
+ * root_key: the reak key of the tree we look for
+ *
+ * If ->offset of 'seach_key' is -1ULL, it means we are not sure the offset
+ * of the search key, just lookup the root with the highest offset for a
+ * given objectid.
+ *
+ * If we find something return 0, otherwise > 0, < 0 on error.
+ */
+int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
+		    struct btrfs_path *path, struct btrfs_root_item *root_item,
+		    struct btrfs_key *root_key)
 {
-	struct btrfs_path *path;
-	struct btrfs_key search_key;
 	struct btrfs_key found_key;
 	struct extent_buffer *l;
 	int ret;
 	int slot;
 
-	search_key.objectid = objectid;
-	search_key.type = BTRFS_ROOT_ITEM_KEY;
-	search_key.offset = (u64)-1;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, search_key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return ret;
 
-	BUG_ON(ret == 0);
-	if (path->slots[0] == 0) {
-		ret = 1;
-		goto out;
+	if (search_key->offset != -1ULL) {	/* the search key is exact */
+		if (ret > 0)
+			goto out;
+	} else {
+		BUG_ON(ret == 0);		/* Logical error */
+		if (path->slots[0] == 0)
+			goto out;
+		path->slots[0]--;
+		ret = 0;
 	}
+
 	l = path->nodes[0];
-	slot = path->slots[0] - 1;
+	slot = path->slots[0];
+
 	btrfs_item_key_to_cpu(l, &found_key, slot);
-	if (found_key.objectid != objectid ||
+	if (found_key.objectid != search_key->objectid ||
 	    found_key.type != BTRFS_ROOT_ITEM_KEY) {
 		ret = 1;
 		goto out;
 	}
-	if (item)
-		read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot),
-				   sizeof(*item));
-	if (key)
-		memcpy(key, &found_key, sizeof(found_key));
-	ret = 0;
+
+	if (root_item)
+		btrfs_read_root_item(l, slot, root_item);
+	if (root_key)
+		memcpy(root_key, &found_key, sizeof(found_key));
 out:
-	btrfs_free_path(path);
+	btrfs_release_path(path);
 	return ret;
 }
 
@@ -91,120 +140,84 @@
 	int ret;
 	int slot;
 	unsigned long ptr;
+	int old_len;
 
 	path = btrfs_alloc_path();
-	BUG_ON(!path);
+	if (!path)
+		return -ENOMEM;
+
 	ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-	if (ret < 0)
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
 		goto out;
+	}
 
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
 		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
-		       (unsigned long long)key->objectid, key->type,
-		       (unsigned long long)key->offset);
+		       key->objectid, key->type, key->offset);
 		BUG_ON(1);
 	}
 
 	l = path->nodes[0];
 	slot = path->slots[0];
 	ptr = btrfs_item_ptr_offset(l, slot);
-	write_extent_buffer(l, item, ptr, sizeof(*item));
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-out:
-	btrfs_free_path(path);
-	return ret;
-}
-
-int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
-		      *root, struct btrfs_key *key, struct btrfs_root_item
-		      *item)
-{
-	int ret;
-	ret = btrfs_insert_item(trans, root, key, item, sizeof(*item));
-	return ret;
-}
-
-/*
- * at mount time we want to find all the old transaction snapshots that were in
- * the process of being deleted if we crashed.  This is any root item with an
- * offset lower than the latest root.  They need to be queued for deletion to
- * finish what was happening when we crashed.
- */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
-{
-	struct btrfs_root *dead_root;
-	struct btrfs_root_item *ri;
-	struct btrfs_key key;
-	struct btrfs_key found_key;
-	struct btrfs_path *path;
-	int ret;
-	u32 nritems;
-	struct extent_buffer *leaf;
-	int slot;
-
-	key.objectid = objectid;
-	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-	key.offset = 0;
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+	old_len = btrfs_item_size_nr(l, slot);
 
-again:
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto err;
-	while (1) {
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		slot = path->slots[0];
-		if (slot >= nritems) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret)
-				break;
-			leaf = path->nodes[0];
-			nritems = btrfs_header_nritems(leaf);
-			slot = path->slots[0];
+	/*
+	 * If this is the first time we update the root item which originated
+	 * from an older kernel, we need to enlarge the item size to make room
+	 * for the added fields.
+	 */
+	if (old_len < sizeof(*item)) {
+		btrfs_release_path(path);
+		ret = btrfs_search_slot(trans, root, key, path,
+				-1, 1);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
 		}
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
-			goto next;
 
-		if (key.objectid < objectid)
-			goto next;
-
-		if (key.objectid > objectid)
-			break;
-
-		ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item);
-		if (btrfs_disk_root_refs(leaf, ri) != 0)
-			goto next;
-
-		memcpy(&found_key, &key, sizeof(key));
-		key.offset++;
+		ret = btrfs_del_item(trans, root, path);
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
+		}
 		btrfs_release_path(path);
-		dead_root =
-			btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-						    &found_key);
-		if (IS_ERR(dead_root)) {
-			ret = PTR_ERR(dead_root);
-			goto err;
+		ret = btrfs_insert_empty_item(trans, root, path,
+				key, sizeof(*item));
+		if (ret < 0) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto out;
 		}
-
-		ret = btrfs_add_dead_root(dead_root);
-		if (ret)
-			goto err;
-		goto again;
-next:
-		slot++;
-		path->slots[0]++;
+		l = path->nodes[0];
+		slot = path->slots[0];
+		ptr = btrfs_item_ptr_offset(l, slot);
 	}
-	ret = 0;
-err:
+
+	/*
+	 * Update generation_v2 so at the next mount we know the new root
+	 * fields are valid.
+	 */
+	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+
+	write_extent_buffer(l, item, ptr, sizeof(*item));
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		      struct btrfs_key *key, struct btrfs_root_item *item)
+{
+	/*
+	 * Make sure generation v1 and v2 match. See update_root for details.
+	 */
+	btrfs_set_root_generation_v2(item, btrfs_root_generation(item));
+	return btrfs_insert_item(trans, root, key, item, sizeof(*item));
+}
+
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
 {
 	struct extent_buffer *leaf;
@@ -214,6 +227,10 @@
 	struct btrfs_root *root;
 	int err = 0;
 	int ret;
+	bool can_recover = true;
+
+	if (tree_root->fs_info->sb->s_flags & MS_RDONLY)
+		can_recover = false;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -253,22 +270,52 @@
 		root_key.objectid = key.offset;
 		key.offset++;
 
-		root = btrfs_read_fs_root_no_name(tree_root->fs_info,
-						  &root_key);
-		if (!IS_ERR(root))
+		root = btrfs_read_fs_root(tree_root, &root_key);
+		err = PTR_RET(root);
+		if (err && err != -ENOENT) {
+			break;
+		} else if (err == -ENOENT) {
+			struct btrfs_trans_handle *trans;
+
+			btrfs_release_path(path);
+
+			trans = btrfs_join_transaction(tree_root);
+			if (IS_ERR(trans)) {
+				err = PTR_ERR(trans);
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to start trans to delete "
+					    "orphan item");
+				break;
+			}
+			err = btrfs_del_orphan_item(trans, tree_root,
+						    root_key.objectid);
+			btrfs_end_transaction(trans, tree_root);
+			if (err) {
+				btrfs_error(tree_root->fs_info, err,
+					    "Failed to delete root orphan "
+					    "item");
+				break;
+			}
 			continue;
+		}
 
-		ret = PTR_ERR(root);
-		if (ret != -ENOENT) {
-			err = ret;
+		err = btrfs_init_fs_root(root);
+		if (err) {
+			btrfs_free_fs_root(root);
 			break;
 		}
 
-		ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
-		if (ret) {
-			err = ret;
+		root->orphan_item_inserted = 1;
+
+		err = btrfs_insert_fs_root(root->fs_info, root);
+		if (err) {
+			BUG_ON(err == -EEXIST);
+			btrfs_free_fs_root(root);
 			break;
 		}
+
+		if (btrfs_root_refs(&root->root_item) == 0)
+			btrfs_add_dead_root(root);
 	}
 
 	btrfs_free_path(path);
@@ -281,8 +328,6 @@
 {
 	struct btrfs_path *path;
 	int ret;
-	struct btrfs_root_item *ri;
-	struct extent_buffer *leaf;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -292,8 +337,6 @@
 		goto out;
 
 	BUG_ON(ret != 0);
-	leaf = path->nodes[0];
-	ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
 
 	ret = btrfs_del_item(trans, root, path);
 out:
@@ -384,6 +427,8 @@
  *
  * For a back ref the root_id is the id of the subvol or snapshot and
  * ref_id is the id of the tree referencing it.
+ *
+ * Will return 0, -ENOMEM, or anything from the CoW path
  */
 int btrfs_add_root_ref(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *tree_root,
@@ -407,7 +452,11 @@
 again:
 	ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
 				      sizeof(*ref) + name_len);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, tree_root, ret);
+		btrfs_free_path(path);
+		return ret;
+	}
 
 	leaf = path->nodes[0];
 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
@@ -438,12 +487,25 @@
  */
 void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
 {
-	u64 inode_flags = le64_to_cpu(root_item->inode.flags);
+	u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
 
 	if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
 		inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
-		root_item->inode.flags = cpu_to_le64(inode_flags);
-		root_item->flags = 0;
-		root_item->byte_limit = 0;
+		btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
+		btrfs_set_root_flags(root_item, 0);
+		btrfs_set_root_limit(root_item, 0);
 	}
 }
+
+void btrfs_update_root_times(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root)
+{
+	struct btrfs_root_item *item = &root->root_item;
+	struct timespec ct = CURRENT_TIME;
+
+	spin_lock(&root->root_item_lock);
+	btrfs_set_root_ctransid(item, trans->transid);
+	btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
+	btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
+	spin_unlock(&root->root_item_lock);
+}
diff -ur a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
--- a/fs/btrfs/scrub.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/scrub.c	2014-02-17 11:56:58.000000000 +0100
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2011 STRATO.  All rights reserved.
+ * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
@@ -25,6 +25,10 @@
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
+#include "dev-replace.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "raid56.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -35,66 +39,109 @@
  * Future enhancements:
  *  - In case an unrepairable extent is encountered, track which files are
  *    affected and report them
- *  - In case of a read error on files with nodatasum, map the file and read
- *    the extent to trigger a writeback of the good copy
  *  - track and record media errors, throw out bad devices
  *  - add a mode to also read unallocated space
  */
 
-struct scrub_bio;
-struct scrub_page;
-struct scrub_dev;
-static void scrub_bio_end_io(struct bio *bio, int err);
-static void scrub_checksum(struct btrfs_work *work);
-static int scrub_checksum_data(struct scrub_dev *sdev,
-			       struct scrub_page *spag, void *buffer);
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
-				     struct scrub_page *spag, u64 logical,
-				     void *buffer);
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
-static void scrub_fixup_end_io(struct bio *bio, int err);
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-			  struct page *page);
-static void scrub_fixup(struct scrub_bio *sbio, int ix);
+struct scrub_block;
+struct scrub_ctx;
+
+/*
+ * the following three values only influence the performance.
+ * The last one configures the number of parallel and outstanding I/O
+ * operations. The first two values configure an upper limit for the number
+ * of (dynamically allocated) pages that are added to a bio.
+ */
+#define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
+#define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
+#define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
 
-#define SCRUB_PAGES_PER_BIO	16	/* 64k per bio */
-#define SCRUB_BIOS_PER_DEV	16	/* 1 MB per device in flight */
+/*
+ * the following value times PAGE_SIZE needs to be large enough to match the
+ * largest node/leaf/sector size that shall be supported.
+ * Values larger than BTRFS_STRIPE_LEN are not supported.
+ */
+#define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
 
 struct scrub_page {
+	struct scrub_block	*sblock;
+	struct page		*page;
+	struct btrfs_device	*dev;
 	u64			flags;  /* extent flags */
 	u64			generation;
-	int			mirror_num;
-	int			have_csum;
+	u64			logical;
+	u64			physical;
+	u64			physical_for_dev_replace;
+	atomic_t		ref_count;
+	struct {
+		unsigned int	mirror_num:8;
+		unsigned int	have_csum:1;
+		unsigned int	io_error:1;
+	};
 	u8			csum[BTRFS_CSUM_SIZE];
 };
 
 struct scrub_bio {
 	int			index;
-	struct scrub_dev	*sdev;
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
 	struct bio		*bio;
 	int			err;
 	u64			logical;
 	u64			physical;
-	struct scrub_page	spag[SCRUB_PAGES_PER_BIO];
-	u64			count;
+#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
+#else
+	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
+#endif
+	int			page_count;
 	int			next_free;
 	struct btrfs_work	work;
 };
 
-struct scrub_dev {
-	struct scrub_bio	*bios[SCRUB_BIOS_PER_DEV];
-	struct btrfs_device	*dev;
+struct scrub_block {
+	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
+	int			page_count;
+	atomic_t		outstanding_pages;
+	atomic_t		ref_count; /* free mem on transition to zero */
+	struct scrub_ctx	*sctx;
+	struct {
+		unsigned int	header_error:1;
+		unsigned int	checksum_error:1;
+		unsigned int	no_io_error_seen:1;
+		unsigned int	generation_error:1; /* also sets header_error */
+	};
+};
+
+struct scrub_wr_ctx {
+	struct scrub_bio *wr_curr_bio;
+	struct btrfs_device *tgtdev;
+	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
+	atomic_t flush_all_writes;
+	struct mutex wr_lock;
+};
+
+struct scrub_ctx {
+	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
+	struct btrfs_root	*dev_root;
 	int			first_free;
 	int			curr;
-	atomic_t		in_flight;
-	atomic_t		fixup_cnt;
+	atomic_t		bios_in_flight;
+	atomic_t		workers_pending;
 	spinlock_t		list_lock;
 	wait_queue_head_t	list_wait;
 	u16			csum_size;
 	struct list_head	csum_list;
 	atomic_t		cancel_req;
 	int			readonly;
+	int			pages_per_rd_bio;
+	u32			sectorsize;
+	u32			nodesize;
+	u32			leafsize;
+
+	int			is_dev_replace;
+	struct scrub_wr_ctx	wr_ctx;
+
 	/*
 	 * statistics
 	 */
@@ -103,13 +150,31 @@
 };
 
 struct scrub_fixup_nodatasum {
-	struct scrub_dev	*sdev;
+	struct scrub_ctx	*sctx;
+	struct btrfs_device	*dev;
 	u64			logical;
 	struct btrfs_root	*root;
 	struct btrfs_work	work;
 	int			mirror_num;
 };
 
+struct scrub_nocow_inode {
+	u64			inum;
+	u64			offset;
+	u64			root;
+	struct list_head	list;
+};
+
+struct scrub_copy_nocow_ctx {
+	struct scrub_ctx	*sctx;
+	u64			logical;
+	u64			len;
+	int			mirror_num;
+	u64			physical_for_dev_replace;
+	struct list_head	inodes;
+	struct btrfs_work	work;
+};
+
 struct scrub_warning {
 	struct btrfs_path	*path;
 	u64			extent_item_size;
@@ -123,103 +188,248 @@
 	int			scratch_bufsize;
 };
 
-static void scrub_free_csums(struct scrub_dev *sdev)
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+				     struct btrfs_fs_info *fs_info,
+				     struct scrub_block *original_sblock,
+				     u64 length, u64 logical,
+				     struct scrub_block *sblocks_for_recheck);
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size);
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+					 struct scrub_block *sblock,
+					 int is_metadata, int have_csum,
+					 const u8 *csum, u64 generation,
+					 u16 csum_size);
+static void scrub_complete_bio_end_io(struct bio *bio, int err);
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+					     struct scrub_block *sblock_good,
+					     int force_write);
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+					    struct scrub_block *sblock_good,
+					    int page_num, int force_write);
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num);
+static int scrub_checksum_data(struct scrub_block *sblock);
+static int scrub_checksum_tree_block(struct scrub_block *sblock);
+static int scrub_checksum_super(struct scrub_block *sblock);
+static void scrub_block_get(struct scrub_block *sblock);
+static void scrub_block_put(struct scrub_block *sblock);
+static void scrub_page_get(struct scrub_page *spage);
+static void scrub_page_put(struct scrub_page *spage);
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace);
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_bio_end_io_worker(struct btrfs_work *work);
+static void scrub_block_complete(struct scrub_block *sblock);
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num);
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace);
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage);
+static void scrub_wr_submit(struct scrub_ctx *sctx);
+static void scrub_wr_bio_end_io(struct bio *bio, int err);
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page);
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      struct scrub_copy_nocow_ctx *ctx);
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace);
+static void copy_nocow_pages_worker(struct btrfs_work *work);
+
+
+static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
+{
+	atomic_inc(&sctx->bios_in_flight);
+}
+
+static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
+{
+	atomic_dec(&sctx->bios_in_flight);
+	wake_up(&sctx->list_wait);
+}
+
+/*
+ * used for workers that require transaction commits (i.e., for the
+ * NOCOW case)
+ */
+static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 {
-	while (!list_empty(&sdev->csum_list)) {
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	/*
+	 * increment scrubs_running to prevent cancel requests from
+	 * completing as long as a worker is running. we must also
+	 * increment scrubs_paused to prevent deadlocking on pause
+	 * requests used for transactions commits (as the worker uses a
+	 * transaction context). it is safe to regard the worker
+	 * as paused for all matters practical. effectively, we only
+	 * avoid cancellation requests from completing.
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_inc(&fs_info->scrubs_running);
+	atomic_inc(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_inc(&sctx->workers_pending);
+}
+
+/* used for workers that require transaction commits */
+static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
+{
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	/*
+	 * see scrub_pending_trans_workers_inc() why we're pretending
+	 * to be paused in the scrub counters
+	 */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_dec(&sctx->workers_pending);
+	wake_up(&fs_info->scrub_pause_wait);
+	wake_up(&sctx->list_wait);
+}
+
+static void scrub_free_csums(struct scrub_ctx *sctx)
+{
+	while (!list_empty(&sctx->csum_list)) {
 		struct btrfs_ordered_sum *sum;
-		sum = list_first_entry(&sdev->csum_list,
+		sum = list_first_entry(&sctx->csum_list,
 				       struct btrfs_ordered_sum, list);
 		list_del(&sum->list);
 		kfree(sum);
 	}
 }
 
-static void scrub_free_bio(struct bio *bio)
+static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 {
 	int i;
-	struct page *last_page = NULL;
 
-	if (!bio)
+	if (!sctx)
 		return;
 
-	for (i = 0; i < bio->bi_vcnt; ++i) {
-		if (bio->bi_io_vec[i].bv_page == last_page)
-			continue;
-		last_page = bio->bi_io_vec[i].bv_page;
-		__free_page(last_page);
-	}
-	bio_put(bio);
-}
+	scrub_free_wr_ctx(&sctx->wr_ctx);
 
-static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
-{
-	int i;
-
-	if (!sdev)
-		return;
+	/* this can happen when scrub is cancelled */
+	if (sctx->curr != -1) {
+		struct scrub_bio *sbio = sctx->bios[sctx->curr];
+
+		for (i = 0; i < sbio->page_count; i++) {
+			WARN_ON(!sbio->pagev[i]->page);
+			scrub_block_put(sbio->pagev[i]->sblock);
+		}
+		bio_put(sbio->bio);
+	}
 
-	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
-		struct scrub_bio *sbio = sdev->bios[i];
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
+		struct scrub_bio *sbio = sctx->bios[i];
 
 		if (!sbio)
 			break;
-
-		scrub_free_bio(sbio->bio);
 		kfree(sbio);
 	}
 
-	scrub_free_csums(sdev);
-	kfree(sdev);
+	scrub_free_csums(sctx);
+	kfree(sctx);
 }
 
 static noinline_for_stack
-struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
-	struct scrub_dev *sdev;
+	struct scrub_ctx *sctx;
 	int		i;
 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+	int pages_per_rd_bio;
+	int ret;
 
-	sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
-	if (!sdev)
+	/*
+	 * the setting of pages_per_rd_bio is correct for scrub but might
+	 * be wrong for the dev_replace code where we might read from
+	 * different devices in the initial huge bios. However, that
+	 * code is able to correctly handle the case when adding a page
+	 * to a bio fails.
+	 */
+	if (dev->bdev)
+		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	else
+		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
+	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
+	if (!sctx)
 		goto nomem;
-	sdev->dev = dev;
-	for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+	sctx->is_dev_replace = is_dev_replace;
+	sctx->pages_per_rd_bio = pages_per_rd_bio;
+	sctx->curr = -1;
+	sctx->dev_root = dev->dev_root;
+	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 		struct scrub_bio *sbio;
 
 		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 		if (!sbio)
 			goto nomem;
-		sdev->bios[i] = sbio;
+		sctx->bios[i] = sbio;
 
 		sbio->index = i;
-		sbio->sdev = sdev;
-		sbio->count = 0;
-		sbio->work.func = scrub_checksum;
+		sbio->sctx = sctx;
+		sbio->page_count = 0;
+		sbio->work.func = scrub_bio_end_io_worker;
 
-		if (i != SCRUB_BIOS_PER_DEV-1)
-			sdev->bios[i]->next_free = i + 1;
+		if (i != SCRUB_BIOS_PER_SCTX - 1)
+			sctx->bios[i]->next_free = i + 1;
 		else
-			sdev->bios[i]->next_free = -1;
+			sctx->bios[i]->next_free = -1;
 	}
-	sdev->first_free = 0;
-	sdev->curr = -1;
-	atomic_set(&sdev->in_flight, 0);
-	atomic_set(&sdev->fixup_cnt, 0);
-	atomic_set(&sdev->cancel_req, 0);
-	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
-	INIT_LIST_HEAD(&sdev->csum_list);
-
-	spin_lock_init(&sdev->list_lock);
-	spin_lock_init(&sdev->stat_lock);
-	init_waitqueue_head(&sdev->list_wait);
-	return sdev;
+	sctx->first_free = 0;
+	sctx->nodesize = dev->dev_root->nodesize;
+	sctx->leafsize = dev->dev_root->leafsize;
+	sctx->sectorsize = dev->dev_root->sectorsize;
+	atomic_set(&sctx->bios_in_flight, 0);
+	atomic_set(&sctx->workers_pending, 0);
+	atomic_set(&sctx->cancel_req, 0);
+	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
+	INIT_LIST_HEAD(&sctx->csum_list);
+
+	spin_lock_init(&sctx->list_lock);
+	spin_lock_init(&sctx->stat_lock);
+	init_waitqueue_head(&sctx->list_wait);
+
+	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
+				 fs_info->dev_replace.tgtdev, is_dev_replace);
+	if (ret) {
+		scrub_free_ctx(sctx);
+		return ERR_PTR(ret);
+	}
+	return sctx;
 
 nomem:
-	scrub_free_dev(sdev);
+	scrub_free_ctx(sctx);
 	return ERR_PTR(-ENOMEM);
 }
 
-static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
+				     void *warn_ctx)
 {
 	u64 isize;
 	u32 nlink;
@@ -227,7 +437,7 @@
 	int i;
 	struct extent_buffer *eb;
 	struct btrfs_inode_item *inode_item;
-	struct scrub_warning *swarn = ctx;
+	struct scrub_warning *swarn = warn_ctx;
 	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 	struct inode_fs_paths *ipath = NULL;
 	struct btrfs_root *local_root;
@@ -271,10 +481,10 @@
 	 * hold all of the paths here
 	 */
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
-		printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+		printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
 			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
 			"length %llu, links %u (path: %s)\n", swarn->errstr,
-			swarn->logical, swarn->dev->name,
+			swarn->logical, rcu_str_deref(swarn->dev->name),
 			(unsigned long long)swarn->sector, root, inum, offset,
 			min(isize - offset, (u64)PAGE_SIZE), nlink,
 			(char *)(unsigned long)ipath->fspath->val[i]);
@@ -283,75 +493,85 @@
 	return 0;
 
 err:
-	printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+	printk_in_rcu(KERN_WARNING "btrfs: %s at logical %llu on dev "
 		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 		"resolving failed with ret=%d\n", swarn->errstr,
-		swarn->logical, swarn->dev->name,
+		swarn->logical, rcu_str_deref(swarn->dev->name),
 		(unsigned long long)swarn->sector, root, inum, offset, ret);
 
 	free_ipath(ipath);
 	return 0;
 }
 
-static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
-				int ix)
+static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 {
-	struct btrfs_device *dev = sbio->sdev->dev;
-	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+	struct btrfs_device *dev;
+	struct btrfs_fs_info *fs_info;
 	struct btrfs_path *path;
 	struct btrfs_key found_key;
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct scrub_warning swarn;
-	u32 item_size;
-	int ret;
+	unsigned long ptr = 0;
+	u64 extent_item_pos;
+	u64 flags = 0;
 	u64 ref_root;
+	u32 item_size;
 	u8 ref_level;
-	unsigned long ptr = 0;
 	const int bufsize = 4096;
-	u64 extent_offset;
+	int ret;
+
+	WARN_ON(sblock->page_count < 1);
+	dev = sblock->pagev[0]->dev;
+	fs_info = sblock->sctx->dev_root->fs_info;
 
 	path = btrfs_alloc_path();
 
 	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
 	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
-	swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
-	swarn.logical = sbio->logical + ix * PAGE_SIZE;
+	swarn.sector = (sblock->pagev[0]->physical) >> 9;
+	swarn.logical = sblock->pagev[0]->logical;
 	swarn.errstr = errstr;
-	swarn.dev = dev;
+	swarn.dev = NULL;
 	swarn.msg_bufsize = bufsize;
 	swarn.scratch_bufsize = bufsize;
 
 	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 		goto out;
 
-	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
+				  &flags);
 	if (ret < 0)
 		goto out;
 
-	extent_offset = swarn.logical - found_key.objectid;
+	extent_item_pos = swarn.logical - found_key.objectid;
 	swarn.extent_item_size = found_key.offset;
 
 	eb = path->nodes[0];
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 
-	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
 			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
 							&ref_root, &ref_level);
-			printk(KERN_WARNING "%s at logical %llu on dev %s, "
+			printk_in_rcu(KERN_WARNING
+				"btrfs: %s at logical %llu on dev %s, "
 				"sector %llu: metadata %s (level %d) in tree "
-				"%llu\n", errstr, swarn.logical, dev->name,
+				"%llu\n", errstr, swarn.logical,
+				rcu_str_deref(dev->name),
 				(unsigned long long)swarn.sector,
 				ref_level ? "node" : "leaf",
 				ret < 0 ? -1 : ref_level,
 				ret < 0 ? -1 : ref_root);
 		} while (ret != 1);
+		btrfs_release_path(path);
 	} else {
+		btrfs_release_path(path);
 		swarn.path = path;
-		iterate_extent_inodes(fs_info, path, found_key.objectid,
-					extent_offset,
+		swarn.dev = dev;
+		iterate_extent_inodes(fs_info, found_key.objectid,
+					extent_item_pos, 1,
 					scrub_print_warning_inode, &swarn);
 	}
 
@@ -361,29 +581,38 @@
 	kfree(swarn.msg_buf);
 }
 
-static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 {
 	struct page *page = NULL;
 	unsigned long index;
-	struct scrub_fixup_nodatasum *fixup = ctx;
+	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 	int ret;
 	int corrected = 0;
 	struct btrfs_key key;
 	struct inode *inode = NULL;
+	struct btrfs_fs_info *fs_info;
 	u64 end = offset + PAGE_SIZE - 1;
 	struct btrfs_root *local_root;
+	int srcu_index;
 
 	key.objectid = root;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 	key.offset = (u64)-1;
-	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
-	if (IS_ERR(local_root))
+
+	fs_info = fixup->root->fs_info;
+	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(local_root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 		return PTR_ERR(local_root);
+	}
 
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.objectid = inum;
 	key.offset = 0;
-	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
@@ -396,7 +625,6 @@
 	}
 
 	if (PageUptodate(page)) {
-		struct btrfs_mapping_tree *map_tree;
 		if (PageDirty(page)) {
 			/*
 			 * we need to write the data to the defect sector. the
@@ -417,8 +645,8 @@
 			ret = -EIO;
 			goto out;
 		}
-		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
-		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+		fs_info = BTRFS_I(inode)->root->fs_info;
+		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
 					fixup->logical, page,
 					fixup->mirror_num);
 		unlock_page(page);
@@ -475,21 +703,21 @@
 {
 	int ret;
 	struct scrub_fixup_nodatasum *fixup;
-	struct scrub_dev *sdev;
+	struct scrub_ctx *sctx;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_path *path;
 	int uncorrectable = 0;
 
 	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
-	sdev = fixup->sdev;
+	sctx = fixup->sctx;
 	fs_info = fixup->root->fs_info;
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.malloc_errors;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.malloc_errors;
+		spin_unlock(&sctx->stat_lock);
 		uncorrectable = 1;
 		goto out;
 	}
@@ -518,364 +746,989 @@
 	}
 	WARN_ON(ret != 1);
 
-	spin_lock(&sdev->stat_lock);
-	++sdev->stat.corrected_errors;
-	spin_unlock(&sdev->stat_lock);
+	spin_lock(&sctx->stat_lock);
+	++sctx->stat.corrected_errors;
+	spin_unlock(&sctx->stat_lock);
 
 out:
 	if (trans && !IS_ERR(trans))
 		btrfs_end_transaction(trans, fixup->root);
 	if (uncorrectable) {
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.uncorrectable_errors;
-		spin_unlock(&sdev->stat_lock);
-		printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
-					"(nodatasum) error at logical %llu\n",
-					fixup->logical);
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.uncorrectable_errors;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_replace_stats_inc(
+			&sctx->dev_root->fs_info->dev_replace.
+			num_uncorrectable_read_errors);
+		printk_ratelimited_in_rcu(KERN_ERR
+			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
+			fixup->logical, rcu_str_deref(fixup->dev->name));
 	}
 
 	btrfs_free_path(path);
 	kfree(fixup);
 
-	/* see caller why we're pretending to be paused in the scrub counters */
-	mutex_lock(&fs_info->scrub_lock);
-	atomic_dec(&fs_info->scrubs_running);
-	atomic_dec(&fs_info->scrubs_paused);
-	mutex_unlock(&fs_info->scrub_lock);
-	atomic_dec(&sdev->fixup_cnt);
-	wake_up(&fs_info->scrub_pause_wait);
-	wake_up(&sdev->list_wait);
+	scrub_pending_trans_workers_dec(sctx);
 }
 
 /*
- * scrub_recheck_error gets called when either verification of the page
- * failed or the bio failed to read, e.g. with EIO. In the latter case,
- * recheck_error gets called for every page in the bio, even though only
- * one may be bad
+ * scrub_handle_errored_block gets called when either verification of the
+ * pages failed or the bio failed to read, e.g. with EIO. In the latter
+ * case, this function handles all pages in the bio, even though only one
+ * may be bad.
+ * The goal of this function is to repair the errored block by using the
+ * contents of one of the mirrors.
  */
-static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 {
-	struct scrub_dev *sdev = sbio->sdev;
-	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+	struct scrub_ctx *sctx = sblock_to_check->sctx;
+	struct btrfs_device *dev;
+	struct btrfs_fs_info *fs_info;
+	u64 length;
+	u64 logical;
+	u64 generation;
+	unsigned int failed_mirror_index;
+	unsigned int is_metadata;
+	unsigned int have_csum;
+	u8 *csum;
+	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
+	struct scrub_block *sblock_bad;
+	int ret;
+	int mirror_index;
+	int page_num;
+	int success;
 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
-					DEFAULT_RATELIMIT_BURST);
+				      DEFAULT_RATELIMIT_BURST);
 
-	if (sbio->err) {
-		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
-				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
-			if (scrub_fixup_check(sbio, ix) == 0)
-				return 0;
-		}
+	BUG_ON(sblock_to_check->page_count < 1);
+	fs_info = sctx->dev_root->fs_info;
+	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
+		/*
+		 * if we find an error in a super block, we just report it.
+		 * They will get written with the next transaction commit
+		 * anyway
+		 */
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
+		return 0;
+	}
+	length = sblock_to_check->page_count * PAGE_SIZE;
+	logical = sblock_to_check->pagev[0]->logical;
+	generation = sblock_to_check->pagev[0]->generation;
+	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
+	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
+	is_metadata = !(sblock_to_check->pagev[0]->flags &
+			BTRFS_EXTENT_FLAG_DATA);
+	have_csum = sblock_to_check->pagev[0]->have_csum;
+	csum = sblock_to_check->pagev[0]->csum;
+	dev = sblock_to_check->pagev[0]->dev;
+
+	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
+		sblocks_for_recheck = NULL;
+		goto nodatasum_case;
+	}
+
+	/*
+	 * read all mirrors one after the other. This includes to
+	 * re-read the extent or metadata block that failed (that was
+	 * the cause that this fixup code is called) another time,
+	 * page by page this time in order to know which pages
+	 * caused I/O errors and which ones are good (for all mirrors).
+	 * It is the goal to handle the situation when more than one
+	 * mirror contains I/O errors, but the errors do not
+	 * overlap, i.e. the data can be repaired by selecting the
+	 * pages from those mirrors without I/O error on the
+	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
+	 * would be that mirror #1 has an I/O error on the first page,
+	 * the second page is good, and mirror #2 has an I/O error on
+	 * the second page, but the first page is good.
+	 * Then the first page of the first mirror can be repaired by
+	 * taking the first page of the second mirror, and the
+	 * second page of the second mirror can be repaired by
+	 * copying the contents of the 2nd page of the 1st mirror.
+	 * One more note: if the pages of one mirror contain I/O
+	 * errors, the checksum cannot be verified. In order to get
+	 * the best data for repairing, the first attempt is to find
+	 * a mirror without I/O errors and with a validated checksum.
+	 * Only if this is not possible, the pages are picked from
+	 * mirrors with I/O errors without considering the checksum.
+	 * If the latter is the case, at the end, the checksum of the
+	 * repaired area is verified in order to correctly maintain
+	 * the statistics.
+	 */
+
+	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
+				     sizeof(*sblocks_for_recheck),
+				     GFP_NOFS);
+	if (!sblocks_for_recheck) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+		goto out;
+	}
+
+	/* setup the context, map the logical blocks and alloc the pages */
+	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
+					logical, sblocks_for_recheck);
+	if (ret) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+		goto out;
+	}
+	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
+	sblock_bad = sblocks_for_recheck + failed_mirror_index;
+
+	/* build and submit the bios for the failed mirror, check checksums */
+	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
+			    csum, generation, sctx->csum_size);
+
+	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
+	    sblock_bad->no_io_error_seen) {
+		/*
+		 * the error disappeared after reading page by page, or
+		 * the area was part of a huge bio and other parts of the
+		 * bio caused I/O errors, or the block layer merged several
+		 * read requests into one and the error is caused by a
+		 * different bio (usually one of the two latter cases is
+		 * the cause)
+		 */
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.unverified_errors++;
+		spin_unlock(&sctx->stat_lock);
+
+		if (sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock_bad);
+		goto out;
+	}
+
+	if (!sblock_bad->no_io_error_seen) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.read_errors++;
+		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
-			scrub_print_warning("i/o error", sbio, ix);
-	} else {
+			scrub_print_warning("i/o error", sblock_to_check);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
+	} else if (sblock_bad->checksum_error) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.csum_errors++;
+		spin_unlock(&sctx->stat_lock);
+		if (__ratelimit(&_rs))
+			scrub_print_warning("checksum error", sblock_to_check);
+		btrfs_dev_stat_inc_and_print(dev,
+					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
+	} else if (sblock_bad->header_error) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.verify_errors++;
+		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
-			scrub_print_warning("checksum error", sbio, ix);
+			scrub_print_warning("checksum/header error",
+					    sblock_to_check);
+		if (sblock_bad->generation_error)
+			btrfs_dev_stat_inc_and_print(dev,
+				BTRFS_DEV_STAT_GENERATION_ERRS);
+		else
+			btrfs_dev_stat_inc_and_print(dev,
+				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 	}
 
-	spin_lock(&sdev->stat_lock);
-	++sdev->stat.read_errors;
-	spin_unlock(&sdev->stat_lock);
+	if (sctx->readonly && !sctx->is_dev_replace)
+		goto did_not_correct_error;
 
-	scrub_fixup(sbio, ix);
-	return 1;
-}
+	if (!is_metadata && !have_csum) {
+		struct scrub_fixup_nodatasum *fixup_nodatasum;
 
-static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
-{
-	int ret = 1;
-	struct page *page;
-	void *buffer;
-	u64 flags = sbio->spag[ix].flags;
+nodatasum_case:
+		WARN_ON(sctx->is_dev_replace);
 
-	page = sbio->bio->bi_io_vec[ix].bv_page;
-	buffer = kmap_atomic(page, KM_USER0);
-	if (flags & BTRFS_EXTENT_FLAG_DATA) {
-		ret = scrub_checksum_data(sbio->sdev,
-					  sbio->spag + ix, buffer);
-	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-		ret = scrub_checksum_tree_block(sbio->sdev,
-						sbio->spag + ix,
-						sbio->logical + ix * PAGE_SIZE,
-						buffer);
+		/*
+		 * !is_metadata and !have_csum, this means that the data
+		 * might not be COW'ed, that it might be modified
+		 * concurrently. The general strategy to work on the
+		 * commit root does not help in the case when COW is not
+		 * used.
+		 */
+		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
+		if (!fixup_nodatasum)
+			goto did_not_correct_error;
+		fixup_nodatasum->sctx = sctx;
+		fixup_nodatasum->dev = dev;
+		fixup_nodatasum->logical = logical;
+		fixup_nodatasum->root = fs_info->extent_root;
+		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
+		scrub_pending_trans_workers_inc(sctx);
+		fixup_nodatasum->work.func = scrub_fixup_nodatasum;
+		btrfs_queue_worker(&fs_info->scrub_workers,
+				   &fixup_nodatasum->work);
+		goto out;
+	}
+
+	/*
+	 * now build and submit the bios for the other mirrors, check
+	 * checksums.
+	 * First try to pick the mirror which is completely without I/O
+	 * errors and also does not have a checksum error.
+	 * If one is found, and if a checksum is present, the full block
+	 * that is known to contain an error is rewritten. Afterwards
+	 * the block is known to be corrected.
+	 * If a mirror is found which is completely correct, and no
+	 * checksum is present, only those pages are rewritten that had
+	 * an I/O error in the block to be repaired, since it cannot be
+	 * determined, which copy of the other pages is better (and it
+	 * could happen otherwise that a correct page would be
+	 * overwritten by a bad one).
+	 */
+	for (mirror_index = 0;
+	     mirror_index < BTRFS_MAX_MIRRORS &&
+	     sblocks_for_recheck[mirror_index].page_count > 0;
+	     mirror_index++) {
+		struct scrub_block *sblock_other;
+
+		if (mirror_index == failed_mirror_index)
+			continue;
+		sblock_other = sblocks_for_recheck + mirror_index;
+
+		/* build and submit the bios, check checksums */
+		scrub_recheck_block(fs_info, sblock_other, is_metadata,
+				    have_csum, csum, generation,
+				    sctx->csum_size);
+
+		if (!sblock_other->header_error &&
+		    !sblock_other->checksum_error &&
+		    sblock_other->no_io_error_seen) {
+			if (sctx->is_dev_replace) {
+				scrub_write_block_to_dev_replace(sblock_other);
+			} else {
+				int force_write = is_metadata || have_csum;
+
+				ret = scrub_repair_block_from_good_copy(
+						sblock_bad, sblock_other,
+						force_write);
+			}
+			if (0 == ret)
+				goto corrected_error;
+		}
+	}
+
+	/*
+	 * for dev_replace, pick good pages and write to the target device.
+	 */
+	if (sctx->is_dev_replace) {
+		success = 1;
+		for (page_num = 0; page_num < sblock_bad->page_count;
+		     page_num++) {
+			int sub_success;
+
+			sub_success = 0;
+			for (mirror_index = 0;
+			     mirror_index < BTRFS_MAX_MIRRORS &&
+			     sblocks_for_recheck[mirror_index].page_count > 0;
+			     mirror_index++) {
+				struct scrub_block *sblock_other =
+					sblocks_for_recheck + mirror_index;
+				struct scrub_page *page_other =
+					sblock_other->pagev[page_num];
+
+				if (!page_other->io_error) {
+					ret = scrub_write_page_to_dev_replace(
+							sblock_other, page_num);
+					if (ret == 0) {
+						/* succeeded for this page */
+						sub_success = 1;
+						break;
+					} else {
+						btrfs_dev_replace_stats_inc(
+							&sctx->dev_root->
+							fs_info->dev_replace.
+							num_write_errors);
+					}
+				}
+			}
+
+			if (!sub_success) {
+				/*
+				 * did not find a mirror to fetch the page
+				 * from. scrub_write_page_to_dev_replace()
+				 * handles this case (page->io_error), by
+				 * filling the block with zeros before
+				 * submitting the write request
+				 */
+				success = 0;
+				ret = scrub_write_page_to_dev_replace(
+						sblock_bad, page_num);
+				if (ret)
+					btrfs_dev_replace_stats_inc(
+						&sctx->dev_root->fs_info->
+						dev_replace.num_write_errors);
+			}
+		}
+
+		goto out;
+	}
+
+	/*
+	 * for regular scrub, repair those pages that are errored.
+	 * In case of I/O errors in the area that is supposed to be
+	 * repaired, continue by picking good copies of those pages.
+	 * Select the good pages from mirrors to rewrite bad pages from
+	 * the area to fix. Afterwards verify the checksum of the block
+	 * that is supposed to be repaired. This verification step is
+	 * only done for the purpose of statistic counting and for the
+	 * final scrub report, whether errors remain.
+	 * A perfect algorithm could make use of the checksum and try
+	 * all possible combinations of pages from the different mirrors
+	 * until the checksum verification succeeds. For example, when
+	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
+	 * of mirror #2 is readable but the final checksum test fails,
+	 * then the 2nd page of mirror #3 could be tried, whether now
+	 * the final checksum succeedes. But this would be a rare
+	 * exception and is therefore not implemented. At least it is
+	 * avoided that the good copy is overwritten.
+	 * A more useful improvement would be to pick the sectors
+	 * without I/O error based on sector sizes (512 bytes on legacy
+	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
+	 * mirror could be repaired by taking 512 byte of a different
+	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
+	 * area are unreadable.
+	 */
+
+	/* can only fix I/O errors from here on */
+	if (sblock_bad->no_io_error_seen)
+		goto did_not_correct_error;
+
+	success = 1;
+	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+
+		if (!page_bad->io_error)
+			continue;
+
+		for (mirror_index = 0;
+		     mirror_index < BTRFS_MAX_MIRRORS &&
+		     sblocks_for_recheck[mirror_index].page_count > 0;
+		     mirror_index++) {
+			struct scrub_block *sblock_other = sblocks_for_recheck +
+							   mirror_index;
+			struct scrub_page *page_other = sblock_other->pagev[
+							page_num];
+
+			if (!page_other->io_error) {
+				ret = scrub_repair_page_from_good_copy(
+					sblock_bad, sblock_other, page_num, 0);
+				if (0 == ret) {
+					page_bad->io_error = 0;
+					break; /* succeeded for this page */
+				}
+			}
+		}
+
+		if (page_bad->io_error) {
+			/* did not find a mirror to copy the page from */
+			success = 0;
+		}
+	}
+
+	if (success) {
+		if (is_metadata || have_csum) {
+			/*
+			 * need to verify the checksum now that all
+			 * sectors on disk are repaired (the write
+			 * request for data to be repaired is on its way).
+			 * Just be lazy and use scrub_recheck_block()
+			 * which re-reads the data before the checksum
+			 * is verified, but most likely the data comes out
+			 * of the page cache.
+			 */
+			scrub_recheck_block(fs_info, sblock_bad,
+					    is_metadata, have_csum, csum,
+					    generation, sctx->csum_size);
+			if (!sblock_bad->header_error &&
+			    !sblock_bad->checksum_error &&
+			    sblock_bad->no_io_error_seen)
+				goto corrected_error;
+			else
+				goto did_not_correct_error;
+		} else {
+corrected_error:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.corrected_errors++;
+			spin_unlock(&sctx->stat_lock);
+			printk_ratelimited_in_rcu(KERN_ERR
+				"btrfs: fixed up error at logical %llu on dev %s\n",
+				logical, rcu_str_deref(dev->name));
+		}
 	} else {
-		WARN_ON(1);
+did_not_correct_error:
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.uncorrectable_errors++;
+		spin_unlock(&sctx->stat_lock);
+		printk_ratelimited_in_rcu(KERN_ERR
+			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
+			logical, rcu_str_deref(dev->name));
 	}
-	kunmap_atomic(buffer, KM_USER0);
 
-	return ret;
-}
+out:
+	if (sblocks_for_recheck) {
+		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
+		     mirror_index++) {
+			struct scrub_block *sblock = sblocks_for_recheck +
+						     mirror_index;
+			int page_index;
+
+			for (page_index = 0; page_index < sblock->page_count;
+			     page_index++) {
+				sblock->pagev[page_index]->sblock = NULL;
+				scrub_page_put(sblock->pagev[page_index]);
+			}
+		}
+		kfree(sblocks_for_recheck);
+	}
 
-static void scrub_fixup_end_io(struct bio *bio, int err)
-{
-	complete((struct completion *)bio->bi_private);
+	return 0;
 }
 
-static void scrub_fixup(struct scrub_bio *sbio, int ix)
+static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+				     struct btrfs_fs_info *fs_info,
+				     struct scrub_block *original_sblock,
+				     u64 length, u64 logical,
+				     struct scrub_block *sblocks_for_recheck)
 {
-	struct scrub_dev *sdev = sbio->sdev;
-	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
-	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-	struct btrfs_bio *bbio = NULL;
-	struct scrub_fixup_nodatasum *fixup;
-	u64 logical = sbio->logical + ix * PAGE_SIZE;
-	u64 length;
-	int i;
+	int page_index;
+	int mirror_index;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(complete);
 
-	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
-	    (sbio->spag[ix].have_csum == 0)) {
-		fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
-		if (!fixup)
-			goto uncorrectable;
-		fixup->sdev = sdev;
-		fixup->logical = logical;
-		fixup->root = fs_info->extent_root;
-		fixup->mirror_num = sbio->spag[ix].mirror_num;
+	/*
+	 * note: the two members ref_count and outstanding_pages
+	 * are not used (and not set) in the blocks that are used for
+	 * the recheck procedure
+	 */
+
+	page_index = 0;
+	while (length > 0) {
+		u64 sublen = min_t(u64, length, PAGE_SIZE);
+		u64 mapped_length = sublen;
+		struct btrfs_bio *bbio = NULL;
+
 		/*
-		 * increment scrubs_running to prevent cancel requests from
-		 * completing as long as a fixup worker is running. we must also
-		 * increment scrubs_paused to prevent deadlocking on pause
-		 * requests used for transactions commits (as the worker uses a
-		 * transaction context). it is safe to regard the fixup worker
-		 * as paused for all matters practical. effectively, we only
-		 * avoid cancellation requests from completing.
+		 * with a length of PAGE_SIZE, each returned stripe
+		 * represents one mirror
 		 */
-		mutex_lock(&fs_info->scrub_lock);
-		atomic_inc(&fs_info->scrubs_running);
-		atomic_inc(&fs_info->scrubs_paused);
-		mutex_unlock(&fs_info->scrub_lock);
-		atomic_inc(&sdev->fixup_cnt);
-		fixup->work.func = scrub_fixup_nodatasum;
-		btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
-		return;
-	}
+		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
+				      &mapped_length, &bbio, 0);
+		if (ret || !bbio || mapped_length < sublen) {
+			kfree(bbio);
+			return -EIO;
+		}
 
-	length = PAGE_SIZE;
-	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
-			      &bbio, 0);
-	if (ret || !bbio || length < PAGE_SIZE) {
-		printk(KERN_ERR
-		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
-		       (unsigned long long)logical);
-		WARN_ON(1);
+		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
+		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
+		     mirror_index++) {
+			struct scrub_block *sblock;
+			struct scrub_page *page;
+
+			if (mirror_index >= BTRFS_MAX_MIRRORS)
+				continue;
+
+			sblock = sblocks_for_recheck + mirror_index;
+			sblock->sctx = sctx;
+			page = kzalloc(sizeof(*page), GFP_NOFS);
+			if (!page) {
+leave_nomem:
+				spin_lock(&sctx->stat_lock);
+				sctx->stat.malloc_errors++;
+				spin_unlock(&sctx->stat_lock);
+				kfree(bbio);
+				return -ENOMEM;
+			}
+			scrub_page_get(page);
+			sblock->pagev[page_index] = page;
+			page->logical = logical;
+			page->physical = bbio->stripes[mirror_index].physical;
+			BUG_ON(page_index >= original_sblock->page_count);
+			page->physical_for_dev_replace =
+				original_sblock->pagev[page_index]->
+				physical_for_dev_replace;
+			/* for missing devices, dev->bdev is NULL */
+			page->dev = bbio->stripes[mirror_index].dev;
+			page->mirror_num = mirror_index + 1;
+			sblock->page_count++;
+			page->page = alloc_page(GFP_NOFS);
+			if (!page->page)
+				goto leave_nomem;
+		}
 		kfree(bbio);
-		return;
+		length -= sublen;
+		logical += sublen;
+		page_index++;
 	}
 
-	if (bbio->num_stripes == 1)
-		/* there aren't any replicas */
-		goto uncorrectable;
+	return 0;
+}
 
-	/*
-	 * first find a good copy
-	 */
-	for (i = 0; i < bbio->num_stripes; ++i) {
-		if (i + 1 == sbio->spag[ix].mirror_num)
+/*
+ * this function will check the on disk data for checksum errors, header
+ * errors and read I/O errors. If any I/O errors happen, the exact pages
+ * which are errored are marked as being bad. The goal is to enable scrub
+ * to take those pages that are not errored from all the mirrors so that
+ * the pages that are errored in the just handled mirror can be repaired.
+ */
+static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
+				struct scrub_block *sblock, int is_metadata,
+				int have_csum, u8 *csum, u64 generation,
+				u16 csum_size)
+{
+	int page_num;
+
+	sblock->no_io_error_seen = 1;
+	sblock->header_error = 0;
+	sblock->checksum_error = 0;
+
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		struct bio *bio;
+		struct scrub_page *page = sblock->pagev[page_num];
+		DECLARE_COMPLETION_ONSTACK(complete);
+
+		if (page->dev->bdev == NULL) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
 			continue;
+		}
 
-		if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
-				   bbio->stripes[i].physical >> 9,
-				   sbio->bio->bi_io_vec[ix].bv_page)) {
-			/* I/O-error, this is not a good copy */
+		WARN_ON(!page->page);
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		if (!bio) {
+			page->io_error = 1;
+			sblock->no_io_error_seen = 0;
 			continue;
 		}
+		bio->bi_bdev = page->dev->bdev;
+		bio->bi_sector = page->physical >> 9;
+		bio->bi_end_io = scrub_complete_bio_end_io;
+		bio->bi_private = &complete;
+
+		bio_add_page(bio, page->page, PAGE_SIZE, 0);
+		btrfsic_submit_bio(READ, bio);
+
+		/* this will also unplug the queue */
+		wait_for_completion(&complete);
+
+		page->io_error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+		if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+			sblock->no_io_error_seen = 0;
+		bio_put(bio);
+	}
+
+	if (sblock->no_io_error_seen)
+		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
+					     have_csum, csum, generation,
+					     csum_size);
 
-		if (scrub_fixup_check(sbio, ix) == 0)
+	return;
+}
+
+static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
+					 struct scrub_block *sblock,
+					 int is_metadata, int have_csum,
+					 const u8 *csum, u64 generation,
+					 u16 csum_size)
+{
+	int page_num;
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u32 crc = ~(u32)0;
+	void *mapped_buffer;
+
+	WARN_ON(!sblock->pagev[0]->page);
+	if (is_metadata) {
+		struct btrfs_header *h;
+
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+		h = (struct btrfs_header *)mapped_buffer;
+
+		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
+		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
+		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+			   BTRFS_UUID_SIZE)) {
+			sblock->header_error = 1;
+		} else if (generation != btrfs_stack_header_generation(h)) {
+			sblock->header_error = 1;
+			sblock->generation_error = 1;
+		}
+		csum = h->csum;
+	} else {
+		if (!have_csum)
+			return;
+
+		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
+	}
+
+	for (page_num = 0;;) {
+		if (page_num == 0 && is_metadata)
+			crc = btrfs_csum_data(
+				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
+				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
+		else
+			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
+
+		kunmap_atomic(mapped_buffer);
+		page_num++;
+		if (page_num >= sblock->page_count)
 			break;
+		WARN_ON(!sblock->pagev[page_num]->page);
+
+		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
 	}
-	if (i == bbio->num_stripes)
-		goto uncorrectable;
 
-	if (!sdev->readonly) {
-		/*
-		 * bi_io_vec[ix].bv_page now contains good data, write it back
-		 */
-		if (scrub_fixup_io(WRITE, sdev->dev->bdev,
-				   (sbio->physical + ix * PAGE_SIZE) >> 9,
-				   sbio->bio->bi_io_vec[ix].bv_page)) {
-			/* I/O-error, writeback failed, give up */
-			goto uncorrectable;
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, csum, csum_size))
+		sblock->checksum_error = 1;
+}
+
+static void scrub_complete_bio_end_io(struct bio *bio, int err)
+{
+	complete((struct completion *)bio->bi_private);
+}
+
+static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
+					     struct scrub_block *sblock_good,
+					     int force_write)
+{
+	int page_num;
+	int ret = 0;
+
+	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+		int ret_sub;
+
+		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
+							   sblock_good,
+							   page_num,
+							   force_write);
+		if (ret_sub)
+			ret = ret_sub;
+	}
+
+	return ret;
+}
+
+static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
+					    struct scrub_block *sblock_good,
+					    int page_num, int force_write)
+{
+	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+	struct scrub_page *page_good = sblock_good->pagev[page_num];
+
+	BUG_ON(page_bad->page == NULL);
+	BUG_ON(page_good->page == NULL);
+	if (force_write || sblock_bad->header_error ||
+	    sblock_bad->checksum_error || page_bad->io_error) {
+		struct bio *bio;
+		int ret;
+		DECLARE_COMPLETION_ONSTACK(complete);
+
+		if (!page_bad->dev->bdev) {
+			printk_ratelimited(KERN_WARNING
+				"btrfs: scrub_repair_page_from_good_copy(bdev == NULL) is unexpected!\n");
+			return -EIO;
 		}
+
+		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+		if (!bio)
+			return -EIO;
+		bio->bi_bdev = page_bad->dev->bdev;
+		bio->bi_sector = page_bad->physical >> 9;
+		bio->bi_end_io = scrub_complete_bio_end_io;
+		bio->bi_private = &complete;
+
+		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
+		if (PAGE_SIZE != ret) {
+			bio_put(bio);
+			return -EIO;
+		}
+		btrfsic_submit_bio(WRITE, bio);
+
+		/* this will also unplug the queue */
+		wait_for_completion(&complete);
+		if (!bio_flagged(bio, BIO_UPTODATE)) {
+			btrfs_dev_stat_inc_and_print(page_bad->dev,
+				BTRFS_DEV_STAT_WRITE_ERRS);
+			btrfs_dev_replace_stats_inc(
+				&sblock_bad->sctx->dev_root->fs_info->
+				dev_replace.num_write_errors);
+			bio_put(bio);
+			return -EIO;
+		}
+		bio_put(bio);
 	}
 
-	kfree(bbio);
-	spin_lock(&sdev->stat_lock);
-	++sdev->stat.corrected_errors;
-	spin_unlock(&sdev->stat_lock);
+	return 0;
+}
 
-	printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
-			       (unsigned long long)logical);
-	return;
+static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
+{
+	int page_num;
 
-uncorrectable:
-	kfree(bbio);
-	spin_lock(&sdev->stat_lock);
-	++sdev->stat.uncorrectable_errors;
-	spin_unlock(&sdev->stat_lock);
+	for (page_num = 0; page_num < sblock->page_count; page_num++) {
+		int ret;
 
-	printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
-				"logical %llu\n", (unsigned long long)logical);
+		ret = scrub_write_page_to_dev_replace(sblock, page_num);
+		if (ret)
+			btrfs_dev_replace_stats_inc(
+				&sblock->sctx->dev_root->fs_info->dev_replace.
+				num_write_errors);
+	}
 }
 
-static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-			 struct page *page)
+static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
+					   int page_num)
 {
-	struct bio *bio = NULL;
+	struct scrub_page *spage = sblock->pagev[page_num];
+
+	BUG_ON(spage->page == NULL);
+	if (spage->io_error) {
+		void *mapped_buffer = kmap_atomic(spage->page);
+
+		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
+		flush_dcache_page(spage->page);
+		kunmap_atomic(mapped_buffer);
+	}
+	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
+}
+
+static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
 	int ret;
-	DECLARE_COMPLETION_ONSTACK(complete);
 
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_bdev = bdev;
-	bio->bi_sector = sector;
-	bio_add_page(bio, page, PAGE_SIZE, 0);
-	bio->bi_end_io = scrub_fixup_end_io;
-	bio->bi_private = &complete;
-	submit_bio(rw, bio);
+	mutex_lock(&wr_ctx->wr_lock);
+again:
+	if (!wr_ctx->wr_curr_bio) {
+		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
+					      GFP_NOFS);
+		if (!wr_ctx->wr_curr_bio) {
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -ENOMEM;
+		}
+		wr_ctx->wr_curr_bio->sctx = sctx;
+		wr_ctx->wr_curr_bio->page_count = 0;
+	}
+	sbio = wr_ctx->wr_curr_bio;
+	if (sbio->page_count == 0) {
+		struct bio *bio;
 
-	/* this will also unplug the queue */
-	wait_for_completion(&complete);
+		sbio->physical = spage->physical_for_dev_replace;
+		sbio->logical = spage->logical;
+		sbio->dev = wr_ctx->tgtdev;
+		bio = sbio->bio;
+		if (!bio) {
+			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
+			if (!bio) {
+				mutex_unlock(&wr_ctx->wr_lock);
+				return -ENOMEM;
+			}
+			sbio->bio = bio;
+		}
 
-	ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
-	bio_put(bio);
-	return ret;
+		bio->bi_private = sbio;
+		bio->bi_end_io = scrub_wr_bio_end_io;
+		bio->bi_bdev = sbio->dev->bdev;
+		bio->bi_sector = sbio->physical >> 9;
+		sbio->err = 0;
+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+		   spage->physical_for_dev_replace ||
+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
+		   spage->logical) {
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
+		if (sbio->page_count < 1) {
+			bio_put(sbio->bio);
+			sbio->bio = NULL;
+			mutex_unlock(&wr_ctx->wr_lock);
+			return -EIO;
+		}
+		scrub_wr_submit(sctx);
+		goto again;
+	}
+
+	sbio->pagev[sbio->page_count] = spage;
+	scrub_page_get(spage);
+	sbio->page_count++;
+	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
+		scrub_wr_submit(sctx);
+	mutex_unlock(&wr_ctx->wr_lock);
+
+	return 0;
 }
 
-static void scrub_bio_end_io(struct bio *bio, int err)
+static void scrub_wr_submit(struct scrub_ctx *sctx)
+{
+	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
+	struct scrub_bio *sbio;
+
+	if (!wr_ctx->wr_curr_bio)
+		return;
+
+	sbio = wr_ctx->wr_curr_bio;
+	wr_ctx->wr_curr_bio = NULL;
+	WARN_ON(!sbio->bio->bi_bdev);
+	scrub_pending_bio_inc(sctx);
+	/* process all writes in a single worker thread. Then the block layer
+	 * orders the requests before sending them to the driver which
+	 * doubled the write performance on spinning disks when measured
+	 * with Linux 3.5 */
+	btrfsic_submit_bio(WRITE, sbio->bio);
+}
+
+static void scrub_wr_bio_end_io(struct bio *bio, int err)
 {
 	struct scrub_bio *sbio = bio->bi_private;
-	struct scrub_dev *sdev = sbio->sdev;
-	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
 
 	sbio->err = err;
 	sbio->bio = bio;
 
-	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+	sbio->work.func = scrub_wr_bio_end_io_worker;
+	btrfs_queue_worker(&fs_info->scrub_wr_completion_workers, &sbio->work);
 }
 
-static void scrub_checksum(struct btrfs_work *work)
+static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
-	struct scrub_dev *sdev = sbio->sdev;
-	struct page *page;
-	void *buffer;
+	struct scrub_ctx *sctx = sbio->sctx;
 	int i;
-	u64 flags;
-	u64 logical;
-	int ret;
 
+	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
 	if (sbio->err) {
-		ret = 0;
-		for (i = 0; i < sbio->count; ++i)
-			ret |= scrub_recheck_error(sbio, i);
-		if (!ret) {
-			spin_lock(&sdev->stat_lock);
-			++sdev->stat.unverified_errors;
-			spin_unlock(&sdev->stat_lock);
-		}
-
-		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
-		sbio->bio->bi_phys_segments = 0;
-		sbio->bio->bi_idx = 0;
-
-		for (i = 0; i < sbio->count; i++) {
-			struct bio_vec *bi;
-			bi = &sbio->bio->bi_io_vec[i];
-			bi->bv_offset = 0;
-			bi->bv_len = PAGE_SIZE;
-		}
-		goto out;
-	}
-	for (i = 0; i < sbio->count; ++i) {
-		page = sbio->bio->bi_io_vec[i].bv_page;
-		buffer = kmap_atomic(page, KM_USER0);
-		flags = sbio->spag[i].flags;
-		logical = sbio->logical + i * PAGE_SIZE;
-		ret = 0;
-		if (flags & BTRFS_EXTENT_FLAG_DATA) {
-			ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
-		} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-			ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
-							logical, buffer);
-		} else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
-			BUG_ON(i);
-			(void)scrub_checksum_super(sbio, buffer);
-		} else {
-			WARN_ON(1);
-		}
-		kunmap_atomic(buffer, KM_USER0);
-		if (ret) {
-			ret = scrub_recheck_error(sbio, i);
-			if (!ret) {
-				spin_lock(&sdev->stat_lock);
-				++sdev->stat.unverified_errors;
-				spin_unlock(&sdev->stat_lock);
-			}
+		struct btrfs_dev_replace *dev_replace =
+			&sbio->sctx->dev_root->fs_info->dev_replace;
+
+		for (i = 0; i < sbio->page_count; i++) {
+			struct scrub_page *spage = sbio->pagev[i];
+
+			spage->io_error = 1;
+			btrfs_dev_replace_stats_inc(&dev_replace->
+						    num_write_errors);
 		}
 	}
 
-out:
-	scrub_free_bio(sbio->bio);
-	sbio->bio = NULL;
-	spin_lock(&sdev->list_lock);
-	sbio->next_free = sdev->first_free;
-	sdev->first_free = sbio->index;
-	spin_unlock(&sdev->list_lock);
-	atomic_dec(&sdev->in_flight);
-	wake_up(&sdev->list_wait);
+	for (i = 0; i < sbio->page_count; i++)
+		scrub_page_put(sbio->pagev[i]);
+
+	bio_put(sbio->bio);
+	kfree(sbio);
+	scrub_pending_bio_dec(sctx);
+}
+
+static int scrub_checksum(struct scrub_block *sblock)
+{
+	u64 flags;
+	int ret;
+
+	WARN_ON(sblock->page_count < 1);
+	flags = sblock->pagev[0]->flags;
+	ret = 0;
+	if (flags & BTRFS_EXTENT_FLAG_DATA)
+		ret = scrub_checksum_data(sblock);
+	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		ret = scrub_checksum_tree_block(sblock);
+	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
+		(void)scrub_checksum_super(sblock);
+	else
+		WARN_ON(1);
+	if (ret)
+		scrub_handle_errored_block(sblock);
+
+	return ret;
 }
 
-static int scrub_checksum_data(struct scrub_dev *sdev,
-			       struct scrub_page *spag, void *buffer)
+static int scrub_checksum_data(struct scrub_block *sblock)
 {
+	struct scrub_ctx *sctx = sblock->sctx;
 	u8 csum[BTRFS_CSUM_SIZE];
+	u8 *on_disk_csum;
+	struct page *page;
+	void *buffer;
 	u32 crc = ~(u32)0;
 	int fail = 0;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	u64 len;
+	int index;
 
-	if (!spag->have_csum)
+	BUG_ON(sblock->page_count < 1);
+	if (!sblock->pagev[0]->have_csum)
 		return 0;
 
-	crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+	on_disk_csum = sblock->pagev[0]->csum;
+	page = sblock->pagev[0]->page;
+	buffer = kmap_atomic(page);
+
+	len = sctx->sectorsize;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, PAGE_SIZE);
+
+		crc = btrfs_csum_data(buffer, crc, l);
+		kunmap_atomic(buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		buffer = kmap_atomic(page);
+	}
+
 	btrfs_csum_final(crc, csum);
-	if (memcmp(csum, spag->csum, sdev->csum_size))
+	if (memcmp(csum, on_disk_csum, sctx->csum_size))
 		fail = 1;
 
-	spin_lock(&sdev->stat_lock);
-	++sdev->stat.data_extents_scrubbed;
-	sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
-	if (fail)
-		++sdev->stat.csum_errors;
-	spin_unlock(&sdev->stat_lock);
-
 	return fail;
 }
 
-static int scrub_checksum_tree_block(struct scrub_dev *sdev,
-				     struct scrub_page *spag, u64 logical,
-				     void *buffer)
+static int scrub_checksum_tree_block(struct scrub_block *sblock)
 {
+	struct scrub_ctx *sctx = sblock->sctx;
 	struct btrfs_header *h;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	u8 csum[BTRFS_CSUM_SIZE];
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u8 on_disk_csum[BTRFS_CSUM_SIZE];
+	struct page *page;
+	void *mapped_buffer;
+	u64 mapped_size;
+	void *p;
 	u32 crc = ~(u32)0;
 	int fail = 0;
 	int crc_fail = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	page = sblock->pagev[0]->page;
+	mapped_buffer = kmap_atomic(page);
+	h = (struct btrfs_header *)mapped_buffer;
+	memcpy(on_disk_csum, h->csum, sctx->csum_size);
 
 	/*
 	 * we don't use the getter functions here, as we
 	 * a) don't have an extent buffer and
 	 * b) the page is already kmapped
 	 */
-	h = (struct btrfs_header *)buffer;
 
-	if (logical != le64_to_cpu(h->bytenr))
+	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
 		++fail;
 
-	if (spag->generation != le64_to_cpu(h->generation))
+	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
 		++fail;
 
 	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -885,184 +1738,411 @@
 		   BTRFS_UUID_SIZE))
 		++fail;
 
-	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
-			      PAGE_SIZE - BTRFS_CSUM_SIZE);
-	btrfs_csum_final(crc, csum);
-	if (memcmp(csum, h->csum, sdev->csum_size))
-		++crc_fail;
+	WARN_ON(sctx->nodesize != sctx->leafsize);
+	len = sctx->nodesize - BTRFS_CSUM_SIZE;
+	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, mapped_size);
 
-	spin_lock(&sdev->stat_lock);
-	++sdev->stat.tree_extents_scrubbed;
-	sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
-	if (crc_fail)
-		++sdev->stat.csum_errors;
-	if (fail)
-		++sdev->stat.verify_errors;
-	spin_unlock(&sdev->stat_lock);
+		crc = btrfs_csum_data(p, crc, l);
+		kunmap_atomic(mapped_buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		mapped_buffer = kmap_atomic(page);
+		mapped_size = PAGE_SIZE;
+		p = mapped_buffer;
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+		++crc_fail;
 
 	return fail || crc_fail;
 }
 
-static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+static int scrub_checksum_super(struct scrub_block *sblock)
 {
 	struct btrfs_super_block *s;
-	u64 logical;
-	struct scrub_dev *sdev = sbio->sdev;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	struct scrub_ctx *sctx = sblock->sctx;
+	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	u8 csum[BTRFS_CSUM_SIZE];
+	u8 calculated_csum[BTRFS_CSUM_SIZE];
+	u8 on_disk_csum[BTRFS_CSUM_SIZE];
+	struct page *page;
+	void *mapped_buffer;
+	u64 mapped_size;
+	void *p;
 	u32 crc = ~(u32)0;
-	int fail = 0;
-
-	s = (struct btrfs_super_block *)buffer;
-	logical = sbio->logical;
+	int fail_gen = 0;
+	int fail_cor = 0;
+	u64 len;
+	int index;
+
+	BUG_ON(sblock->page_count < 1);
+	page = sblock->pagev[0]->page;
+	mapped_buffer = kmap_atomic(page);
+	s = (struct btrfs_super_block *)mapped_buffer;
+	memcpy(on_disk_csum, s->csum, sctx->csum_size);
 
-	if (logical != le64_to_cpu(s->bytenr))
-		++fail;
+	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
+		++fail_cor;
 
-	if (sbio->spag[0].generation != le64_to_cpu(s->generation))
-		++fail;
+	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
+		++fail_gen;
 
 	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
-		++fail;
+		++fail_cor;
 
-	crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
-			      PAGE_SIZE - BTRFS_CSUM_SIZE);
-	btrfs_csum_final(crc, csum);
-	if (memcmp(csum, s->csum, sbio->sdev->csum_size))
-		++fail;
+	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
+	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
+	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
+	index = 0;
+	for (;;) {
+		u64 l = min_t(u64, len, mapped_size);
 
-	if (fail) {
+		crc = btrfs_csum_data(p, crc, l);
+		kunmap_atomic(mapped_buffer);
+		len -= l;
+		if (len == 0)
+			break;
+		index++;
+		BUG_ON(index >= sblock->page_count);
+		BUG_ON(!sblock->pagev[index]->page);
+		page = sblock->pagev[index]->page;
+		mapped_buffer = kmap_atomic(page);
+		mapped_size = PAGE_SIZE;
+		p = mapped_buffer;
+	}
+
+	btrfs_csum_final(crc, calculated_csum);
+	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
+		++fail_cor;
+
+	if (fail_cor + fail_gen) {
 		/*
 		 * if we find an error in a super block, we just report it.
 		 * They will get written with the next transaction commit
 		 * anyway
 		 */
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.super_errors;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		++sctx->stat.super_errors;
+		spin_unlock(&sctx->stat_lock);
+		if (fail_cor)
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+				BTRFS_DEV_STAT_CORRUPTION_ERRS);
+		else
+			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
+				BTRFS_DEV_STAT_GENERATION_ERRS);
 	}
 
-	return fail;
+	return fail_cor + fail_gen;
 }
 
-static int scrub_submit(struct scrub_dev *sdev)
+static void scrub_block_get(struct scrub_block *sblock)
 {
-	struct scrub_bio *sbio;
+	atomic_inc(&sblock->ref_count);
+}
 
-	if (sdev->curr == -1)
-		return 0;
+static void scrub_block_put(struct scrub_block *sblock)
+{
+	if (atomic_dec_and_test(&sblock->ref_count)) {
+		int i;
 
-	sbio = sdev->bios[sdev->curr];
-	sbio->err = 0;
-	sdev->curr = -1;
-	atomic_inc(&sdev->in_flight);
+		for (i = 0; i < sblock->page_count; i++)
+			scrub_page_put(sblock->pagev[i]);
+		kfree(sblock);
+	}
+}
 
-	submit_bio(READ, sbio->bio);
+static void scrub_page_get(struct scrub_page *spage)
+{
+	atomic_inc(&spage->ref_count);
+}
 
-	return 0;
+static void scrub_page_put(struct scrub_page *spage)
+{
+	if (atomic_dec_and_test(&spage->ref_count)) {
+		if (spage->page)
+			__free_page(spage->page);
+		kfree(spage);
+	}
 }
 
-static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
-		      u64 physical, u64 flags, u64 gen, int mirror_num,
-		      u8 *csum, int force)
+static void scrub_submit(struct scrub_ctx *sctx)
 {
 	struct scrub_bio *sbio;
-	struct page *page;
+
+	if (sctx->curr == -1)
+		return;
+
+	sbio = sctx->bios[sctx->curr];
+	sctx->curr = -1;
+	scrub_pending_bio_inc(sctx);
+
+	if (!sbio->bio->bi_bdev) {
+		/*
+		 * this case should not happen. If btrfs_map_block() is
+		 * wrong, it could happen for dev-replace operations on
+		 * missing devices when no mirrors are available, but in
+		 * this case it should already fail the mount.
+		 * This case is handled correctly (but _very_ slowly).
+		 */
+		printk_ratelimited(KERN_WARNING
+			"btrfs: scrub_submit(bio bdev == NULL) is unexpected!\n");
+		bio_endio(sbio->bio, -EIO);
+	} else {
+		btrfsic_submit_bio(READ, sbio->bio);
+	}
+}
+
+static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
+				    struct scrub_page *spage)
+{
+	struct scrub_block *sblock = spage->sblock;
+	struct scrub_bio *sbio;
 	int ret;
 
 again:
 	/*
 	 * grab a fresh bio or wait for one to become available
 	 */
-	while (sdev->curr == -1) {
-		spin_lock(&sdev->list_lock);
-		sdev->curr = sdev->first_free;
-		if (sdev->curr != -1) {
-			sdev->first_free = sdev->bios[sdev->curr]->next_free;
-			sdev->bios[sdev->curr]->next_free = -1;
-			sdev->bios[sdev->curr]->count = 0;
-			spin_unlock(&sdev->list_lock);
+	while (sctx->curr == -1) {
+		spin_lock(&sctx->list_lock);
+		sctx->curr = sctx->first_free;
+		if (sctx->curr != -1) {
+			sctx->first_free = sctx->bios[sctx->curr]->next_free;
+			sctx->bios[sctx->curr]->next_free = -1;
+			sctx->bios[sctx->curr]->page_count = 0;
+			spin_unlock(&sctx->list_lock);
 		} else {
-			spin_unlock(&sdev->list_lock);
-			wait_event(sdev->list_wait, sdev->first_free != -1);
+			spin_unlock(&sctx->list_lock);
+			wait_event(sctx->list_wait, sctx->first_free != -1);
 		}
 	}
-	sbio = sdev->bios[sdev->curr];
-	if (sbio->count == 0) {
+	sbio = sctx->bios[sctx->curr];
+	if (sbio->page_count == 0) {
 		struct bio *bio;
 
-		sbio->physical = physical;
-		sbio->logical = logical;
-		bio = bio_alloc(GFP_NOFS, SCRUB_PAGES_PER_BIO);
-		if (!bio)
-			return -ENOMEM;
+		sbio->physical = spage->physical;
+		sbio->logical = spage->logical;
+		sbio->dev = spage->dev;
+		bio = sbio->bio;
+		if (!bio) {
+			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
+			if (!bio)
+				return -ENOMEM;
+			sbio->bio = bio;
+		}
 
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_bio_end_io;
-		bio->bi_bdev = sdev->dev->bdev;
+		bio->bi_bdev = sbio->dev->bdev;
 		bio->bi_sector = sbio->physical >> 9;
 		sbio->err = 0;
-		sbio->bio = bio;
-	} else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
-		   sbio->logical + sbio->count * PAGE_SIZE != logical) {
-		ret = scrub_submit(sdev);
-		if (ret)
-			return ret;
+	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
+		   spage->physical ||
+		   sbio->logical + sbio->page_count * PAGE_SIZE !=
+		   spage->logical ||
+		   sbio->dev != spage->dev) {
+		scrub_submit(sctx);
 		goto again;
 	}
-	sbio->spag[sbio->count].flags = flags;
-	sbio->spag[sbio->count].generation = gen;
-	sbio->spag[sbio->count].have_csum = 0;
-	sbio->spag[sbio->count].mirror_num = mirror_num;
-
-	page = alloc_page(GFP_NOFS);
-	if (!page)
-		return -ENOMEM;
 
-	ret = bio_add_page(sbio->bio, page, PAGE_SIZE, 0);
-	if (!ret) {
-		__free_page(page);
-		ret = scrub_submit(sdev);
-		if (ret)
-			return ret;
+	sbio->pagev[sbio->page_count] = spage;
+	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
+	if (ret != PAGE_SIZE) {
+		if (sbio->page_count < 1) {
+			bio_put(sbio->bio);
+			sbio->bio = NULL;
+			return -EIO;
+		}
+		scrub_submit(sctx);
 		goto again;
 	}
 
-	if (csum) {
-		sbio->spag[sbio->count].have_csum = 1;
-		memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+	scrub_block_get(sblock); /* one for the page added to the bio */
+	atomic_inc(&sblock->outstanding_pages);
+	sbio->page_count++;
+	if (sbio->page_count == sctx->pages_per_rd_bio)
+		scrub_submit(sctx);
+
+	return 0;
+}
+
+static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+		       u64 physical, struct btrfs_device *dev, u64 flags,
+		       u64 gen, int mirror_num, u8 *csum, int force,
+		       u64 physical_for_dev_replace)
+{
+	struct scrub_block *sblock;
+	int index;
+
+	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
+	if (!sblock) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	/* one ref inside this function, plus one for each page added to
+	 * a bio later on */
+	atomic_set(&sblock->ref_count, 1);
+	sblock->sctx = sctx;
+	sblock->no_io_error_seen = 1;
+
+	for (index = 0; len > 0; index++) {
+		struct scrub_page *spage;
+		u64 l = min_t(u64, len, PAGE_SIZE);
+
+		spage = kzalloc(sizeof(*spage), GFP_NOFS);
+		if (!spage) {
+leave_nomem:
+			spin_lock(&sctx->stat_lock);
+			sctx->stat.malloc_errors++;
+			spin_unlock(&sctx->stat_lock);
+			scrub_block_put(sblock);
+			return -ENOMEM;
+		}
+		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
+		scrub_page_get(spage);
+		sblock->pagev[index] = spage;
+		spage->sblock = sblock;
+		spage->dev = dev;
+		spage->flags = flags;
+		spage->generation = gen;
+		spage->logical = logical;
+		spage->physical = physical;
+		spage->physical_for_dev_replace = physical_for_dev_replace;
+		spage->mirror_num = mirror_num;
+		if (csum) {
+			spage->have_csum = 1;
+			memcpy(spage->csum, csum, sctx->csum_size);
+		} else {
+			spage->have_csum = 0;
+		}
+		sblock->page_count++;
+		spage->page = alloc_page(GFP_NOFS);
+		if (!spage->page)
+			goto leave_nomem;
+		len -= l;
+		logical += l;
+		physical += l;
+		physical_for_dev_replace += l;
 	}
-	++sbio->count;
-	if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+
+	WARN_ON(sblock->page_count == 0);
+	for (index = 0; index < sblock->page_count; index++) {
+		struct scrub_page *spage = sblock->pagev[index];
 		int ret;
 
-		ret = scrub_submit(sdev);
-		if (ret)
+		ret = scrub_add_page_to_rd_bio(sctx, spage);
+		if (ret) {
+			scrub_block_put(sblock);
 			return ret;
+		}
 	}
 
+	if (force)
+		scrub_submit(sctx);
+
+	/* last one frees, either here or in bio completion for last page */
+	scrub_block_put(sblock);
 	return 0;
 }
 
-static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+	struct scrub_bio *sbio = bio->bi_private;
+	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
+
+	sbio->err = err;
+	sbio->bio = bio;
+
+	btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+
+static void scrub_bio_end_io_worker(struct btrfs_work *work)
+{
+	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+	struct scrub_ctx *sctx = sbio->sctx;
+	int i;
+
+	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
+	if (sbio->err) {
+		for (i = 0; i < sbio->page_count; i++) {
+			struct scrub_page *spage = sbio->pagev[i];
+
+			spage->io_error = 1;
+			spage->sblock->no_io_error_seen = 0;
+		}
+	}
+
+	/* now complete the scrub_block items that have all pages completed */
+	for (i = 0; i < sbio->page_count; i++) {
+		struct scrub_page *spage = sbio->pagev[i];
+		struct scrub_block *sblock = spage->sblock;
+
+		if (atomic_dec_and_test(&sblock->outstanding_pages))
+			scrub_block_complete(sblock);
+		scrub_block_put(sblock);
+	}
+
+	bio_put(sbio->bio);
+	sbio->bio = NULL;
+	spin_lock(&sctx->list_lock);
+	sbio->next_free = sctx->first_free;
+	sctx->first_free = sbio->index;
+	spin_unlock(&sctx->list_lock);
+
+	if (sctx->is_dev_replace &&
+	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+	}
+
+	scrub_pending_bio_dec(sctx);
+}
+
+static void scrub_block_complete(struct scrub_block *sblock)
+{
+	if (!sblock->no_io_error_seen) {
+		scrub_handle_errored_block(sblock);
+	} else {
+		/*
+		 * if has checksum error, write via repair mechanism in
+		 * dev replace case, otherwise write here in dev replace
+		 * case.
+		 */
+		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
+			scrub_write_block_to_dev_replace(sblock);
+	}
+}
+
+static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
 			   u8 *csum)
 {
 	struct btrfs_ordered_sum *sum = NULL;
-	int ret = 0;
-	unsigned long i;
+	unsigned long index;
 	unsigned long num_sectors;
-	u32 sectorsize = sdev->dev->dev_root->sectorsize;
 
-	while (!list_empty(&sdev->csum_list)) {
-		sum = list_first_entry(&sdev->csum_list,
+	while (!list_empty(&sctx->csum_list)) {
+		sum = list_first_entry(&sctx->csum_list,
 				       struct btrfs_ordered_sum, list);
 		if (sum->bytenr > logical)
 			return 0;
 		if (sum->bytenr + sum->len > logical)
 			break;
 
-		++sdev->stat.csum_discards;
+		++sctx->stat.csum_discards;
 		list_del(&sum->list);
 		kfree(sum);
 		sum = NULL;
@@ -1070,54 +2150,81 @@
 	if (!sum)
 		return 0;
 
-	num_sectors = sum->len / sectorsize;
-	for (i = 0; i < num_sectors; ++i) {
-		if (sum->sums[i].bytenr == logical) {
-			memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
-			ret = 1;
-			break;
-		}
-	}
-	if (ret && i == num_sectors - 1) {
+	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
+	num_sectors = sum->len / sctx->sectorsize;
+	memcpy(csum, sum->sums + index, sctx->csum_size);
+	if (index == num_sectors - 1) {
 		list_del(&sum->list);
 		kfree(sum);
 	}
-	return ret;
+	return 1;
 }
 
 /* scrub extent tries to collect up to 64 kB for each bio */
-static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
-			u64 physical, u64 flags, u64 gen, int mirror_num)
+static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
+			u64 physical, struct btrfs_device *dev, u64 flags,
+			u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
 	int ret;
 	u8 csum[BTRFS_CSUM_SIZE];
+	u32 blocksize;
+
+	if (flags & BTRFS_EXTENT_FLAG_DATA) {
+		blocksize = sctx->sectorsize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.data_extents_scrubbed++;
+		sctx->stat.data_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
+	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		WARN_ON(sctx->nodesize != sctx->leafsize);
+		blocksize = sctx->nodesize;
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.tree_extents_scrubbed++;
+		sctx->stat.tree_bytes_scrubbed += len;
+		spin_unlock(&sctx->stat_lock);
+	} else {
+		blocksize = sctx->sectorsize;
+		WARN_ON(1);
+	}
 
 	while (len) {
-		u64 l = min_t(u64, len, PAGE_SIZE);
+		u64 l = min_t(u64, len, blocksize);
 		int have_csum = 0;
 
 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
 			/* push csums to sbio */
-			have_csum = scrub_find_csum(sdev, logical, l, csum);
+			have_csum = scrub_find_csum(sctx, logical, l, csum);
 			if (have_csum == 0)
-				++sdev->stat.no_csum;
+				++sctx->stat.no_csum;
+			if (sctx->is_dev_replace && !have_csum) {
+				ret = copy_nocow_pages(sctx, logical, l,
+						       mirror_num,
+						      physical_for_dev_replace);
+				goto behind_scrub_pages;
+			}
 		}
-		ret = scrub_page(sdev, logical, l, physical, flags, gen,
-				 mirror_num, have_csum ? csum : NULL, 0);
+		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
+				  mirror_num, have_csum ? csum : NULL, 0,
+				  physical_for_dev_replace);
+behind_scrub_pages:
 		if (ret)
 			return ret;
 		len -= l;
 		logical += l;
 		physical += l;
+		physical_for_dev_replace += l;
 	}
 	return 0;
 }
 
-static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
-	struct map_lookup *map, int num, u64 base, u64 length)
+static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
+					   struct map_lookup *map,
+					   struct btrfs_device *scrub_dev,
+					   int num, u64 base, u64 length,
+					   int is_dev_replace)
 {
 	struct btrfs_path *path;
-	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_root *csum_root = fs_info->csum_root;
 	struct btrfs_extent_item *extent;
@@ -1125,21 +2232,33 @@
 	u64 flags;
 	int ret;
 	int slot;
-	int i;
 	u64 nstripes;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	u64 physical;
 	u64 logical;
+	u64 logic_end;
 	u64 generation;
 	int mirror_num;
 	struct reada_control *reada1;
 	struct reada_control *reada2;
 	struct btrfs_key key_start;
 	struct btrfs_key key_end;
-
 	u64 increment = map->stripe_len;
 	u64 offset;
+	u64 extent_logical;
+	u64 extent_physical;
+	u64 extent_len;
+	struct btrfs_device *extent_dev;
+	int extent_mirror_num;
+	int stop_loop;
+
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6)) {
+		if (num >= nr_data_stripes(map)) {
+			return 0;
+		}
+	}
 
 	nstripes = length;
 	offset = 0;
@@ -1168,6 +2287,11 @@
 	if (!path)
 		return -ENOMEM;
 
+	/*
+	 * work on commit root. The related disk blocks are static as
+	 * long as COW is applied. This means, it is save to rewrite
+	 * them to repair disk errors without any race conditions
+	 */
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
@@ -1178,8 +2302,8 @@
 	 */
 	logical = base + offset;
 
-	wait_event(sdev->list_wait,
-		   atomic_read(&sdev->in_flight) == 0);
+	wait_event(sctx->list_wait,
+		   atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_inc(&fs_info->scrubs_paused);
 	wake_up(&fs_info->scrub_pause_wait);
 
@@ -1188,8 +2312,8 @@
 	key_start.type = BTRFS_EXTENT_ITEM_KEY;
 	key_start.offset = (u64)0;
 	key_end.objectid = base + offset + nstripes * increment;
-	key_end.type = BTRFS_EXTENT_ITEM_KEY;
-	key_end.offset = (u64)0;
+	key_end.type = BTRFS_METADATA_ITEM_KEY;
+	key_end.offset = (u64)-1;
 	reada1 = btrfs_reada_add(root, &key_start, &key_end);
 
 	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -1227,13 +2351,14 @@
 	 */
 	logical = base + offset;
 	physical = map->stripes[num].physical;
+	logic_end = logical + increment * nstripes;
 	ret = 0;
-	for (i = 0; i < nstripes; ++i) {
+	while (logical < logic_end) {
 		/*
 		 * canceled?
 		 */
 		if (atomic_read(&fs_info->scrub_cancel_req) ||
-		    atomic_read(&sdev->cancel_req)) {
+		    atomic_read(&sctx->cancel_req)) {
 			ret = -ECANCELED;
 			goto out;
 		}
@@ -1242,9 +2367,14 @@
 		 */
 		if (atomic_read(&fs_info->scrub_pause_req)) {
 			/* push queued extents */
-			scrub_submit(sdev);
-			wait_event(sdev->list_wait,
-				   atomic_read(&sdev->in_flight) == 0);
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+			scrub_submit(sctx);
+			mutex_lock(&sctx->wr_ctx.wr_lock);
+			scrub_wr_submit(sctx);
+			mutex_unlock(&sctx->wr_ctx.wr_lock);
+			wait_event(sctx->list_wait,
+				   atomic_read(&sctx->bios_in_flight) == 0);
+			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
 			atomic_inc(&fs_info->scrubs_paused);
 			wake_up(&fs_info->scrub_pause_wait);
 			mutex_lock(&fs_info->scrub_lock);
@@ -1259,19 +2389,14 @@
 			wake_up(&fs_info->scrub_pause_wait);
 		}
 
-		ret = btrfs_lookup_csums_range(csum_root, logical,
-					       logical + map->stripe_len - 1,
-					       &sdev->csum_list, 1);
-		if (ret)
-			goto out;
-
 		key.objectid = logical;
 		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = (u64)0;
+		key.offset = (u64)-1;
 
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
+
 		if (ret > 0) {
 			ret = btrfs_previous_item(root, path, 0,
 						  BTRFS_EXTENT_ITEM_KEY);
@@ -1288,7 +2413,10 @@
 			}
 		}
 
+		stop_loop = 0;
 		while (1) {
+			u64 bytes;
+
 			l = path->nodes[0];
 			slot = path->slots[0];
 			if (slot >= btrfs_header_nritems(l)) {
@@ -1298,19 +2426,30 @@
 				if (ret < 0)
 					goto out;
 
+				stop_loop = 1;
 				break;
 			}
 			btrfs_item_key_to_cpu(l, &key, slot);
 
-			if (key.objectid + key.offset <= logical)
-				goto next;
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				bytes = root->leafsize;
+			else
+				bytes = key.offset;
 
-			if (key.objectid >= logical + map->stripe_len)
-				break;
+			if (key.objectid + bytes <= logical)
+				goto next;
 
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
+			    key.type != BTRFS_METADATA_ITEM_KEY)
 				goto next;
 
+			if (key.objectid >= logical + map->stripe_len) {
+				/* out of this device extent */
+				if (key.objectid >= logic_end)
+					stop_loop = 1;
+				break;
+			}
+
 			extent = btrfs_item_ptr(l, slot,
 						struct btrfs_extent_item);
 			flags = btrfs_extent_flags(l, extent);
@@ -1321,58 +2460,105 @@
 				printk(KERN_ERR
 				       "btrfs scrub: tree block %llu spanning "
 				       "stripes, ignored. logical=%llu\n",
-				       (unsigned long long)key.objectid,
-				       (unsigned long long)logical);
+				       key.objectid, logical);
 				goto next;
 			}
 
+again:
+			extent_logical = key.objectid;
+			extent_len = bytes;
+
 			/*
 			 * trim extent to this stripe
 			 */
-			if (key.objectid < logical) {
-				key.offset -= logical - key.objectid;
-				key.objectid = logical;
+			if (extent_logical < logical) {
+				extent_len -= logical - extent_logical;
+				extent_logical = logical;
 			}
-			if (key.objectid + key.offset >
+			if (extent_logical + extent_len >
 			    logical + map->stripe_len) {
-				key.offset = logical + map->stripe_len -
-					     key.objectid;
+				extent_len = logical + map->stripe_len -
+					     extent_logical;
 			}
 
-			ret = scrub_extent(sdev, key.objectid, key.offset,
-					   key.objectid - logical + physical,
-					   flags, generation, mirror_num);
+			extent_physical = extent_logical - logical + physical;
+			extent_dev = scrub_dev;
+			extent_mirror_num = mirror_num;
+			if (is_dev_replace)
+				scrub_remap_extent(fs_info, extent_logical,
+						   extent_len, &extent_physical,
+						   &extent_dev,
+						   &extent_mirror_num);
+
+			ret = btrfs_lookup_csums_range(csum_root, logical,
+						logical + map->stripe_len - 1,
+						&sctx->csum_list, 1);
+			if (ret)
+				goto out;
+
+			ret = scrub_extent(sctx, extent_logical, extent_len,
+					   extent_physical, extent_dev, flags,
+					   generation, extent_mirror_num,
+					   extent_logical - logical + physical);
 			if (ret)
 				goto out;
 
+			scrub_free_csums(sctx);
+			if (extent_logical + extent_len <
+			    key.objectid + bytes) {
+				logical += increment;
+				physical += map->stripe_len;
+
+				if (logical < key.objectid + bytes) {
+					cond_resched();
+					goto again;
+				}
+
+				if (logical >= logic_end) {
+					stop_loop = 1;
+					break;
+				}
+			}
 next:
 			path->slots[0]++;
 		}
 		btrfs_release_path(path);
 		logical += increment;
 		physical += map->stripe_len;
-		spin_lock(&sdev->stat_lock);
-		sdev->stat.last_physical = physical;
-		spin_unlock(&sdev->stat_lock);
+		spin_lock(&sctx->stat_lock);
+		if (stop_loop)
+			sctx->stat.last_physical = map->stripes[num].physical +
+						   length;
+		else
+			sctx->stat.last_physical = physical;
+		spin_unlock(&sctx->stat_lock);
+		if (stop_loop)
+			break;
 	}
+out:
 	/* push queued extents */
-	scrub_submit(sdev);
+	scrub_submit(sctx);
+	mutex_lock(&sctx->wr_ctx.wr_lock);
+	scrub_wr_submit(sctx);
+	mutex_unlock(&sctx->wr_ctx.wr_lock);
 
-out:
 	blk_finish_plug(&plug);
 	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
 
-static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
-	u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
+static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
+					  struct btrfs_device *scrub_dev,
+					  u64 chunk_tree, u64 chunk_objectid,
+					  u64 chunk_offset, u64 length,
+					  u64 dev_offset, int is_dev_replace)
 {
 	struct btrfs_mapping_tree *map_tree =
-		&sdev->dev->dev_root->fs_info->mapping_tree;
+		&sctx->dev_root->fs_info->mapping_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	int i;
-	int ret = -EINVAL;
+	int ret = 0;
 
 	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
@@ -1389,8 +2575,11 @@
 		goto out;
 
 	for (i = 0; i < map->num_stripes; ++i) {
-		if (map->stripes[i].dev == sdev->dev) {
-			ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
+		    map->stripes[i].physical == dev_offset) {
+			ret = scrub_stripe(sctx, map, scrub_dev, i,
+					   chunk_offset, length,
+					   is_dev_replace);
 			if (ret)
 				goto out;
 		}
@@ -1402,11 +2591,13 @@
 }
 
 static noinline_for_stack
-int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+int scrub_enumerate_chunks(struct scrub_ctx *sctx,
+			   struct btrfs_device *scrub_dev, u64 start, u64 end,
+			   int is_dev_replace)
 {
 	struct btrfs_dev_extent *dev_extent = NULL;
 	struct btrfs_path *path;
-	struct btrfs_root *root = sdev->dev->dev_root;
+	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 length;
 	u64 chunk_tree;
@@ -1418,6 +2609,7 @@
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_block_group_cache *cache;
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -1427,11 +2619,10 @@
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
-	key.objectid = sdev->dev->devid;
+	key.objectid = scrub_dev->devid;
 	key.offset = 0ull;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -1450,7 +2641,7 @@
 
 		btrfs_item_key_to_cpu(l, &found_key, slot);
 
-		if (found_key.objectid != sdev->dev->devid)
+		if (found_key.objectid != scrub_dev->devid)
 			break;
 
 		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
@@ -1484,11 +2675,62 @@
 			ret = -ENOENT;
 			break;
 		}
-		ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
-				  chunk_offset, length);
+		dev_replace->cursor_right = found_key.offset + length;
+		dev_replace->cursor_left = found_key.offset;
+		dev_replace->item_needs_writeback = 1;
+		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
+				  chunk_offset, length, found_key.offset,
+				  is_dev_replace);
+
+		/*
+		 * flush, submit all pending read and write bios, afterwards
+		 * wait for them.
+		 * Note that in the dev replace case, a read request causes
+		 * write requests that are submitted in the read completion
+		 * worker. Therefore in the current situation, it is required
+		 * that all write requests are flushed, so that all read and
+		 * write requests are really completed when bios_in_flight
+		 * changes to 0.
+		 */
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
+		scrub_submit(sctx);
+		mutex_lock(&sctx->wr_ctx.wr_lock);
+		scrub_wr_submit(sctx);
+		mutex_unlock(&sctx->wr_ctx.wr_lock);
+
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->bios_in_flight) == 0);
+		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
+		atomic_inc(&fs_info->scrubs_paused);
+		wake_up(&fs_info->scrub_pause_wait);
+		wait_event(sctx->list_wait,
+			   atomic_read(&sctx->workers_pending) == 0);
+
+		mutex_lock(&fs_info->scrub_lock);
+		while (atomic_read(&fs_info->scrub_pause_req)) {
+			mutex_unlock(&fs_info->scrub_lock);
+			wait_event(fs_info->scrub_pause_wait,
+			   atomic_read(&fs_info->scrub_pause_req) == 0);
+			mutex_lock(&fs_info->scrub_lock);
+		}
+		atomic_dec(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		wake_up(&fs_info->scrub_pause_wait);
+
+		dev_replace->cursor_left = dev_replace->cursor_right;
+		dev_replace->item_needs_writeback = 1;
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
+		if (is_dev_replace &&
+		    atomic64_read(&dev_replace->num_write_errors) > 0) {
+			ret = -EIO;
+			break;
+		}
+		if (sctx->stat.malloc_errors > 0) {
+			ret = -ENOMEM;
+			break;
+		}
 
 		key.offset = found_key.offset + length;
 		btrfs_release_path(path);
@@ -1503,28 +2745,32 @@
 	return ret < 0 ? ret : 0;
 }
 
-static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
+					   struct btrfs_device *scrub_dev)
 {
 	int	i;
 	u64	bytenr;
 	u64	gen;
 	int	ret;
-	struct btrfs_device *device = sdev->dev;
-	struct btrfs_root *root = device->dev_root;
+	struct btrfs_root *root = sctx->dev_root;
+
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		return -EIO;
 
 	gen = root->fs_info->last_trans_committed;
 
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
-		if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
 			break;
 
-		ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
-				 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
+				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
+				  NULL, 1, bytenr);
 		if (ret)
 			return ret;
 	}
-	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 
 	return 0;
 }
@@ -1532,19 +2778,38 @@
 /*
  * get a reference count on fs_info->scrub_workers. start worker if necessary
  */
-static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
+						int is_dev_replace)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	int ret = 0;
 
 	mutex_lock(&fs_info->scrub_lock);
 	if (fs_info->scrub_workers_refcnt == 0) {
-		btrfs_init_workers(&fs_info->scrub_workers, "scrub",
-			   fs_info->thread_pool_size, &fs_info->generic_worker);
+		if (is_dev_replace)
+			btrfs_init_workers(&fs_info->scrub_workers, "scrub", 1,
+					&fs_info->generic_worker);
+		else
+			btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+					fs_info->thread_pool_size,
+					&fs_info->generic_worker);
 		fs_info->scrub_workers.idle_thresh = 4;
 		ret = btrfs_start_workers(&fs_info->scrub_workers);
 		if (ret)
 			goto out;
+		btrfs_init_workers(&fs_info->scrub_wr_completion_workers,
+				   "scrubwrc",
+				   fs_info->thread_pool_size,
+				   &fs_info->generic_worker);
+		fs_info->scrub_wr_completion_workers.idle_thresh = 2;
+		ret = btrfs_start_workers(
+				&fs_info->scrub_wr_completion_workers);
+		if (ret)
+			goto out;
+		btrfs_init_workers(&fs_info->scrub_nocow_workers, "scrubnc", 1,
+				   &fs_info->generic_worker);
+		ret = btrfs_start_workers(&fs_info->scrub_nocow_workers);
+		if (ret)
+			goto out;
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
@@ -1553,106 +2818,151 @@
 	return ret;
 }
 
-static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
-
 	mutex_lock(&fs_info->scrub_lock);
-	if (--fs_info->scrub_workers_refcnt == 0)
+	if (--fs_info->scrub_workers_refcnt == 0) {
 		btrfs_stop_workers(&fs_info->scrub_workers);
+		btrfs_stop_workers(&fs_info->scrub_wr_completion_workers);
+		btrfs_stop_workers(&fs_info->scrub_nocow_workers);
+	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
 	mutex_unlock(&fs_info->scrub_lock);
 }
 
-
-int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
-		    struct btrfs_scrub_progress *progress, int readonly)
+int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
+		    u64 end, struct btrfs_scrub_progress *progress,
+		    int readonly, int is_dev_replace)
 {
-	struct scrub_dev *sdev;
-	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct scrub_ctx *sctx;
 	int ret;
 	struct btrfs_device *dev;
 
-	if (btrfs_fs_closing(root->fs_info))
+	if (btrfs_fs_closing(fs_info))
 		return -EINVAL;
 
 	/*
 	 * check some assumptions
 	 */
-	if (root->sectorsize != PAGE_SIZE ||
-	    root->sectorsize != root->leafsize ||
-	    root->sectorsize != root->nodesize) {
-		printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
+		printk(KERN_ERR
+		       "btrfs_scrub: size assumption nodesize == leafsize (%d == %d) fails\n",
+		       fs_info->chunk_root->nodesize,
+		       fs_info->chunk_root->leafsize);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
+		/*
+		 * in this case scrub is unable to calculate the checksum
+		 * the way scrub is implemented. Do not handle this
+		 * situation at all because it won't ever happen.
+		 */
+		printk(KERN_ERR
+		       "btrfs_scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails\n",
+		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
+		/* not supported for data w/o checksums */
+		printk(KERN_ERR
+		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
+		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
+		return -EINVAL;
+	}
+
+	if (fs_info->chunk_root->nodesize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
+	    fs_info->chunk_root->sectorsize >
+	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
+		/*
+		 * would exhaust the array bounds of pagev member in
+		 * struct scrub_block
+		 */
+		pr_err("btrfs_scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails\n",
+		       fs_info->chunk_root->nodesize,
+		       SCRUB_MAX_PAGES_PER_BLOCK,
+		       fs_info->chunk_root->sectorsize,
+		       SCRUB_MAX_PAGES_PER_BLOCK);
 		return -EINVAL;
 	}
 
-	ret = scrub_workers_get(root);
+	ret = scrub_workers_get(fs_info, is_dev_replace);
 	if (ret)
 		return ret;
 
-	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, devid, NULL, NULL);
-	if (!dev || dev->missing) {
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
+	if (!dev || (dev->missing && !is_dev_replace)) {
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
 		return -ENODEV;
 	}
 	mutex_lock(&fs_info->scrub_lock);
 
-	if (!dev->in_fs_metadata) {
+	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
 		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
-		return -ENODEV;
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
+		return -EIO;
 	}
 
-	if (dev->scrub_device) {
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (dev->scrub_device ||
+	    (!is_dev_replace &&
+	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
+		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
 		return -EINPROGRESS;
 	}
-	sdev = scrub_setup_dev(dev);
-	if (IS_ERR(sdev)) {
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+	sctx = scrub_setup_ctx(dev, is_dev_replace);
+	if (IS_ERR(sctx)) {
 		mutex_unlock(&fs_info->scrub_lock);
-		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-		scrub_workers_put(root);
-		return PTR_ERR(sdev);
+		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+		scrub_workers_put(fs_info);
+		return PTR_ERR(sctx);
 	}
-	sdev->readonly = readonly;
-	dev->scrub_device = sdev;
+	sctx->readonly = readonly;
+	dev->scrub_device = sctx;
 
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 
-	down_read(&fs_info->scrub_super_lock);
-	ret = scrub_supers(sdev);
-	up_read(&fs_info->scrub_super_lock);
+	if (!is_dev_replace) {
+		down_read(&fs_info->scrub_super_lock);
+		ret = scrub_supers(sctx, dev);
+		up_read(&fs_info->scrub_super_lock);
+	}
 
 	if (!ret)
-		ret = scrub_enumerate_chunks(sdev, start, end);
+		ret = scrub_enumerate_chunks(sctx, dev, start, end,
+					     is_dev_replace);
 
-	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_dec(&fs_info->scrubs_running);
 	wake_up(&fs_info->scrub_pause_wait);
 
-	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
 
 	if (progress)
-		memcpy(progress, &sdev->stat, sizeof(*progress));
+		memcpy(progress, &sctx->stat, sizeof(*progress));
 
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_device = NULL;
 	mutex_unlock(&fs_info->scrub_lock);
 
-	scrub_free_dev(sdev);
-	scrub_workers_put(root);
+	scrub_free_ctx(sctx);
+	scrub_workers_put(fs_info);
 
 	return ret;
 }
 
-int btrfs_scrub_pause(struct btrfs_root *root)
+void btrfs_scrub_pause(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
@@ -1667,35 +2977,28 @@
 		mutex_lock(&fs_info->scrub_lock);
 	}
 	mutex_unlock(&fs_info->scrub_lock);
-
-	return 0;
 }
 
-int btrfs_scrub_continue(struct btrfs_root *root)
+void btrfs_scrub_continue(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	atomic_dec(&fs_info->scrub_pause_req);
 	wake_up(&fs_info->scrub_pause_wait);
-	return 0;
 }
 
-int btrfs_scrub_pause_super(struct btrfs_root *root)
+void btrfs_scrub_pause_super(struct btrfs_root *root)
 {
 	down_write(&root->fs_info->scrub_super_lock);
-	return 0;
 }
 
-int btrfs_scrub_continue_super(struct btrfs_root *root)
+void btrfs_scrub_continue_super(struct btrfs_root *root)
 {
 	up_write(&root->fs_info->scrub_super_lock);
-	return 0;
 }
 
-int btrfs_scrub_cancel(struct btrfs_root *root)
+int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
-
 	mutex_lock(&fs_info->scrub_lock);
 	if (!atomic_read(&fs_info->scrubs_running)) {
 		mutex_unlock(&fs_info->scrub_lock);
@@ -1715,18 +3018,18 @@
 	return 0;
 }
 
-int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
+int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
+			   struct btrfs_device *dev)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct scrub_dev *sdev;
+	struct scrub_ctx *sctx;
 
 	mutex_lock(&fs_info->scrub_lock);
-	sdev = dev->scrub_device;
-	if (!sdev) {
+	sctx = dev->scrub_device;
+	if (!sctx) {
 		mutex_unlock(&fs_info->scrub_lock);
 		return -ENOTCONN;
 	}
-	atomic_inc(&sdev->cancel_req);
+	atomic_inc(&sctx->cancel_req);
 	while (dev->scrub_device) {
 		mutex_unlock(&fs_info->scrub_lock);
 		wait_event(fs_info->scrub_pause_wait,
@@ -1737,41 +3040,384 @@
 
 	return 0;
 }
-int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
+
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+			 struct btrfs_scrub_progress *progress)
 {
-	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_device *dev;
+	struct scrub_ctx *sctx = NULL;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
+	if (dev)
+		sctx = dev->scrub_device;
+	if (sctx)
+		memcpy(progress, &sctx->stat, sizeof(*progress));
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
+}
+
+static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
+			       u64 extent_logical, u64 extent_len,
+			       u64 *extent_physical,
+			       struct btrfs_device **extent_dev,
+			       int *extent_mirror_num)
+{
+	u64 mapped_length;
+	struct btrfs_bio *bbio = NULL;
+	int ret;
+
+	mapped_length = extent_len;
+	ret = btrfs_map_block(fs_info, READ, extent_logical,
+			      &mapped_length, &bbio, 0);
+	if (ret || !bbio || mapped_length < extent_len ||
+	    !bbio->stripes[0].dev->bdev) {
+		kfree(bbio);
+		return;
+	}
+
+	*extent_physical = bbio->stripes[0].physical;
+	*extent_mirror_num = bbio->mirror_num;
+	*extent_dev = bbio->stripes[0].dev;
+	kfree(bbio);
+}
+
+static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
+			      struct scrub_wr_ctx *wr_ctx,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_device *dev,
+			      int is_dev_replace)
+{
+	WARN_ON(wr_ctx->wr_curr_bio != NULL);
+
+	mutex_init(&wr_ctx->wr_lock);
+	wr_ctx->wr_curr_bio = NULL;
+	if (!is_dev_replace)
+		return 0;
+
+	WARN_ON(!dev->bdev);
+	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
+					 bio_get_nr_vecs(dev->bdev));
+	wr_ctx->tgtdev = dev;
+	atomic_set(&wr_ctx->flush_all_writes, 0);
+	return 0;
+}
+
+static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
+{
+	mutex_lock(&wr_ctx->wr_lock);
+	kfree(wr_ctx->wr_curr_bio);
+	wr_ctx->wr_curr_bio = NULL;
+	mutex_unlock(&wr_ctx->wr_lock);
+}
+
+static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
+			    int mirror_num, u64 physical_for_dev_replace)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx;
+	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+
+	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
+	if (!nocow_ctx) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+
+	scrub_pending_trans_workers_inc(sctx);
+
+	nocow_ctx->sctx = sctx;
+	nocow_ctx->logical = logical;
+	nocow_ctx->len = len;
+	nocow_ctx->mirror_num = mirror_num;
+	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
+	nocow_ctx->work.func = copy_nocow_pages_worker;
+	INIT_LIST_HEAD(&nocow_ctx->inodes);
+	btrfs_queue_worker(&fs_info->scrub_nocow_workers,
+			   &nocow_ctx->work);
+
+	return 0;
+}
+
+static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
+	struct scrub_nocow_inode *nocow_inode;
+
+	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
+	if (!nocow_inode)
+		return -ENOMEM;
+	nocow_inode->inum = inum;
+	nocow_inode->offset = offset;
+	nocow_inode->root = root;
+	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
+	return 0;
+}
+
+#define COPY_COMPLETE 1
+
+static void copy_nocow_pages_worker(struct btrfs_work *work)
+{
+	struct scrub_copy_nocow_ctx *nocow_ctx =
+		container_of(work, struct scrub_copy_nocow_ctx, work);
+	struct scrub_ctx *sctx = nocow_ctx->sctx;
+	u64 logical = nocow_ctx->logical;
+	u64 len = nocow_ctx->len;
+	int mirror_num = nocow_ctx->mirror_num;
+	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
 	int ret;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	int not_written = 0;
+
+	fs_info = sctx->dev_root->fs_info;
+	root = fs_info->extent_root;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		not_written = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		not_written = 1;
+		goto out;
+	}
+
+	ret = iterate_inodes_from_logical(logical, fs_info, path,
+					  record_inode_for_nocow, nocow_ctx);
+	if (ret != 0 && ret != -ENOENT) {
+		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
+			logical, physical_for_dev_replace, len, mirror_num,
+			ret);
+		not_written = 1;
+		goto out;
+	}
+
+	btrfs_end_transaction(trans, root);
+	trans = NULL;
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
+						 entry->root, nocow_ctx);
+		kfree(entry);
+		if (ret == COPY_COMPLETE) {
+			ret = 0;
+			break;
+		} else if (ret) {
+			break;
+		}
+	}
+out:
+	while (!list_empty(&nocow_ctx->inodes)) {
+		struct scrub_nocow_inode *entry;
+		entry = list_first_entry(&nocow_ctx->inodes,
+					 struct scrub_nocow_inode,
+					 list);
+		list_del_init(&entry->list);
+		kfree(entry);
+	}
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, root);
+	if (not_written)
+		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
+					    num_uncorrectable_read_errors);
+
+	btrfs_free_path(path);
+	kfree(nocow_ctx);
+
+	scrub_pending_trans_workers_dec(sctx);
+}
+
+static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
+				      struct scrub_copy_nocow_ctx *nocow_ctx)
+{
+	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+	struct btrfs_key key;
+	struct inode *inode;
+	struct page *page;
+	struct btrfs_root *local_root;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_map *em;
+	struct extent_state *cached_state = NULL;
+	struct extent_io_tree *io_tree;
+	u64 physical_for_dev_replace;
+	u64 len = nocow_ctx->len;
+	u64 lockstart = offset, lockend = offset + len - 1;
+	unsigned long index;
+	int srcu_index;
+	int ret = 0;
+	int err = 0;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+
+	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(local_root)) {
+		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+		return PTR_ERR(local_root);
+	}
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	/* Avoid truncate/dio/punch hole.. */
+	mutex_lock(&inode->i_mutex);
+	inode_dio_wait(inode);
+
+	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
+	io_tree = &BTRFS_I(inode)->io_tree;
+
+	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		goto out_unlock;
+	}
+
+	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
+	if (IS_ERR(em)) {
+		ret = PTR_ERR(em);
+		goto out_unlock;
+	}
 
 	/*
-	 * we have to hold the device_list_mutex here so the device
-	 * does not go away in cancel_dev. FIXME: find a better solution
+	 * This extent does not actually cover the logical extent anymore,
+	 * move on to the next inode.
 	 */
-	mutex_lock(&fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, devid, NULL, NULL);
-	if (!dev) {
-		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
-		return -ENODEV;
+	if (em->block_start > nocow_ctx->logical ||
+	    em->block_start + em->block_len < nocow_ctx->logical + len) {
+		free_extent_map(em);
+		goto out_unlock;
 	}
-	ret = btrfs_scrub_cancel_dev(root, dev);
-	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	free_extent_map(em);
+
+	while (len >= PAGE_CACHE_SIZE) {
+		index = offset >> PAGE_CACHE_SHIFT;
+again:
+		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		if (!page) {
+			pr_err("find_or_create_page() failed\n");
+			ret = -ENOMEM;
+			goto out;
+		}
+
+		if (PageUptodate(page)) {
+			if (PageDirty(page))
+				goto next_page;
+		} else {
+			ClearPageError(page);
+			err = extent_read_full_page_nolock(io_tree, page,
+							   btrfs_get_extent,
+							   nocow_ctx->mirror_num);
+			if (err) {
+				ret = err;
+				goto next_page;
+			}
+
+			lock_page(page);
+			/*
+			 * If the page has been remove from the page cache,
+			 * the data on it is meaningless, because it may be
+			 * old one, the new data may be written into the new
+			 * page in the page cache.
+			 */
+			if (page->mapping != inode->i_mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				goto again;
+			}
+			if (!PageUptodate(page)) {
+				ret = -EIO;
+				goto next_page;
+			}
+		}
+		err = write_page_nocow(nocow_ctx->sctx,
+				       physical_for_dev_replace, page);
+		if (err)
+			ret = err;
+next_page:
+		unlock_page(page);
+		page_cache_release(page);
 
+		if (ret)
+			break;
+
+		offset += PAGE_CACHE_SIZE;
+		physical_for_dev_replace += PAGE_CACHE_SIZE;
+		len -= PAGE_CACHE_SIZE;
+	}
+	ret = COPY_COMPLETE;
+out_unlock:
+	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
+			     GFP_NOFS);
+out:
+	mutex_unlock(&inode->i_mutex);
+	iput(inode);
 	return ret;
 }
 
-int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
-			 struct btrfs_scrub_progress *progress)
+static int write_page_nocow(struct scrub_ctx *sctx,
+			    u64 physical_for_dev_replace, struct page *page)
 {
+	struct bio *bio;
 	struct btrfs_device *dev;
-	struct scrub_dev *sdev = NULL;
+	int ret;
+	DECLARE_COMPLETION_ONSTACK(compl);
 
-	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-	dev = btrfs_find_device(root, devid, NULL, NULL);
-	if (dev)
-		sdev = dev->scrub_device;
-	if (sdev)
-		memcpy(progress, &sdev->stat, sizeof(*progress));
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+	dev = sctx->wr_ctx.tgtdev;
+	if (!dev)
+		return -EIO;
+	if (!dev->bdev) {
+		printk_ratelimited(KERN_WARNING
+			"btrfs: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
+		return -EIO;
+	}
+	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
+	if (!bio) {
+		spin_lock(&sctx->stat_lock);
+		sctx->stat.malloc_errors++;
+		spin_unlock(&sctx->stat_lock);
+		return -ENOMEM;
+	}
+	bio->bi_private = &compl;
+	bio->bi_end_io = scrub_complete_bio_end_io;
+	bio->bi_size = 0;
+	bio->bi_sector = physical_for_dev_replace >> 9;
+	bio->bi_bdev = dev->bdev;
+	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+	if (ret != PAGE_CACHE_SIZE) {
+leave_with_eio:
+		bio_put(bio);
+		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
+		return -EIO;
+	}
+	btrfsic_submit_bio(WRITE_SYNC, bio);
+	wait_for_completion(&compl);
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
+		goto leave_with_eio;
 
-	return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+	bio_put(bio);
+	return 0;
 }
Nur in b/fs/btrfs: send.c.
Nur in b/fs/btrfs: send.h.
diff -ur a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c
--- a/fs/btrfs/struct-funcs.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/struct-funcs.c	2014-02-17 11:56:58.000000000 +0100
@@ -17,15 +17,27 @@
  */
 
 #include <linux/highmem.h>
+#include <asm/unaligned.h>
 
-/* this is some deeply nasty code.  ctree.h has a different
- * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef
+#include "ctree.h"
+
+static inline u8 get_unaligned_le8(const void *p)
+{
+       return *(u8 *)p;
+}
+
+static inline void put_unaligned_le8(u8 val, void *p)
+{
+       *(u8 *)p = val;
+}
+
+/*
+ * this is some deeply nasty code.
  *
  * The end result is that anyone who #includes ctree.h gets a
- * declaration for the btrfs_set_foo functions and btrfs_foo functions
- *
- * This file declares the macros and then #includes ctree.h, which results
- * in cpp creating the function here based on the template below.
+ * declaration for the btrfs_set_foo functions and btrfs_foo functions,
+ * which are wappers of btrfs_set_token_#bits functions and
+ * btrfs_get_token_#bits functions, which are defined in this file.
  *
  * These setget functions do all the extent_buffer related mapping
  * required to efficiently read and write specific fields in the extent
@@ -33,64 +45,93 @@
  * an unsigned long offset into the extent buffer which has been
  * cast to a specific type.  This gives us all the gcc type checking.
  *
- * The extent buffer api is used to do all the kmapping and page
- * spanning work required to get extent buffers in highmem and have
- * a metadata blocksize different from the page size.
- *
- * The macro starts with a simple function prototype declaration so that
- * sparse won't complain about it being static.
+ * The extent buffer api is used to do the page spanning work required to
+ * have a metadata blocksize different from the page size.
  */
 
-#define BTRFS_SETGET_FUNCS(name, type, member, bits)			\
-u##bits btrfs_##name(struct extent_buffer *eb, type *s);		\
-void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val);	\
-u##bits btrfs_##name(struct extent_buffer *eb,				\
-				   type *s)				\
+#define DEFINE_BTRFS_SETGET_BITS(bits)					\
+u##bits btrfs_get_token_##bits(struct extent_buffer *eb, void *ptr,	\
+			       unsigned long off,			\
+			       struct btrfs_map_token *token)		\
 {									\
-	unsigned long part_offset = (unsigned long)s;			\
-	unsigned long offset = part_offset + offsetof(type, member);	\
-	type *p;							\
-	int err;						\
-	char *kaddr;						\
-	unsigned long map_start;				\
-	unsigned long map_len;					\
-	u##bits res;						\
-	err = map_private_extent_buffer(eb, offset,		\
-			sizeof(((type *)0)->member),		\
-			&kaddr, &map_start, &map_len);		\
-	if (err) {						\
-		__le##bits leres;				\
-		read_eb_member(eb, s, type, member, &leres);	\
-		return le##bits##_to_cpu(leres);		\
-	}							\
-	p = (type *)(kaddr + part_offset - map_start);		\
-	res = le##bits##_to_cpu(p->member);			\
-	return res;						\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+	u##bits res;							\
+									\
+	if (token && token->kaddr && token->offset <= offset &&		\
+	    token->eb == eb &&						\
+	   (token->offset + PAGE_CACHE_SIZE >= offset + size)) {	\
+		kaddr = token->kaddr;					\
+		p = kaddr + part_offset - token->offset;		\
+		res = get_unaligned_le##bits(p + off);			\
+		return res;						\
+	}								\
+	err = map_private_extent_buffer(eb, offset, size,		\
+					&kaddr, &map_start, &map_len);	\
+	if (err) {							\
+		__le##bits leres;					\
+									\
+		read_extent_buffer(eb, &leres, offset, size);		\
+		return le##bits##_to_cpu(leres);			\
+	}								\
+	p = kaddr + part_offset - map_start;				\
+	res = get_unaligned_le##bits(p + off);				\
+	if (token) {							\
+		token->kaddr = kaddr;					\
+		token->offset = map_start;				\
+		token->eb = eb;						\
+	}								\
+	return res;							\
 }									\
-void btrfs_set_##name(struct extent_buffer *eb,				\
-				    type *s, u##bits val)		\
+void btrfs_set_token_##bits(struct extent_buffer *eb,			\
+			    void *ptr, unsigned long off, u##bits val,	\
+			    struct btrfs_map_token *token)		\
 {									\
-	unsigned long part_offset = (unsigned long)s;			\
-	unsigned long offset = part_offset + offsetof(type, member);	\
-	type *p;							\
-	int err;						\
-	char *kaddr;						\
-	unsigned long map_start;				\
-	unsigned long map_len;					\
-	err = map_private_extent_buffer(eb, offset,		\
-			sizeof(((type *)0)->member),		\
-			&kaddr, &map_start, &map_len);		\
-	if (err) {						\
-		__le##bits val2;				\
-		val2 = cpu_to_le##bits(val);			\
-		write_eb_member(eb, s, type, member, &val2);	\
-		return;						\
-	}							\
-	p = (type *)(kaddr + part_offset - map_start);		\
-	p->member = cpu_to_le##bits(val);			\
+	unsigned long part_offset = (unsigned long)ptr;			\
+	unsigned long offset = part_offset + off;			\
+	void *p;							\
+	int err;							\
+	char *kaddr;							\
+	unsigned long map_start;					\
+	unsigned long map_len;						\
+	int size = sizeof(u##bits);					\
+									\
+	if (token && token->kaddr && token->offset <= offset &&		\
+	    token->eb == eb &&						\
+	   (token->offset + PAGE_CACHE_SIZE >= offset + size)) {	\
+		kaddr = token->kaddr;					\
+		p = kaddr + part_offset - token->offset;		\
+		put_unaligned_le##bits(val, p + off);			\
+		return;							\
+	}								\
+	err = map_private_extent_buffer(eb, offset, size,		\
+			&kaddr, &map_start, &map_len);			\
+	if (err) {							\
+		__le##bits val2;					\
+									\
+		val2 = cpu_to_le##bits(val);				\
+		write_extent_buffer(eb, &val2, offset, size);		\
+		return;							\
+	}								\
+	p = kaddr + part_offset - map_start;				\
+	put_unaligned_le##bits(val, p + off);				\
+	if (token) {							\
+		token->kaddr = kaddr;					\
+		token->offset = map_start;				\
+		token->eb = eb;						\
+	}								\
 }
 
-#include "ctree.h"
+DEFINE_BTRFS_SETGET_BITS(8)
+DEFINE_BTRFS_SETGET_BITS(16)
+DEFINE_BTRFS_SETGET_BITS(32)
+DEFINE_BTRFS_SETGET_BITS(64)
 
 void btrfs_node_key(struct extent_buffer *eb,
 		    struct btrfs_disk_key *disk_key, int nr)
diff -ur a/fs/btrfs/super.c b/fs/btrfs/super.c
--- a/fs/btrfs/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -42,30 +42,43 @@
 #include <linux/cleancache.h>
 #include <linux/mnt_namespace.h>
 #include <linux/ratelimit.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "xattr.h"
 #include "volumes.h"
-#include "version.h"
 #include "export.h"
 #include "compression.h"
+#include "rcu-string.h"
+#include "dev-replace.h"
+#include "free-space-cache.h"
+#include "backref.h"
+#include "tests/btrfs-tests.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
 
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+#include <linux/syno_acl.h>
+#endif
+
 static const struct super_operations btrfs_super_ops;
 static struct file_system_type btrfs_fs_type;
 
-static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
-				      char nbuf[16])
+#ifdef MY_ABC_HERE
+spinlock_t SYNOBtrfsGlobalLock;  /* lock for SYNOBtrfsGlobalBuf[]  */
+unsigned char SYNOBtrfsGlobalBuf[UNICODE_UTF8_BUFSIZE];
+static int SYNOBtrfsGlobalLockInit = 0;
+#endif
+
+static const char *btrfs_decode_error(int errno)
 {
-	char *errstr = NULL;
+	char *errstr = "unknown";
 
 	switch (errno) {
 	case -EIO:
@@ -77,33 +90,27 @@
 	case -EROFS:
 		errstr = "Readonly filesystem";
 		break;
-	default:
-		if (nbuf) {
-			if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
-				errstr = nbuf;
-		}
+	case -EEXIST:
+		errstr = "Object already exists";
+		break;
+	case -ENOSPC:
+		errstr = "No space left";
+		break;
+	case -ENOENT:
+		errstr = "No such entry";
 		break;
 	}
 
 	return errstr;
 }
 
-static void __save_error_info(struct btrfs_fs_info *fs_info)
+static void save_error_info(struct btrfs_fs_info *fs_info)
 {
 	/*
 	 * today we only save the error info into ram.  Long term we'll
 	 * also send it down to the disk
 	 */
-	fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
-}
-
-/* NOTE:
- *	We move write_super stuff at umount in order to avoid deadlock
- *	for umount hold all lock.
- */
-static void save_error_info(struct btrfs_fs_info *fs_info)
-{
-	__save_error_info(fs_info);
+	set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 }
 
 /* btrfs handle error by forcing the filesystem readonly */
@@ -114,21 +121,31 @@
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
 		sb->s_flags |= MS_RDONLY;
-		printk(KERN_INFO "btrfs is forced readonly\n");
+		btrfs_info(fs_info, "forced readonly");
+		/*
+		 * Note that a running device replace operation is not
+		 * canceled here although there is no way to update
+		 * the progress. It would add the risk of a deadlock,
+		 * therefore the canceling is ommited. The only penalty
+		 * is that some I/O remains active until the procedure
+		 * completes. The next time when the filesystem is
+		 * mounted writeable again, the device replace
+		 * operation continues.
+		 */
 	}
 }
 
+#ifdef CONFIG_PRINTK
 /*
  * __btrfs_std_error decodes expected errors from the caller and
  * invokes the approciate error response.
  */
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
-		     unsigned int line, int errno)
+		       unsigned int line, int errno, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
-	char nbuf[16];
 	const char *errstr;
 
 	/*
@@ -136,25 +153,174 @@
 	 * under MS_RDONLY, then it is safe here.
 	 */
 	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
-		return;
+  		return;
+
+	errstr = btrfs_decode_error(errno);
+	if (fmt) {
+		struct va_format vaf;
+		va_list args;
+
+		va_start(args, fmt);
+		vaf.fmt = fmt;
+		vaf.va = &args;
+
+		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s (%pV)\n",
+			sb->s_id, function, line, errno, errstr, &vaf);
+		va_end(args);
+	} else {
+		printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: errno=%d %s\n",
+			sb->s_id, function, line, errno, errstr);
+	}
 
-	errstr = btrfs_decode_error(fs_info, errno, nbuf);
-	printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
-		sb->s_id, function, line, errstr);
+	/* Don't go through full error handling during mount */
 	save_error_info(fs_info);
+	if (sb->s_flags & MS_BORN)
+		btrfs_handle_error(fs_info);
+}
+
+static const char * const logtypes[] = {
+	"emergency",
+	"alert",
+	"critical",
+	"error",
+	"warning",
+	"notice",
+	"info",
+	"debug",
+};
+
+void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
+{
+	struct super_block *sb = fs_info->sb;
+	char lvl[4];
+	struct va_format vaf;
+	va_list args;
+	const char *type = logtypes[4];
+	int kern_level;
+
+	va_start(args, fmt);
+
+	kern_level = printk_get_level(fmt);
+	if (kern_level) {
+		size_t size = printk_skip_level(fmt) - fmt;
+		memcpy(lvl, fmt,  size);
+		lvl[size] = '\0';
+		fmt += size;
+		type = logtypes[kern_level - '0'];
+	} else
+		*lvl = '\0';
 
-	btrfs_handle_error(fs_info);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	printk("%sBTRFS %s (device %s): %pV\n", lvl, type, sb->s_id, &vaf);
+
+	va_end(args);
 }
 
-static void btrfs_put_super(struct super_block *sb)
+#else
+
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+		       unsigned int line, int errno, const char *fmt, ...)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	int ret;
+	struct super_block *sb = fs_info->sb;
+
+	/*
+	 * Special case: if the error is EROFS, and we're already
+	 * under MS_RDONLY, then it is safe here.
+	 */
+	if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+		return;
+
+	/* Don't go through full error handling during mount */
+	if (sb->s_flags & MS_BORN) {
+		save_error_info(fs_info);
+		btrfs_handle_error(fs_info);
+	}
+}
+#endif
+
+/*
+ * We only mark the transaction aborted and then set the file system read-only.
+ * This will prevent new transactions from starting or trying to join this
+ * one.
+ *
+ * This means that error recovery at the call site is limited to freeing
+ * any local memory allocations and passing the error code up without
+ * further cleanup. The transaction should complete as it normally would
+ * in the call path but will return -EIO.
+ *
+ * We'll complete the cleanup in btrfs_end_transaction and
+ * btrfs_commit_transaction.
+ */
+void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root, const char *function,
+			       unsigned int line, int errno)
+{
+	/*
+	 * Report first abort since mount
+	 */
+	if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED,
+				&root->fs_info->fs_state)) {
+		WARN(1, KERN_DEBUG "btrfs: Transaction aborted (error %d)\n",
+				errno);
+	}
+	trans->aborted = errno;
+	/* Nothing used. The other threads that have joined this
+	 * transaction may be able to continue. */
+	if (!trans->blocks_used) {
+		const char *errstr;
+
+		errstr = btrfs_decode_error(errno);
+		btrfs_warn(root->fs_info,
+		           "%s:%d: Aborting unused transaction(%s).",
+		           function, line, errstr);
+		return;
+	}
+	ACCESS_ONCE(trans->transaction->aborted) = errno;
+	/* Wake up anybody who may be waiting on this transaction */
+	wake_up(&root->fs_info->transaction_wait);
+	wake_up(&root->fs_info->transaction_blocked_wait);
+	__btrfs_std_error(root->fs_info, function, line, errno, NULL);
+}
+/*
+ * __btrfs_panic decodes unexpected, fatal errors from the caller,
+ * issues an alert, and either panics or BUGs, depending on mount options.
+ */
+void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+		   unsigned int line, int errno, const char *fmt, ...)
+{
+	char *s_id = "<unknown>";
+	const char *errstr;
+	struct va_format vaf = { .fmt = fmt };
+	va_list args;
+
+	if (fs_info)
+		s_id = fs_info->sb->s_id;
 
-	ret = close_ctree(root);
-	sb->s_fs_info = NULL;
+	va_start(args, fmt);
+	vaf.va = &args;
+
+	errstr = btrfs_decode_error(errno);
+	if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
+		panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+			s_id, function, line, &vaf, errno, errstr);
+
+	printk(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
+	       s_id, function, line, &vaf, errno, errstr);
+	va_end(args);
+	/* Caller calls BUG() */
+}
 
-	(void)ret; /* FIXME: need to fix VFS to return error? */
+static void btrfs_put_super(struct super_block *sb)
+{
+	(void)close_ctree(btrfs_sb(sb)->tree_root);
+	/* FIXME: need to fix VFS to return error? */
+	/* AV: return it _where_?  ->put_super() can be triggered by any number
+	 * of async events, up to and including delivery of SIGKILL to the
+	 * last process that kept it busy.  Or segfault in the aforementioned
+	 * process...  Whom would you report that to?
+	 */
 }
 
 enum {
@@ -164,14 +330,21 @@
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
+	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
+	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
+	Opt_check_integrity, Opt_check_integrity_including_extent_data,
+	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
+	Opt_commit_interval,
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	Opt_synoacl, Opt_nosynoacl,
+#endif
+	Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
-	{Opt_subvolid, "subvolid=%d"},
+	{Opt_subvolid, "subvolid=%s"},
 	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
@@ -200,12 +373,24 @@
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "nospace_cache"},
 	{Opt_recovery, "recovery"},
+	{Opt_skip_balance, "skip_balance"},
+	{Opt_check_integrity, "check_int"},
+	{Opt_check_integrity_including_extent_data, "check_int_data"},
+	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
+	{Opt_fatal_errors, "fatal_errors=%s"},
+	{Opt_commit_interval, "commit=%d"},
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	{Opt_synoacl, SYNO_ACL_MNT_OPT},
+	{Opt_nosynoacl, SYNO_ACL_NOT_MNT_OPT},
+#endif
 	{Opt_err, NULL},
 };
 
 /*
  * Regular mount options parser.  Everything that is needed only when
  * reading in a new superblock is parsed here.
+ * XXX JDM: This needs to be cleaned up for remount.
  */
 int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
@@ -260,13 +445,22 @@
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-			printk(KERN_INFO "btrfs: setting nodatacow\n");
+			if (!btrfs_test_opt(root, COMPRESS) ||
+				!btrfs_test_opt(root, FORCE_COMPRESS)) {
+					printk(KERN_INFO "btrfs: setting nodatacow, compression disabled\n");
+			} else {
+				printk(KERN_INFO "btrfs: setting nodatacow\n");
+			}
+			info->compress_type = BTRFS_COMPRESS_NONE;
+			btrfs_clear_opt(info->mount_opt, COMPRESS);
+			btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 			btrfs_set_opt(info->mount_opt, NODATACOW);
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_compress_force:
 		case Opt_compress_force_type:
 			compress_force = true;
+			/* Fallthrough */
 		case Opt_compress:
 		case Opt_compress_type:
 			if (token == Opt_compress ||
@@ -274,15 +468,27 @@
 			    strcmp(args[0].from, "zlib") == 0) {
 				compress_type = "zlib";
 				info->compress_type = BTRFS_COMPRESS_ZLIB;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
 			} else if (strcmp(args[0].from, "lzo") == 0) {
 				compress_type = "lzo";
 				info->compress_type = BTRFS_COMPRESS_LZO;
+				btrfs_set_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, NODATACOW);
+				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				btrfs_set_fs_incompat(info, COMPRESS_LZO);
+			} else if (strncmp(args[0].from, "no", 2) == 0) {
+				compress_type = "no";
+				info->compress_type = BTRFS_COMPRESS_NONE;
+				btrfs_clear_opt(info->mount_opt, COMPRESS);
+				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
+				compress_force = false;
 			} else {
 				ret = -EINVAL;
 				goto out;
 			}
 
-			btrfs_set_opt(info->mount_opt, COMPRESS);
 			if (compress_force) {
 				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
 				pr_info("btrfs: force %s compression\n",
@@ -313,12 +519,14 @@
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
 			break;
 		case Opt_thread_pool:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg) {
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg > 0) {
 				info->thread_pool_size = intarg;
-				printk(KERN_INFO "btrfs: thread pool %d\n",
-				       info->thread_pool_size);
+			} else {
+				ret = -EINVAL;
+				goto out;
 			}
 			break;
 		case Opt_max_inline:
@@ -333,22 +541,38 @@
 						root->sectorsize);
 				}
 				printk(KERN_INFO "btrfs: max_inline at %llu\n",
-					(unsigned long long)info->max_inline);
+					info->max_inline);
+			} else {
+				ret = -ENOMEM;
+				goto out;
 			}
 			break;
 		case Opt_alloc_start:
 			num = match_strdup(&args[0]);
 			if (num) {
+				mutex_lock(&info->chunk_mutex);
 				info->alloc_start = memparse(num, NULL);
+				mutex_unlock(&info->chunk_mutex);
 				kfree(num);
 				printk(KERN_INFO
 					"btrfs: allocations start at %llu\n",
-					(unsigned long long)info->alloc_start);
+					info->alloc_start);
+			} else {
+				ret = -ENOMEM;
+				goto out;
 			}
 			break;
 		case Opt_noacl:
 			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
 			break;
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+		case Opt_synoacl:
+			btrfs_set_opt(info->mount_opt, SYNO_ACL);
+			break;
+		case Opt_nosynoacl:
+			btrfs_clear_opt(info->mount_opt, SYNO_ACL);
+			break;
+#endif
 		case Opt_notreelog:
 			printk(KERN_INFO "btrfs: disabling tree log\n");
 			btrfs_set_opt(info->mount_opt, NOTREELOG);
@@ -358,12 +582,16 @@
 			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
 			break;
 		case Opt_ratio:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg) {
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
 				info->metadata_ratio = intarg;
 				printk(KERN_INFO "btrfs: metadata ratio %d\n",
 				       info->metadata_ratio);
+			} else {
+				ret = -EINVAL;
+				goto out;
 			}
 			break;
 		case Opt_discard:
@@ -372,6 +600,9 @@
 		case Opt_space_cache:
 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 			break;
+		case Opt_rescan_uuid_tree:
+			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
+			break;
 		case Opt_no_space_cache:
 			printk(KERN_INFO "btrfs: disabling disk space caching\n");
 			btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
@@ -391,13 +622,86 @@
 			btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
 			break;
 		case Opt_defrag:
-			printk(KERN_INFO "btrfs: enabling auto defrag");
+			printk(KERN_INFO "btrfs: enabling auto defrag\n");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
 		case Opt_recovery:
-			printk(KERN_INFO "btrfs: enabling auto recovery");
+			printk(KERN_INFO "btrfs: enabling auto recovery\n");
 			btrfs_set_opt(info->mount_opt, RECOVERY);
 			break;
+		case Opt_skip_balance:
+			btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+			break;
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+		case Opt_check_integrity_including_extent_data:
+			printk(KERN_INFO "btrfs: enabling check integrity"
+			       " including extent data\n");
+			btrfs_set_opt(info->mount_opt,
+				      CHECK_INTEGRITY_INCLUDING_EXTENT_DATA);
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity:
+			printk(KERN_INFO "btrfs: enabling check integrity\n");
+			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
+			break;
+		case Opt_check_integrity_print_mask:
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
+				info->check_integrity_print_mask = intarg;
+				printk(KERN_INFO "btrfs:"
+				       " check_integrity_print_mask 0x%x\n",
+				       info->check_integrity_print_mask);
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+#else
+		case Opt_check_integrity_including_extent_data:
+		case Opt_check_integrity:
+		case Opt_check_integrity_print_mask:
+			printk(KERN_ERR "btrfs: support for check_integrity*"
+			       " not compiled in!\n");
+			ret = -EINVAL;
+			goto out;
+#endif
+		case Opt_fatal_errors:
+			if (strcmp(args[0].from, "panic") == 0)
+				btrfs_set_opt(info->mount_opt,
+					      PANIC_ON_FATAL_ERROR);
+			else if (strcmp(args[0].from, "bug") == 0)
+				btrfs_clear_opt(info->mount_opt,
+					      PANIC_ON_FATAL_ERROR);
+			else {
+				ret = -EINVAL;
+				goto out;
+			}
+			break;
+		case Opt_commit_interval:
+			intarg = 0;
+			ret = match_int(&args[0], &intarg);
+			if (ret < 0) {
+				printk(KERN_ERR
+					"btrfs: invalid commit interval\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			if (intarg > 0) {
+				if (intarg > 300) {
+					printk(KERN_WARNING
+					    "btrfs: excessive commit interval %d\n",
+							intarg);
+				}
+				info->commit_interval = intarg;
+			} else {
+				printk(KERN_INFO
+				    "btrfs: using default commit interval %ds\n",
+				    BTRFS_DEFAULT_COMMIT_INTERVAL);
+				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+			}
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
@@ -422,12 +726,12 @@
  */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		void *holder, char **subvol_name, u64 *subvol_objectid,
-		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
+		struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *device_name, *opts, *orig, *p;
+	char *num = NULL;
 	int error = 0;
-	int intarg;
 
 	if (!options)
 		return 0;
@@ -451,30 +755,28 @@
 		case Opt_subvol:
 			kfree(*subvol_name);
 			*subvol_name = match_strdup(&args[0]);
+			if (!*subvol_name) {
+				error = -ENOMEM;
+				goto out;
+			}
 			break;
 		case Opt_subvolid:
-			intarg = 0;
-			error = match_int(&args[0], &intarg);
-			if (!error) {
+			num = match_strdup(&args[0]);
+			if (num) {
+				*subvol_objectid = memparse(num, NULL);
+				kfree(num);
 				/* we want the original fs_tree */
-				if (!intarg)
+				if (!*subvol_objectid)
 					*subvol_objectid =
 						BTRFS_FS_TREE_OBJECTID;
-				else
-					*subvol_objectid = intarg;
+			} else {
+				error = -EINVAL;
+				goto out;
 			}
 			break;
 		case Opt_subvolrootid:
-			intarg = 0;
-			error = match_int(&args[0], &intarg);
-			if (!error) {
-				/* we want the original fs_tree */
-				if (!intarg)
-					*subvol_rootid =
-						BTRFS_FS_TREE_OBJECTID;
-				else
-					*subvol_rootid = intarg;
-			}
+			printk(KERN_WARNING
+				"btrfs: 'subvolrootid' mount option is deprecated and has no effect\n");
 			break;
 		case Opt_device:
 			device_name = match_strdup(&args[0]);
@@ -501,7 +803,8 @@
 static struct dentry *get_default_root(struct super_block *sb,
 				       u64 subvol_objectid)
 {
-	struct btrfs_root *root = sb->s_fs_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 	struct btrfs_root *new_root;
 	struct btrfs_dir_item *di;
 	struct btrfs_path *path;
@@ -531,7 +834,7 @@
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
@@ -545,7 +848,7 @@
 		 */
 		btrfs_free_path(path);
 		dir_id = BTRFS_FIRST_FREE_OBJECTID;
-		new_root = root->fs_info->fs_root;
+		new_root = fs_info->fs_root;
 		goto setup_root;
 	}
 
@@ -553,13 +856,10 @@
 	btrfs_free_path(path);
 
 find_root:
-	new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
+	new_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (IS_ERR(new_root))
 		return ERR_CAST(new_root);
 
-	if (btrfs_root_refs(&new_root->root_item) == 0)
-		return ERR_PTR(-ENOENT);
-
 	dir_id = btrfs_root_dirid(&new_root->root_item);
 setup_root:
 	location.objectid = dir_id;
@@ -589,7 +889,7 @@
 {
 	struct inode *inode;
 	struct dentry *root_dentry;
-	struct btrfs_root *tree_root;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
 	struct btrfs_key key;
 	int err;
 
@@ -603,19 +903,29 @@
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
 	sb->s_flags |= MS_POSIXACL;
 #endif
-
-	tree_root = open_ctree(sb, fs_devices, (char *)data);
-
-	if (IS_ERR(tree_root)) {
+	sb->s_flags |= MS_I_VERSION;
+	err = open_ctree(sb, fs_devices, (char *)data);
+	if (err) {
 		printk("btrfs: open_ctree failed\n");
-		return PTR_ERR(tree_root);
+		return err;
 	}
-	sb->s_fs_info = tree_root;
 
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	if (btrfs_raw_test_opt(fs_info->mount_opt, SYNO_ACL)) {
+		int st = SYNOACLModuleStatusGet("synoacl_vfs");
+		if (MODULE_STATE_LIVE != st) {
+			btrfs_err(fs_info, "synoacl module has not been loaded. Unable to mount with synoacl, vfs_mod status=%d", st);
+			btrfs_clear_opt(fs_info->mount_opt, SYNO_ACL);
+		} else {
+			sb->s_flags |= MS_SYNOACL;
+			SYNOACLModuleGet("synoacl_vfs");
+		}
+	}
+#endif
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.offset = 0;
-	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
+	inode = btrfs_iget(sb, &key, fs_info->fs_root, NULL);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
 		goto fail_close;
@@ -632,40 +942,49 @@
 
 	save_mount_options(sb, data);
 	cleancache_init_fs(sb);
+#ifdef MY_ABC_HERE
+	if (!SYNOBtrfsGlobalLockInit) {
+		spin_lock_init(&SYNOBtrfsGlobalLock);
+		SYNOBtrfsGlobalLockInit=1;
+	}
+#endif
+	sb->s_flags |= MS_ACTIVE;
 	return 0;
 
 fail_close:
-	close_ctree(tree_root);
+	close_ctree(fs_info->tree_root);
 	return err;
 }
 
 int btrfs_sync_fs(struct super_block *sb, int wait)
 {
 	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = btrfs_sb(sb);
-	int ret;
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
 
 	trace_btrfs_sync_fs(wait);
 
 	if (!wait) {
-		filemap_flush(root->fs_info->btree_inode->i_mapping);
+		filemap_flush(fs_info->btree_inode->i_mapping);
 		return 0;
 	}
 
-	btrfs_start_delalloc_inodes(root, 0);
-	btrfs_wait_ordered_extents(root, 0, 0);
+	btrfs_wait_all_ordered_extents(fs_info);
 
-	trans = btrfs_start_transaction(root, 0);
-	if (IS_ERR(trans))
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
 		return PTR_ERR(trans);
-	ret = btrfs_commit_transaction(trans, root);
-	return ret;
+	}
+	return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
-	struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
-	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_fs_info *info = btrfs_sb(vfs->mnt_sb);
+	struct btrfs_root *root = info->tree_root;
 	char *compress_type;
 
 	if (btrfs_test_opt(root, DEGRADED))
@@ -677,11 +996,9 @@
 	if (btrfs_test_opt(root, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
 	if (info->max_inline != 8192 * 1024)
-		seq_printf(seq, ",max_inline=%llu",
-			   (unsigned long long)info->max_inline);
+		seq_printf(seq, ",max_inline=%llu", info->max_inline);
 	if (info->alloc_start != 0)
-		seq_printf(seq, ",alloc_start=%llu",
-			   (unsigned long long)info->alloc_start);
+		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
 	if (info->thread_pool_size !=  min_t(unsigned long,
 					     num_online_cpus() + 2, 8))
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -707,12 +1024,19 @@
 		seq_puts(seq, ",flushoncommit");
 	if (btrfs_test_opt(root, DISCARD))
 		seq_puts(seq, ",discard");
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	if (btrfs_test_opt(root, SYNO_ACL))
+		seq_puts(seq, ","SYNO_ACL_MNT_OPT);
+#else
 	if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
 		seq_puts(seq, ",noacl");
+#endif
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
 	else
 		seq_puts(seq, ",nospace_cache");
+	if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+		seq_puts(seq, ",rescan_uuid_tree");
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		seq_puts(seq, ",clear_cache");
 	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -723,28 +1047,43 @@
 		seq_puts(seq, ",autodefrag");
 	if (btrfs_test_opt(root, INODE_MAP_CACHE))
 		seq_puts(seq, ",inode_cache");
+	if (btrfs_test_opt(root, SKIP_BALANCE))
+		seq_puts(seq, ",skip_balance");
+	if (btrfs_test_opt(root, RECOVERY))
+		seq_puts(seq, ",recovery");
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+		seq_puts(seq, ",check_int_data");
+	else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		seq_puts(seq, ",check_int");
+	if (info->check_integrity_print_mask)
+		seq_printf(seq, ",check_int_print_mask=%d",
+				info->check_integrity_print_mask);
+#endif
+	if (info->metadata_ratio)
+		seq_printf(seq, ",metadata_ratio=%d",
+				info->metadata_ratio);
+	if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
+		seq_puts(seq, ",fatal_errors=panic");
+	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
+		seq_printf(seq, ",commit=%d", info->commit_interval);
 	return 0;
 }
 
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-	struct btrfs_root *test_root = data;
-	struct btrfs_root *root = btrfs_sb(s);
+	struct btrfs_fs_info *p = data;
+	struct btrfs_fs_info *fs_info = btrfs_sb(s);
 
-	/*
-	 * If this super block is going away, return false as it
-	 * can't match as an existing super block.
-	 */
-	if (!atomic_read(&s->s_active))
-		return 0;
-	return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
+	return fs_info->fs_devices == p->fs_devices;
 }
 
 static int btrfs_set_super(struct super_block *s, void *data)
 {
-	s->s_fs_info = data;
-
-	return set_anon_super(s, data);
+	int err = set_anon_super(s, data);
+	if (!err)
+		s->s_fs_info = data;
+	return err;
 }
 
 /*
@@ -764,63 +1103,48 @@
  */
 static char *setup_root_args(char *args)
 {
-	unsigned copied = 0;
-	unsigned len = strlen(args) + 2;
-	char *pos;
-	char *ret;
+	unsigned len = strlen(args) + 2 + 1;
+	char *src, *dst, *buf;
 
 	/*
-	 * We need the same args as before, but minus
-	 *
-	 * subvol=a
-	 *
-	 * and add
+	 * We need the same args as before, but with this substitution:
+	 * s!subvol=[^,]+!subvolid=0!
 	 *
-	 * subvolid=0
-	 *
-	 * which is a difference of 2 characters, so we allocate strlen(args) +
-	 * 2 characters.
+	 * Since the replacement string is up to 2 bytes longer than the
+	 * original, allocate strlen(args) + 2 + 1 bytes.
 	 */
-	ret = kzalloc(len * sizeof(char), GFP_NOFS);
-	if (!ret)
-		return NULL;
-	pos = strstr(args, "subvol=");
 
+	src = strstr(args, "subvol=");
 	/* This shouldn't happen, but just in case.. */
-	if (!pos) {
-		kfree(ret);
+	if (!src)
+		return NULL;
+
+	buf = dst = kmalloc(len, GFP_NOFS);
+	if (!buf)
 		return NULL;
-	}
 
 	/*
-	 * The subvol=<> arg is not at the front of the string, copy everybody
-	 * up to that into ret.
+	 * If the subvol= arg is not at the start of the string,
+	 * copy whatever precedes it into buf.
 	 */
-	if (pos != args) {
-		*pos = '\0';
-		strcpy(ret, args);
-		copied += strlen(args);
-		pos++;
+	if (src != args) {
+		*src++ = '\0';
+		strcpy(buf, args);
+		dst += strlen(args);
 	}
 
-	strncpy(ret + copied, "subvolid=0", len - copied);
-
-	/* Length of subvolid=0 */
-	copied += 10;
+	strcpy(dst, "subvolid=0");
+	dst += strlen("subvolid=0");
 
 	/*
-	 * If there is no , after the subvol= option then we know there's no
-	 * other options and we can just return.
+	 * If there is a "," after the original subvol=... string,
+	 * copy that suffix into our buffer.  Otherwise, we're done.
 	 */
-	pos = strchr(pos, ',');
-	if (!pos)
-		return ret;
+	src = strchr(src, ',');
+	if (src)
+		strcpy(dst, src);
 
-	/* Copy the rest of the arguments into our buffer */
-	strncpy(ret + copied, pos, len - copied);
-	copied += strlen(pos);
-
-	return ret;
+	return buf;
 }
 
 static struct dentry *mount_subvol(const char *subvol_name, int flags,
@@ -870,7 +1194,6 @@
 	fmode_t mode = FMODE_READ;
 	char *subvol_name = NULL;
 	u64 subvol_objectid = 0;
-	u64 subvol_rootid = 0;
 	int error = 0;
 
 	if (!(flags & MS_RDONLY))
@@ -878,7 +1201,7 @@
 
 	error = btrfs_parse_early_options(data, mode, fs_type,
 					  &subvol_name, &subvol_objectid,
-					  &subvol_rootid, &fs_devices);
+					  &fs_devices);
 	if (error) {
 		kfree(subvol_name);
 		return ERR_PTR(error);
@@ -904,12 +1227,6 @@
 	if (!fs_info)
 		return ERR_PTR(-ENOMEM);
 
-	fs_info->tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-	if (!fs_info->tree_root) {
-		error = -ENOMEM;
-		goto error_fs_info;
-	}
-	fs_info->tree_root->fs_info = fs_info;
 	fs_info->fs_devices = fs_devices;
 
 	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
@@ -929,43 +1246,30 @@
 	}
 
 	bdev = fs_devices->latest_bdev;
-	s = sget(fs_type, btrfs_test_super, btrfs_set_super,
-		 fs_info->tree_root);
+	s = sget(fs_type, btrfs_test_super, btrfs_set_super, fs_info);
 	if (IS_ERR(s)) {
 		error = PTR_ERR(s);
 		goto error_close_devices;
 	}
 
 	if (s->s_root) {
-		if ((flags ^ s->s_flags) & MS_RDONLY) {
-			deactivate_locked_super(s);
-			error = -EBUSY;
-			goto error_close_devices;
-		}
-
 		btrfs_close_devices(fs_devices);
 		free_fs_info(fs_info);
+		if ((flags ^ s->s_flags) & MS_RDONLY)
+			error = -EBUSY;
 	} else {
 		char b[BDEVNAME_SIZE];
 
 		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
+		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
-		if (error) {
-			deactivate_locked_super(s);
-			return ERR_PTR(error);
-		}
-
-		s->s_flags |= MS_ACTIVE;
 	}
 
-	root = get_default_root(s, subvol_objectid);
-	if (IS_ERR(root)) {
+	root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error);
+	if (IS_ERR(root))
 		deactivate_locked_super(s);
-		return root;
-	}
 
 	return root;
 
@@ -976,41 +1280,209 @@
 	return ERR_PTR(error);
 }
 
+static void btrfs_set_max_workers(struct btrfs_workers *workers, int new_limit)
+{
+	spin_lock_irq(&workers->lock);
+	workers->max_workers = new_limit;
+	spin_unlock_irq(&workers->lock);
+}
+
+static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
+				     int new_pool_size, int old_pool_size)
+{
+	if (new_pool_size == old_pool_size)
+		return;
+
+	fs_info->thread_pool_size = new_pool_size;
+
+	printk(KERN_INFO "btrfs: resize thread pool %d -> %d\n",
+	       old_pool_size, new_pool_size);
+
+	btrfs_set_max_workers(&fs_info->generic_worker, new_pool_size);
+	btrfs_set_max_workers(&fs_info->workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->delalloc_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->submit_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->caching_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->fixup_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->endio_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->endio_meta_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->endio_meta_write_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->endio_write_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->endio_freespace_worker, new_pool_size);
+	btrfs_set_max_workers(&fs_info->delayed_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->readahead_workers, new_pool_size);
+	btrfs_set_max_workers(&fs_info->scrub_wr_completion_workers,
+			      new_pool_size);
+}
+
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info)
+{
+	set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
+static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,
+				       unsigned long old_opts, int flags)
+{
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (flags & MS_RDONLY))) {
+		/* wait for any defraggers to finish */
+		wait_event(fs_info->transaction_wait,
+			   (atomic_read(&fs_info->defrag_running) == 0));
+		if (flags & MS_RDONLY)
+			sync_filesystem(fs_info->sb);
+	}
+}
+
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+					 unsigned long old_opts)
+{
+	/*
+	 * We need cleanup all defragable inodes if the autodefragment is
+	 * close or the fs is R/O.
+	 */
+	if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+	    (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+	     (fs_info->sb->s_flags & MS_RDONLY))) {
+		btrfs_cleanup_defrag_inodes(fs_info);
+	}
+
+	clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
+
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+	struct btrfs_root *root = fs_info->tree_root;
+	unsigned old_flags = sb->s_flags;
+	unsigned long old_opts = fs_info->mount_opt;
+	unsigned long old_compress_type = fs_info->compress_type;
+	u64 old_max_inline = fs_info->max_inline;
+	u64 old_alloc_start = fs_info->alloc_start;
+	int old_thread_pool_size = fs_info->thread_pool_size;
+	unsigned int old_metadata_ratio = fs_info->metadata_ratio;
 	int ret;
 
+	btrfs_remount_prepare(fs_info);
+
 	ret = btrfs_parse_options(root, data);
-	if (ret)
-		return -EINVAL;
+	if (ret) {
+		ret = -EINVAL;
+		goto restore;
+	}
+
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	if ((sb->s_flags & MS_SYNOACL) && !btrfs_test_opt(root, SYNO_ACL)) {
+		sb->s_flags = sb->s_flags & ~MS_SYNOACL;
+		SYNOACLModulePut("synoacl_vfs");
+	} else if((!(sb->s_flags & MS_SYNOACL)) && btrfs_test_opt(root, SYNO_ACL)) {
+		int st = SYNOACLModuleStatusGet("synoacl_vfs");
+		if (MODULE_STATE_LIVE != st) {
+			btrfs_err(fs_info, "synoacl module has not been loaded. Unable to remount with synoacl, vfs_mod status=%d", st);
+			btrfs_clear_opt(fs_info->mount_opt, SYNO_ACL);
+		} else {
+			sb->s_flags |= MS_SYNOACL;
+			SYNOACLModuleGet("synoacl_vfs");
+		}
+	}
+#endif
+	btrfs_remount_begin(fs_info, old_opts, *flags);
+	btrfs_resize_thread_pool(fs_info,
+		fs_info->thread_pool_size, old_thread_pool_size);
 
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-		return 0;
+		goto out;
 
 	if (*flags & MS_RDONLY) {
+		/*
+		 * this also happens on 'umount -rf' or on shutdown, when
+		 * the filesystem is busy.
+		 */
 		sb->s_flags |= MS_RDONLY;
 
-		ret =  btrfs_commit_super(root);
-		WARN_ON(ret);
+		btrfs_dev_replace_suspend_for_unmount(fs_info);
+		btrfs_scrub_cancel(fs_info);
+		btrfs_pause_balance(fs_info);
+
+		ret = btrfs_commit_super(root);
+		if (ret)
+			goto restore;
 	} else {
-		if (root->fs_info->fs_devices->rw_devices == 0)
-			return -EACCES;
+		if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
+			btrfs_err(fs_info,
+				"Remounting read-write after error is not allowed\n");
+			ret = -EINVAL;
+			goto restore;
+		}
+		if (fs_info->fs_devices->rw_devices == 0) {
+			ret = -EACCES;
+			goto restore;
+		}
 
-		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
-			return -EINVAL;
+		if (fs_info->fs_devices->missing_devices >
+		     fs_info->num_tolerated_disk_barrier_failures &&
+		    !(*flags & MS_RDONLY)) {
+			printk(KERN_WARNING
+			       "Btrfs: too many missing devices, writeable remount is not allowed\n");
+			ret = -EACCES;
+			goto restore;
+		}
 
-		ret = btrfs_cleanup_fs_roots(root->fs_info);
-		WARN_ON(ret);
+		if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+			ret = -EINVAL;
+			goto restore;
+		}
+
+		ret = btrfs_cleanup_fs_roots(fs_info);
+		if (ret)
+			goto restore;
 
 		/* recover relocation */
 		ret = btrfs_recover_relocation(root);
-		WARN_ON(ret);
+		if (ret)
+			goto restore;
+
+		ret = btrfs_resume_balance_async(fs_info);
+		if (ret)
+			goto restore;
 
+		ret = btrfs_resume_dev_replace_async(fs_info);
+		if (ret) {
+			pr_warn("btrfs: failed to resume dev_replace\n");
+			goto restore;
+		}
+
+		if (!fs_info->uuid_root) {
+			pr_info("btrfs: creating UUID tree\n");
+			ret = btrfs_create_uuid_tree(fs_info);
+			if (ret) {
+				pr_warn("btrfs: failed to create the uuid tree"
+					"%d\n", ret);
+				goto restore;
+			}
+		}
 		sb->s_flags &= ~MS_RDONLY;
 	}
-
+out:
+	btrfs_remount_cleanup(fs_info, old_opts);
 	return 0;
+
+restore:
+	/* We've hit an error - don't reset MS_RDONLY */
+	if (sb->s_flags & MS_RDONLY)
+		old_flags |= MS_RDONLY;
+	sb->s_flags = old_flags;
+	fs_info->mount_opt = old_opts;
+	fs_info->compress_type = old_compress_type;
+	fs_info->max_inline = old_max_inline;
+	mutex_lock(&fs_info->chunk_mutex);
+	fs_info->alloc_start = old_alloc_start;
+	mutex_unlock(&fs_info->chunk_mutex);
+	btrfs_resize_thread_pool(fs_info,
+		old_thread_pool_size, fs_info->thread_pool_size);
+	fs_info->metadata_ratio = old_metadata_ratio;
+	btrfs_remount_cleanup(fs_info, old_opts);
+	return ret;
 }
 
 /* Used to sort the devices by max_avail(descending sort) */
@@ -1085,7 +1557,8 @@
 		min_stripe_size = BTRFS_STRIPE_LEN;
 
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
-		if (!device->in_fs_metadata || !device->bdev)
+		if (!device->in_fs_metadata || !device->bdev ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		avail_space = device->total_bytes - device->bytes_used;
@@ -1169,18 +1642,18 @@
 
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
-	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
-	struct list_head *head = &root->fs_info->space_info;
+	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
+	struct btrfs_super_block *disk_super = fs_info->super_copy;
+	struct list_head *head = &fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
 	u64 total_free_data = 0;
 	int bits = dentry->d_sb->s_blocksize_bits;
-	__be32 *fsid = (__be32 *)root->fs_info->fsid;
+	__be32 *fsid = (__be32 *)fs_info->fsid;
 	int ret;
 
 	/* holding chunk_muext to avoid allocating new chunks */
-	mutex_lock(&root->fs_info->chunk_mutex);
+	mutex_lock(&fs_info->chunk_mutex);
 	rcu_read_lock();
 	list_for_each_entry_rcu(found, head, list) {
 		if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
@@ -1199,14 +1672,14 @@
 	buf->f_bsize = dentry->d_sb->s_blocksize;
 	buf->f_type = BTRFS_SUPER_MAGIC;
 	buf->f_bavail = total_free_data;
-	ret = btrfs_calc_avail_data_space(root, &total_free_data);
+	ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data);
 	if (ret) {
-		mutex_unlock(&root->fs_info->chunk_mutex);
+		mutex_unlock(&fs_info->chunk_mutex);
 		return ret;
 	}
 	buf->f_bavail += total_free_data;
 	buf->f_bavail = buf->f_bavail >> bits;
-	mutex_unlock(&root->fs_info->chunk_mutex);
+	mutex_unlock(&fs_info->chunk_mutex);
 
 	/* We treat it as constant endianness (it doesn't matter _which_)
 	   because we want the fsid to come out the same whether mounted
@@ -1220,11 +1693,23 @@
 	return 0;
 }
 
+static void btrfs_kill_super(struct super_block *sb)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	if (MS_SYNOACL & sb->s_flags) {
+		SYNOACLModulePut("synoacl_vfs");
+	}
+#endif
+	kill_anon_super(sb);
+	free_fs_info(fs_info);
+}
+
 static struct file_system_type btrfs_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "btrfs",
 	.mount		= btrfs_mount,
-	.kill_sb	= kill_anon_super,
+	.kill_sb	= btrfs_kill_super,
 	.fs_flags	= FS_REQUIRES_DEV,
 };
 
@@ -1250,6 +1735,13 @@
 		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
 					    &btrfs_fs_type, &fs_devices);
 		break;
+	case BTRFS_IOC_DEVICES_READY:
+		ret = btrfs_scan_one_device(vol->name, FMODE_READ,
+					    &btrfs_fs_type, &fs_devices);
+		if (ret)
+			break;
+		ret = !(fs_devices->num_devices == fs_devices->total_devices);
+		break;
 	}
 
 	kfree(vol);
@@ -1258,38 +1750,103 @@
 
 static int btrfs_freeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_lock(&root->fs_info->transaction_kthread_mutex);
-	mutex_lock(&root->fs_info->cleaner_mutex);
-	return 0;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root = btrfs_sb(sb)->tree_root;
+
+	trans = btrfs_attach_transaction_barrier(root);
+	if (IS_ERR(trans)) {
+		/* no transaction, don't bother */
+		if (PTR_ERR(trans) == -ENOENT)
+			return 0;
+		return PTR_ERR(trans);
+	}
+	return btrfs_commit_transaction(trans, root);
 }
 
 static int btrfs_unfreeze(struct super_block *sb)
 {
-	struct btrfs_root *root = btrfs_sb(sb);
-	mutex_unlock(&root->fs_info->cleaner_mutex);
-	mutex_unlock(&root->fs_info->transaction_kthread_mutex);
 	return 0;
 }
 
-static void btrfs_fs_dirty_inode(struct inode *inode, int flags)
+#ifdef MY_ABC_HERE
+static int btrfs_show_devname(struct seq_file *m, struct vfsmount *vfs)
+#else
+static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
+#endif
+{
+#ifdef MY_ABC_HERE
+	struct btrfs_fs_info *fs_info = btrfs_sb(vfs->mnt_sb);
+#else
+	struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
+#endif
+	struct btrfs_fs_devices *cur_devices;
+	struct btrfs_device *dev, *first_dev = NULL;
+	struct list_head *head;
+	struct rcu_string *name;
+
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	cur_devices = fs_info->fs_devices;
+	while (cur_devices) {
+		head = &cur_devices->devices;
+		list_for_each_entry(dev, head, dev_list) {
+			if (dev->missing)
+				continue;
+			if (!first_dev || dev->devid < first_dev->devid)
+				first_dev = dev;
+		}
+		cur_devices = cur_devices->seed;
+	}
+
+	if (first_dev) {
+		rcu_read_lock();
+		name = rcu_dereference(first_dev->name);
+		seq_escape(m, name->str, " \t\n\\");
+		rcu_read_unlock();
+	} else {
+		WARN_ON(1);
+	}
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+	return 0;
+}
+
+#ifdef MY_ABC_HERE
+static int syno_btrfs_set_sb_archive_ver(struct super_block *sb, u32 archive_ver)
 {
+	struct btrfs_fs_info *fs_info = sb->s_fs_info;
+	struct btrfs_trans_handle *trans;
 	int ret;
 
-	ret = btrfs_dirty_inode(inode);
-	if (ret)
-		printk_ratelimited(KERN_ERR "btrfs: fail to dirty inode %Lu "
-				   "error %d\n", btrfs_ino(inode), ret);
+	trans = btrfs_start_transaction(fs_info->fs_root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	sb->s_archive_version = archive_ver;
+	fs_info->super_copy->archive_version = cpu_to_le32(archive_ver);
+
+	ret = btrfs_commit_transaction(trans, fs_info->fs_root);
+	WARN_ON(ret);
+	return 0;
 }
 
+static int syno_btrfs_get_sb_archive_ver(struct super_block *sb, u32 *version)
+{
+	*version = sb->s_archive_version;
+	return 0;
+}
+#endif
+
 static const struct super_operations btrfs_super_ops = {
+#ifdef MY_ABC_HERE
+	.syno_set_sb_archive_ver = syno_btrfs_set_sb_archive_ver,
+	.syno_get_sb_archive_ver = syno_btrfs_get_sb_archive_ver,
+#endif
 	.drop_inode	= btrfs_drop_inode,
 	.evict_inode	= btrfs_evict_inode,
 	.put_super	= btrfs_put_super,
 	.sync_fs	= btrfs_sync_fs,
 	.show_options	= btrfs_show_options,
+	.show_devname	= btrfs_show_devname,
 	.write_inode	= btrfs_write_inode,
-	.dirty_inode	= btrfs_fs_dirty_inode,
 	.alloc_inode	= btrfs_alloc_inode,
 	.destroy_inode	= btrfs_destroy_inode,
 	.statfs		= btrfs_statfs,
@@ -1322,7 +1879,27 @@
 static void btrfs_interface_exit(void)
 {
 	if (misc_deregister(&btrfs_misc) < 0)
-		printk(KERN_INFO "misc_deregister failed for control device");
+		printk(KERN_INFO "btrfs: misc_deregister failed for control device\n");
+}
+
+static void btrfs_print_info(void)
+{
+	printk(KERN_INFO "Btrfs loaded"
+#ifdef CONFIG_BTRFS_DEBUG
+			", debug=on"
+#endif
+#ifdef CONFIG_BTRFS_ASSERT
+			", assert=on"
+#endif
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+			", integrity-checker=on"
+#endif
+			"\n");
+}
+
+static int btrfs_run_sanity_tests(void)
+{
+	return btrfs_test_free_space_cache();
 }
 
 static int __init init_btrfs_fs(void)
@@ -1333,9 +1910,7 @@
 	if (err)
 		return err;
 
-	err = btrfs_init_compress();
-	if (err)
-		goto free_sysfs;
+	btrfs_init_compress();
 
 	err = btrfs_init_cachep();
 	if (err)
@@ -1349,25 +1924,56 @@
 	if (err)
 		goto free_extent_io;
 
-	err = btrfs_delayed_inode_init();
+	err = ordered_data_init();
 	if (err)
 		goto free_extent_map;
 
-	err = btrfs_interface_init();
+	err = btrfs_delayed_inode_init();
+	if (err)
+		goto free_ordered_data;
+
+	err = btrfs_auto_defrag_init();
 	if (err)
 		goto free_delayed_inode;
 
+	err = btrfs_delayed_ref_init();
+	if (err)
+		goto free_auto_defrag;
+
+	err = btrfs_prelim_ref_init();
+	if (err)
+		goto free_prelim_ref;
+
+	err = btrfs_interface_init();
+	if (err)
+		goto free_delayed_ref;
+
+	btrfs_init_lockdep();
+
+	btrfs_print_info();
+
+	err = btrfs_run_sanity_tests();
+	if (err)
+		goto unregister_ioctl;
+
 	err = register_filesystem(&btrfs_fs_type);
 	if (err)
 		goto unregister_ioctl;
 
-	printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION);
 	return 0;
 
 unregister_ioctl:
 	btrfs_interface_exit();
+free_prelim_ref:
+	btrfs_prelim_ref_exit();
+free_delayed_ref:
+	btrfs_delayed_ref_exit();
+free_auto_defrag:
+	btrfs_auto_defrag_exit();
 free_delayed_inode:
 	btrfs_delayed_inode_exit();
+free_ordered_data:
+	ordered_data_exit();
 free_extent_map:
 	extent_map_exit();
 free_extent_io:
@@ -1376,7 +1982,6 @@
 	btrfs_destroy_cachep();
 free_compress:
 	btrfs_exit_compress();
-free_sysfs:
 	btrfs_exit_sysfs();
 	return err;
 }
@@ -1384,7 +1989,11 @@
 static void __exit exit_btrfs_fs(void)
 {
 	btrfs_destroy_cachep();
+	btrfs_delayed_ref_exit();
+	btrfs_auto_defrag_exit();
 	btrfs_delayed_inode_exit();
+	btrfs_prelim_ref_exit();
+	ordered_data_exit();
 	extent_map_exit();
 	extent_io_exit();
 	btrfs_interface_exit();
Nur in b/fs/btrfs: syno_acl.c.
Nur in b/fs/btrfs: syno_acl.h.
diff -ur a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
--- a/fs/btrfs/sysfs.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/sysfs.c	2014-02-17 11:56:58.000000000 +0100
@@ -21,7 +21,6 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
 
 #include "ctree.h"
Nur in b/fs/btrfs: tests.
diff -ur a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
--- a/fs/btrfs/transaction.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/transaction.c	2014-02-17 11:56:58.000000000 +0100
@@ -22,21 +22,55 @@
 #include <linux/writeback.h>
 #include <linux/pagemap.h>
 #include <linux/blkdev.h>
+#include <linux/uuid.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
 #include "tree-log.h"
 #include "inode-map.h"
+#include "volumes.h"
+#include "dev-replace.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
 
-static noinline void put_transaction(struct btrfs_transaction *transaction)
+static unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
+	[TRANS_STATE_RUNNING]		= 0U,
+	[TRANS_STATE_BLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START),
+	[TRANS_STATE_COMMIT_START]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH),
+	[TRANS_STATE_COMMIT_DOING]	= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN),
+	[TRANS_STATE_UNBLOCKED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+	[TRANS_STATE_COMPLETED]		= (__TRANS_USERSPACE |
+					   __TRANS_START |
+					   __TRANS_ATTACH |
+					   __TRANS_JOIN |
+					   __TRANS_JOIN_NOLOCK),
+};
+
+static void put_transaction(struct btrfs_transaction *transaction)
 {
 	WARN_ON(atomic_read(&transaction->use_count) == 0);
 	if (atomic_dec_and_test(&transaction->use_count)) {
 		BUG_ON(!list_empty(&transaction->list));
-		memset(transaction, 0, sizeof(*transaction));
+		WARN_ON(transaction->delayed_refs.root.rb_node);
+		while (!list_empty(&transaction->pending_chunks)) {
+			struct extent_map *em;
+
+			em = list_first_entry(&transaction->pending_chunks,
+					      struct extent_map, list);
+			list_del_init(&em->list);
+			free_extent_map(em);
+		}
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
 }
@@ -47,59 +81,106 @@
 	root->commit_root = btrfs_root_node(root);
 }
 
+static inline void extwriter_counter_inc(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_inc(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_dec(struct btrfs_transaction *trans,
+					 unsigned int type)
+{
+	if (type & TRANS_EXTWRITERS)
+		atomic_dec(&trans->num_extwriters);
+}
+
+static inline void extwriter_counter_init(struct btrfs_transaction *trans,
+					  unsigned int type)
+{
+	atomic_set(&trans->num_extwriters, ((type & TRANS_EXTWRITERS) ? 1 : 0));
+}
+
+static inline int extwriter_counter_read(struct btrfs_transaction *trans)
+{
+	return atomic_read(&trans->num_extwriters);
+}
+
 /*
  * either allocate a new transaction or hop into the existing one
  */
-static noinline int join_transaction(struct btrfs_root *root, int nofail)
+static noinline int join_transaction(struct btrfs_root *root, unsigned int type)
 {
 	struct btrfs_transaction *cur_trans;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 
-	spin_lock(&root->fs_info->trans_lock);
+	spin_lock(&fs_info->trans_lock);
 loop:
-	if (root->fs_info->trans_no_join) {
-		if (!nofail) {
-			spin_unlock(&root->fs_info->trans_lock);
-			return -EBUSY;
-		}
+	/* The file system has been taken offline. No new transactions. */
+	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		spin_unlock(&fs_info->trans_lock);
+		return -EROFS;
 	}
 
-	cur_trans = root->fs_info->running_transaction;
+	cur_trans = fs_info->running_transaction;
 	if (cur_trans) {
+		if (cur_trans->aborted) {
+			spin_unlock(&fs_info->trans_lock);
+			return cur_trans->aborted;
+		}
+		if (btrfs_blocked_trans_types[cur_trans->state] & type) {
+			spin_unlock(&fs_info->trans_lock);
+			return -EBUSY;
+		}
 		atomic_inc(&cur_trans->use_count);
 		atomic_inc(&cur_trans->num_writers);
-		cur_trans->num_joined++;
-		spin_unlock(&root->fs_info->trans_lock);
+		extwriter_counter_inc(cur_trans, type);
+		spin_unlock(&fs_info->trans_lock);
 		return 0;
 	}
-	spin_unlock(&root->fs_info->trans_lock);
+	spin_unlock(&fs_info->trans_lock);
+
+	/*
+	 * If we are ATTACH, we just want to catch the current transaction,
+	 * and commit it. If there is no transaction, just return ENOENT.
+	 */
+	if (type == TRANS_ATTACH)
+		return -ENOENT;
+
+	/*
+	 * JOIN_NOLOCK only happens during the transaction commit, so
+	 * it is impossible that ->running_transaction is NULL
+	 */
+	BUG_ON(type == TRANS_JOIN_NOLOCK);
 
 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
 
-	spin_lock(&root->fs_info->trans_lock);
-	if (root->fs_info->running_transaction) {
+	spin_lock(&fs_info->trans_lock);
+	if (fs_info->running_transaction) {
 		/*
 		 * someone started a transaction after we unlocked.  Make sure
-		 * to redo the trans_no_join checks above
+		 * to redo the checks above
 		 */
 		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
-		cur_trans = root->fs_info->running_transaction;
 		goto loop;
+	} else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
+		spin_unlock(&fs_info->trans_lock);
+		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+		return -EROFS;
 	}
 
 	atomic_set(&cur_trans->num_writers, 1);
-	cur_trans->num_joined = 0;
+	extwriter_counter_init(cur_trans, type);
 	init_waitqueue_head(&cur_trans->writer_wait);
 	init_waitqueue_head(&cur_trans->commit_wait);
-	cur_trans->in_commit = 0;
-	cur_trans->blocked = 0;
+	cur_trans->state = TRANS_STATE_RUNNING;
 	/*
 	 * One for this trans handle, one so it will live on until we
 	 * commit the transaction.
 	 */
 	atomic_set(&cur_trans->use_count, 2);
-	cur_trans->commit_done = 0;
 	cur_trans->start_time = get_seconds();
 
 	cur_trans->delayed_refs.root = RB_ROOT;
@@ -108,17 +189,36 @@
 	cur_trans->delayed_refs.num_heads = 0;
 	cur_trans->delayed_refs.flushing = 0;
 	cur_trans->delayed_refs.run_delayed_start = 0;
-	spin_lock_init(&cur_trans->commit_lock);
+
+	/*
+	 * although the tree mod log is per file system and not per transaction,
+	 * the log must never go across transaction boundaries.
+	 */
+	smp_mb();
+	if (!list_empty(&fs_info->tree_mod_seq_list))
+		WARN(1, KERN_ERR "btrfs: tree_mod_seq_list not empty when "
+			"creating a fresh transaction\n");
+	if (!RB_EMPTY_ROOT(&fs_info->tree_mod_log))
+		WARN(1, KERN_ERR "btrfs: tree_mod_log rb tree not empty when "
+			"creating a fresh transaction\n");
+	atomic64_set(&fs_info->tree_mod_seq, 0);
+
 	spin_lock_init(&cur_trans->delayed_refs.lock);
+	atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
+	atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
+	init_waitqueue_head(&cur_trans->delayed_refs.wait);
 
 	INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-	list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+	INIT_LIST_HEAD(&cur_trans->ordered_operations);
+	INIT_LIST_HEAD(&cur_trans->pending_chunks);
+	list_add_tail(&cur_trans->list, &fs_info->trans_list);
 	extent_io_tree_init(&cur_trans->dirty_pages,
-			     root->fs_info->btree_inode->i_mapping);
-	root->fs_info->generation++;
-	cur_trans->transid = root->fs_info->generation;
-	root->fs_info->running_transaction = cur_trans;
-	spin_unlock(&root->fs_info->trans_lock);
+			     fs_info->btree_inode->i_mapping);
+	fs_info->generation++;
+	cur_trans->transid = fs_info->generation;
+	fs_info->running_transaction = cur_trans;
+	cur_trans->aborted = 0;
+	spin_unlock(&fs_info->trans_lock);
 
 	return 0;
 }
@@ -208,6 +308,13 @@
 	return 0;
 }
 
+static inline int is_transaction_blocked(struct btrfs_transaction *trans)
+{
+	return (trans->state >= TRANS_STATE_BLOCKED &&
+		trans->state < TRANS_STATE_UNBLOCKED &&
+		!trans->aborted);
+}
+
 /* wait for commit against the current transaction to become unblocked
  * when this is done, it is safe to start a new transaction, but the current
  * transaction might not be fully on disk.
@@ -218,25 +325,19 @@
 
 	spin_lock(&root->fs_info->trans_lock);
 	cur_trans = root->fs_info->running_transaction;
-	if (cur_trans && cur_trans->blocked) {
+	if (cur_trans && is_transaction_blocked(cur_trans)) {
 		atomic_inc(&cur_trans->use_count);
 		spin_unlock(&root->fs_info->trans_lock);
 
 		wait_event(root->fs_info->transaction_wait,
-			   !cur_trans->blocked);
+			   cur_trans->state >= TRANS_STATE_UNBLOCKED ||
+			   cur_trans->aborted);
 		put_transaction(cur_trans);
 	} else {
 		spin_unlock(&root->fs_info->trans_lock);
 	}
 }
 
-enum btrfs_trans_type {
-	TRANS_START,
-	TRANS_JOIN,
-	TRANS_USERSPACE,
-	TRANS_JOIN_NOLOCK,
-};
-
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
 	if (root->fs_info->log_root_recovering)
@@ -252,21 +353,24 @@
 	return 0;
 }
 
-static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-						    u64 num_items, int type)
+static struct btrfs_trans_handle *
+start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type,
+		  enum btrfs_reserve_flush_enum flush)
 {
 	struct btrfs_trans_handle *h;
 	struct btrfs_transaction *cur_trans;
 	u64 num_bytes = 0;
 	int ret;
+	u64 qgroup_reserved = 0;
 
-	if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return ERR_PTR(-EROFS);
 
 	if (current->journal_info) {
-		WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+		WARN_ON(type & TRANS_EXTWRITERS);
 		h = current->journal_info;
 		h->use_count++;
+		WARN_ON(h->use_count > 2);
 		h->orig_rsv = h->block_rsv;
 		h->block_rsv = NULL;
 		goto got_it;
@@ -277,30 +381,57 @@
 	 * the appropriate flushing if need be.
 	 */
 	if (num_items > 0 && root != root->fs_info->chunk_root) {
+		if (root->fs_info->quota_enabled &&
+		    is_fstree(root->root_key.objectid)) {
+			qgroup_reserved = num_items * root->leafsize;
+			ret = btrfs_qgroup_reserve(root, qgroup_reserved);
+			if (ret)
+				return ERR_PTR(ret);
+		}
+
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
 		ret = btrfs_block_rsv_add(root,
 					  &root->fs_info->trans_block_rsv,
-					  num_bytes);
+					  num_bytes, flush);
 		if (ret)
-			return ERR_PTR(ret);
+			goto reserve_fail;
 	}
 again:
 	h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
-	if (!h)
-		return ERR_PTR(-ENOMEM);
+	if (!h) {
+		ret = -ENOMEM;
+		goto alloc_fail;
+	}
+
+	/*
+	 * If we are JOIN_NOLOCK we're already committing a transaction and
+	 * waiting on this guy, so we don't need to do the sb_start_intwrite
+	 * because we're already holding a ref.  We need this because we could
+	 * have raced in and did an fsync() on a file which can kick a commit
+	 * and then we deadlock with somebody doing a freeze.
+	 *
+	 * If we are ATTACH, it means we just want to catch the current
+	 * transaction and commit it, so we needn't do sb_start_intwrite(). 
+	 */
+	if (type & __TRANS_FREEZABLE)
+		sb_start_intwrite(root->fs_info->sb);
 
 	if (may_wait_transaction(root, type))
 		wait_current_trans(root);
 
 	do {
-		ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
-		if (ret == -EBUSY)
+		ret = join_transaction(root, type);
+		if (ret == -EBUSY) {
 			wait_current_trans(root);
+			if (unlikely(type == TRANS_ATTACH))
+				ret = -ENOENT;
+		}
 	} while (ret == -EBUSY);
 
 	if (ret < 0) {
-		kmem_cache_free(btrfs_trans_handle_cachep, h);
-		return ERR_PTR(ret);
+		/* We must get the transaction if we are JOIN_NOLOCK. */
+		BUG_ON(type == TRANS_JOIN_NOLOCK);
+		goto join_fail;
 	}
 
 	cur_trans = root->fs_info->running_transaction;
@@ -309,21 +440,34 @@
 	h->transaction = cur_trans;
 	h->blocks_used = 0;
 	h->bytes_reserved = 0;
+	h->root = root;
 	h->delayed_ref_updates = 0;
 	h->use_count = 1;
+	h->adding_csums = 0;
 	h->block_rsv = NULL;
 	h->orig_rsv = NULL;
+	h->aborted = 0;
+	h->qgroup_reserved = 0;
+	h->delayed_ref_elem.seq = 0;
+	h->type = type;
+	h->allocating_chunk = false;
+	INIT_LIST_HEAD(&h->qgroup_ref_list);
+	INIT_LIST_HEAD(&h->new_bgs);
 
 	smp_mb();
-	if (cur_trans->blocked && may_wait_transaction(root, type)) {
+	if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+	    may_wait_transaction(root, type)) {
 		btrfs_commit_transaction(h, root);
 		goto again;
 	}
 
 	if (num_bytes) {
+		trace_btrfs_space_reservation(root->fs_info, "transaction",
+					      h->transid, num_bytes, 1);
 		h->block_rsv = &root->fs_info->trans_block_rsv;
 		h->bytes_reserved = num_bytes;
 	}
+	h->qgroup_reserved = qgroup_reserved;
 
 got_it:
 	btrfs_record_root_in_trans(h, root);
@@ -331,67 +475,129 @@
 	if (!current->journal_info && type != TRANS_USERSPACE)
 		current->journal_info = h;
 	return h;
+
+join_fail:
+	if (type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+	kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+	if (num_bytes)
+		btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+					num_bytes);
+reserve_fail:
+	if (qgroup_reserved)
+		btrfs_qgroup_free(root, qgroup_reserved);
+	return ERR_PTR(ret);
 }
 
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items)
 {
-	return start_transaction(root, num_items, TRANS_START);
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_ALL);
 }
+
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+					struct btrfs_root *root, int num_items)
+{
+	return start_transaction(root, num_items, TRANS_START,
+				 BTRFS_RESERVE_FLUSH_LIMIT);
+}
+
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN);
+	return start_transaction(root, 0, TRANS_JOIN, 0);
 }
 
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
 }
 
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_USERSPACE);
+	return start_transaction(root, 0, TRANS_USERSPACE, 0);
+}
+
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ *     btrfs_attach_transaction_barrier()
+ */
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
+{
+	return start_transaction(root, 0, TRANS_ATTACH, 0);
+}
+
+/*
+ * btrfs_attach_transaction_barrier() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+	struct btrfs_trans_handle *trans;
+
+	trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+		btrfs_wait_for_commit(root, 0);
+
+	return trans;
 }
 
 /* wait for a transaction commit to be fully complete */
 static noinline void wait_for_commit(struct btrfs_root *root,
 				    struct btrfs_transaction *commit)
 {
-	wait_event(commit->commit_wait, commit->commit_done);
+	wait_event(commit->commit_wait, commit->state == TRANS_STATE_COMPLETED);
 }
 
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
 {
 	struct btrfs_transaction *cur_trans = NULL, *t;
-	int ret;
+	int ret = 0;
 
-	ret = 0;
 	if (transid) {
 		if (transid <= root->fs_info->last_trans_committed)
 			goto out;
 
+		ret = -EINVAL;
 		/* find specified transaction */
 		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry(t, &root->fs_info->trans_list, list) {
 			if (t->transid == transid) {
 				cur_trans = t;
 				atomic_inc(&cur_trans->use_count);
+				ret = 0;
 				break;
 			}
-			if (t->transid > transid)
+			if (t->transid > transid) {
+				ret = 0;
 				break;
+			}
 		}
 		spin_unlock(&root->fs_info->trans_lock);
-		ret = -EINVAL;
+		/* The specified transaction doesn't exist */
 		if (!cur_trans)
-			goto out;  /* bad transid */
+			goto out;
 	} else {
 		/* find newest transaction that is committing | committed */
 		spin_lock(&root->fs_info->trans_lock);
 		list_for_each_entry_reverse(t, &root->fs_info->trans_list,
 					    list) {
-			if (t->in_commit) {
-				if (t->commit_done)
+			if (t->state >= TRANS_STATE_COMMIT_START) {
+				if (t->state == TRANS_STATE_COMPLETED)
 					break;
 				cur_trans = t;
 				atomic_inc(&cur_trans->use_count);
@@ -404,9 +610,7 @@
 	}
 
 	wait_for_commit(root, cur_trans);
-
 	put_transaction(cur_trans);
-	ret = 0;
 out:
 	return ret;
 }
@@ -420,80 +624,93 @@
 static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-	int ret;
+	if (root->fs_info->global_block_rsv.space_info->full &&
+	    btrfs_should_throttle_delayed_refs(trans, root))
+		return 1;
 
-	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
-	return ret ? 1 : 0;
+	return !!btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
 }
 
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
-	struct btrfs_block_rsv *rsv = trans->block_rsv;
 	int updates;
+	int err;
 
 	smp_mb();
-	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+	if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+	    cur_trans->delayed_refs.flushing)
 		return 1;
 
-	/*
-	 * We need to do this in case we're deleting csums so the global block
-	 * rsv get's used instead of the csum block rsv.
-	 */
-	trans->block_rsv = NULL;
-
 	updates = trans->delayed_ref_updates;
 	trans->delayed_ref_updates = 0;
-	if (updates)
-		btrfs_run_delayed_refs(trans, root, updates);
-
-	trans->block_rsv = rsv;
+	if (updates) {
+		err = btrfs_run_delayed_refs(trans, root, updates);
+		if (err) /* Error code will also eval true */
+			return err;
+	}
 
 	return should_end_transaction(trans, root);
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, int throttle, int lock)
+			  struct btrfs_root *root, int throttle)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_fs_info *info = root->fs_info;
-	int count = 0;
+	unsigned long cur = trans->delayed_ref_updates;
+	int lock = (trans->type != TRANS_JOIN_NOLOCK);
+	int err = 0;
 
 	if (--trans->use_count) {
 		trans->block_rsv = trans->orig_rsv;
 		return 0;
 	}
 
+	/*
+	 * do the qgroup accounting as early as possible
+	 */
+	err = btrfs_delayed_refs_qgroup_accounting(trans, info);
+
 	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
-	while (count < 4) {
-		unsigned long cur = trans->delayed_ref_updates;
-		trans->delayed_ref_updates = 0;
-		if (cur &&
-		    trans->transaction->delayed_refs.num_heads_ready > 64) {
-			trans->delayed_ref_updates = 0;
 
-			/*
-			 * do a full flush if the transaction is trying
-			 * to close
-			 */
-			if (trans->transaction->delayed_refs.flushing)
-				cur = 0;
-			btrfs_run_delayed_refs(trans, root, cur);
-		} else {
-			break;
-		}
-		count++;
+	if (trans->qgroup_reserved) {
+		/*
+		 * the same root has to be passed here between start_transaction
+		 * and end_transaction. Subvolume quota depends on this.
+		 */
+		btrfs_qgroup_free(trans->root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
 	}
 
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
+	trans->delayed_ref_updates = 0;
+	if (btrfs_should_throttle_delayed_refs(trans, root)) {
+		cur = max_t(unsigned long, cur, 1);
+		trans->delayed_ref_updates = 0;
+		btrfs_run_delayed_refs(trans, root, cur);
+	}
+
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
+
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
-	    should_end_transaction(trans, root)) {
-		trans->transaction->blocked = 1;
-		smp_wmb();
+	    should_end_transaction(trans, root) &&
+	    ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) {
+		spin_lock(&info->trans_lock);
+		if (cur_trans->state == TRANS_STATE_RUNNING)
+			cur_trans->state = TRANS_STATE_BLOCKED;
+		spin_unlock(&info->trans_lock);
 	}
 
-	if (lock && cur_trans->blocked && !cur_trans->in_commit) {
+	if (lock && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
 		if (throttle) {
 			/*
 			 * We may race with somebody else here so end up having
@@ -507,9 +724,13 @@
 		}
 	}
 
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+
 	WARN_ON(cur_trans != info->running_transaction);
 	WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
 	atomic_dec(&cur_trans->num_writers);
+	extwriter_counter_dec(cur_trans, trans->type);
 
 	smp_mb();
 	if (waitqueue_active(&cur_trans->writer_wait))
@@ -518,52 +739,35 @@
 
 	if (current->journal_info == trans)
 		current->journal_info = NULL;
-	memset(trans, 0, sizeof(*trans));
-	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (throttle)
 		btrfs_run_delayed_iputs(root);
 
-	return 0;
+	if (trans->aborted ||
+	    test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
+		err = -EIO;
+	assert_qgroups_uptodate(trans);
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+	return err;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root)
 {
-	int ret;
-
-	ret = __btrfs_end_transaction(trans, root, 0, 1);
-	if (ret)
-		return ret;
-	return 0;
+	return __btrfs_end_transaction(trans, root, 0);
 }
 
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root)
 {
-	int ret;
-
-	ret = __btrfs_end_transaction(trans, root, 1, 1);
-	if (ret)
-		return ret;
-	return 0;
-}
-
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root)
-{
-	int ret;
-
-	ret = __btrfs_end_transaction(trans, root, 0, 0);
-	if (ret)
-		return ret;
-	return 0;
+	return __btrfs_end_transaction(trans, root, 1);
 }
 
 int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root)
 {
-	return __btrfs_end_transaction(trans, root, 1, 1);
+	return __btrfs_end_transaction(trans, root, 1);
 }
 
 /*
@@ -577,13 +781,15 @@
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      mark)) {
-		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
-				   GFP_NOFS);
+				      mark, &cached_state)) {
+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				   mark, &cached_state, GFP_NOFS);
+		cached_state = NULL;
 		err = filemap_fdatawrite_range(mapping, start, end);
 		if (err)
 			werr = err;
@@ -607,12 +813,14 @@
 	int err = 0;
 	int werr = 0;
 	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
+	struct extent_state *cached_state = NULL;
 	u64 start = 0;
 	u64 end;
 
 	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-				      EXTENT_NEED_WAIT)) {
-		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+				      EXTENT_NEED_WAIT, &cached_state)) {
+		clear_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
+				 0, 0, &cached_state, GFP_NOFS);
 		err = filemap_fdatawait_range(mapping, start, end);
 		if (err)
 			werr = err;
@@ -629,13 +837,16 @@
  * them in one of two extent_io trees.  This is used to make sure all of
  * those extents are on disk for transaction or log commit
  */
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark)
 {
 	int ret;
 	int ret2;
+	struct blk_plug plug;
 
+	blk_start_plug(&plug);
 	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
+	blk_finish_plug(&plug);
 	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
 
 	if (ret)
@@ -689,11 +900,13 @@
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 
 		old_root_used = btrfs_root_used(&root->root_item);
 		ret = btrfs_write_dirty_block_groups(trans, root);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 
 	if (root != root->fs_info->extent_root)
@@ -704,6 +917,10 @@
 
 /*
  * update all the cowonly tree roots on disk
+ *
+ * The error handling in this function may not be obvious. Any of the
+ * failures will cause the file system to go offline. We still need
+ * to clean up the delayed refs.
  */
 static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 					 struct btrfs_root *root)
@@ -714,13 +931,31 @@
 	int ret;
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	if (ret)
+		return ret;
 
 	eb = btrfs_lock_root_node(fs_info->tree_root);
-	btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
+	ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
+			      0, &eb);
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 
+	if (ret)
+		return ret;
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret)
+		return ret;
+
+	ret = btrfs_run_dev_stats(trans, root->fs_info);
+	WARN_ON(ret);
+	ret = btrfs_run_dev_replace(trans, root->fs_info);
+	WARN_ON(ret);
+
+	ret = btrfs_run_qgroups(trans, root->fs_info);
+	BUG_ON(ret);
+
+	/* run_qgroups might have added some more refs */
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 	BUG_ON(ret);
 
@@ -729,13 +964,17 @@
 		list_del_init(next);
 		root = list_entry(next, struct btrfs_root, dirty_list);
 
-		update_cowonly_root(trans, root);
+		ret = update_cowonly_root(trans, root);
+		if (ret)
+			return ret;
 	}
 
 	down_write(&fs_info->extent_commit_sem);
 	switch_commit_root(fs_info->extent_root);
 	up_write(&fs_info->extent_commit_sem);
 
+	btrfs_after_dev_replace_commit(fs_info);
+
 	return 0;
 }
 
@@ -744,12 +983,12 @@
  * a dirty root struct and adds it into the list of dead roots that need to
  * be deleted
  */
-int btrfs_add_dead_root(struct btrfs_root *root)
+void btrfs_add_dead_root(struct btrfs_root *root)
 {
 	spin_lock(&root->fs_info->trans_lock);
-	list_add(&root->root_list, &root->fs_info->dead_roots);
+	if (list_empty(&root->root_list))
+		list_add_tail(&root->root_list, &root->fs_info->dead_roots);
 	spin_unlock(&root->fs_info->trans_lock);
-	return 0;
 }
 
 /*
@@ -812,15 +1051,14 @@
 }
 
 /*
- * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
- * otherwise every leaf in the btree is read and defragged.
+ * defrag a given btree.
+ * Every leaf in the btree is read and defragged.
  */
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+int btrfs_defrag_root(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_trans_handle *trans;
 	int ret;
-	unsigned long nr;
 
 	if (xchg(&root->defrag_running, 1))
 		return 0;
@@ -830,15 +1068,20 @@
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 
-		ret = btrfs_defrag_leaves(trans, root, cacheonly);
+		ret = btrfs_defrag_leaves(trans, root);
 
-		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
-		btrfs_btree_balance_dirty(info->tree_root, nr);
+		btrfs_btree_balance_dirty(info->tree_root);
 		cond_resched();
 
 		if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
 			break;
+
+		if (btrfs_defrag_cancelled(root->fs_info)) {
+			printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+			ret = -EAGAIN;
+			break;
+		}
 	}
 	root->defrag_running = 0;
 	return ret;
@@ -846,7 +1089,12 @@
 
 /*
  * new snapshots need to be created at a very specific time in the
- * transaction commit.  This does the actual creation
+ * transaction commit.  This does the actual creation.
+ *
+ * Note:
+ * If the error which may affect the commitment of the current transaction
+ * happens, we should return the error number. If the error which just affect
+ * the creation of the pending snapshots, just return 0.
  */
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
@@ -859,50 +1107,62 @@
 	struct btrfs_root *parent_root;
 	struct btrfs_block_rsv *rsv;
 	struct inode *parent_inode;
-	struct dentry *parent;
+	struct btrfs_path *path;
+	struct btrfs_dir_item *dir_item;
 	struct dentry *dentry;
 	struct extent_buffer *tmp;
 	struct extent_buffer *old;
-	int ret;
+	struct timespec cur_time = CURRENT_TIME;
+	int ret = 0;
 	u64 to_reserve = 0;
 	u64 index = 0;
 	u64 objectid;
 	u64 root_flags;
+	uuid_le new_uuid;
 
-	rsv = trans->block_rsv;
+	path = btrfs_alloc_path();
+	if (!path) {
+		pending->error = -ENOMEM;
+		return 0;
+	}
 
 	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
 	if (!new_root_item) {
 		pending->error = -ENOMEM;
-		goto fail;
+		goto root_item_alloc_fail;
 	}
 
-	ret = btrfs_find_free_objectid(tree_root, &objectid);
-	if (ret) {
-		pending->error = ret;
-		goto fail;
-	}
+	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
+	if (pending->error)
+		goto no_free_objectid;
 
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add_noflush(root, &pending->block_rsv,
-						  to_reserve);
-		if (ret) {
-			pending->error = ret;
-			goto fail;
-		}
+		pending->error = btrfs_block_rsv_add(root,
+						     &pending->block_rsv,
+						     to_reserve,
+						     BTRFS_RESERVE_NO_FLUSH);
+		if (pending->error)
+			goto no_free_objectid;
 	}
 
+	pending->error = btrfs_qgroup_inherit(trans, fs_info,
+					      root->root_key.objectid,
+					      objectid, pending->inherit);
+	if (pending->error)
+		goto no_free_objectid;
+
 	key.objectid = objectid;
 	key.offset = (u64)-1;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 
+	rsv = trans->block_rsv;
 	trans->block_rsv = &pending->block_rsv;
+	trans->bytes_reserved = trans->block_rsv->reserved;
 
 	dentry = pending->dentry;
-	parent = dget_parent(dentry);
-	parent_inode = parent->d_inode;
+	parent_inode = pending->dir;
 	parent_root = BTRFS_I(parent_inode)->root;
 	record_root_in_trans(trans, parent_root);
 
@@ -910,17 +1170,22 @@
 	 * insert the directory item
 	 */
 	ret = btrfs_set_inode_index(parent_inode, &index);
-	BUG_ON(ret);
-	ret = btrfs_insert_dir_item(trans, parent_root,
-				dentry->d_name.name, dentry->d_name.len,
-				parent_inode, &key,
-				BTRFS_FT_DIR, index);
-	BUG_ON(ret);
+	BUG_ON(ret); /* -ENOMEM */
 
-	btrfs_i_size_write(parent_inode, parent_inode->i_size +
-					 dentry->d_name.len * 2);
-	ret = btrfs_update_inode(trans, parent_root, parent_inode);
-	BUG_ON(ret);
+	/* check if there is a file/dir which has the same name. */
+	dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
+					 btrfs_ino(parent_inode),
+					 dentry->d_name.name,
+					 dentry->d_name.len, 0);
+	if (dir_item != NULL && !IS_ERR(dir_item)) {
+		pending->error = -EEXIST;
+		goto dir_item_existed;
+	} else if (IS_ERR(dir_item)) {
+		ret = PTR_ERR(dir_item);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	btrfs_release_path(path);
 
 	/*
 	 * pull in the delayed directory update
@@ -929,7 +1194,10 @@
 	 * snapshot
 	 */
 	ret = btrfs_run_delayed_items(trans, root);
-	BUG_ON(ret);
+	if (ret) {	/* Transaction aborted */
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
@@ -943,13 +1211,43 @@
 		root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
 	btrfs_set_root_flags(new_root_item, root_flags);
 
+	btrfs_set_root_generation_v2(new_root_item,
+			trans->transid);
+	uuid_le_gen(&new_uuid);
+	memcpy(new_root_item->uuid, new_uuid.b, BTRFS_UUID_SIZE);
+	memcpy(new_root_item->parent_uuid, root->root_item.uuid,
+			BTRFS_UUID_SIZE);
+	if (!(root_flags & BTRFS_ROOT_SUBVOL_RDONLY)) {
+		memset(new_root_item->received_uuid, 0,
+		       sizeof(new_root_item->received_uuid));
+		memset(&new_root_item->stime, 0, sizeof(new_root_item->stime));
+		memset(&new_root_item->rtime, 0, sizeof(new_root_item->rtime));
+		btrfs_set_root_stransid(new_root_item, 0);
+		btrfs_set_root_rtransid(new_root_item, 0);
+	}
+	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
+	btrfs_set_root_otransid(new_root_item, trans->transid);
+
 	old = btrfs_lock_root_node(root);
-	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	ret = btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	if (ret) {
+		btrfs_tree_unlock(old);
+		free_extent_buffer(old);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
 	btrfs_set_lock_blocking(old);
 
-	btrfs_copy_root(trans, root, old, &tmp, objectid);
+	ret = btrfs_copy_root(trans, root, old, &tmp, objectid);
+	/* clean up in any case */
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	/* see comments in should_cow_block() */
 	root->force_cow = 1;
@@ -961,7 +1259,10 @@
 	ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
 	btrfs_tree_unlock(tmp);
 	free_extent_buffer(tmp);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	/*
 	 * insert root back/forward references
@@ -970,19 +1271,76 @@
 				 parent_root->root_key.objectid,
 				 btrfs_ino(parent_inode), index,
 				 dentry->d_name.name, dentry->d_name.len);
-	BUG_ON(ret);
-	dput(parent);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
 	key.offset = (u64)-1;
 	pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
-	BUG_ON(IS_ERR(pending->snap));
+	if (IS_ERR(pending->snap)) {
+		ret = PTR_ERR(pending->snap);
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
 
-	btrfs_reloc_post_snapshot(trans, pending);
+	ret = btrfs_reloc_post_snapshot(trans, pending);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	ret = btrfs_insert_dir_item(trans, parent_root,
+				    dentry->d_name.name, dentry->d_name.len,
+				    parent_inode, &key,
+				    BTRFS_FT_DIR, index);
+	/* We have check then name at the beginning, so it is impossible. */
+	BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+
+	btrfs_i_size_write(parent_inode, parent_inode->i_size +
+					 dentry->d_name.len * 2);
+	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
+	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
+				  BTRFS_UUID_KEY_SUBVOL, objectid);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
+		ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+					  new_root_item->received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  objectid);
+		if (ret && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto fail;
+		}
+	}
 fail:
-	kfree(new_root_item);
+	pending->error = ret;
+dir_item_existed:
 	trans->block_rsv = rsv;
-	btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
-	return 0;
+	trans->bytes_reserved = 0;
+no_free_objectid:
+	kfree(new_root_item);
+root_item_alloc_fail:
+	btrfs_free_path(path);
+	return ret;
 }
 
 /*
@@ -991,15 +1349,17 @@
 static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans,
 					     struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_pending_snapshot *pending;
+	struct btrfs_pending_snapshot *pending, *next;
 	struct list_head *head = &trans->transaction->pending_snapshots;
-	int ret;
+	int ret = 0;
 
-	list_for_each_entry(pending, head, list) {
+	list_for_each_entry_safe(pending, next, head, list) {
+		list_del(&pending->list);
 		ret = create_pending_snapshot(trans, fs_info, pending);
-		BUG_ON(ret);
+		if (ret)
+			break;
 	}
-	return 0;
+	return ret;
 }
 
 static void update_super_roots(struct btrfs_root *root)
@@ -1020,24 +1380,32 @@
 	super->root_level = root_item->level;
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		super->cache_generation = root_item->generation;
+	if (root->fs_info->update_uuid_tree_gen)
+		super->uuid_tree_generation = root_item->generation;
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
+	struct btrfs_transaction *trans;
 	int ret = 0;
+
 	spin_lock(&info->trans_lock);
-	if (info->running_transaction)
-		ret = info->running_transaction->in_commit;
+	trans = info->running_transaction;
+	if (trans)
+		ret = (trans->state >= TRANS_STATE_COMMIT_START);
 	spin_unlock(&info->trans_lock);
 	return ret;
 }
 
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
+	struct btrfs_transaction *trans;
 	int ret = 0;
+
 	spin_lock(&info->trans_lock);
-	if (info->running_transaction)
-		ret = info->running_transaction->blocked;
+	trans = info->running_transaction;
+	if (trans)
+		ret = is_transaction_blocked(trans);
 	spin_unlock(&info->trans_lock);
 	return ret;
 }
@@ -1049,7 +1417,9 @@
 static void wait_current_trans_commit_start(struct btrfs_root *root,
 					    struct btrfs_transaction *trans)
 {
-	wait_event(root->fs_info->transaction_blocked_wait, trans->in_commit);
+	wait_event(root->fs_info->transaction_blocked_wait,
+		   trans->state >= TRANS_STATE_COMMIT_START ||
+		   trans->aborted);
 }
 
 /*
@@ -1060,7 +1430,8 @@
 					 struct btrfs_transaction *trans)
 {
 	wait_event(root->fs_info->transaction_wait,
-		   trans->commit_done || (trans->in_commit && !trans->blocked));
+		   trans->state >= TRANS_STATE_UNBLOCKED ||
+		   trans->aborted);
 }
 
 /*
@@ -1070,13 +1441,24 @@
 struct btrfs_async_commit {
 	struct btrfs_trans_handle *newtrans;
 	struct btrfs_root *root;
-	struct delayed_work work;
+	struct work_struct work;
 };
 
 static void do_async_commit(struct work_struct *work)
 {
 	struct btrfs_async_commit *ac =
-		container_of(work, struct btrfs_async_commit, work.work);
+		container_of(work, struct btrfs_async_commit, work);
+
+	/*
+	 * We've got freeze protection passed with the transaction.
+	 * Tell lockdep about it.
+	 */
+	if (ac->newtrans->type < TRANS_JOIN_NOLOCK)
+		rwsem_acquire_read(
+		     &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+		     0, 1, _THIS_IP_);
+
+	current->journal_info = ac->newtrans;
 
 	btrfs_commit_transaction(ac->newtrans, ac->root);
 	kfree(ac);
@@ -1093,7 +1475,7 @@
 	if (!ac)
 		return -ENOMEM;
 
-	INIT_DELAYED_WORK(&ac->work, do_async_commit);
+	INIT_WORK(&ac->work, do_async_commit);
 	ac->root = root;
 	ac->newtrans = btrfs_join_transaction(root);
 	if (IS_ERR(ac->newtrans)) {
@@ -1107,7 +1489,17 @@
 	atomic_inc(&cur_trans->use_count);
 
 	btrfs_end_transaction(trans, root);
-	schedule_delayed_work(&ac->work, 0);
+
+	/*
+	 * Tell lockdep we've released the freeze rwsem, since the
+	 * async commit thread will be the one to unlock it.
+	 */
+	if (trans->type < TRANS_JOIN_NOLOCK)
+		rwsem_release(
+			&root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
+			1, _THIS_IP_);
+
+	schedule_work(&ac->work);
 
 	/* wait for transaction to start and unblock */
 	if (wait_for_unblock)
@@ -1122,69 +1514,173 @@
 	return 0;
 }
 
-/*
- * btrfs_transaction state sequence:
- *    in_commit = 0, blocked = 0  (initial)
- *    in_commit = 1, blocked = 1
- *    blocked = 0
- *    commit_done = 1
- */
+
+static void cleanup_transaction(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root, int err)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	DEFINE_WAIT(wait);
+
+	WARN_ON(trans->use_count > 1);
+
+	btrfs_abort_transaction(trans, root, err);
+
+	spin_lock(&root->fs_info->trans_lock);
+
+	/*
+	 * If the transaction is removed from the list, it means this
+	 * transaction has been committed successfully, so it is impossible
+	 * to call the cleanup function.
+	 */
+	BUG_ON(list_empty(&cur_trans->list));
+
+	list_del_init(&cur_trans->list);
+	if (cur_trans == root->fs_info->running_transaction) {
+		cur_trans->state = TRANS_STATE_COMMIT_DOING;
+		spin_unlock(&root->fs_info->trans_lock);
+		wait_event(cur_trans->writer_wait,
+			   atomic_read(&cur_trans->num_writers) == 1);
+
+		spin_lock(&root->fs_info->trans_lock);
+	}
+	spin_unlock(&root->fs_info->trans_lock);
+
+	btrfs_cleanup_one_transaction(trans->transaction, root);
+
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans == root->fs_info->running_transaction)
+		root->fs_info->running_transaction = NULL;
+	spin_unlock(&root->fs_info->trans_lock);
+
+	put_transaction(cur_trans);
+	put_transaction(cur_trans);
+
+	trace_btrfs_transaction_commit(root);
+
+	btrfs_scrub_continue(root);
+
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+
+	kmem_cache_free(btrfs_trans_handle_cachep, trans);
+}
+
+static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root)
+{
+	int ret;
+
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret)
+		return ret;
+
+	/*
+	 * running the delayed items may have added new refs. account
+	 * them now so that they hinder processing of more delayed refs
+	 * as little as possible.
+	 */
+	btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
+
+	/*
+	 * rename don't use btrfs_join_transaction, so, once we
+	 * set the transaction to blocked above, we aren't going
+	 * to get any new ordered operations.  We can safely run
+	 * it here and no for sure that nothing new will be added
+	 * to the list
+	 */
+	ret = btrfs_run_ordered_operations(trans, root, 1);
+
+	return ret;
+}
+
+static inline int btrfs_start_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		return btrfs_start_all_delalloc_inodes(fs_info, 1);
+	return 0;
+}
+
+static inline void btrfs_wait_delalloc_flush(struct btrfs_fs_info *fs_info)
+{
+	if (btrfs_test_opt(fs_info->tree_root, FLUSHONCOMMIT))
+		btrfs_wait_all_ordered_extents(fs_info);
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
-	unsigned long joined = 0;
-	struct btrfs_transaction *cur_trans;
+	struct btrfs_transaction *cur_trans = trans->transaction;
 	struct btrfs_transaction *prev_trans = NULL;
-	DEFINE_WAIT(wait);
 	int ret;
-	int should_grow = 0;
-	unsigned long now = get_seconds();
-	int flush_on_commit = btrfs_test_opt(root, FLUSHONCOMMIT);
 
-	btrfs_run_ordered_operations(root, 0);
+	ret = btrfs_run_ordered_operations(trans, root, 0);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
-	btrfs_trans_release_metadata(trans, root);
-	trans->block_rsv = NULL;
+	/* Stop the commit early if ->aborted is set */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
 	ret = btrfs_run_delayed_refs(trans, root, 0);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
+
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
 
 	cur_trans = trans->transaction;
+
 	/*
 	 * set the flushing flag so procs in this transaction have to
 	 * start sending their work down.
 	 */
 	cur_trans->delayed_refs.flushing = 1;
+	smp_wmb();
+
+	if (!list_empty(&trans->new_bgs))
+		btrfs_create_pending_block_groups(trans, root);
 
 	ret = btrfs_run_delayed_refs(trans, root, 0);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_end_transaction(trans, root);
+		return ret;
+	}
 
-	spin_lock(&cur_trans->commit_lock);
-	if (cur_trans->in_commit) {
-		spin_unlock(&cur_trans->commit_lock);
+	spin_lock(&root->fs_info->trans_lock);
+	if (cur_trans->state >= TRANS_STATE_COMMIT_START) {
+		spin_unlock(&root->fs_info->trans_lock);
 		atomic_inc(&cur_trans->use_count);
-		btrfs_end_transaction(trans, root);
+		ret = btrfs_end_transaction(trans, root);
 
 		wait_for_commit(root, cur_trans);
 
 		put_transaction(cur_trans);
 
-		return 0;
+		return ret;
 	}
 
-	trans->transaction->in_commit = 1;
-	trans->transaction->blocked = 1;
-	spin_unlock(&cur_trans->commit_lock);
+	cur_trans->state = TRANS_STATE_COMMIT_START;
 	wake_up(&root->fs_info->transaction_blocked_wait);
 
-	spin_lock(&root->fs_info->trans_lock);
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
 		prev_trans = list_entry(cur_trans->list.prev,
 					struct btrfs_transaction, list);
-		if (!prev_trans->commit_done) {
+		if (prev_trans->state != TRANS_STATE_COMPLETED) {
 			atomic_inc(&prev_trans->use_count);
 			spin_unlock(&root->fs_info->trans_lock);
 
@@ -1198,59 +1694,41 @@
 		spin_unlock(&root->fs_info->trans_lock);
 	}
 
-	if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
-		should_grow = 1;
+	extwriter_counter_dec(cur_trans, trans->type);
 
-	do {
-		int snap_pending = 0;
-
-		joined = cur_trans->num_joined;
-		if (!list_empty(&trans->transaction->pending_snapshots))
-			snap_pending = 1;
-
-		WARN_ON(cur_trans != trans->transaction);
-
-		if (flush_on_commit || snap_pending) {
-			btrfs_start_delalloc_inodes(root, 1);
-			ret = btrfs_wait_ordered_extents(root, 0, 1);
-			BUG_ON(ret);
-		}
-
-		ret = btrfs_run_delayed_items(trans, root);
-		BUG_ON(ret);
+	ret = btrfs_start_delalloc_flush(root->fs_info);
+	if (ret)
+		goto cleanup_transaction;
 
-		/*
-		 * rename don't use btrfs_join_transaction, so, once we
-		 * set the transaction to blocked above, we aren't going
-		 * to get any new ordered operations.  We can safely run
-		 * it here and no for sure that nothing new will be added
-		 * to the list
-		 */
-		btrfs_run_ordered_operations(root, 1);
+	ret = btrfs_flush_all_pending_stuffs(trans, root);
+	if (ret)
+		goto cleanup_transaction;
 
-		prepare_to_wait(&cur_trans->writer_wait, &wait,
-				TASK_UNINTERRUPTIBLE);
+	wait_event(cur_trans->writer_wait,
+		   extwriter_counter_read(cur_trans) == 0);
 
-		if (atomic_read(&cur_trans->num_writers) > 1)
-			schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-		else if (should_grow)
-			schedule_timeout(1);
-
-		finish_wait(&cur_trans->writer_wait, &wait);
-	} while (atomic_read(&cur_trans->num_writers) > 1 ||
-		 (should_grow && cur_trans->num_joined != joined));
+	/* some pending stuffs might be added after the previous flush. */
+	ret = btrfs_flush_all_pending_stuffs(trans, root);
+	if (ret)
+		goto cleanup_transaction;
 
+	btrfs_wait_delalloc_flush(root->fs_info);
 	/*
 	 * Ok now we need to make sure to block out any other joins while we
 	 * commit the transaction.  We could have started a join before setting
-	 * no_join so make sure to wait for num_writers to == 1 again.
+	 * COMMIT_DOING so make sure to wait for num_writers to == 1 again.
 	 */
 	spin_lock(&root->fs_info->trans_lock);
-	root->fs_info->trans_no_join = 1;
+	cur_trans->state = TRANS_STATE_COMMIT_DOING;
 	spin_unlock(&root->fs_info->trans_lock);
 	wait_event(cur_trans->writer_wait,
 		   atomic_read(&cur_trans->num_writers) == 1);
 
+	/* ->aborted might be set after the previous check, so check it */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		goto cleanup_transaction;
+	}
 	/*
 	 * the reloc mutex makes sure that we stop
 	 * the balancing code from coming in and moving
@@ -1258,14 +1736,38 @@
 	 */
 	mutex_lock(&root->fs_info->reloc_mutex);
 
-	ret = btrfs_run_delayed_items(trans, root);
-	BUG_ON(ret);
-
+	/*
+	 * We needn't worry about the delayed items because we will
+	 * deal with them in create_pending_snapshot(), which is the
+	 * core function of the snapshot creation.
+	 */
 	ret = create_pending_snapshots(trans, root->fs_info);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto cleanup_transaction;
+	}
+
+	/*
+	 * We insert the dir indexes of the snapshots and update the inode
+	 * of the snapshots' parents after the snapshot creation, so there
+	 * are some delayed items which are not dealt with. Now deal with
+	 * them.
+	 *
+	 * We needn't worry that this operation will corrupt the snapshots,
+	 * because all the tree which are snapshoted will be forced to COW
+	 * the nodes and leaves.
+	 */
+	ret = btrfs_run_delayed_items(trans, root);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto cleanup_transaction;
+	}
 
 	ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto cleanup_transaction;
+	}
 
 	/*
 	 * make sure none of the code above managed to slip in a
@@ -1292,7 +1794,11 @@
 	mutex_lock(&root->fs_info->tree_log_mutex);
 
 	ret = commit_fs_roots(trans, root);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto cleanup_transaction;
+	}
 
 	/* commit_fs_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
@@ -1300,7 +1806,22 @@
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
 	ret = commit_cowonly_roots(trans, root);
-	BUG_ON(ret);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto cleanup_transaction;
+	}
+
+	/*
+	 * The tasks which save the space cache and inode cache may also
+	 * update ->aborted, check it.
+	 */
+	if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+		ret = cur_trans->aborted;
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		mutex_unlock(&root->fs_info->reloc_mutex);
+		goto cleanup_transaction;
+	}
 
 	btrfs_prepare_extent_commit(trans, root);
 
@@ -1314,28 +1835,35 @@
 			    root->fs_info->chunk_root->node);
 	switch_commit_root(root->fs_info->chunk_root);
 
+	assert_qgroups_uptodate(trans);
 	update_super_roots(root);
 
-	if (!root->fs_info->log_root_recovering) {
-		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
-		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
-	}
-
+	btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+	btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
 	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
 	       sizeof(*root->fs_info->super_copy));
 
-	trans->transaction->blocked = 0;
 	spin_lock(&root->fs_info->trans_lock);
+	cur_trans->state = TRANS_STATE_UNBLOCKED;
 	root->fs_info->running_transaction = NULL;
-	root->fs_info->trans_no_join = 0;
 	spin_unlock(&root->fs_info->trans_lock);
 	mutex_unlock(&root->fs_info->reloc_mutex);
 
 	wake_up(&root->fs_info->transaction_wait);
 
 	ret = btrfs_write_and_wait_transaction(trans, root);
-	BUG_ON(ret);
-	write_ctree_super(trans, root, 0);
+	if (ret) {
+		btrfs_error(root->fs_info, ret,
+			    "Error while writing out transaction");
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		goto cleanup_transaction;
+	}
+
+	ret = write_ctree_super(trans, root, 0);
+	if (ret) {
+		mutex_unlock(&root->fs_info->tree_log_mutex);
+		goto cleanup_transaction;
+	}
 
 	/*
 	 * the super is written, we can safely allow the tree-loggers
@@ -1345,10 +1873,12 @@
 
 	btrfs_finish_extent_commit(trans, root);
 
-	cur_trans->commit_done = 1;
-
 	root->fs_info->last_trans_committed = cur_trans->transid;
-
+	/*
+	 * We needn't acquire the lock here because there is no other task
+	 * which can change it.
+	 */
+	cur_trans->state = TRANS_STATE_COMPLETED;
 	wake_up(&cur_trans->commit_wait);
 
 	spin_lock(&root->fs_info->trans_lock);
@@ -1358,6 +1888,9 @@
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
+	if (trans->type & __TRANS_FREEZABLE)
+		sb_end_intwrite(root->fs_info->sb);
+
 	trace_btrfs_transaction_commit(root);
 
 	btrfs_scrub_continue(root);
@@ -1371,31 +1904,59 @@
 		btrfs_run_delayed_iputs(root);
 
 	return ret;
+
+cleanup_transaction:
+	btrfs_trans_release_metadata(trans, root);
+	trans->block_rsv = NULL;
+	if (trans->qgroup_reserved) {
+		btrfs_qgroup_free(root, trans->qgroup_reserved);
+		trans->qgroup_reserved = 0;
+	}
+	btrfs_warn(root->fs_info, "Skipping commit of aborted transaction.");
+	if (current->journal_info == trans)
+		current->journal_info = NULL;
+	cleanup_transaction(trans, root, ret);
+
+	return ret;
 }
 
 /*
- * interface function to delete all the snapshots we have scheduled for deletion
+ * return < 0 if error
+ * 0 if there are no more dead_roots at the time of call
+ * 1 there are more to be processed, call me again
+ *
+ * The return value indicates there are certainly more snapshots to delete, but
+ * if there comes a new one during processing, it may return 0. We don't mind,
+ * because btrfs_commit_super will poke cleaner thread and it will process it a
+ * few seconds later.
  */
-int btrfs_clean_old_snapshots(struct btrfs_root *root)
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 {
-	LIST_HEAD(list);
+	int ret;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 
 	spin_lock(&fs_info->trans_lock);
-	list_splice_init(&fs_info->dead_roots, &list);
+	if (list_empty(&fs_info->dead_roots)) {
+		spin_unlock(&fs_info->trans_lock);
+		return 0;
+	}
+	root = list_first_entry(&fs_info->dead_roots,
+			struct btrfs_root, root_list);
+	list_del_init(&root->root_list);
 	spin_unlock(&fs_info->trans_lock);
 
-	while (!list_empty(&list)) {
-		root = list_entry(list.next, struct btrfs_root, root_list);
-		list_del(&root->root_list);
-
-		btrfs_kill_all_delayed_nodes(root);
-
-		if (btrfs_header_backref_rev(root->node) <
-		    BTRFS_MIXED_BACKREF_REV)
-			btrfs_drop_snapshot(root, NULL, 0);
-		else
-			btrfs_drop_snapshot(root, NULL, 1);
-	}
-	return 0;
+	pr_debug("btrfs: cleaner removing %llu\n", root->objectid);
+
+	btrfs_kill_all_delayed_nodes(root);
+
+	if (btrfs_header_backref_rev(root->node) <
+			BTRFS_MIXED_BACKREF_REV)
+		ret = btrfs_drop_snapshot(root, NULL, 0, 0);
+	else
+		ret = btrfs_drop_snapshot(root, NULL, 1, 0);
+	/*
+	 * If we encounter a transaction abort during snapshot cleaning, we
+	 * don't want to crash here
+	 */
+	return (ret < 0) ? 0 : 1;
 }
diff -ur a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
--- a/fs/btrfs/transaction.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/transaction.h	2014-02-17 11:56:58.000000000 +0100
@@ -20,34 +20,68 @@
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
 #include "delayed-ref.h"
+#include "ctree.h"
+
+enum btrfs_trans_state {
+	TRANS_STATE_RUNNING		= 0,
+	TRANS_STATE_BLOCKED		= 1,
+	TRANS_STATE_COMMIT_START	= 2,
+	TRANS_STATE_COMMIT_DOING	= 3,
+	TRANS_STATE_UNBLOCKED		= 4,
+	TRANS_STATE_COMPLETED		= 5,
+	TRANS_STATE_MAX			= 6,
+};
 
 struct btrfs_transaction {
 	u64 transid;
 	/*
+	 * total external writers(USERSPACE/START/ATTACH) in this
+	 * transaction, it must be zero before the transaction is
+	 * being committed
+	 */
+	atomic_t num_extwriters;
+	/*
 	 * total writers in this transaction, it must be zero before the
 	 * transaction can end
 	 */
 	atomic_t num_writers;
 	atomic_t use_count;
 
-	unsigned long num_joined;
-
-	spinlock_t commit_lock;
-	int in_commit;
-	int commit_done;
-	int blocked;
+	/* Be protected by fs_info->trans_lock when we want to change it. */
+	enum btrfs_trans_state state;
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
+	struct list_head ordered_operations;
+	struct list_head pending_chunks;
 	struct btrfs_delayed_ref_root delayed_refs;
+	int aborted;
 };
 
+#define __TRANS_FREEZABLE	(1U << 0)
+
+#define __TRANS_USERSPACE	(1U << 8)
+#define __TRANS_START		(1U << 9)
+#define __TRANS_ATTACH		(1U << 10)
+#define __TRANS_JOIN		(1U << 11)
+#define __TRANS_JOIN_NOLOCK	(1U << 12)
+
+#define TRANS_USERSPACE		(__TRANS_USERSPACE | __TRANS_FREEZABLE)
+#define TRANS_START		(__TRANS_START | __TRANS_FREEZABLE)
+#define TRANS_ATTACH		(__TRANS_ATTACH)
+#define TRANS_JOIN		(__TRANS_JOIN | __TRANS_FREEZABLE)
+#define TRANS_JOIN_NOLOCK	(__TRANS_JOIN_NOLOCK)
+
+#define TRANS_EXTWRITERS	(__TRANS_USERSPACE | __TRANS_START |	\
+				 __TRANS_ATTACH)
+
 struct btrfs_trans_handle {
 	u64 transid;
 	u64 bytes_reserved;
+	u64 qgroup_reserved;
 	unsigned long use_count;
 	unsigned long blocks_reserved;
 	unsigned long blocks_used;
@@ -55,14 +89,30 @@
 	struct btrfs_transaction *transaction;
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_rsv *orig_rsv;
+	short aborted;
+	short adding_csums;
+	bool allocating_chunk;
+	unsigned int type;
+	/*
+	 * this root is only needed to validate that the root passed to
+	 * start_transaction is the same as the one passed to end_transaction.
+	 * Subvolume quota depends on this
+	 */
+	struct btrfs_root *root;
+	struct seq_list delayed_ref_elem;
+	struct list_head qgroup_ref_list;
+	struct list_head new_bgs;
 };
 
 struct btrfs_pending_snapshot {
 	struct dentry *dentry;
+	struct inode *dir;
 	struct btrfs_root *root;
 	struct btrfs_root *snap;
+	struct btrfs_qgroup_inherit *inherit;
 	/* block reservation for the operation */
 	struct btrfs_block_rsv block_rsv;
+	u64 qgroup_reserved;
 	/* extra metadata reseration for relocation */
 	int error;
 	bool readonly;
@@ -74,24 +124,28 @@
 {
 	BTRFS_I(inode)->last_trans = trans->transaction->transid;
 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
 }
 
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root);
-int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 						   int num_items);
+struct btrfs_trans_handle *btrfs_start_transaction_lflush(
+					struct btrfs_root *root, int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+					struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root);
 
-int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
-int btrfs_clean_old_snapshots(struct btrfs_root *root);
+void btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_defrag_root(struct btrfs_root *root);
+int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
@@ -106,8 +160,6 @@
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_write_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
diff -ur a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
--- a/fs/btrfs/tree-defrag.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/tree-defrag.c	2014-02-17 11:56:58.000000000 +0100
@@ -23,13 +23,14 @@
 #include "transaction.h"
 #include "locking.h"
 
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
- * things from disk, otherwise read all the leaves and try to get key order to
+/*
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
  * better reflect disk order
  */
 
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root, int cache_only)
+			struct btrfs_root *root)
 {
 	struct btrfs_path *path = NULL;
 	struct btrfs_key key;
@@ -41,9 +42,6 @@
 	u64 last_ret = 0;
 	u64 min_trans = 0;
 
-	if (cache_only)
-		goto out;
-
 	if (root->fs_info->extent_root == root) {
 		/*
 		 * there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@
 	}
 
 	path->keep_locks = 1;
-	if (cache_only)
-		min_trans = root->defrag_trans_start;
 
-	ret = btrfs_search_forward(root, &key, NULL, path,
-				   cache_only, min_trans);
+	ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -109,11 +104,11 @@
 		goto out;
 	}
 	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-	next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
 					   min_trans);
 	ret = btrfs_realloc_node(trans, root,
 				 path->nodes[1], 0,
-				 cache_only, &last_ret,
+				 &last_ret,
 				 &root->defrag_progress);
 	if (ret) {
 		WARN_ON(ret == -EAGAIN);
diff -ur a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
--- a/fs/btrfs/tree-log.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/tree-log.c	2014-02-17 11:56:58.000000000 +0100
@@ -18,13 +18,17 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/list_sort.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "print-tree.h"
+#include "backref.h"
 #include "compat.h"
 #include "tree-log.h"
+#include "hash.h"
 
 /* magic values for the inode_only field in btrfs_log_inode:
  *
@@ -89,7 +93,8 @@
  */
 #define LOG_WALK_PIN_ONLY 0
 #define LOG_WALK_REPLAY_INODES 1
-#define LOG_WALK_REPLAY_ALL 2
+#define LOG_WALK_REPLAY_DIR_INDEX 2
+#define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, struct inode *inode,
@@ -146,7 +151,7 @@
 			root->log_multiple_pids = true;
 		}
 
-		root->log_batch++;
+		atomic_inc(&root->log_batch);
 		atomic_inc(&root->log_writers);
 		mutex_unlock(&root->log_mutex);
 		return 0;
@@ -165,7 +170,7 @@
 			err = ret;
 	}
 	mutex_unlock(&root->fs_info->tree_log_mutex);
-	root->log_batch++;
+	atomic_inc(&root->log_batch);
 	atomic_inc(&root->log_writers);
 	mutex_unlock(&root->log_mutex);
 	return err;
@@ -212,14 +217,13 @@
  * indicate we're done making changes to the log tree
  * and wake up anyone waiting to do a sync
  */
-int btrfs_end_log_trans(struct btrfs_root *root)
+void btrfs_end_log_trans(struct btrfs_root *root)
 {
 	if (atomic_dec_and_test(&root->log_writers)) {
 		smp_mb();
 		if (waitqueue_active(&root->log_writer_wait))
 			wake_up(&root->log_writer_wait);
 	}
-	return 0;
 }
 
 
@@ -275,18 +279,31 @@
 			      struct extent_buffer *eb,
 			      struct walk_control *wc, u64 gen)
 {
+	int ret = 0;
+
+	/*
+	 * If this fs is mixed then we need to be able to process the leaves to
+	 * pin down any logged extents, so we have to read the block.
+	 */
+	if (btrfs_fs_incompat(log->fs_info, MIXED_GROUPS)) {
+		ret = btrfs_read_buffer(eb, gen);
+		if (ret)
+			return ret;
+	}
+
 	if (wc->pin)
-		btrfs_pin_extent_for_log_replay(wc->trans,
-						log->fs_info->extent_root,
-						eb->start, eb->len);
+		ret = btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
+						      eb->start, eb->len);
 
-	if (btrfs_buffer_uptodate(eb, gen)) {
+	if (!ret && btrfs_buffer_uptodate(eb, gen, 0)) {
+		if (wc->pin && btrfs_header_level(eb) == 0)
+			ret = btrfs_exclude_logged_extents(log, eb);
 		if (wc->write)
 			btrfs_write_tree_block(eb);
 		if (wc->wait)
 			btrfs_wait_tree_block_writeback(eb);
 	}
-	return 0;
+	return ret;
 }
 
 /*
@@ -316,6 +333,7 @@
 	unsigned long src_ptr;
 	unsigned long dst_ptr;
 	int overwrite_root = 0;
+	bool inode_item = key->type == BTRFS_INODE_ITEM_KEY;
 
 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		overwrite_root = 1;
@@ -325,6 +343,9 @@
 
 	/* look for the key in the destination tree */
 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
 	if (ret == 0) {
 		char *src_copy;
 		char *dst_copy;
@@ -366,6 +387,50 @@
 			return 0;
 		}
 
+		/*
+		 * We need to load the old nbytes into the inode so when we
+		 * replay the extents we've logged we get the right nbytes.
+		 */
+		if (inode_item) {
+			struct btrfs_inode_item *item;
+			u64 nbytes;
+			u32 mode;
+
+			item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+					      struct btrfs_inode_item);
+			nbytes = btrfs_inode_nbytes(path->nodes[0], item);
+			item = btrfs_item_ptr(eb, slot,
+					      struct btrfs_inode_item);
+			btrfs_set_inode_nbytes(eb, item, nbytes);
+
+			/*
+			 * If this is a directory we need to reset the i_size to
+			 * 0 so that we can set it up properly when replaying
+			 * the rest of the items in this log.
+			 */
+			mode = btrfs_inode_mode(eb, item);
+			if (S_ISDIR(mode))
+				btrfs_set_inode_size(eb, item, 0);
+		}
+	} else if (inode_item) {
+		struct btrfs_inode_item *item;
+		u32 mode;
+
+		/*
+		 * New inode, set nbytes to 0 so that the nbytes comes out
+		 * properly when we replay the extents.
+		 */
+		item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item);
+		btrfs_set_inode_nbytes(eb, item, 0);
+
+		/*
+		 * If this is a directory we need to reset the i_size to 0 so
+		 * that we can set it up properly when replaying the rest of
+		 * the items in this log.
+		 */
+		mode = btrfs_inode_mode(eb, item);
+		if (S_ISDIR(mode))
+			btrfs_set_inode_size(eb, item, 0);
 	}
 insert:
 	btrfs_release_path(path);
@@ -378,12 +443,11 @@
 		u32 found_size;
 		found_size = btrfs_item_size_nr(path->nodes[0],
 						path->slots[0]);
-		if (found_size > item_size) {
-			btrfs_truncate_item(trans, root, path, item_size, 1);
-		} else if (found_size < item_size) {
-			ret = btrfs_extend_item(trans, root, path,
-						item_size - found_size);
-		}
+		if (found_size > item_size)
+			btrfs_truncate_item(root, path, item_size, 1);
+		else if (found_size < item_size)
+			btrfs_extend_item(root, path,
+					  item_size - found_size);
 	} else if (ret) {
 		return ret;
 	}
@@ -484,11 +548,9 @@
 				      struct btrfs_key *key)
 {
 	int found_type;
-	u64 mask = root->sectorsize - 1;
 	u64 extent_end;
-	u64 alloc_hint;
 	u64 start = key->offset;
-	u64 saved_nbytes;
+	u64 nbytes = 0;
 	struct btrfs_file_extent_item *item;
 	struct inode *inode = NULL;
 	unsigned long size;
@@ -498,11 +560,20 @@
 	found_type = btrfs_file_extent_type(eb, item);
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
-	    found_type == BTRFS_FILE_EXTENT_PREALLOC)
-		extent_end = start + btrfs_file_extent_num_bytes(eb, item);
-	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
+	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		nbytes = btrfs_file_extent_num_bytes(eb, item);
+		extent_end = start + nbytes;
+
+		/*
+		 * We don't add to the inodes nbytes if we are prealloc or a
+		 * hole.
+		 */
+		if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
+			nbytes = 0;
+	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		size = btrfs_file_extent_inline_len(eb, item);
-		extent_end = (start + size + mask) & ~mask;
+		nbytes = btrfs_file_extent_ram_bytes(eb, item);
+		extent_end = ALIGN(start + size, root->sectorsize);
 	} else {
 		ret = 0;
 		goto out;
@@ -550,11 +621,10 @@
 	}
 	btrfs_release_path(path);
 
-	saved_nbytes = inode_get_bytes(inode);
 	/* drop any overlapping extents */
-	ret = btrfs_drop_extents(trans, inode, start, extent_end,
-				 &alloc_hint, 1);
-	BUG_ON(ret);
+	ret = btrfs_drop_extents(trans, root, inode, start, extent_end, 1);
+	if (ret)
+		goto out;
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -564,7 +634,8 @@
 
 		ret = btrfs_insert_empty_item(trans, root, path, key,
 					      sizeof(*item));
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 		dest_offset = btrfs_item_ptr_offset(path->nodes[0],
 						    path->slots[0]);
 		copy_extent_buffer(path->nodes[0], eb, dest_offset,
@@ -589,8 +660,9 @@
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
 						0, root->root_key.objectid,
-						key->objectid, offset);
-				BUG_ON(ret);
+						key->objectid, offset, 0);
+				if (ret)
+					goto out;
 			} else {
 				/*
 				 * insert the extent pointer in the extent
@@ -599,7 +671,8 @@
 				ret = btrfs_alloc_logged_file_extent(trans,
 						root, root->root_key.objectid,
 						key->objectid, offset, &ins);
-				BUG_ON(ret);
+				if (ret)
+					goto out;
 			}
 			btrfs_release_path(path);
 
@@ -616,30 +689,34 @@
 			ret = btrfs_lookup_csums_range(root->log_root,
 						csum_start, csum_end - 1,
 						&ordered_sums, 0);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 			while (!list_empty(&ordered_sums)) {
 				struct btrfs_ordered_sum *sums;
 				sums = list_entry(ordered_sums.next,
 						struct btrfs_ordered_sum,
 						list);
-				ret = btrfs_csum_file_blocks(trans,
+				if (!ret)
+					ret = btrfs_csum_file_blocks(trans,
 						root->fs_info->csum_root,
 						sums);
-				BUG_ON(ret);
 				list_del(&sums->list);
 				kfree(sums);
 			}
+			if (ret)
+				goto out;
 		} else {
 			btrfs_release_path(path);
 		}
 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
 		/* inline extents are easy, we just overwrite them */
 		ret = overwrite_item(trans, root, path, eb, slot, key);
-		BUG_ON(ret);
+		if (ret)
+			goto out;
 	}
 
-	inode_set_bytes(inode, saved_nbytes);
-	btrfs_update_inode(trans, root, inode);
+	inode_add_bytes(inode, nbytes);
+	ret = btrfs_update_inode(trans, root, inode);
 out:
 	if (inode)
 		iput(inode);
@@ -680,20 +757,22 @@
 
 	inode = read_one_inode(root, location.objectid);
 	if (!inode) {
-		kfree(name);
-		return -EIO;
+		ret = -EIO;
+		goto out;
 	}
 
 	ret = link_to_fixup_dir(trans, root, path, location.objectid);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
+	else
+		ret = btrfs_run_delayed_items(trans, root);
+out:
 	kfree(name);
-
 	iput(inode);
-
-	btrfs_run_delayed_items(trans, root);
 	return ret;
 }
 
@@ -746,6 +825,7 @@
  */
 static noinline int backref_in_log(struct btrfs_root *log,
 				   struct btrfs_key *key,
+				   u64 ref_objectid,
 				   char *name, int namelen)
 {
 	struct btrfs_path *path;
@@ -766,8 +846,17 @@
 	if (ret != 0)
 		goto out;
 
-	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		if (btrfs_find_name_in_ext_backref(path, ref_objectid,
+						   name, namelen, NULL))
+			match = 1;
+
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
 	ptr_end = ptr + item_size;
 	while (ptr < ptr_end) {
 		ref = (struct btrfs_inode_ref *)ptr;
@@ -788,91 +877,42 @@
 	return match;
 }
 
-
-/*
- * replay one inode back reference item found in the log tree.
- * eb, slot and key refer to the buffer and key found in the log tree.
- * root is the destination we are replaying into, and path is for temp
- * use by this function.  (it should be released on return).
- */
-static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+static inline int __add_inode_ref(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
-				  struct btrfs_root *log,
 				  struct btrfs_path *path,
-				  struct extent_buffer *eb, int slot,
-				  struct btrfs_key *key)
+				  struct btrfs_root *log_root,
+				  struct inode *dir, struct inode *inode,
+				  struct extent_buffer *eb,
+				  u64 inode_objectid, u64 parent_objectid,
+				  u64 ref_index, char *name, int namelen,
+				  int *search_done)
 {
-	struct btrfs_inode_ref *ref;
-	struct btrfs_dir_item *di;
-	struct inode *dir;
-	struct inode *inode;
-	unsigned long ref_ptr;
-	unsigned long ref_end;
-	char *name;
-	int namelen;
 	int ret;
-	int search_done = 0;
-
-	/*
-	 * it is possible that we didn't log all the parent directories
-	 * for a given inode.  If we don't find the dir, just don't
-	 * copy the back ref in.  The link count fixup code will take
-	 * care of the rest
-	 */
-	dir = read_one_inode(root, key->offset);
-	if (!dir)
-		return -ENOENT;
-
-	inode = read_one_inode(root, key->objectid);
-	if (!inode) {
-		iput(dir);
-		return -EIO;
-	}
-
-	ref_ptr = btrfs_item_ptr_offset(eb, slot);
-	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+	char *victim_name;
+	int victim_name_len;
+	struct extent_buffer *leaf;
+	struct btrfs_dir_item *di;
+	struct btrfs_key search_key;
+	struct btrfs_inode_extref *extref;
 
 again:
-	ref = (struct btrfs_inode_ref *)ref_ptr;
-
-	namelen = btrfs_inode_ref_name_len(eb, ref);
-	name = kmalloc(namelen, GFP_NOFS);
-	BUG_ON(!name);
-
-	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
-
-	/* if we already have a perfect match, we're done */
-	if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
-			 btrfs_inode_ref_index(eb, ref),
-			 name, namelen)) {
-		goto out;
-	}
-
-	/*
-	 * look for a conflicting back reference in the metadata.
-	 * if we find one we have to unlink that name of the file
-	 * before we add our new link.  Later on, we overwrite any
-	 * existing back reference, and we don't want to create
-	 * dangling pointers in the directory.
-	 */
-
-	if (search_done)
-		goto insert;
-
-	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	/* Search old style refs */
+	search_key.objectid = inode_objectid;
+	search_key.type = BTRFS_INODE_REF_KEY;
+	search_key.offset = parent_objectid;
+	ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
 	if (ret == 0) {
-		char *victim_name;
-		int victim_name_len;
 		struct btrfs_inode_ref *victim_ref;
 		unsigned long ptr;
 		unsigned long ptr_end;
-		struct extent_buffer *leaf = path->nodes[0];
+
+		leaf = path->nodes[0];
 
 		/* are we trying to overwrite a back ref for the root directory
 		 * if so, just jump out, we're done
 		 */
-		if (key->objectid == key->offset)
-			goto out_nowrite;
+		if (search_key.objectid == search_key.offset)
+			return 1;
 
 		/* check all the names in this back reference to see
 		 * if they are in the log.  if so, we allow them to stay
@@ -885,13 +925,16 @@
 			victim_name_len = btrfs_inode_ref_name_len(leaf,
 								   victim_ref);
 			victim_name = kmalloc(victim_name_len, GFP_NOFS);
-			BUG_ON(!victim_name);
+			if (!victim_name)
+				return -ENOMEM;
 
 			read_extent_buffer(leaf, victim_name,
 					   (unsigned long)(victim_ref + 1),
 					   victim_name_len);
 
-			if (!backref_in_log(log, key, victim_name,
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid,
+					    victim_name,
 					    victim_name_len)) {
 				btrfs_inc_nlink(inode);
 				btrfs_release_path(path);
@@ -899,28 +942,106 @@
 				ret = btrfs_unlink_inode(trans, root, dir,
 							 inode, victim_name,
 							 victim_name_len);
-				btrfs_run_delayed_items(trans, root);
+				kfree(victim_name);
+				if (ret)
+					return ret;
+				ret = btrfs_run_delayed_items(trans, root);
+				if (ret)
+					return ret;
+				*search_done = 1;
+				goto again;
 			}
 			kfree(victim_name);
+
 			ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
 		}
-		BUG_ON(ret);
 
 		/*
 		 * NOTE: we have searched root tree and checked the
 		 * coresponding ref, it does not need to check again.
 		 */
-		search_done = 1;
+		*search_done = 1;
+	}
+	btrfs_release_path(path);
+
+	/* Same search but for extended refs */
+	extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
+					   inode_objectid, parent_objectid, 0,
+					   0);
+	if (!IS_ERR_OR_NULL(extref)) {
+		u32 item_size;
+		u32 cur_offset = 0;
+		unsigned long base;
+		struct inode *victim_parent;
+
+		leaf = path->nodes[0];
+
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		base = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *)base + cur_offset;
+
+			victim_name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			if (btrfs_inode_extref_parent(leaf, extref) != parent_objectid)
+				goto next;
+
+			victim_name = kmalloc(victim_name_len, GFP_NOFS);
+			if (!victim_name)
+				return -ENOMEM;
+			read_extent_buffer(leaf, victim_name, (unsigned long)&extref->name,
+					   victim_name_len);
+
+			search_key.objectid = inode_objectid;
+			search_key.type = BTRFS_INODE_EXTREF_KEY;
+			search_key.offset = btrfs_extref_hash(parent_objectid,
+							      victim_name,
+							      victim_name_len);
+			ret = 0;
+			if (!backref_in_log(log_root, &search_key,
+					    parent_objectid, victim_name,
+					    victim_name_len)) {
+				ret = -ENOENT;
+				victim_parent = read_one_inode(root,
+							       parent_objectid);
+				if (victim_parent) {
+					btrfs_inc_nlink(inode);
+					btrfs_release_path(path);
+
+					ret = btrfs_unlink_inode(trans, root,
+								 victim_parent,
+								 inode,
+								 victim_name,
+								 victim_name_len);
+					if (!ret)
+						ret = btrfs_run_delayed_items(
+								  trans, root);
+				}
+				iput(victim_parent);
+				kfree(victim_name);
+				if (ret)
+					return ret;
+				*search_done = 1;
+				goto again;
+			}
+			kfree(victim_name);
+			if (ret)
+				return ret;
+next:
+			cur_offset += victim_name_len + sizeof(*extref);
+		}
+		*search_done = 1;
 	}
 	btrfs_release_path(path);
 
 	/* look for a conflicting sequence number */
 	di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
-					 btrfs_inode_ref_index(eb, ref),
-					 name, namelen, 0);
+					 ref_index, name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 	btrfs_release_path(path);
 
@@ -929,33 +1050,184 @@
 				   name, namelen, 0);
 	if (di && !IS_ERR(di)) {
 		ret = drop_one_dir_item(trans, root, path, dir, di);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 	}
 	btrfs_release_path(path);
 
-insert:
-	/* insert our name */
-	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
-			     btrfs_inode_ref_index(eb, ref));
-	BUG_ON(ret);
+	return 0;
+}
 
-	btrfs_update_inode(trans, root, inode);
+static int extref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			     u32 *namelen, char **name, u64 *index,
+			     u64 *parent_objectid)
+{
+	struct btrfs_inode_extref *extref;
 
-out:
-	ref_ptr = (unsigned long)(ref + 1) + namelen;
-	kfree(name);
-	if (ref_ptr < ref_end)
-		goto again;
+	extref = (struct btrfs_inode_extref *)ref_ptr;
+
+	*namelen = btrfs_inode_extref_name_len(eb, extref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)&extref->name,
+			   *namelen);
+
+	*index = btrfs_inode_extref_index(eb, extref);
+	if (parent_objectid)
+		*parent_objectid = btrfs_inode_extref_parent(eb, extref);
+
+	return 0;
+}
+
+static int ref_get_fields(struct extent_buffer *eb, unsigned long ref_ptr,
+			  u32 *namelen, char **name, u64 *index)
+{
+	struct btrfs_inode_ref *ref;
+
+	ref = (struct btrfs_inode_ref *)ref_ptr;
+
+	*namelen = btrfs_inode_ref_name_len(eb, ref);
+	*name = kmalloc(*namelen, GFP_NOFS);
+	if (*name == NULL)
+		return -ENOMEM;
+
+	read_extent_buffer(eb, *name, (unsigned long)(ref + 1), *namelen);
+
+	*index = btrfs_inode_ref_index(eb, ref);
+
+	return 0;
+}
+
+/*
+ * replay one inode back reference item found in the log tree.
+ * eb, slot and key refer to the buffer and key found in the log tree.
+ * root is the destination we are replaying into, and path is for temp
+ * use by this function.  (it should be released on return).
+ */
+static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_root *log,
+				  struct btrfs_path *path,
+				  struct extent_buffer *eb, int slot,
+				  struct btrfs_key *key)
+{
+	struct inode *dir;
+	struct inode *inode;
+	unsigned long ref_ptr;
+	unsigned long ref_end;
+	char *name;
+	int namelen;
+	int ret;
+	int search_done = 0;
+	int log_ref_ver = 0;
+	u64 parent_objectid;
+	u64 inode_objectid;
+	u64 ref_index = 0;
+	int ref_struct_size;
+
+	ref_ptr = btrfs_item_ptr_offset(eb, slot);
+	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
+
+	if (key->type == BTRFS_INODE_EXTREF_KEY) {
+		struct btrfs_inode_extref *r;
+
+		ref_struct_size = sizeof(struct btrfs_inode_extref);
+		log_ref_ver = 1;
+		r = (struct btrfs_inode_extref *)ref_ptr;
+		parent_objectid = btrfs_inode_extref_parent(eb, r);
+	} else {
+		ref_struct_size = sizeof(struct btrfs_inode_ref);
+		parent_objectid = key->offset;
+	}
+	inode_objectid = key->objectid;
+
+	/*
+	 * it is possible that we didn't log all the parent directories
+	 * for a given inode.  If we don't find the dir, just don't
+	 * copy the back ref in.  The link count fixup code will take
+	 * care of the rest
+	 */
+	dir = read_one_inode(root, parent_objectid);
+	if (!dir)
+		return -ENOENT;
+
+	inode = read_one_inode(root, inode_objectid);
+	if (!inode) {
+		iput(dir);
+		return -EIO;
+	}
+
+	while (ref_ptr < ref_end) {
+		if (log_ref_ver) {
+			ret = extref_get_fields(eb, ref_ptr, &namelen, &name,
+						&ref_index, &parent_objectid);
+			/*
+			 * parent object can change from one array
+			 * item to another.
+			 */
+			if (!dir)
+				dir = read_one_inode(root, parent_objectid);
+			if (!dir)
+				return -ENOENT;
+		} else {
+			ret = ref_get_fields(eb, ref_ptr, &namelen, &name,
+					     &ref_index);
+		}
+		if (ret)
+			return ret;
+
+		/* if we already have a perfect match, we're done */
+		if (!inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
+				  ref_index, name, namelen)) {
+			/*
+			 * look for a conflicting back reference in the
+			 * metadata. if we find one we have to unlink that name
+			 * of the file before we add our new link.  Later on, we
+			 * overwrite any existing back reference, and we don't
+			 * want to create dangling pointers in the directory.
+			 */
+
+			if (!search_done) {
+				ret = __add_inode_ref(trans, root, path, log,
+						      dir, inode, eb,
+						      inode_objectid,
+						      parent_objectid,
+						      ref_index, name, namelen,
+						      &search_done);
+				if (ret == 1) {
+					ret = 0;
+					goto out;
+				}
+				if (ret)
+					goto out;
+			}
+
+			/* insert our name */
+			ret = btrfs_add_link(trans, dir, inode, name, namelen,
+					     0, ref_index);
+			if (ret)
+				goto out;
+
+			btrfs_update_inode(trans, root, inode);
+		}
+
+		ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
+		kfree(name);
+		if (log_ref_ver) {
+			iput(dir);
+			dir = NULL;
+		}
+	}
 
 	/* finally write the back reference in the inode */
 	ret = overwrite_item(trans, root, path, eb, slot, key);
-	BUG_ON(ret);
-
-out_nowrite:
+out:
 	btrfs_release_path(path);
 	iput(dir);
 	iput(inode);
-	return 0;
+	return ret;
 }
 
 static int insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -968,25 +1240,55 @@
 	return ret;
 }
 
+static int count_inode_extrefs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
+{
+	int ret = 0;
+	int name_len;
+	unsigned int nlink = 0;
+	u32 item_size;
+	u32 cur_offset = 0;
+	u64 inode_objectid = btrfs_ino(inode);
+	u64 offset = 0;
+	unsigned long ptr;
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
 
-/*
- * There are a few corners where the link count of the file can't
- * be properly maintained during replay.  So, instead of adding
- * lots of complexity to the log code, we just scan the backrefs
- * for any file that has been through replay.
- *
- * The scan will update the link count on the inode to reflect the
- * number of back refs found.  If it goes down to zero, the iput
- * will free the inode.
- */
-static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
-					   struct btrfs_root *root,
-					   struct inode *inode)
+	while (1) {
+		ret = btrfs_find_one_extref(root, inode_objectid, offset, path,
+					    &extref, &offset);
+		if (ret)
+			break;
+
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+
+		while (cur_offset < item_size) {
+			extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
+			name_len = btrfs_inode_extref_name_len(leaf, extref);
+
+			nlink++;
+
+			cur_offset += name_len + sizeof(*extref);
+		}
+
+		offset++;
+		btrfs_release_path(path);
+	}
+	btrfs_release_path(path);
+
+	if (ret < 0)
+		return ret;
+	return nlink;
+}
+
+static int count_inode_refs(struct btrfs_root *root,
+			       struct inode *inode, struct btrfs_path *path)
 {
-	struct btrfs_path *path;
 	int ret;
 	struct btrfs_key key;
-	u64 nlink = 0;
+	unsigned int nlink = 0;
 	unsigned long ptr;
 	unsigned long ptr_end;
 	int name_len;
@@ -996,10 +1298,6 @@
 	key.type = BTRFS_INODE_REF_KEY;
 	key.offset = (u64)-1;
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
@@ -1033,6 +1331,50 @@
 		btrfs_release_path(path);
 	}
 	btrfs_release_path(path);
+
+	return nlink;
+}
+
+/*
+ * There are a few corners where the link count of the file can't
+ * be properly maintained during replay.  So, instead of adding
+ * lots of complexity to the log code, we just scan the backrefs
+ * for any file that has been through replay.
+ *
+ * The scan will update the link count on the inode to reflect the
+ * number of back refs found.  If it goes down to zero, the iput
+ * will free the inode.
+ */
+static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct inode *inode)
+{
+	struct btrfs_path *path;
+	int ret;
+	u64 nlink = 0;
+	u64 ino = btrfs_ino(inode);
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = count_inode_refs(root, inode, path);
+	if (ret < 0)
+		goto out;
+
+	nlink = ret;
+
+	ret = count_inode_extrefs(root, inode, path);
+	if (ret == -ENOENT)
+		ret = 0;
+
+	if (ret < 0)
+		goto out;
+
+	nlink += ret;
+
+	ret = 0;
+
 	if (nlink != inode->i_nlink) {
 		set_nlink(inode, nlink);
 		btrfs_update_inode(trans, root, inode);
@@ -1043,14 +1385,15 @@
 		if (S_ISDIR(inode->i_mode)) {
 			ret = replay_dir_deletes(trans, root, NULL, path,
 						 ino, 1);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 		}
 		ret = insert_orphan_item(trans, root, ino);
-		BUG_ON(ret);
 	}
-	btrfs_free_path(path);
 
-	return 0;
+out:
+	btrfs_free_path(path);
+	return ret;
 }
 
 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
@@ -1090,9 +1433,9 @@
 			return -EIO;
 
 		ret = fixup_inode_link_count(trans, root, inode);
-		BUG_ON(ret);
-
 		iput(inode);
+		if (ret)
+			goto out;
 
 		/*
 		 * fixup on a directory may create new entries,
@@ -1134,12 +1477,15 @@
 
 	btrfs_release_path(path);
 	if (ret == 0) {
-		btrfs_inc_nlink(inode);
-		btrfs_update_inode(trans, root, inode);
+		if (!inode->i_nlink)
+			set_nlink(inode, 1);
+		else
+			btrfs_inc_nlink(inode);
+		ret = btrfs_update_inode(trans, root, inode);
 	} else if (ret == -EEXIST) {
 		ret = 0;
 	} else {
-		BUG();
+		BUG(); /* Logic Error */
 	}
 	iput(inode);
 
@@ -1171,6 +1517,7 @@
 		iput(inode);
 		return -EIO;
 	}
+
 	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
 
 	/* FIXME, put inode into FIXUP list */
@@ -1208,7 +1555,8 @@
 	struct inode *dir;
 	u8 log_type;
 	int exists;
-	int ret;
+	int ret = 0;
+	bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
 
 	dir = read_one_inode(root, key->objectid);
 	if (!dir)
@@ -1216,8 +1564,10 @@
 
 	name_len = btrfs_dir_name_len(eb, di);
 	name = kmalloc(name_len, GFP_NOFS);
-	if (!name)
-		return -ENOMEM;
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	log_type = btrfs_dir_type(eb, di);
 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
@@ -1240,7 +1590,9 @@
 						     key->offset, name,
 						     name_len, 1);
 	} else {
-		BUG();
+		/* Corruption */
+		ret = -EINVAL;
+		goto out;
 	}
 	if (IS_ERR_OR_NULL(dst_di)) {
 		/* we need a sequence number to insert, so we only
@@ -1268,22 +1620,29 @@
 		goto out;
 
 	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	if (key->type == BTRFS_DIR_INDEX_KEY)
 		goto insert;
 out:
 	btrfs_release_path(path);
+	if (!ret && update_size) {
+		btrfs_i_size_write(dir, dir->i_size + name_len * 2);
+		ret = btrfs_update_inode(trans, root, dir);
+	}
 	kfree(name);
 	iput(dir);
-	return 0;
+	return ret;
 
 insert:
 	btrfs_release_path(path);
 	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
 			      name, name_len, log_type, &log_key);
-
-	BUG_ON(ret && ret != -ENOENT);
+	if (ret && ret != -ENOENT)
+		goto out;
+	update_size = false;
+	ret = 0;
 	goto out;
 }
 
@@ -1314,7 +1673,8 @@
 			return -EIO;
 		name_len = btrfs_dir_name_len(eb, di);
 		ret = replay_one_name(trans, root, path, eb, di, key);
-		BUG_ON(ret);
+		if (ret)
+			return ret;
 		ptr = (unsigned long)(di + 1);
 		ptr += name_len;
 	}
@@ -1475,16 +1835,21 @@
 
 			ret = link_to_fixup_dir(trans, root,
 						path, location.objectid);
-			BUG_ON(ret);
+			if (ret) {
+				kfree(name);
+				iput(inode);
+				goto out;
+			}
+
 			btrfs_inc_nlink(inode);
 			ret = btrfs_unlink_inode(trans, root, dir, inode,
 						 name, name_len);
-			BUG_ON(ret);
-
-			btrfs_run_delayed_items(trans, root);
-
+			if (!ret)
+				ret = btrfs_run_delayed_items(trans, root);
 			kfree(name);
 			iput(inode);
+			if (ret)
+				goto out;
 
 			/* there might still be more names under this key
 			 * check and repeat if required
@@ -1588,7 +1953,8 @@
 			ret = check_item_in_log(trans, root, log, path,
 						log_path, dir,
 						&found_key);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 			if (found_key.offset == (u64)-1)
 				break;
 			dir_key.offset = found_key.offset + 1;
@@ -1636,7 +2002,9 @@
 	int i;
 	int ret;
 
-	btrfs_read_buffer(eb, gen);
+	ret = btrfs_read_buffer(eb, gen);
+	if (ret)
+		return ret;
 
 	level = btrfs_header_level(eb);
 
@@ -1663,11 +2031,13 @@
 			if (S_ISDIR(mode)) {
 				ret = replay_dir_deletes(wc->trans,
 					 root, log, path, key.objectid, 0);
-				BUG_ON(ret);
+				if (ret)
+					break;
 			}
 			ret = overwrite_item(wc->trans, root, path,
 					     eb, i, &key);
-			BUG_ON(ret);
+			if (ret)
+				break;
 
 			/* for regular files, make sure corresponding
 			 * orhpan item exist. extents past the new EOF
@@ -1676,13 +2046,24 @@
 			if (S_ISREG(mode)) {
 				ret = insert_orphan_item(wc->trans, root,
 							 key.objectid);
-				BUG_ON(ret);
+				if (ret)
+					break;
 			}
 
 			ret = link_to_fixup_dir(wc->trans, root,
 						path, key.objectid);
-			BUG_ON(ret);
+			if (ret)
+				break;
 		}
+
+		if (key.type == BTRFS_DIR_INDEX_KEY &&
+		    wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
+			ret = replay_one_dir_item(wc->trans, root, path,
+						  eb, i, &key);
+			if (ret)
+				break;
+		}
+
 		if (wc->stage < LOG_WALK_REPLAY_ALL)
 			continue;
 
@@ -1690,24 +2071,29 @@
 		if (key.type == BTRFS_XATTR_ITEM_KEY) {
 			ret = overwrite_item(wc->trans, root, path,
 					     eb, i, &key);
-			BUG_ON(ret);
-		} else if (key.type == BTRFS_INODE_REF_KEY) {
+			if (ret)
+				break;
+		} else if (key.type == BTRFS_INODE_REF_KEY ||
+			   key.type == BTRFS_INODE_EXTREF_KEY) {
 			ret = add_inode_ref(wc->trans, root, log, path,
 					    eb, i, &key);
-			BUG_ON(ret && ret != -ENOENT);
+			if (ret && ret != -ENOENT)
+				break;
+			ret = 0;
 		} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
 			ret = replay_one_extent(wc->trans, root, path,
 						eb, i, &key);
-			BUG_ON(ret);
-		} else if (key.type == BTRFS_DIR_ITEM_KEY ||
-			   key.type == BTRFS_DIR_INDEX_KEY) {
+			if (ret)
+				break;
+		} else if (key.type == BTRFS_DIR_ITEM_KEY) {
 			ret = replay_one_dir_item(wc->trans, root, path,
 						  eb, i, &key);
-			BUG_ON(ret);
+			if (ret)
+				break;
 		}
 	}
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
 
 static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
@@ -1752,12 +2138,18 @@
 
 		if (*level == 1) {
 			ret = wc->process_func(root, next, wc, ptr_gen);
-			if (ret)
+			if (ret) {
+				free_extent_buffer(next);
 				return ret;
+			}
 
 			path->slots[*level]++;
 			if (wc->free) {
-				btrfs_read_buffer(next, ptr_gen);
+				ret = btrfs_read_buffer(next, ptr_gen);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
+				}
 
 				btrfs_tree_lock(next);
 				btrfs_set_lock_blocking(next);
@@ -1769,12 +2161,19 @@
 					BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_and_pin_reserved_extent(root,
 							 bytenr, blocksize);
-				BUG_ON(ret);
+				if (ret) {
+					free_extent_buffer(next);
+					return ret;
+				}
 			}
 			free_extent_buffer(next);
 			continue;
 		}
-		btrfs_read_buffer(next, ptr_gen);
+		ret = btrfs_read_buffer(next, ptr_gen);
+		if (ret) {
+			free_extent_buffer(next);
+			return ret;
+		}
 
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
@@ -1838,7 +2237,8 @@
 				ret = btrfs_free_and_pin_reserved_extent(root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len);
-				BUG_ON(ret);
+				if (ret)
+					return ret;
 			}
 			free_extent_buffer(path->nodes[*level]);
 			path->nodes[*level] = NULL;
@@ -1860,7 +2260,6 @@
 	int wret;
 	int level;
 	struct btrfs_path *path;
-	int i;
 	int orig_level;
 
 	path = btrfs_alloc_path();
@@ -1877,20 +2276,26 @@
 		wret = walk_down_log_tree(trans, log, path, &level, wc);
 		if (wret > 0)
 			break;
-		if (wret < 0)
+		if (wret < 0) {
 			ret = wret;
+			goto out;
+		}
 
 		wret = walk_up_log_tree(trans, log, path, &level, wc);
 		if (wret > 0)
 			break;
-		if (wret < 0)
+		if (wret < 0) {
 			ret = wret;
+			goto out;
+		}
 	}
 
 	/* was the root node processed? if not, catch it here */
 	if (path->nodes[orig_level]) {
-		wc->process_func(log, path->nodes[orig_level], wc,
+		ret = wc->process_func(log, path->nodes[orig_level], wc,
 			 btrfs_header_generation(path->nodes[orig_level]));
+		if (ret)
+			goto out;
 		if (wc->free) {
 			struct extent_buffer *next;
 
@@ -1906,16 +2311,12 @@
 				BTRFS_TREE_LOG_OBJECTID);
 			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
 							 next->len);
-			BUG_ON(ret);
+			if (ret)
+				goto out;
 		}
 	}
 
-	for (i = 0; i <= orig_level; i++) {
-		if (path->nodes[i]) {
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
-	}
+out:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1963,16 +2364,18 @@
 
 		finish_wait(&root->log_commit_wait[index], &wait);
 		mutex_lock(&root->log_mutex);
-	} while (root->log_transid < transid + 2 &&
+	} while (root->fs_info->last_trans_log_full_commit !=
+		 trans->transid && root->log_transid < transid + 2 &&
 		 atomic_read(&root->log_commit[index]));
 	return 0;
 }
 
-static int wait_for_writer(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root)
+static void wait_for_writer(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
 {
 	DEFINE_WAIT(wait);
-	while (atomic_read(&root->log_writers)) {
+	while (root->fs_info->last_trans_log_full_commit !=
+	       trans->transid && atomic_read(&root->log_writers)) {
 		prepare_to_wait(&root->log_writer_wait,
 				&wait, TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&root->log_mutex);
@@ -1982,7 +2385,6 @@
 		mutex_lock(&root->log_mutex);
 		finish_wait(&root->log_writer_wait, &wait);
 	}
-	return 0;
 }
 
 /*
@@ -2007,8 +2409,10 @@
 	struct btrfs_root *log = root->log_root;
 	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
 	unsigned long log_transid = 0;
+	struct blk_plug plug;
 
 	mutex_lock(&root->log_mutex);
+	log_transid = root->log_transid;
 	index1 = root->log_transid % 2;
 	if (atomic_read(&root->log_commit[index1])) {
 		wait_log_commit(trans, root, root->log_transid);
@@ -2021,7 +2425,7 @@
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
 	while (1) {
-		unsigned long batch = root->log_batch;
+		int batch = atomic_read(&root->log_batch);
 		/* when we're on an ssd, just kick the log commit out */
 		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
 			mutex_unlock(&root->log_mutex);
@@ -2029,18 +2433,18 @@
 			mutex_lock(&root->log_mutex);
 		}
 		wait_for_writer(trans, root);
-		if (batch == root->log_batch)
+		if (batch == atomic_read(&root->log_batch))
 			break;
 	}
 
 	/* bail out if we need to do a full commit */
 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
 		ret = -EAGAIN;
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&root->log_mutex);
 		goto out;
 	}
 
-	log_transid = root->log_transid;
 	if (log_transid % 2 == 0)
 		mark = EXTENT_DIRTY;
 	else
@@ -2049,12 +2453,18 @@
 	/* we start IO on  all the marked extents here, but we don't actually
 	 * wait for them until later.
 	 */
+	blk_start_plug(&plug);
 	ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
-	BUG_ON(ret);
+	if (ret) {
+		blk_finish_plug(&plug);
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&root->log_mutex);
+		goto out;
+	}
 
 	btrfs_set_root_node(&log->root_item, log->node);
 
-	root->log_batch = 0;
 	root->log_transid++;
 	log->log_transid = root->log_transid;
 	root->log_start_pid = 0;
@@ -2067,7 +2477,7 @@
 	mutex_unlock(&root->log_mutex);
 
 	mutex_lock(&log_root_tree->log_mutex);
-	log_root_tree->log_batch++;
+	atomic_inc(&log_root_tree->log_batch);
 	atomic_inc(&log_root_tree->log_writers);
 	mutex_unlock(&log_root_tree->log_mutex);
 
@@ -2081,9 +2491,15 @@
 	}
 
 	if (ret) {
-		BUG_ON(ret != -ENOSPC);
+		blk_finish_plug(&plug);
+		if (ret != -ENOSPC) {
+			btrfs_abort_transaction(trans, root, ret);
+			mutex_unlock(&log_root_tree->log_mutex);
+			goto out;
+		}
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out;
@@ -2091,9 +2507,11 @@
 
 	index2 = log_root_tree->log_transid % 2;
 	if (atomic_read(&log_root_tree->log_commit[index2])) {
+		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 		wait_log_commit(trans, log_root_tree,
 				log_root_tree->log_transid);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = 0;
 		goto out;
@@ -2112,24 +2530,35 @@
 	 * check the full commit flag again
 	 */
 	if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+		blk_finish_plug(&plug);
 		btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+		btrfs_free_logged_extents(log, log_transid);
 		mutex_unlock(&log_root_tree->log_mutex);
 		ret = -EAGAIN;
 		goto out_wake_log_root;
 	}
 
-	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
-				&log_root_tree->dirty_log_pages,
-				EXTENT_DIRTY | EXTENT_NEW);
-	BUG_ON(ret);
+	ret = btrfs_write_marked_extents(log_root_tree,
+					 &log_root_tree->dirty_log_pages,
+					 EXTENT_DIRTY | EXTENT_NEW);
+	blk_finish_plug(&plug);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		btrfs_free_logged_extents(log, log_transid);
+		mutex_unlock(&log_root_tree->log_mutex);
+		goto out_wake_log_root;
+	}
 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+	btrfs_wait_marked_extents(log_root_tree,
+				  &log_root_tree->dirty_log_pages,
+				  EXTENT_NEW | EXTENT_DIRTY);
+	btrfs_wait_logged_extents(log, log_transid);
 
 	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
 	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
-	log_root_tree->log_batch = 0;
 	log_root_tree->log_transid++;
 	smp_mb();
 
@@ -2143,9 +2572,12 @@
 	 * in and cause problems either.
 	 */
 	btrfs_scrub_pause_super(root);
-	write_ctree_super(trans, root->fs_info->tree_root, 1);
+	ret = write_ctree_super(trans, root->fs_info->tree_root, 1);
 	btrfs_scrub_continue_super(root);
-	ret = 0;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out_wake_log_root;
+	}
 
 	mutex_lock(&root->log_mutex);
 	if (root->last_log_commit < log_transid)
@@ -2176,12 +2608,18 @@
 		.process_func = process_one_buffer
 	};
 
-	ret = walk_log_tree(trans, log, &wc);
-	BUG_ON(ret);
+	if (trans) {
+		ret = walk_log_tree(trans, log, &wc);
+
+		/* I don't think this can happen but just in case */
+		if (ret)
+			btrfs_abort_transaction(trans, log, ret);
+	}
 
 	while (1) {
 		ret = find_first_extent_bit(&log->dirty_log_pages,
-				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
+				0, &start, &end, EXTENT_DIRTY | EXTENT_NEW,
+				NULL);
 		if (ret)
 			break;
 
@@ -2189,6 +2627,14 @@
 				  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
 	}
 
+	/*
+	 * We may have short-circuited the log tree with the full commit logic
+	 * and left ordered extents on our list, so clear these out to keep us
+	 * from leaking inodes and memory.
+	 */
+	btrfs_free_logged_extents(log, 0);
+	btrfs_free_logged_extents(log, 1);
+
 	free_extent_buffer(log->node);
 	kfree(log);
 }
@@ -2275,7 +2721,10 @@
 	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 	btrfs_release_path(path);
 	di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
@@ -2287,7 +2736,10 @@
 	if (di) {
 		ret = btrfs_delete_one_dir_name(trans, log, path, di);
 		bytes_del += name_len;
-		BUG_ON(ret);
+		if (ret) {
+			err = ret;
+			goto fail;
+		}
 	}
 
 	/* update the directory size in the log to reflect the names
@@ -2330,7 +2782,9 @@
 	if (ret == -ENOSPC) {
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 0;
-	}
+	} else if (ret < 0)
+		btrfs_abort_transaction(trans, root, ret);
+
 	btrfs_end_log_trans(root);
 
 	return err;
@@ -2361,7 +2815,8 @@
 	if (ret == -ENOSPC) {
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 0;
-	}
+	} else if (ret < 0 && ret != -ENOENT)
+		btrfs_abort_transaction(trans, root, ret);
 	btrfs_end_log_trans(root);
 
 	return ret;
@@ -2435,7 +2890,7 @@
 	path->keep_locks = 1;
 
 	ret = btrfs_search_forward(root, &min_key, &max_key,
-				   path, 0, trans->transid);
+				   path, trans->transid);
 
 	/*
 	 * we didn't find anything from this transaction, see if there
@@ -2615,6 +3070,7 @@
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
+	int start_slot;
 
 	key.objectid = objectid;
 	key.type = max_key_type;
@@ -2622,7 +3078,7 @@
 
 	while (1) {
 		ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
-		BUG_ON(ret == 0);
+		BUG_ON(ret == 0); /* Logic error */
 		if (ret < 0)
 			break;
 
@@ -2636,23 +3092,110 @@
 		if (found_key.objectid != objectid)
 			break;
 
-		ret = btrfs_del_item(trans, log, path);
-		if (ret)
+		found_key.offset = 0;
+		found_key.type = 0;
+		ret = btrfs_bin_search(path->nodes[0], &found_key, 0,
+				       &start_slot);
+
+		ret = btrfs_del_items(trans, log, path, start_slot,
+				      path->slots[0] - start_slot + 1);
+		/*
+		 * If start slot isn't 0 then we don't need to re-search, we've
+		 * found the last guy with the objectid in this tree.
+		 */
+		if (ret || start_slot != 0)
 			break;
 		btrfs_release_path(path);
 	}
 	btrfs_release_path(path);
+	if (ret > 0)
+		ret = 0;
 	return ret;
 }
 
+static void fill_inode_item(struct btrfs_trans_handle *trans,
+			    struct extent_buffer *leaf,
+			    struct btrfs_inode_item *item,
+			    struct inode *inode, int log_inode_only)
+{
+	struct btrfs_map_token token;
+
+	btrfs_init_map_token(&token);
+
+	if (log_inode_only) {
+		/* set the generation to zero so the recover code
+		 * can tell the difference between an logging
+		 * just to say 'this inode exists' and a logging
+		 * to say 'update this inode with these values'
+		 */
+		btrfs_set_token_inode_generation(leaf, item, 0, &token);
+		btrfs_set_token_inode_size(leaf, item, 0, &token);
+	} else {
+		btrfs_set_token_inode_generation(leaf, item,
+						 BTRFS_I(inode)->generation,
+						 &token);
+		btrfs_set_token_inode_size(leaf, item, inode->i_size, &token);
+	}
+
+	btrfs_set_token_inode_uid(leaf, item, inode->i_uid, &token);
+	btrfs_set_token_inode_gid(leaf, item, inode->i_gid, &token);
+	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+				     inode->i_atime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+				      inode->i_atime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+				     inode->i_mtime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+				      inode->i_mtime.tv_nsec, &token);
+
+	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+				     inode->i_ctime.tv_sec, &token);
+	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+				      inode->i_ctime.tv_nsec, &token);
+
+	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
+				     &token);
+
+	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
+	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
+	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
+}
+
+static int log_inode_item(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *log, struct btrfs_path *path,
+			  struct inode *inode)
+{
+	struct btrfs_inode_item *inode_item;
+	struct btrfs_key key;
+	int ret;
+
+	memcpy(&key, &BTRFS_I(inode)->location, sizeof(key));
+	ret = btrfs_insert_empty_item(trans, log, path, &key,
+				      sizeof(*inode_item));
+	if (ret && ret != -EEXIST)
+		return ret;
+	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_inode_item);
+	fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+	btrfs_release_path(path);
+	return 0;
+}
+
 static noinline int copy_items(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *log,
+			       struct inode *inode,
 			       struct btrfs_path *dst_path,
 			       struct extent_buffer *src,
 			       int start_slot, int nr, int inode_only)
 {
 	unsigned long src_offset;
 	unsigned long dst_offset;
+	struct btrfs_root *log = BTRFS_I(inode)->root->log_root;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_inode_item *inode_item;
 	int ret;
@@ -2661,6 +3204,7 @@
 	char *ins_data;
 	int i;
 	struct list_head ordered_sums;
+	int skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	INIT_LIST_HEAD(&ordered_sums);
 
@@ -2689,29 +3233,23 @@
 
 		src_offset = btrfs_item_ptr_offset(src, start_slot + i);
 
-		copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
-				   src_offset, ins_sizes[i]);
-
-		if (inode_only == LOG_INODE_EXISTS &&
-		    ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
+		if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
 			inode_item = btrfs_item_ptr(dst_path->nodes[0],
 						    dst_path->slots[0],
 						    struct btrfs_inode_item);
-			btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
-
-			/* set the generation to zero so the recover code
-			 * can tell the difference between an logging
-			 * just to say 'this inode exists' and a logging
-			 * to say 'update this inode with these values'
-			 */
-			btrfs_set_inode_generation(dst_path->nodes[0],
-						   inode_item, 0);
+			fill_inode_item(trans, dst_path->nodes[0], inode_item,
+					inode, inode_only == LOG_INODE_EXISTS);
+		} else {
+			copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
+					   src_offset, ins_sizes[i]);
 		}
+
 		/* take a reference on file data extents so that truncates
 		 * or deletes of this inode don't have to relog the inode
 		 * again
 		 */
-		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
+		if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY &&
+		    !skip_csum) {
 			int found_type;
 			extent = btrfs_item_ptr(src, start_slot + i,
 						struct btrfs_file_extent_item);
@@ -2720,8 +3258,7 @@
 				continue;
 
 			found_type = btrfs_file_extent_type(src, extent);
-			if (found_type == BTRFS_FILE_EXTENT_REG ||
-			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+			if (found_type == BTRFS_FILE_EXTENT_REG) {
 				u64 ds, dl, cs, cl;
 				ds = btrfs_file_extent_disk_bytenr(src,
 								extent);
@@ -2744,7 +3281,11 @@
 						log->fs_info->csum_root,
 						ds + cs, ds + cs + cl - 1,
 						&ordered_sums, 0);
-				BUG_ON(ret);
+				if (ret) {
+					btrfs_release_path(dst_path);
+					kfree(ins_data);
+					return ret;
+				}
 			}
 		}
 	}
@@ -2770,6 +3311,297 @@
 	return ret;
 }
 
+static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct extent_map *em1, *em2;
+
+	em1 = list_entry(a, struct extent_map, list);
+	em2 = list_entry(b, struct extent_map, list);
+
+	if (em1->start < em2->start)
+		return -1;
+	else if (em1->start > em2->start)
+		return 1;
+	return 0;
+}
+
+static int log_one_extent(struct btrfs_trans_handle *trans,
+			  struct inode *inode, struct btrfs_root *root,
+			  struct extent_map *em, struct btrfs_path *path)
+{
+	struct btrfs_root *log = root->log_root;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	struct btrfs_ordered_extent *ordered;
+	struct list_head ordered_sums;
+	struct btrfs_map_token token;
+	struct btrfs_key key;
+	u64 mod_start = em->mod_start;
+	u64 mod_len = em->mod_len;
+	u64 csum_offset;
+	u64 csum_len;
+	u64 extent_offset = em->start - em->orig_start;
+	u64 block_len;
+	int ret;
+	int index = log->log_transid % 2;
+	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+
+	ret = __btrfs_drop_extents(trans, log, inode, path, em->start,
+				   em->start + em->len, NULL, 0);
+	if (ret)
+		return ret;
+
+	INIT_LIST_HEAD(&ordered_sums);
+	btrfs_init_map_token(&token);
+	key.objectid = btrfs_ino(inode);
+	key.type = BTRFS_EXTENT_DATA_KEY;
+	key.offset = em->start;
+
+	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*fi));
+	if (ret)
+		return ret;
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+
+	btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
+					       &token);
+	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+		skip_csum = true;
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_PREALLOC,
+						 &token);
+	} else {
+		btrfs_set_token_file_extent_type(leaf, fi,
+						 BTRFS_FILE_EXTENT_REG,
+						 &token);
+		if (em->block_start == 0)
+			skip_csum = true;
+	}
+
+	block_len = max(em->block_len, em->orig_block_len);
+	if (em->compress_type != BTRFS_COMPRESS_NONE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start,
+							&token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi,
+							em->block_start -
+							extent_offset, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len,
+							   &token);
+	} else {
+		btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token);
+		btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0,
+							   &token);
+	}
+
+	btrfs_set_token_file_extent_offset(leaf, fi,
+					   em->start - em->orig_start,
+					   &token);
+	btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token);
+	btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token);
+	btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type,
+						&token);
+	btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token);
+	btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token);
+	btrfs_mark_buffer_dirty(leaf);
+
+	btrfs_release_path(path);
+	if (ret) {
+		return ret;
+	}
+
+	if (skip_csum)
+		return 0;
+
+	if (em->compress_type) {
+		csum_offset = 0;
+		csum_len = block_len;
+	}
+
+	/*
+	 * First check and see if our csums are on our outstanding ordered
+	 * extents.
+	 */
+again:
+	spin_lock_irq(&log->log_extents_lock[index]);
+	list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+		struct btrfs_ordered_sum *sum;
+
+		if (!mod_len)
+			break;
+
+		if (ordered->inode != inode)
+			continue;
+
+		if (ordered->file_offset + ordered->len <= mod_start ||
+		    mod_start + mod_len <= ordered->file_offset)
+			continue;
+
+		/*
+		 * We are going to copy all the csums on this ordered extent, so
+		 * go ahead and adjust mod_start and mod_len in case this
+		 * ordered extent has already been logged.
+		 */
+		if (ordered->file_offset > mod_start) {
+			if (ordered->file_offset + ordered->len >=
+			    mod_start + mod_len)
+				mod_len = ordered->file_offset - mod_start;
+			/*
+			 * If we have this case
+			 *
+			 * |--------- logged extent ---------|
+			 *       |----- ordered extent ----|
+			 *
+			 * Just don't mess with mod_start and mod_len, we'll
+			 * just end up logging more csums than we need and it
+			 * will be ok.
+			 */
+		} else {
+			if (ordered->file_offset + ordered->len <
+			    mod_start + mod_len) {
+				mod_len = (mod_start + mod_len) -
+					(ordered->file_offset + ordered->len);
+				mod_start = ordered->file_offset +
+					ordered->len;
+			} else {
+				mod_len = 0;
+			}
+		}
+
+		/*
+		 * To keep us from looping for the above case of an ordered
+		 * extent that falls inside of the logged extent.
+		 */
+		if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+				     &ordered->flags))
+			continue;
+		atomic_inc(&ordered->refs);
+		spin_unlock_irq(&log->log_extents_lock[index]);
+		/*
+		 * we've dropped the lock, we must either break or
+		 * start over after this.
+		 */
+
+		wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+
+		list_for_each_entry(sum, &ordered->list, list) {
+			ret = btrfs_csum_file_blocks(trans, log, sum);
+			if (ret) {
+				btrfs_put_ordered_extent(ordered);
+				goto unlocked;
+			}
+		}
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+
+	}
+	spin_unlock_irq(&log->log_extents_lock[index]);
+unlocked:
+
+	if (!mod_len || ret)
+		return ret;
+
+	csum_offset = mod_start - em->start;
+	csum_len = mod_len;
+
+	/* block start is already adjusted for the file extent offset. */
+	ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
+				       em->block_start + csum_offset,
+				       em->block_start + csum_offset +
+				       csum_len - 1, &ordered_sums, 0);
+	if (ret)
+		return ret;
+
+	while (!list_empty(&ordered_sums)) {
+		struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
+						   struct btrfs_ordered_sum,
+						   list);
+		if (!ret)
+			ret = btrfs_csum_file_blocks(trans, log, sums);
+		list_del(&sums->list);
+		kfree(sums);
+	}
+
+	return ret;
+}
+
+static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     struct inode *inode,
+				     struct btrfs_path *path)
+{
+	struct extent_map *em, *n;
+	struct list_head extents;
+	struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+	u64 test_gen;
+	int ret = 0;
+	int num = 0;
+
+	INIT_LIST_HEAD(&extents);
+
+	write_lock(&tree->lock);
+	test_gen = root->fs_info->last_trans_committed;
+
+	list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
+		list_del_init(&em->list);
+
+		/*
+		 * Just an arbitrary number, this can be really CPU intensive
+		 * once we start getting a lot of extents, and really once we
+		 * have a bunch of extents we just want to commit since it will
+		 * be faster.
+		 */
+		if (++num > 32768) {
+			list_del_init(&tree->modified_extents);
+			ret = -EFBIG;
+			goto process;
+		}
+
+		if (em->generation <= test_gen)
+			continue;
+		/* Need a ref to keep it from getting evicted from cache */
+		atomic_inc(&em->refs);
+		set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+		list_add_tail(&em->list, &extents);
+		num++;
+	}
+
+	list_sort(NULL, &extents, extent_cmp);
+
+process:
+	while (!list_empty(&extents)) {
+		em = list_entry(extents.next, struct extent_map, list);
+
+		list_del_init(&em->list);
+
+		/*
+		 * If we had an error we just need to delete everybody from our
+		 * private list.
+		 */
+		if (ret) {
+			clear_em_logging(tree, em);
+			free_extent_map(em);
+			continue;
+		}
+
+		write_unlock(&tree->lock);
+
+		ret = log_one_extent(trans, inode, root, em, path);
+		write_lock(&tree->lock);
+		clear_em_logging(tree, em);
+		free_extent_map(em);
+	}
+	WARN_ON(!list_empty(&extents));
+	write_unlock(&tree->lock);
+
+	btrfs_release_path(path);
+	return ret;
+}
+
 /* log a single inode in the tree log.
  * At least one parent directory for this inode must exist in the tree
  * or be logged already.
@@ -2799,10 +3631,9 @@
 	int nritems;
 	int ins_start_slot = 0;
 	int ins_nr;
+	bool fast_search = false;
 	u64 ino = btrfs_ino(inode);
 
-	log = root->log_root;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -2818,25 +3649,32 @@
 
 	max_key.objectid = ino;
 
-	/* today the code can only do partial logging of directories */
-	if (!S_ISDIR(inode->i_mode))
-	    inode_only = LOG_INODE_ALL;
 
-	if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
+	/* today the code can only do partial logging of directories */
+	if (S_ISDIR(inode->i_mode) ||
+	    (!test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+		       &BTRFS_I(inode)->runtime_flags) &&
+	     inode_only == LOG_INODE_EXISTS))
 		max_key.type = BTRFS_XATTR_ITEM_KEY;
 	else
 		max_key.type = (u8)-1;
 	max_key.offset = (u64)-1;
 
-	ret = btrfs_commit_inode_delayed_items(trans, inode);
-	if (ret) {
-		btrfs_free_path(path);
-		btrfs_free_path(dst_path);
-		return ret;
+	/* Only run delayed items if we are a dir or a new file */
+	if (S_ISDIR(inode->i_mode) ||
+	    BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+		ret = btrfs_commit_inode_delayed_items(trans, inode);
+		if (ret) {
+			btrfs_free_path(path);
+			btrfs_free_path(dst_path);
+			return ret;
+		}
 	}
 
 	mutex_lock(&BTRFS_I(inode)->log_mutex);
 
+	btrfs_get_logged_extents(log, inode);
+
 	/*
 	 * a brute force approach to making sure we get the most uptodate
 	 * copies of everything.
@@ -2848,7 +3686,30 @@
 			max_key_type = BTRFS_XATTR_ITEM_KEY;
 		ret = drop_objectid_items(trans, log, path, ino, max_key_type);
 	} else {
-		ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
+		if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+				       &BTRFS_I(inode)->runtime_flags)) {
+			clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+				  &BTRFS_I(inode)->runtime_flags);
+			ret = btrfs_truncate_inode_items(trans, log,
+							 inode, 0, 0);
+		} else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+					      &BTRFS_I(inode)->runtime_flags)) {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
+			max_key.type = BTRFS_XATTR_ITEM_KEY;
+			ret = drop_objectid_items(trans, log, path, ino,
+						  max_key.type);
+		} else {
+			if (inode_only == LOG_INODE_ALL)
+				fast_search = true;
+			ret = log_inode_item(trans, log, dst_path, inode);
+			if (ret) {
+				err = ret;
+				goto out_unlock;
+			}
+			goto log_extents;
+		}
+
 	}
 	if (ret) {
 		err = ret;
@@ -2859,7 +3720,7 @@
 	while (1) {
 		ins_nr = 0;
 		ret = btrfs_search_forward(root, &min_key, &max_key,
-					   path, 0, trans->transid);
+					   path, trans->transid);
 		if (ret != 0)
 			break;
 again:
@@ -2879,7 +3740,7 @@
 			goto next_slot;
 		}
 
-		ret = copy_items(trans, log, dst_path, src, ins_start_slot,
+		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
 				 ins_nr, inode_only);
 		if (ret) {
 			err = ret;
@@ -2897,7 +3758,7 @@
 			goto again;
 		}
 		if (ins_nr) {
-			ret = copy_items(trans, log, dst_path, src,
+			ret = copy_items(trans, inode, dst_path, src,
 					 ins_start_slot,
 					 ins_nr, inode_only);
 			if (ret) {
@@ -2918,8 +3779,7 @@
 			break;
 	}
 	if (ins_nr) {
-		ret = copy_items(trans, log, dst_path, src,
-				 ins_start_slot,
+		ret = copy_items(trans, inode, dst_path, src, ins_start_slot,
 				 ins_nr, inode_only);
 		if (ret) {
 			err = ret;
@@ -2927,10 +3787,27 @@
 		}
 		ins_nr = 0;
 	}
-	WARN_ON(ins_nr);
+
+log_extents:
+	btrfs_release_path(path);
+	btrfs_release_path(dst_path);
+	if (fast_search) {
+		ret = btrfs_log_changed_extents(trans, root, inode, dst_path);
+		if (ret) {
+			err = ret;
+			goto out_unlock;
+		}
+	} else {
+		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
+		struct extent_map *em, *n;
+
+		write_lock(&tree->lock);
+		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
+			list_del_init(&em->list);
+		write_unlock(&tree->lock);
+	}
+
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-		btrfs_release_path(path);
-		btrfs_release_path(dst_path);
 		ret = log_directory_changes(trans, root, inode, path, dst_path);
 		if (ret) {
 			err = ret;
@@ -2938,7 +3815,10 @@
 		}
 	}
 	BTRFS_I(inode)->logged_trans = trans->transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
+	if (err)
+		btrfs_free_logged_extents(log, log->log_transid);
 	mutex_unlock(&BTRFS_I(inode)->log_mutex);
 
 	btrfs_free_path(path);
@@ -2961,6 +3841,7 @@
 	int ret = 0;
 	struct btrfs_root *root;
 	struct dentry *old_parent = NULL;
+	struct inode *orig_inode = inode;
 
 	/*
 	 * for regular files, if its inode is already on disk, we don't
@@ -2980,7 +3861,14 @@
 	}
 
 	while (1) {
-		BTRFS_I(inode)->logged_trans = trans->transid;
+		/*
+		 * If we are logging a directory then we start with our inode,
+		 * not our parents inode, so we need to skipp setting the
+		 * logged_trans so that further down in the log code we don't
+		 * think this inode has already been logged.
+		 */
+		if (inode != orig_inode)
+			BTRFS_I(inode)->logged_trans = trans->transid;
 		smp_mb();
 
 		if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
@@ -3013,30 +3901,15 @@
 	return ret;
 }
 
-static int inode_in_log(struct btrfs_trans_handle *trans,
-		 struct inode *inode)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret = 0;
-
-	mutex_lock(&root->log_mutex);
-	if (BTRFS_I(inode)->logged_trans == trans->transid &&
-	    BTRFS_I(inode)->last_sub_trans <= root->last_log_commit)
-		ret = 1;
-	mutex_unlock(&root->log_mutex);
-	return ret;
-}
-
-
 /*
  * helper function around btrfs_log_inode to make sure newly created
  * parent directories also end up in the log.  A minimal inode and backref
  * only logging is done of any parent directories that are older than
  * the last committed transaction
  */
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only)
+static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+			    	  struct btrfs_root *root, struct inode *inode,
+			    	  struct dentry *parent, int exists_only)
 {
 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
 	struct super_block *sb;
@@ -3068,7 +3941,7 @@
 	if (ret)
 		goto end_no_trans;
 
-	if (inode_in_log(trans, inode)) {
+	if (btrfs_inode_in_log(inode, trans->transid)) {
 		ret = BTRFS_NO_LOG_SYNC;
 		goto end_no_trans;
 	}
@@ -3120,7 +3993,6 @@
 end_trans:
 	dput(old_parent);
 	if (ret < 0) {
-		BUG_ON(ret != -ENOSPC);
 		root->fs_info->last_trans_log_full_commit = trans->transid;
 		ret = 1;
 	}
@@ -3173,13 +4045,20 @@
 	fs_info->log_root_recovering = 1;
 
 	trans = btrfs_start_transaction(fs_info->tree_root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto error;
+	}
 
 	wc.trans = trans;
 	wc.pin = 1;
 
 	ret = walk_log_tree(trans, log_root_tree, &wc);
-	BUG_ON(ret);
+	if (ret) {
+		btrfs_error(fs_info, ret, "Failed to pin buffers while "
+			    "recovering log root tree.");
+		goto error;
+	}
 
 again:
 	key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3188,8 +4067,12 @@
 
 	while (1) {
 		ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
-		if (ret < 0)
-			break;
+
+		if (ret < 0) {
+			btrfs_error(fs_info, ret,
+				    "Couldn't find tree log root.");
+			goto error;
+		}
 		if (ret > 0) {
 			if (path->slots[0] == 0)
 				break;
@@ -3201,26 +4084,36 @@
 		if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 			break;
 
-		log = btrfs_read_fs_root_no_radix(log_root_tree,
-						  &found_key);
-		BUG_ON(IS_ERR(log));
+		log = btrfs_read_fs_root(log_root_tree, &found_key);
+		if (IS_ERR(log)) {
+			ret = PTR_ERR(log);
+			btrfs_error(fs_info, ret,
+				    "Couldn't read tree log root.");
+			goto error;
+		}
 
 		tmp_key.objectid = found_key.offset;
 		tmp_key.type = BTRFS_ROOT_ITEM_KEY;
 		tmp_key.offset = (u64)-1;
 
 		wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
-		BUG_ON(IS_ERR_OR_NULL(wc.replay_dest));
+		if (IS_ERR(wc.replay_dest)) {
+			ret = PTR_ERR(wc.replay_dest);
+			free_extent_buffer(log->node);
+			free_extent_buffer(log->commit_root);
+			kfree(log);
+			btrfs_error(fs_info, ret, "Couldn't read target root "
+				    "for tree log recovery.");
+			goto error;
+		}
 
 		wc.replay_dest->log_root = log;
 		btrfs_record_root_in_trans(trans, wc.replay_dest);
 		ret = walk_log_tree(trans, log, &wc);
-		BUG_ON(ret);
 
-		if (wc.stage == LOG_WALK_REPLAY_ALL) {
+		if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
 			ret = fixup_inode_link_counts(trans, wc.replay_dest,
 						      path);
-			BUG_ON(ret);
 		}
 
 		key.offset = found_key.offset - 1;
@@ -3229,6 +4122,9 @@
 		free_extent_buffer(log->commit_root);
 		kfree(log);
 
+		if (ret)
+			goto error;
+
 		if (found_key.offset == 0)
 			break;
 	}
@@ -3249,15 +4145,22 @@
 
 	btrfs_free_path(path);
 
+	/* step 4: commit the transaction, which also unpins the blocks */
+	ret = btrfs_commit_transaction(trans, fs_info->tree_root);
+	if (ret)
+		return ret;
+
 	free_extent_buffer(log_root_tree->node);
 	log_root_tree->log_root = NULL;
 	fs_info->log_root_recovering = 0;
-
-	/* step 4: commit the transaction, which also unpins the blocks */
-	btrfs_commit_transaction(trans, fs_info->tree_root);
-
 	kfree(log_root_tree);
+
 	return 0;
+error:
+	if (wc.trans)
+		btrfs_end_transaction(wc.trans, fs_info->tree_root);
+	btrfs_free_path(path);
+	return ret;
 }
 
 /*
diff -ur a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
--- a/fs/btrfs/tree-log.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/tree-log.h	2014-02-17 11:56:58.000000000 +0100
@@ -38,11 +38,8 @@
 			       struct btrfs_root *root,
 			       const char *name, int name_len,
 			       struct inode *inode, u64 dirid);
-int btrfs_end_log_trans(struct btrfs_root *root);
+void btrfs_end_log_trans(struct btrfs_root *root);
 int btrfs_pin_log_trans(struct btrfs_root *root);
-int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-		    struct btrfs_root *root, struct inode *inode,
-		    struct dentry *parent, int exists_only);
 void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
 			     struct inode *dir, struct inode *inode,
 			     int for_rename);
Nur in b/fs/btrfs: ulist.c.
Nur in b/fs/btrfs: ulist.h.
Nur in b/fs/btrfs: uuid-tree.c.
Nur in a/fs/btrfs: version.h.
diff -ur a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
--- a/fs/btrfs/volumes.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/volumes.c	2014-02-17 11:56:58.000000000 +0100
@@ -23,6 +23,10 @@
 #include <linux/random.h>
 #include <linux/iocontext.h>
 #include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/semaphore.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -31,12 +35,20 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+#include "math.h"
+#include "dev-replace.h"
 
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root,
 				struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
@@ -51,6 +63,48 @@
 	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
+static struct btrfs_fs_devices *__alloc_fs_devices(void)
+{
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+	if (!fs_devs)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&fs_devs->device_list_mutex);
+
+	INIT_LIST_HEAD(&fs_devs->devices);
+	INIT_LIST_HEAD(&fs_devs->alloc_list);
+	INIT_LIST_HEAD(&fs_devs->list);
+
+	return fs_devs;
+}
+
+/**
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid:	a pointer to UUID for this FS.  If NULL a new UUID is
+ *		generated.
+ *
+ * Return: a pointer to a new &struct btrfs_fs_devices on success;
+ * ERR_PTR() on error.  Returned struct is not linked onto any lists and
+ * can be destroyed with kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = __alloc_fs_devices();
+	if (IS_ERR(fs_devs))
+		return fs_devs;
+
+	if (fsid)
+		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+	else
+		generate_random_uuid(fs_devs->fsid);
+
+	return fs_devs;
+}
+
 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct btrfs_device *device;
@@ -59,13 +113,26 @@
 		device = list_entry(fs_devices->devices.next,
 				    struct btrfs_device, dev_list);
 		list_del(&device->dev_list);
-		kfree(device->name);
+		rcu_string_free(device->name);
 		kfree(device);
 	}
 	kfree(fs_devices);
 }
 
-int btrfs_cleanup_fs_uuids(void)
+static void btrfs_kobject_uevent(struct block_device *bdev,
+				 enum kobject_action action)
+{
+	int ret;
+
+	ret = kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, action);
+	if (ret)
+		pr_warn("Sending event '%d' to kobject: '%s' (%p): failed\n",
+			action,
+			kobject_name(&disk_to_dev(bdev->bd_disk)->kobj),
+			&disk_to_dev(bdev->bd_disk)->kobj);
+}
+
+void btrfs_cleanup_fs_uuids(void)
 {
 	struct btrfs_fs_devices *fs_devices;
 
@@ -75,7 +142,27 @@
 		list_del(&fs_devices->list);
 		free_fs_devices(fs_devices);
 	}
-	return 0;
+}
+
+static struct btrfs_device *__alloc_device(void)
+{
+	struct btrfs_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_NOFS);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dev->dev_list);
+	INIT_LIST_HEAD(&dev->dev_alloc_list);
+
+	spin_lock_init(&dev->io_lock);
+
+	spin_lock_init(&dev->reada_lock);
+	atomic_set(&dev->reada_in_flight, 0);
+	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
+	return dev;
 }
 
 static noinline struct btrfs_device *__find_device(struct list_head *head,
@@ -103,6 +190,44 @@
 	return NULL;
 }
 
+static int
+btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
+		      int flush, struct block_device **bdev,
+		      struct buffer_head **bh)
+{
+	int ret;
+
+	*bdev = blkdev_get_by_path(device_path, flags, holder);
+
+	if (IS_ERR(*bdev)) {
+		ret = PTR_ERR(*bdev);
+		printk(KERN_INFO "btrfs: open %s failed\n", device_path);
+		goto error;
+	}
+
+	if (flush)
+		filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
+	ret = set_blocksize(*bdev, 4096);
+	if (ret) {
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+	invalidate_bdev(*bdev);
+	*bh = btrfs_read_dev_super(*bdev);
+	if (!*bh) {
+		ret = -EINVAL;
+		blkdev_put(*bdev, flags);
+		goto error;
+	}
+
+	return 0;
+
+error:
+	*bdev = NULL;
+	*bh = NULL;
+	return ret;
+}
+
 static void requeue_list(struct btrfs_pending_bios *pending_bios,
 			struct bio *head, struct bio *tail)
 {
@@ -128,7 +253,7 @@
  * the list if the block device is congested.  This way, multiple devices
  * can make progress from a single worker thread.
  */
-static noinline int run_scheduled_bios(struct btrfs_device *device)
+static noinline void run_scheduled_bios(struct btrfs_device *device)
 {
 	struct bio *pending;
 	struct backing_dev_info *bdi;
@@ -222,9 +347,8 @@
 		cur = pending;
 		pending = pending->bi_next;
 		cur->bi_next = NULL;
-		atomic_dec(&fs_info->nr_async_bios);
 
-		if (atomic_read(&fs_info->nr_async_bios) < limit &&
+		if (atomic_dec_return(&fs_info->nr_async_bios) < limit &&
 		    waitqueue_active(&fs_info->async_submit_wait))
 			wake_up(&fs_info->async_submit_wait);
 
@@ -246,7 +370,7 @@
 			sync_pending = 0;
 		}
 
-		submit_bio(cur->bi_rw, cur);
+		btrfsic_submit_bio(cur->bi_rw, cur);
 		num_run++;
 		batch_run++;
 		if (need_resched())
@@ -314,7 +438,6 @@
 
 done:
 	blk_finish_plug(&plug);
-	return 0;
 }
 
 static void pending_bios_fn(struct btrfs_work *work)
@@ -331,21 +454,19 @@
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices;
+	struct rcu_string *name;
 	u64 found_transid = btrfs_super_generation(disk_super);
-	char *name;
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-		if (!fs_devices)
-			return -ENOMEM;
-		INIT_LIST_HEAD(&fs_devices->devices);
-		INIT_LIST_HEAD(&fs_devices->alloc_list);
+		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (IS_ERR(fs_devices))
+			return PTR_ERR(fs_devices);
+
 		list_add(&fs_devices->list, &fs_uuids);
-		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		mutex_init(&fs_devices->device_list_mutex);
+
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
@@ -355,43 +476,32 @@
 		if (fs_devices->opened)
 			return -EBUSY;
 
-		device = kzalloc(sizeof(*device), GFP_NOFS);
-		if (!device) {
+		device = btrfs_alloc_device(NULL, &devid,
+					    disk_super->dev_item.uuid);
+		if (IS_ERR(device)) {
 			/* we can safely leave the fs_devices entry around */
-			return -ENOMEM;
+			return PTR_ERR(device);
 		}
-		device->devid = devid;
-		device->work.func = pending_bios_fn;
-		memcpy(device->uuid, disk_super->dev_item.uuid,
-		       BTRFS_UUID_SIZE);
-		spin_lock_init(&device->io_lock);
-		device->name = kstrdup(path, GFP_NOFS);
-		if (!device->name) {
+
+		name = rcu_string_strdup(path, GFP_NOFS);
+		if (!name) {
 			kfree(device);
 			return -ENOMEM;
 		}
-		INIT_LIST_HEAD(&device->dev_alloc_list);
-
-		/* init readahead state */
-		spin_lock_init(&device->reada_lock);
-		device->reada_curr_zone = NULL;
-		atomic_set(&device->reada_in_flight, 0);
-		device->reada_next = 0;
-		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
-		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+		rcu_assign_pointer(device->name, name);
 
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add_rcu(&device->dev_list, &fs_devices->devices);
+		fs_devices->num_devices++;
 		mutex_unlock(&fs_devices->device_list_mutex);
 
 		device->fs_devices = fs_devices;
-		fs_devices->num_devices++;
-	} else if (!device->name || strcmp(device->name, path)) {
-		name = kstrdup(path, GFP_NOFS);
+	} else if (!device->name || strcmp(device->name->str, path)) {
+		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name)
 			return -ENOMEM;
-		kfree(device->name);
-		device->name = name;
+		rcu_string_free(device->name);
+		rcu_assign_pointer(device->name, name);
 		if (device->missing) {
 			fs_devices->missing_devices--;
 			device->missing = 0;
@@ -412,36 +522,33 @@
 	struct btrfs_device *device;
 	struct btrfs_device *orig_dev;
 
-	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!fs_devices)
-		return ERR_PTR(-ENOMEM);
+	fs_devices = alloc_fs_devices(orig->fsid);
+	if (IS_ERR(fs_devices))
+		return fs_devices;
 
-	INIT_LIST_HEAD(&fs_devices->devices);
-	INIT_LIST_HEAD(&fs_devices->alloc_list);
-	INIT_LIST_HEAD(&fs_devices->list);
-	mutex_init(&fs_devices->device_list_mutex);
 	fs_devices->latest_devid = orig->latest_devid;
 	fs_devices->latest_trans = orig->latest_trans;
-	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
+	fs_devices->total_devices = orig->total_devices;
 
 	/* We have held the volume lock, it is safe to get the devices. */
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
-		device = kzalloc(sizeof(*device), GFP_NOFS);
-		if (!device)
+		struct rcu_string *name;
+
+		device = btrfs_alloc_device(NULL, &orig_dev->devid,
+					    orig_dev->uuid);
+		if (IS_ERR(device))
 			goto error;
 
-		device->name = kstrdup(orig_dev->name, GFP_NOFS);
-		if (!device->name) {
+		/*
+		 * This is ok to do without rcu read locked because we hold the
+		 * uuid mutex so nothing we touch in here is going to disappear.
+		 */
+		name = rcu_string_strdup(orig_dev->name->str, GFP_NOFS);
+		if (!name) {
 			kfree(device);
 			goto error;
 		}
-
-		device->devid = orig_dev->devid;
-		device->work.func = pending_bios_fn;
-		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-		spin_lock_init(&device->io_lock);
-		INIT_LIST_HEAD(&device->dev_list);
-		INIT_LIST_HEAD(&device->dev_alloc_list);
+		rcu_assign_pointer(device->name, name);
 
 		list_add(&device->dev_list, &fs_devices->devices);
 		device->fs_devices = fs_devices;
@@ -453,17 +560,45 @@
 	return ERR_PTR(-ENOMEM);
 }
 
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+			       struct btrfs_fs_devices *fs_devices, int step)
 {
 	struct btrfs_device *device, *next;
 
+	struct block_device *latest_bdev = NULL;
+	u64 latest_devid = 0;
+	u64 latest_transid = 0;
+
 	mutex_lock(&uuid_mutex);
 again:
 	/* This is the initialized path, it is safe to release the devices. */
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
-		if (device->in_fs_metadata)
+		if (device->in_fs_metadata) {
+			if (!device->is_tgtdev_for_dev_replace &&
+			    (!latest_transid ||
+			     device->generation > latest_transid)) {
+				latest_devid = device->devid;
+				latest_transid = device->generation;
+				latest_bdev = device->bdev;
+			}
 			continue;
+		}
 
+		if (device->devid == BTRFS_DEV_REPLACE_DEVID) {
+			/*
+			 * In the first step, keep the device which has
+			 * the correct fsid and the devid that is used
+			 * for the dev_replace procedure.
+			 * In the second step, the dev_replace state is
+			 * read from the device tree and it is known
+			 * whether the procedure is really active or
+			 * not, which means whether this device is
+			 * used or whether it should be removed.
+			 */
+			if (step == 0 || device->is_tgtdev_for_dev_replace) {
+				continue;
+			}
+		}
 		if (device->bdev) {
 			blkdev_put(device->bdev, device->mode);
 			device->bdev = NULL;
@@ -472,11 +607,12 @@
 		if (device->writeable) {
 			list_del_init(&device->dev_alloc_list);
 			device->writeable = 0;
-			fs_devices->rw_devices--;
+			if (!device->is_tgtdev_for_dev_replace)
+				fs_devices->rw_devices--;
 		}
 		list_del_init(&device->dev_list);
 		fs_devices->num_devices--;
-		kfree(device->name);
+		rcu_string_free(device->name);
 		kfree(device);
 	}
 
@@ -485,8 +621,11 @@
 		goto again;
 	}
 
+	fs_devices->latest_bdev = latest_bdev;
+	fs_devices->latest_devid = latest_devid;
+	fs_devices->latest_trans = latest_transid;
+
 	mutex_unlock(&uuid_mutex);
-	return 0;
 }
 
 static void __free_device(struct work_struct *work)
@@ -498,7 +637,7 @@
 	if (device->bdev)
 		blkdev_put(device->bdev, device->mode);
 
-	kfree(device->name);
+	rcu_string_free(device->name);
 	kfree(device);
 }
 
@@ -522,28 +661,34 @@
 	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry(device, &fs_devices->devices, dev_list) {
 		struct btrfs_device *new_device;
+		struct rcu_string *name;
 
 		if (device->bdev)
 			fs_devices->open_devices--;
 
-		if (device->writeable) {
+		if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 			list_del_init(&device->dev_alloc_list);
 			fs_devices->rw_devices--;
 		}
 
 		if (device->can_discard)
 			fs_devices->num_can_discard--;
+		if (device->missing)
+			fs_devices->missing_devices--;
+
+		new_device = btrfs_alloc_device(NULL, &device->devid,
+						device->uuid);
+		BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
+
+		/* Safe because we are under uuid_mutex */
+		if (device->name) {
+			name = rcu_string_strdup(device->name->str, GFP_NOFS);
+			BUG_ON(!name); /* -ENOMEM */
+			rcu_assign_pointer(new_device->name, name);
+		}
 
-		new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
-		BUG_ON(!new_device);
-		memcpy(new_device, device, sizeof(*new_device));
-		new_device->name = kstrdup(device->name, GFP_NOFS);
-		BUG_ON(device->name && !new_device->name);
-		new_device->bdev = NULL;
-		new_device->writeable = 0;
-		new_device->in_fs_metadata = 0;
-		new_device->can_discard = 0;
 		list_replace_rcu(&device->dev_list, &new_device->dev_list);
+		new_device->fs_devices = device->fs_devices;
 
 		call_rcu(&device->rcu, free_device);
 	}
@@ -576,6 +721,12 @@
 		__btrfs_close_devices(fs_devices);
 		free_fs_devices(fs_devices);
 	}
+	/*
+	 * Wait for rcu kworkers under __btrfs_close_devices
+	 * to finish all blkdev_puts so device is really
+	 * free when umount is done.
+	 */
+	rcu_barrier();
 	return ret;
 }
 
@@ -603,16 +754,10 @@
 		if (!device->name)
 			continue;
 
-		bdev = blkdev_get_by_path(device->name, flags, holder);
-		if (IS_ERR(bdev)) {
-			printk(KERN_INFO "open %s failed\n", device->name);
-			goto error;
-		}
-		set_blocksize(bdev, 4096);
-
-		bh = btrfs_read_dev_super(bdev);
-		if (!bh)
-			goto error_close;
+		/* Just open everything we can; ignore failures here */
+		if (btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
+					    &bdev, &bh))
+			continue;
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -651,7 +796,8 @@
 			fs_devices->rotating = 1;
 
 		fs_devices->open_devices++;
-		if (device->writeable) {
+		if (device->writeable &&
+		    device->devid != BTRFS_DEV_REPLACE_DEVID) {
 			fs_devices->rw_devices++;
 			list_add(&device->dev_alloc_list,
 				 &fs_devices->alloc_list);
@@ -661,9 +807,7 @@
 
 error_brelse:
 		brelse(bh);
-error_close:
 		blkdev_put(bdev, flags);
-error:
 		continue;
 	}
 	if (fs_devices->open_devices == 0) {
@@ -696,19 +840,35 @@
 	return ret;
 }
 
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret)
 {
 	struct btrfs_super_block *disk_super;
 	struct block_device *bdev;
-	struct buffer_head *bh;
-	int ret;
+	struct page *page;
+	void *p;
+	int ret = -EINVAL;
 	u64 devid;
 	u64 transid;
+	u64 total_devices;
+	u64 bytenr;
+	pgoff_t index;
 
+	/*
+	 * we would like to check all the supers, but that would make
+	 * a btrfs mount succeed after a mkfs from a different FS.
+	 * So, we need to add a special mount option to scan for
+	 * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+	 */
+	bytenr = btrfs_sb_offset(0);
+	flags |= FMODE_EXCL;
 	mutex_lock(&uuid_mutex);
 
-	flags |= FMODE_EXCL;
 	bdev = blkdev_get_by_path(path, flags, holder);
 
 	if (IS_ERR(bdev)) {
@@ -716,27 +876,58 @@
 		goto error;
 	}
 
-	ret = set_blocksize(bdev, 4096);
-	if (ret)
-		goto error_close;
-	bh = btrfs_read_dev_super(bdev);
-	if (!bh) {
-		ret = -EINVAL;
-		goto error_close;
-	}
-	disk_super = (struct btrfs_super_block *)bh->b_data;
+	/* make sure our super fits in the device */
+	if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+		goto error_bdev_put;
+
+	/* make sure our super fits in the page */
+	if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+		goto error_bdev_put;
+
+	/* make sure our super doesn't straddle pages on disk */
+	index = bytenr >> PAGE_CACHE_SHIFT;
+	if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+		goto error_bdev_put;
+
+	/* pull in the page with our super */
+	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+				   index, GFP_NOFS);
+
+	if (IS_ERR_OR_NULL(page))
+		goto error_bdev_put;
+
+	p = kmap(page);
+
+	/* align our pointer to the offset of the super block */
+	disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+
+	if (btrfs_super_bytenr(disk_super) != bytenr ||
+	    btrfs_super_magic(disk_super) != BTRFS_MAGIC)
+		goto error_unmap;
+
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
 	transid = btrfs_super_generation(disk_super);
-	if (disk_super->label[0])
-		printk(KERN_INFO "device label %s ", disk_super->label);
-	else
-		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
-	printk(KERN_CONT "devid %llu transid %llu %s\n",
-	       (unsigned long long)devid, (unsigned long long)transid, path);
+	total_devices = btrfs_super_num_devices(disk_super);
+
+	if (disk_super->label[0]) {
+		if (disk_super->label[BTRFS_LABEL_SIZE - 1])
+			disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
+		printk(KERN_INFO "btrfs: device label %s ", disk_super->label);
+	} else {
+		printk(KERN_INFO "btrfs: device fsid %pU ", disk_super->fsid);
+	}
+
+	printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
+
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
+	if (!ret && fs_devices_ret)
+		(*fs_devices_ret)->total_devices = total_devices;
 
-	brelse(bh);
-error_close:
+error_unmap:
+	kunmap(page);
+	page_cache_release(page);
+
+error_bdev_put:
 	blkdev_put(bdev, flags);
 error:
 	mutex_unlock(&uuid_mutex);
@@ -758,7 +949,7 @@
 
 	*length = 0;
 
-	if (start >= device->total_bytes)
+	if (start >= device->total_bytes || device->is_tgtdev_for_dev_replace)
 		return 0;
 
 	path = btrfs_alloc_path();
@@ -827,9 +1018,37 @@
 	return ret;
 }
 
+static int contains_pending_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_device *device,
+				   u64 *start, u64 len)
+{
+	struct extent_map *em;
+	int ret = 0;
+
+	list_for_each_entry(em, &trans->transaction->pending_chunks, list) {
+		struct map_lookup *map;
+		int i;
+
+		map = (struct map_lookup *)em->bdev;
+		for (i = 0; i < map->num_stripes; i++) {
+			if (map->stripes[i].dev != device)
+				continue;
+			if (map->stripes[i].physical >= *start + len ||
+			    map->stripes[i].physical + em->orig_block_len <=
+			    *start)
+				continue;
+			*start = map->stripes[i].physical +
+				em->orig_block_len;
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+
 /*
  * find_free_dev_extent - find free space in the specified device
- * @trans:	transaction handler
  * @device:	the device which we search the free space in
  * @num_bytes:	the size of the free space that we need
  * @start:	store the start of the free space.
@@ -873,27 +1092,28 @@
 	 */
 	search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
 
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+again:
 	max_hole_start = search_start;
 	max_hole_size = 0;
 	hole_size = 0;
 
-	if (search_start >= search_end) {
+	if (search_start >= search_end || device->is_tgtdev_for_dev_replace) {
 		ret = -ENOSPC;
-		goto error;
+		goto out;
 	}
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		ret = -ENOMEM;
-		goto error;
-	}
 	path->reada = 2;
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
 
 	key.objectid = device->devid;
 	key.offset = search_start;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	if (ret > 0) {
@@ -928,6 +1148,15 @@
 		if (key.offset > search_start) {
 			hole_size = key.offset - search_start;
 
+			/*
+			 * Have to check before we set max_hole_start, otherwise
+			 * we could end up sending back this offset anyway.
+			 */
+			if (contains_pending_extent(trans, device,
+						    &search_start,
+						    hole_size))
+				hole_size = 0;
+
 			if (hole_size > max_hole_size) {
 				max_hole_start = search_start;
 				max_hole_size = hole_size;
@@ -971,6 +1200,11 @@
 		max_hole_size = hole_size;
 	}
 
+	if (contains_pending_extent(trans, device, &search_start, hole_size)) {
+		btrfs_release_path(path);
+		goto again;
+	}
+
 	/* See above. */
 	if (hole_size < num_bytes)
 		ret = -ENOSPC;
@@ -979,7 +1213,6 @@
 
 out:
 	btrfs_free_path(path);
-error:
 	*start = max_hole_start;
 	if (len)
 		*len = max_hole_size;
@@ -1025,8 +1258,10 @@
 		leaf = path->nodes[0];
 		extent = btrfs_item_ptr(leaf, path->slots[0],
 					struct btrfs_dev_extent);
+	} else {
+		btrfs_error(root->fs_info, ret, "Slot search failed");
+		goto out;
 	}
-	BUG_ON(ret);
 
 	if (device->bytes_used > 0) {
 		u64 len = btrfs_dev_extent_length(leaf, extent);
@@ -1036,16 +1271,19 @@
 		spin_unlock(&root->fs_info->free_chunk_lock);
 	}
 	ret = btrfs_del_item(trans, root, path);
-
+	if (ret) {
+		btrfs_error(root->fs_info, ret,
+			    "Failed to remove dev extent item");
+	}
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-			   struct btrfs_device *device,
-			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset, u64 start, u64 num_bytes)
+static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_device *device,
+				  u64 chunk_tree, u64 chunk_objectid,
+				  u64 chunk_offset, u64 start, u64 num_bytes)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1055,6 +1293,7 @@
 	struct btrfs_key key;
 
 	WARN_ON(!device->in_fs_metadata);
+	WARN_ON(device->is_tgtdev_for_dev_replace);
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1064,7 +1303,8 @@
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	ret = btrfs_insert_empty_item(trans, root, path, &key,
 				      sizeof(*extent));
-	BUG_ON(ret);
+	if (ret)
+		goto out;
 
 	leaf = path->nodes[0];
 	extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -1074,68 +1314,42 @@
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
-		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
-		    BTRFS_UUID_SIZE);
+		    btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
+out:
 	btrfs_free_path(path);
 	return ret;
 }
 
-static noinline int find_next_chunk(struct btrfs_root *root,
-				    u64 objectid, u64 *offset)
+static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 {
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_key key;
-	struct btrfs_chunk *chunk;
-	struct btrfs_key found_key;
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	key.objectid = objectid;
-	key.offset = (u64)-1;
-	key.type = BTRFS_CHUNK_ITEM_KEY;
-
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-	if (ret < 0)
-		goto error;
-
-	BUG_ON(ret == 0);
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct rb_node *n;
+	u64 ret = 0;
 
-	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
-	if (ret) {
-		*offset = 0;
-	} else {
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      path->slots[0]);
-		if (found_key.objectid != objectid)
-			*offset = 0;
-		else {
-			chunk = btrfs_item_ptr(path->nodes[0], path->slots[0],
-					       struct btrfs_chunk);
-			*offset = found_key.offset +
-				btrfs_chunk_length(path->nodes[0], chunk);
-		}
+	em_tree = &fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	n = rb_last(&em_tree->map);
+	if (n) {
+		em = rb_entry(n, struct extent_map, rb_node);
+		ret = em->start + em->len;
 	}
-	ret = 0;
-error:
-	btrfs_free_path(path);
+	read_unlock(&em_tree->lock);
+
 	return ret;
 }
 
-static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
+				    u64 *devid_ret)
 {
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
 
-	root = root->fs_info->chunk_root;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1144,20 +1358,21 @@
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = (u64)-1;
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto error;
 
-	BUG_ON(ret == 0);
+	BUG_ON(ret == 0); /* Corruption */
 
-	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+	ret = btrfs_previous_item(fs_info->chunk_root, path,
+				  BTRFS_DEV_ITEMS_OBJECTID,
 				  BTRFS_DEV_ITEM_KEY);
 	if (ret) {
-		*objectid = 1;
+		*devid_ret = 1;
 	} else {
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
-		*objectid = found_key.offset + 1;
+		*devid_ret = found_key.offset + 1;
 	}
 	ret = 0;
 error:
@@ -1169,9 +1384,9 @@
  * the device information is stored in the chunk root
  * the btrfs_device struct should be fully filled in
  */
-int btrfs_add_device(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_device *device)
+static int btrfs_add_device(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    struct btrfs_device *device)
 {
 	int ret;
 	struct btrfs_path *path;
@@ -1211,9 +1426,9 @@
 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
 	btrfs_set_device_start_offset(leaf, dev_item, 0);
 
-	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	ptr = btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	ptr = btrfs_device_fsid(dev_item);
 	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -1278,29 +1493,46 @@
 	u64 devid;
 	u64 num_devices;
 	u8 *dev_uuid;
+	unsigned seq;
 	int ret = 0;
 	bool clear_super = false;
 
 	mutex_lock(&uuid_mutex);
-	mutex_lock(&root->fs_info->volume_mutex);
 
-	all_avail = root->fs_info->avail_data_alloc_bits |
-		root->fs_info->avail_system_alloc_bits |
-		root->fs_info->avail_metadata_alloc_bits;
-
-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) &&
-	    root->fs_info->fs_devices->num_devices <= 4) {
-		printk(KERN_ERR "btrfs: unable to go below four devices "
-		       "on raid10\n");
-		ret = -EINVAL;
+	do {
+		seq = read_seqbegin(&root->fs_info->profiles_lock);
+
+		all_avail = root->fs_info->avail_data_alloc_bits |
+			    root->fs_info->avail_system_alloc_bits |
+			    root->fs_info->avail_metadata_alloc_bits;
+	} while (read_seqretry(&root->fs_info->profiles_lock, seq));
+
+	num_devices = root->fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&root->fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&root->fs_info->dev_replace)) {
+		WARN_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&root->fs_info->dev_replace);
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && num_devices <= 4) {
+		ret = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET;
 		goto out;
 	}
 
-	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) &&
-	    root->fs_info->fs_devices->num_devices <= 2) {
-		printk(KERN_ERR "btrfs: unable to go below two "
-		       "devices on raid1\n");
-		ret = -EINVAL;
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && num_devices <= 2) {
+		ret = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET;
+		goto out;
+	}
+
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+	    root->fs_info->fs_devices->rw_devices <= 2) {
+		ret = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET;
+		goto out;
+	}
+	if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+	    root->fs_info->fs_devices->rw_devices <= 3) {
+		ret = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET;
 		goto out;
 	}
 
@@ -1315,7 +1547,9 @@
 		 * is held.
 		 */
 		list_for_each_entry(tmp, devices, dev_list) {
-			if (tmp->in_fs_metadata && !tmp->bdev) {
+			if (tmp->in_fs_metadata &&
+			    !tmp->is_tgtdev_for_dev_replace &&
+			    !tmp->bdev) {
 				device = tmp;
 				break;
 			}
@@ -1324,28 +1558,20 @@
 		bh = NULL;
 		disk_super = NULL;
 		if (!device) {
-			printk(KERN_ERR "btrfs: no missing devices found to "
-			       "remove\n");
+			ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
 			goto out;
 		}
 	} else {
-		bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-					  root->fs_info->bdev_holder);
-		if (IS_ERR(bdev)) {
-			ret = PTR_ERR(bdev);
+		ret = btrfs_get_bdev_and_sb(device_path,
+					    FMODE_WRITE | FMODE_EXCL,
+					    root->fs_info->bdev_holder, 0,
+					    &bdev, &bh);
+		if (ret)
 			goto out;
-		}
-
-		set_blocksize(bdev, 4096);
-		bh = btrfs_read_dev_super(bdev);
-		if (!bh) {
-			ret = -EINVAL;
-			goto error_close;
-		}
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
 		dev_uuid = disk_super->dev_item.uuid;
-		device = btrfs_find_device(root, devid, dev_uuid,
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
 					   disk_super->fsid);
 		if (!device) {
 			ret = -ENOENT;
@@ -1353,10 +1579,13 @@
 		}
 	}
 
+	if (device->is_tgtdev_for_dev_replace) {
+		ret = BTRFS_ERROR_DEV_TGT_REPLACE;
+		goto error_brelse;
+	}
+
 	if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) {
-		printk(KERN_ERR "btrfs: unable to remove the only writeable "
-		       "device\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
 		goto error_brelse;
 	}
 
@@ -1368,10 +1597,17 @@
 		clear_super = true;
 	}
 
+	mutex_unlock(&uuid_mutex);
 	ret = btrfs_shrink_device(device, 0);
+	mutex_lock(&uuid_mutex);
 	if (ret)
 		goto error_undo;
 
+	/*
+	 * TODO: the superblock still includes this device in its num_devices
+	 * counter although write_all_supers() is not locked out. This
+	 * could give a filesystem state which requires a degraded mount.
+	 */
 	ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
 	if (ret)
 		goto error_undo;
@@ -1382,12 +1618,16 @@
 	spin_unlock(&root->fs_info->free_chunk_lock);
 
 	device->in_fs_metadata = 0;
-	btrfs_scrub_cancel_dev(root, device);
+	btrfs_scrub_cancel_dev(root->fs_info, device);
 
 	/*
 	 * the device list mutex makes sure that we don't change
 	 * the device list while someone else is writing out all
-	 * the device supers.
+	 * the device supers. Whoever is writing all supers, should
+	 * lock the device list mutex before getting the number of
+	 * devices in the super block (super_copy). Conversely,
+	 * whoever updates the number of devices in the super block
+	 * (super_copy) should hold the device list mutex.
 	 */
 
 	cur_devices = device->fs_devices;
@@ -1395,6 +1635,7 @@
 	list_del_rcu(&device->dev_list);
 
 	device->fs_devices->num_devices--;
+	device->fs_devices->total_devices--;
 
 	if (device->missing)
 		root->fs_info->fs_devices->missing_devices--;
@@ -1410,10 +1651,10 @@
 		device->fs_devices->open_devices--;
 
 	call_rcu(&device->rcu, free_device);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	if (cur_devices->open_devices == 0) {
 		struct btrfs_fs_devices *fs_devices;
@@ -1431,11 +1672,14 @@
 		free_fs_devices(cur_devices);
 	}
 
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+
 	/*
 	 * at this point, the device is zero sized.  We want to
 	 * remove it from the devices list and zero out the old super
 	 */
-	if (clear_super) {
+	if (clear_super && disk_super) {
 		/* make sure this device isn't detected as part of
 		 * the FS anymore
 		 */
@@ -1446,13 +1690,15 @@
 
 	ret = 0;
 
+	/* Notify udev that device has changed */
+	if (bdev)
+		btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+
 error_brelse:
 	brelse(bh);
-error_close:
 	if (bdev)
 		blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
-	mutex_unlock(&root->fs_info->volume_mutex);
 	mutex_unlock(&uuid_mutex);
 	return ret;
 error_undo:
@@ -1466,11 +1712,121 @@
 	goto error_brelse;
 }
 
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+				 struct btrfs_device *srcdev)
+{
+	WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex));
+
+	list_del_rcu(&srcdev->dev_list);
+	list_del_rcu(&srcdev->dev_alloc_list);
+	fs_info->fs_devices->num_devices--;
+	if (srcdev->missing) {
+		fs_info->fs_devices->missing_devices--;
+		fs_info->fs_devices->rw_devices++;
+	}
+	if (srcdev->can_discard)
+		fs_info->fs_devices->num_can_discard--;
+	if (srcdev->bdev) {
+		fs_info->fs_devices->open_devices--;
+
+		/* zero out the old super */
+		btrfs_scratch_superblock(srcdev);
+	}
+
+	call_rcu(&srcdev->rcu, free_device);
+}
+
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev)
+{
+	struct btrfs_device *next_device;
+
+	WARN_ON(!tgtdev);
+	mutex_lock(&fs_info->fs_devices->device_list_mutex);
+	if (tgtdev->bdev) {
+		btrfs_scratch_superblock(tgtdev);
+		fs_info->fs_devices->open_devices--;
+	}
+	fs_info->fs_devices->num_devices--;
+	if (tgtdev->can_discard)
+		fs_info->fs_devices->num_can_discard++;
+
+	next_device = list_entry(fs_info->fs_devices->devices.next,
+				 struct btrfs_device, dev_list);
+	if (tgtdev->bdev == fs_info->sb->s_bdev)
+		fs_info->sb->s_bdev = next_device->bdev;
+	if (tgtdev->bdev == fs_info->fs_devices->latest_bdev)
+		fs_info->fs_devices->latest_bdev = next_device->bdev;
+	list_del_rcu(&tgtdev->dev_list);
+
+	call_rcu(&tgtdev->rcu, free_device);
+
+	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+}
+
+static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path,
+				     struct btrfs_device **device)
+{
+	int ret = 0;
+	struct btrfs_super_block *disk_super;
+	u64 devid;
+	u8 *dev_uuid;
+	struct block_device *bdev;
+	struct buffer_head *bh;
+
+	*device = NULL;
+	ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
+				    root->fs_info->bdev_holder, 0, &bdev, &bh);
+	if (ret)
+		return ret;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+	devid = btrfs_stack_device_id(&disk_super->dev_item);
+	dev_uuid = disk_super->dev_item.uuid;
+	*device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+				    disk_super->fsid);
+	brelse(bh);
+	if (!*device)
+		ret = -ENOENT;
+	blkdev_put(bdev, FMODE_READ);
+	return ret;
+}
+
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device)
+{
+	*device = NULL;
+	if (strcmp(device_path, "missing") == 0) {
+		struct list_head *devices;
+		struct btrfs_device *tmp;
+
+		devices = &root->fs_info->fs_devices->devices;
+		/*
+		 * It is safe to read the devices since the volume_mutex
+		 * is held by the caller.
+		 */
+		list_for_each_entry(tmp, devices, dev_list) {
+			if (tmp->in_fs_metadata && !tmp->bdev) {
+				*device = tmp;
+				break;
+			}
+		}
+
+		if (!*device) {
+			pr_err("btrfs: no missing device found\n");
+			return -ENOENT;
+		}
+
+		return 0;
+	} else {
+		return btrfs_find_device_by_path(root, device_path, device);
+	}
+}
+
 /*
  * does all the dirty work required for changing file system's UUID.
  */
-static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
+static int btrfs_prepare_sprout(struct btrfs_root *root)
 {
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
@@ -1483,9 +1839,9 @@
 	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!seed_devices)
-		return -ENOMEM;
+	seed_devices = __alloc_fs_devices();
+	if (IS_ERR(seed_devices))
+		return PTR_ERR(seed_devices);
 
 	old_devices = clone_fs_devices(fs_devices);
 	if (IS_ERR(old_devices)) {
@@ -1504,7 +1860,6 @@
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
 			      synchronize_rcu);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1514,11 +1869,14 @@
 	fs_devices->seeding = 0;
 	fs_devices->num_devices = 0;
 	fs_devices->open_devices = 0;
+	fs_devices->total_devices = 0;
 	fs_devices->seed = seed_devices;
 
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	super_flags = btrfs_super_flags(disk_super) &
 		      ~BTRFS_SUPER_FLAG_SEEDING;
 	btrfs_set_super_flags(disk_super, super_flags);
@@ -1578,14 +1936,13 @@
 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
 					  struct btrfs_dev_item);
 		devid = btrfs_device_id(leaf, dev_item);
-		read_extent_buffer(leaf, dev_uuid,
-				   (unsigned long)btrfs_device_uuid(dev_item),
+		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 				   BTRFS_UUID_SIZE);
-		read_extent_buffer(leaf, fs_uuid,
-				   (unsigned long)btrfs_device_fsid(dev_item),
+		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 				   BTRFS_UUID_SIZE);
-		device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
-		BUG_ON(!device);
+		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
+					   fs_uuid);
+		BUG_ON(!device); /* Logic error */
 
 		if (device->fs_devices->seeding) {
 			btrfs_set_device_generation(leaf, dev_item,
@@ -1610,12 +1967,13 @@
 	struct block_device *bdev;
 	struct list_head *devices;
 	struct super_block *sb = root->fs_info->sb;
+	struct rcu_string *name;
 	u64 total_bytes;
 	int seeding_dev = 0;
 	int ret = 0;
 
 	if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
-		return -EINVAL;
+		return -EROFS;
 
 	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
 				  root->fs_info->bdev_holder);
@@ -1629,44 +1987,38 @@
 	}
 
 	filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
-	/*
-	 * we have the volume lock, so we don't need the extra
-	 * device list mutex while reading the list here.
-	 */
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
+			mutex_unlock(
+				&root->fs_info->fs_devices->device_list_mutex);
 			goto error;
 		}
 	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	device = kzalloc(sizeof(*device), GFP_NOFS);
-	if (!device) {
+	device = btrfs_alloc_device(root->fs_info, NULL, NULL);
+	if (IS_ERR(device)) {
 		/* we can safely leave the fs_devices entry around */
-		ret = -ENOMEM;
+		ret = PTR_ERR(device);
 		goto error;
 	}
 
-	device->name = kstrdup(device_path, GFP_NOFS);
-	if (!device->name) {
+	name = rcu_string_strdup(device_path, GFP_NOFS);
+	if (!name) {
 		kfree(device);
 		ret = -ENOMEM;
 		goto error;
 	}
-
-	ret = find_next_devid(root, &device->devid);
-	if (ret) {
-		kfree(device->name);
-		kfree(device);
-		goto error;
-	}
+	rcu_assign_pointer(device->name, name);
 
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
-		kfree(device->name);
+		rcu_string_free(device->name);
 		kfree(device);
 		ret = PTR_ERR(trans);
 		goto error;
@@ -1678,9 +2030,6 @@
 	if (blk_queue_discard(q))
 		device->can_discard = 1;
 	device->writeable = 1;
-	device->work.func = pending_bios_fn;
-	generate_random_uuid(device->uuid);
-	spin_lock_init(&device->io_lock);
 	device->generation = trans->transid;
 	device->io_width = root->sectorsize;
 	device->io_align = root->sectorsize;
@@ -1690,21 +2039,18 @@
 	device->dev_root = root->fs_info->dev_root;
 	device->bdev = bdev;
 	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 0;
 	device->mode = FMODE_EXCL;
 	set_blocksize(device->bdev, 4096);
 
 	if (seeding_dev) {
 		sb->s_flags &= ~MS_RDONLY;
-		ret = btrfs_prepare_sprout(trans, root);
-		BUG_ON(ret);
+		ret = btrfs_prepare_sprout(root);
+		BUG_ON(ret); /* -ENOMEM */
 	}
 
 	device->fs_devices = root->fs_info->fs_devices;
 
-	/*
-	 * we don't want write_supers to jump in here with our device
-	 * half setup
-	 */
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
 	list_add(&device->dev_alloc_list,
@@ -1712,6 +2058,7 @@
 	root->fs_info->fs_devices->num_devices++;
 	root->fs_info->fs_devices->open_devices++;
 	root->fs_info->fs_devices->rw_devices++;
+	root->fs_info->fs_devices->total_devices++;
 	if (device->can_discard)
 		root->fs_info->fs_devices->num_can_discard++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
@@ -1734,11 +2081,21 @@
 
 	if (seeding_dev) {
 		ret = init_first_rw_device(trans, root, device);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
 		ret = btrfs_finish_sprout(trans, root);
-		BUG_ON(ret);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
 	} else {
 		ret = btrfs_add_device(trans, root, device);
+		if (ret) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto error_trans;
+		}
 	}
 
 	/*
@@ -1748,25 +2105,135 @@
 	btrfs_clear_space_info_full(root->fs_info);
 
 	unlock_chunks(root);
-	btrfs_commit_transaction(trans, root);
+	root->fs_info->num_tolerated_disk_barrier_failures =
+		btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info);
+	ret = btrfs_commit_transaction(trans, root);
 
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 
+		if (ret) /* transaction commit */
+			return ret;
+
 		ret = btrfs_relocate_sys_chunks(root);
-		BUG_ON(ret);
+		if (ret < 0)
+			btrfs_error(root->fs_info, ret,
+				    "Failed to relocate sys chunks after "
+				    "device initialization. This can be fixed "
+				    "using the \"btrfs balance\" command.");
+		trans = btrfs_attach_transaction(root);
+		if (IS_ERR(trans)) {
+			if (PTR_ERR(trans) == -ENOENT)
+				return 0;
+			return PTR_ERR(trans);
+		}
+		ret = btrfs_commit_transaction(trans, root);
 	}
-out:
-	mutex_unlock(&root->fs_info->volume_mutex);
+
 	return ret;
+
+error_trans:
+	unlock_chunks(root);
+	btrfs_end_transaction(trans, root);
+	rcu_string_free(device->name);
+	kfree(device);
 error:
 	blkdev_put(bdev, FMODE_EXCL);
 	if (seeding_dev) {
 		mutex_unlock(&uuid_mutex);
 		up_write(&sb->s_umount);
 	}
-	goto out;
+	return ret;
+}
+
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device **device_out)
+{
+	struct request_queue *q;
+	struct btrfs_device *device;
+	struct block_device *bdev;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head *devices;
+	struct rcu_string *name;
+	u64 devid = BTRFS_DEV_REPLACE_DEVID;
+	int ret = 0;
+
+	*device_out = NULL;
+	if (fs_info->fs_devices->seeding)
+		return -EINVAL;
+
+	bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
+				  fs_info->bdev_holder);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+
+	filemap_write_and_wait(bdev->bd_inode->i_mapping);
+
+	devices = &fs_info->fs_devices->devices;
+	list_for_each_entry(device, devices, dev_list) {
+		if (device->bdev == bdev) {
+			ret = -EEXIST;
+			goto error;
+		}
+	}
+
+	device = btrfs_alloc_device(NULL, &devid, NULL);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		goto error;
+	}
+
+	name = rcu_string_strdup(device_path, GFP_NOFS);
+	if (!name) {
+		kfree(device);
+		ret = -ENOMEM;
+		goto error;
+	}
+	rcu_assign_pointer(device->name, name);
+
+	q = bdev_get_queue(bdev);
+	if (blk_queue_discard(q))
+		device->can_discard = 1;
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	device->writeable = 1;
+	device->generation = 0;
+	device->io_width = root->sectorsize;
+	device->io_align = root->sectorsize;
+	device->sector_size = root->sectorsize;
+	device->total_bytes = i_size_read(bdev->bd_inode);
+	device->disk_total_bytes = device->total_bytes;
+	device->dev_root = fs_info->dev_root;
+	device->bdev = bdev;
+	device->in_fs_metadata = 1;
+	device->is_tgtdev_for_dev_replace = 1;
+	device->mode = FMODE_EXCL;
+	set_blocksize(device->bdev, 4096);
+	device->fs_devices = fs_info->fs_devices;
+	list_add(&device->dev_list, &fs_info->fs_devices->devices);
+	fs_info->fs_devices->num_devices++;
+	fs_info->fs_devices->open_devices++;
+	if (device->can_discard)
+		fs_info->fs_devices->num_can_discard++;
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
+	*device_out = device;
+	return ret;
+
+error:
+	blkdev_put(bdev, FMODE_EXCL);
+	return ret;
+}
+
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev)
+{
+	WARN_ON(fs_info->fs_devices->rw_devices == 0);
+	tgtdev->io_width = fs_info->dev_root->sectorsize;
+	tgtdev->io_align = fs_info->dev_root->sectorsize;
+	tgtdev->sector_size = fs_info->dev_root->sectorsize;
+	tgtdev->dev_root = fs_info->dev_root;
+	tgtdev->in_fs_metadata = 1;
 }
 
 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
@@ -1825,7 +2292,8 @@
 
 	if (!device->writeable)
 		return -EACCES;
-	if (new_size <= device->total_bytes)
+	if (new_size <= device->total_bytes ||
+	    device->is_tgtdev_for_dev_replace)
 		return -EINVAL;
 
 	btrfs_set_super_total_bytes(super_copy, old_total + diff);
@@ -1867,10 +2335,20 @@
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-	BUG_ON(ret);
+	if (ret < 0)
+		goto out;
+	else if (ret > 0) { /* Logic error or corruption */
+		btrfs_error(root->fs_info, -ENOENT,
+			    "Failed lookup while freeing chunk.");
+		ret = -ENOENT;
+		goto out;
+	}
 
 	ret = btrfs_del_item(trans, root, path);
-
+	if (ret < 0)
+		btrfs_error(root->fs_info, ret,
+			    "Failed to delete chunk item.");
+out:
 	btrfs_free_path(path);
 	return ret;
 }
@@ -1947,7 +2425,11 @@
 		return ret;
 
 	trans = btrfs_start_transaction(root, 0);
-	BUG_ON(IS_ERR(trans));
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		btrfs_std_error(root->fs_info, ret);
+		return ret;
+	}
 
 	lock_chunks(root);
 
@@ -1959,7 +2441,7 @@
 	em = lookup_extent_mapping(em_tree, chunk_offset, 1);
 	read_unlock(&em_tree->lock);
 
-	BUG_ON(em->start > chunk_offset ||
+	BUG_ON(!em || em->start > chunk_offset ||
 	       em->start + em->len < chunk_offset);
 	map = (struct map_lookup *)em->bdev;
 
@@ -2032,7 +2514,7 @@
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
-		BUG_ON(ret == 0);
+		BUG_ON(ret == 0); /* Corruption */
 
 		ret = btrfs_previous_item(chunk_root, path, key.objectid,
 					  key.type);
@@ -2077,44 +2559,386 @@
 	return ret;
 }
 
-static u64 div_factor(u64 num, int factor)
+static int insert_balance_item(struct btrfs_root *root,
+			       struct btrfs_balance_control *bctl)
 {
-	if (factor == 10)
-		return num;
-	num *= factor;
-	do_div(num, 10);
-	return num;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*item));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
+	btrfs_set_balance_data(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
+	btrfs_set_balance_meta(leaf, item, &disk_bargs);
+	btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
+	btrfs_set_balance_sys(leaf, item, &disk_bargs);
+
+	btrfs_set_balance_flags(leaf, item, bctl->flags);
+
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
 }
 
-int btrfs_balance(struct btrfs_root *dev_root)
+static int del_balance_item(struct btrfs_root *root)
 {
-	int ret;
-	struct list_head *devices = &dev_root->fs_info->fs_devices->devices;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret, err;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	trans = btrfs_start_transaction(root, 0);
+	if (IS_ERR(trans)) {
+		btrfs_free_path(path);
+		return PTR_ERR(trans);
+	}
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = btrfs_del_item(trans, root, path);
+out:
+	btrfs_free_path(path);
+	err = btrfs_commit_transaction(trans, root);
+	if (err && !ret)
+		ret = err;
+	return ret;
+}
+
+/*
+ * This is a heuristic used to reduce the number of chunks balanced on
+ * resume after balance was interrupted.
+ */
+static void update_balance_args(struct btrfs_balance_control *bctl)
+{
+	/*
+	 * Turn on soft mode for chunk types that were being converted.
+	 */
+	if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
+	if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
+
+	/*
+	 * Turn on usage filter if is not already used.  The idea is
+	 * that chunks that we have already balanced should be
+	 * reasonably full.  Don't do it for chunks that are being
+	 * converted - that will keep us from relocating unconverted
+	 * (albeit full) chunks.
+	 */
+	if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->data.usage = 90;
+	}
+	if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->sys.usage = 90;
+	}
+	if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
+		bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
+		bctl->meta.usage = 90;
+	}
+}
+
+/*
+ * Should be called with both balance and volume mutexes held to
+ * serialize other volume operations (add_dev/rm_dev/resize) with
+ * restriper.  Same goes for unset_balance_control.
+ */
+static void set_balance_control(struct btrfs_balance_control *bctl)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+
+	BUG_ON(fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = bctl;
+	spin_unlock(&fs_info->balance_lock);
+}
+
+static void unset_balance_control(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+
+	BUG_ON(!fs_info->balance_ctl);
+
+	spin_lock(&fs_info->balance_lock);
+	fs_info->balance_ctl = NULL;
+	spin_unlock(&fs_info->balance_lock);
+
+	kfree(bctl);
+}
+
+/*
+ * Balance filters.  Return 1 if chunk should be filtered out
+ * (should not be balanced).
+ */
+static int chunk_profiles_filter(u64 chunk_type,
+				 struct btrfs_balance_args *bargs)
+{
+	chunk_type = chunk_to_extended(chunk_type) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	if (bargs->profiles & chunk_type)
+		return 0;
+
+	return 1;
+}
+
+static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_block_group_cache *cache;
+	u64 chunk_used, user_thresh;
+	int ret = 1;
+
+	cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+	chunk_used = btrfs_block_group_used(&cache->item);
+
+	if (bargs->usage == 0)
+		user_thresh = 1;
+	else if (bargs->usage > 100)
+		user_thresh = cache->key.offset;
+	else
+		user_thresh = div_factor_fine(cache->key.offset,
+					      bargs->usage);
+
+	if (chunk_used < user_thresh)
+		ret = 0;
+
+	btrfs_put_block_group(cache);
+	return ret;
+}
+
+static int chunk_devid_filter(struct extent_buffer *leaf,
+			      struct btrfs_chunk *chunk,
+			      struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	int i;
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
+			return 0;
+	}
+
+	return 1;
+}
+
+/* [pstart, pend) */
+static int chunk_drange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	struct btrfs_stripe *stripe;
+	int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	u64 stripe_offset;
+	u64 stripe_length;
+	int factor;
+	int i;
+
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
+		return 0;
+
+	if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
+	     BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
+		factor = num_stripes / 2;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
+		factor = num_stripes - 1;
+	} else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+		factor = num_stripes - 2;
+	} else {
+		factor = num_stripes;
+	}
+
+	for (i = 0; i < num_stripes; i++) {
+		stripe = btrfs_stripe_nr(chunk, i);
+		if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
+			continue;
+
+		stripe_offset = btrfs_stripe_offset(leaf, stripe);
+		stripe_length = btrfs_chunk_length(leaf, chunk);
+		do_div(stripe_length, factor);
+
+		if (stripe_offset < bargs->pend &&
+		    stripe_offset + stripe_length > bargs->pstart)
+			return 0;
+	}
+
+	return 1;
+}
+
+/* [vstart, vend) */
+static int chunk_vrange_filter(struct extent_buffer *leaf,
+			       struct btrfs_chunk *chunk,
+			       u64 chunk_offset,
+			       struct btrfs_balance_args *bargs)
+{
+	if (chunk_offset < bargs->vend &&
+	    chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
+		/* at least part of the chunk is inside this vrange */
+		return 0;
+
+	return 1;
+}
+
+static int chunk_soft_convert_filter(u64 chunk_type,
+				     struct btrfs_balance_args *bargs)
+{
+	if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
+		return 0;
+
+	chunk_type = chunk_to_extended(chunk_type) &
+				BTRFS_EXTENDED_PROFILE_MASK;
+
+	if (bargs->target == chunk_type)
+		return 1;
+
+	return 0;
+}
+
+static int should_balance_chunk(struct btrfs_root *root,
+				struct extent_buffer *leaf,
+				struct btrfs_chunk *chunk, u64 chunk_offset)
+{
+	struct btrfs_balance_control *bctl = root->fs_info->balance_ctl;
+	struct btrfs_balance_args *bargs = NULL;
+	u64 chunk_type = btrfs_chunk_type(leaf, chunk);
+
+	/* type filter */
+	if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
+	      (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
+		return 0;
+	}
+
+	if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
+		bargs = &bctl->data;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
+		bargs = &bctl->sys;
+	else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
+		bargs = &bctl->meta;
+
+	/* profiles filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
+	    chunk_profiles_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
+	/* usage filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
+	    chunk_usage_filter(bctl->fs_info, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
+	    chunk_devid_filter(leaf, chunk, bargs)) {
+		return 0;
+	}
+
+	/* drange filter, makes sense only with devid filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
+	    chunk_drange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* vrange filter */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
+	    chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
+		return 0;
+	}
+
+	/* soft profile changing mode */
+	if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
+	    chunk_soft_convert_filter(chunk_type, bargs)) {
+		return 0;
+	}
+
+	return 1;
+}
+
+static int __btrfs_balance(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl = fs_info->balance_ctl;
+	struct btrfs_root *chunk_root = fs_info->chunk_root;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct list_head *devices;
 	struct btrfs_device *device;
 	u64 old_size;
 	u64 size_to_free;
+	struct btrfs_chunk *chunk;
 	struct btrfs_path *path;
 	struct btrfs_key key;
-	struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_key found_key;
-
-	if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
-		return -EROFS;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	mutex_lock(&dev_root->fs_info->volume_mutex);
-	dev_root = dev_root->fs_info->dev_root;
+	struct btrfs_trans_handle *trans;
+	struct extent_buffer *leaf;
+	int slot;
+	int ret;
+	int enospc_errors = 0;
+	bool counting = true;
 
 	/* step one make some room on all the devices */
+	devices = &fs_info->fs_devices->devices;
 	list_for_each_entry(device, devices, dev_list) {
 		old_size = device->total_bytes;
 		size_to_free = div_factor(old_size, 1);
 		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
 		if (!device->writeable ||
-		    device->total_bytes - device->bytes_used > size_to_free)
+		    device->total_bytes - device->bytes_used > size_to_free ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		ret = btrfs_shrink_device(device, old_size - size_to_free);
@@ -2137,11 +2961,23 @@
 		ret = -ENOMEM;
 		goto error;
 	}
+
+	/* zero out stat counters */
+	spin_lock(&fs_info->balance_lock);
+	memset(&bctl->stat, 0, sizeof(bctl->stat));
+	spin_unlock(&fs_info->balance_lock);
+again:
 	key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
 	key.offset = (u64)-1;
 	key.type = BTRFS_CHUNK_ITEM_KEY;
 
 	while (1) {
+		if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
+		    atomic_read(&fs_info->balance_cancel_req)) {
+			ret = -ECANCELED;
+			goto error;
+		}
+
 		ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
 		if (ret < 0)
 			goto error;
@@ -2151,38 +2987,734 @@
 		 * failed
 		 */
 		if (ret == 0)
-			break;
+			BUG(); /* FIXME break ? */
 
 		ret = btrfs_previous_item(chunk_root, path, 0,
 					  BTRFS_CHUNK_ITEM_KEY);
-		if (ret)
+		if (ret) {
+			ret = 0;
 			break;
+		}
+
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 
-		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-				      path->slots[0]);
 		if (found_key.objectid != key.objectid)
 			break;
 
-		/* chunk zero is special */
-		if (found_key.offset == 0)
-			break;
+		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+
+		if (!counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.considered++;
+			spin_unlock(&fs_info->balance_lock);
+		}
 
+		ret = should_balance_chunk(chunk_root, leaf, chunk,
+					   found_key.offset);
 		btrfs_release_path(path);
+		if (!ret)
+			goto loop;
+
+		if (counting) {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.expected++;
+			spin_unlock(&fs_info->balance_lock);
+			goto loop;
+		}
+
 		ret = btrfs_relocate_chunk(chunk_root,
 					   chunk_root->root_key.objectid,
 					   found_key.objectid,
 					   found_key.offset);
 		if (ret && ret != -ENOSPC)
 			goto error;
+		if (ret == -ENOSPC) {
+			enospc_errors++;
+		} else {
+			spin_lock(&fs_info->balance_lock);
+			bctl->stat.completed++;
+			spin_unlock(&fs_info->balance_lock);
+		}
+loop:
+		if (found_key.offset == 0)
+			break;
 		key.offset = found_key.offset - 1;
 	}
-	ret = 0;
+
+	if (counting) {
+		btrfs_release_path(path);
+		counting = false;
+		goto again;
+	}
 error:
 	btrfs_free_path(path);
-	mutex_unlock(&dev_root->fs_info->volume_mutex);
+	if (enospc_errors) {
+		printk(KERN_INFO "btrfs: %d enospc errors during balance\n",
+		       enospc_errors);
+		if (!ret)
+			ret = -ENOSPC;
+	}
+
+	return ret;
+}
+
+/**
+ * alloc_profile_is_valid - see if a given profile is valid and reduced
+ * @flags: profile to validate
+ * @extended: if true @flags is treated as an extended profile
+ */
+static int alloc_profile_is_valid(u64 flags, int extended)
+{
+	u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
+			       BTRFS_BLOCK_GROUP_PROFILE_MASK);
+
+	flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
+
+	/* 1) check that all other bits are zeroed */
+	if (flags & ~mask)
+		return 0;
+
+	/* 2) see if profile is reduced */
+	if (flags == 0)
+		return !extended; /* "0" is valid for usual profiles */
+
+	/* true if exactly one bit set */
+	return (flags & (flags - 1)) == 0;
+}
+
+static inline int balance_need_close(struct btrfs_fs_info *fs_info)
+{
+	/* cancel requested || normal exit path */
+	return atomic_read(&fs_info->balance_cancel_req) ||
+		(atomic_read(&fs_info->balance_pause_req) == 0 &&
+		 atomic_read(&fs_info->balance_cancel_req) == 0);
+}
+
+static void __cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret;
+
+	unset_balance_control(fs_info);
+	ret = del_balance_item(fs_info->tree_root);
+	if (ret)
+		btrfs_std_error(fs_info, ret);
+
+	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+}
+
+/*
+ * Should be called with both balance and volume mutexes held
+ */
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs)
+{
+	struct btrfs_fs_info *fs_info = bctl->fs_info;
+	u64 allowed;
+	int mixed = 0;
+	int ret;
+	u64 num_devices;
+	unsigned seq;
+
+	if (btrfs_fs_closing(fs_info) ||
+	    atomic_read(&fs_info->balance_pause_req) ||
+	    atomic_read(&fs_info->balance_cancel_req)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	allowed = btrfs_super_incompat_flags(fs_info->super_copy);
+	if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+		mixed = 1;
+
+	/*
+	 * In case of mixed groups both data and meta should be picked,
+	 * and identical options should be given for both of them.
+	 */
+	allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
+	if (mixed && (bctl->flags & allowed)) {
+		if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
+		    !(bctl->flags & BTRFS_BALANCE_METADATA) ||
+		    memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
+			printk(KERN_ERR "btrfs: with mixed groups data and "
+			       "metadata balance options must be the same\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	num_devices = fs_info->fs_devices->num_devices;
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+		BUG_ON(num_devices < 1);
+		num_devices--;
+	}
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+	allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
+	if (num_devices == 1)
+		allowed |= BTRFS_BLOCK_GROUP_DUP;
+	else if (num_devices > 1)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
+	if (num_devices > 2)
+		allowed |= BTRFS_BLOCK_GROUP_RAID5;
+	if (num_devices > 3)
+		allowed |= (BTRFS_BLOCK_GROUP_RAID10 |
+			    BTRFS_BLOCK_GROUP_RAID6);
+	if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->data.target, 1) ||
+	     (bctl->data.target & ~allowed))) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "data profile %llu\n",
+		       bctl->data.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->meta.target, 1) ||
+	     (bctl->meta.target & ~allowed))) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "metadata profile %llu\n",
+		       bctl->meta.target);
+		ret = -EINVAL;
+		goto out;
+	}
+	if ((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (!alloc_profile_is_valid(bctl->sys.target, 1) ||
+	     (bctl->sys.target & ~allowed))) {
+		printk(KERN_ERR "btrfs: unable to start balance with target "
+		       "system profile %llu\n",
+		       bctl->sys.target);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow dup'ed data chunks only in mixed mode */
+	if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+	    (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
+		printk(KERN_ERR "btrfs: dup for data is not allowed\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* allow to reduce meta or sys integrity only if force set */
+	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+			BTRFS_BLOCK_GROUP_RAID10 |
+			BTRFS_BLOCK_GROUP_RAID5 |
+			BTRFS_BLOCK_GROUP_RAID6;
+	do {
+		seq = read_seqbegin(&fs_info->profiles_lock);
+
+		if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_system_alloc_bits & allowed) &&
+		     !(bctl->sys.target & allowed)) ||
+		    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+		     (fs_info->avail_metadata_alloc_bits & allowed) &&
+		     !(bctl->meta.target & allowed))) {
+			if (bctl->flags & BTRFS_BALANCE_FORCE) {
+				printk(KERN_INFO "btrfs: force reducing metadata "
+				       "integrity\n");
+			} else {
+				printk(KERN_ERR "btrfs: balance will reduce metadata "
+				       "integrity, use force if you want this\n");
+				ret = -EINVAL;
+				goto out;
+			}
+		}
+	} while (read_seqretry(&fs_info->profiles_lock, seq));
+
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		int num_tolerated_disk_barrier_failures;
+		u64 target = bctl->sys.target;
+
+		num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+		if (num_tolerated_disk_barrier_failures > 0 &&
+		    (target &
+		     (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+		      BTRFS_AVAIL_ALLOC_BIT_SINGLE)))
+			num_tolerated_disk_barrier_failures = 0;
+		else if (num_tolerated_disk_barrier_failures > 1 &&
+			 (target &
+			  (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)))
+			num_tolerated_disk_barrier_failures = 1;
+
+		fs_info->num_tolerated_disk_barrier_failures =
+			num_tolerated_disk_barrier_failures;
+	}
+
+	ret = insert_balance_item(fs_info->tree_root, bctl);
+	if (ret && ret != -EEXIST)
+		goto out;
+
+	if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
+		BUG_ON(ret == -EEXIST);
+		set_balance_control(bctl);
+	} else {
+		BUG_ON(ret != -EEXIST);
+		spin_lock(&fs_info->balance_lock);
+		update_balance_args(bctl);
+		spin_unlock(&fs_info->balance_lock);
+	}
+
+	atomic_inc(&fs_info->balance_running);
+	mutex_unlock(&fs_info->balance_mutex);
+
+	ret = __btrfs_balance(fs_info);
+
+	mutex_lock(&fs_info->balance_mutex);
+	atomic_dec(&fs_info->balance_running);
+
+	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
+		fs_info->num_tolerated_disk_barrier_failures =
+			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
+	}
+
+	if (bargs) {
+		memset(bargs, 0, sizeof(*bargs));
+		update_ioctl_balance_args(fs_info, 0, bargs);
+	}
+
+	if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
+	    balance_need_close(fs_info)) {
+		__cancel_balance(fs_info);
+	}
+
+	wake_up(&fs_info->balance_wait_q);
+
+	return ret;
+out:
+	if (bctl->flags & BTRFS_BALANCE_RESUME)
+		__cancel_balance(fs_info);
+	else {
+		kfree(bctl);
+		atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+	}
+	return ret;
+}
+
+static int balance_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	int ret = 0;
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	if (fs_info->balance_ctl) {
+		printk(KERN_INFO "btrfs: continuing balance\n");
+		ret = btrfs_balance(fs_info->balance_ctl, NULL);
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+
+	return ret;
+}
+
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *tsk;
+
+	spin_lock(&fs_info->balance_lock);
+	if (!fs_info->balance_ctl) {
+		spin_unlock(&fs_info->balance_lock);
+		return 0;
+	}
+	spin_unlock(&fs_info->balance_lock);
+
+	if (btrfs_test_opt(fs_info->tree_root, SKIP_BALANCE)) {
+		printk(KERN_INFO "btrfs: force skipping balance\n");
+		return 0;
+	}
+
+	tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
+	return PTR_RET(tsk);
+}
+
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_balance_control *bctl;
+	struct btrfs_balance_item *item;
+	struct btrfs_disk_balance_args disk_bargs;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_BALANCE_OBJECTID;
+	key.type = BTRFS_BALANCE_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) { /* ret = -ENOENT; */
+		ret = 0;
+		goto out;
+	}
+
+	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	if (!bctl) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
+
+	bctl->fs_info = fs_info;
+	bctl->flags = btrfs_balance_flags(leaf, item);
+	bctl->flags |= BTRFS_BALANCE_RESUME;
+
+	btrfs_balance_data(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
+	btrfs_balance_meta(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
+	btrfs_balance_sys(leaf, item, &disk_bargs);
+	btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+
+	WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
+
+	mutex_lock(&fs_info->volume_mutex);
+	mutex_lock(&fs_info->balance_mutex);
+
+	set_balance_control(bctl);
+
+	mutex_unlock(&fs_info->balance_mutex);
+	mutex_unlock(&fs_info->volume_mutex);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
+{
+	int ret = 0;
+
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	if (atomic_read(&fs_info->balance_running)) {
+		atomic_inc(&fs_info->balance_pause_req);
+		mutex_unlock(&fs_info->balance_mutex);
+
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+
+		mutex_lock(&fs_info->balance_mutex);
+		/* we are good with balance_ctl ripped off from under us */
+		BUG_ON(atomic_read(&fs_info->balance_running));
+		atomic_dec(&fs_info->balance_pause_req);
+	} else {
+		ret = -ENOTCONN;
+	}
+
+	mutex_unlock(&fs_info->balance_mutex);
+	return ret;
+}
+
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
+{
+	mutex_lock(&fs_info->balance_mutex);
+	if (!fs_info->balance_ctl) {
+		mutex_unlock(&fs_info->balance_mutex);
+		return -ENOTCONN;
+	}
+
+	atomic_inc(&fs_info->balance_cancel_req);
+	/*
+	 * if we are running just wait and return, balance item is
+	 * deleted in btrfs_balance in this case
+	 */
+	if (atomic_read(&fs_info->balance_running)) {
+		mutex_unlock(&fs_info->balance_mutex);
+		wait_event(fs_info->balance_wait_q,
+			   atomic_read(&fs_info->balance_running) == 0);
+		mutex_lock(&fs_info->balance_mutex);
+	} else {
+		/* __cancel_balance needs volume_mutex */
+		mutex_unlock(&fs_info->balance_mutex);
+		mutex_lock(&fs_info->volume_mutex);
+		mutex_lock(&fs_info->balance_mutex);
+
+		if (fs_info->balance_ctl)
+			__cancel_balance(fs_info);
+
+		mutex_unlock(&fs_info->volume_mutex);
+	}
+
+	BUG_ON(fs_info->balance_ctl || atomic_read(&fs_info->balance_running));
+	atomic_dec(&fs_info->balance_cancel_req);
+	mutex_unlock(&fs_info->balance_mutex);
+	return 0;
+}
+
+static int btrfs_uuid_scan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path = NULL;
+	int ret = 0;
+	struct extent_buffer *eb;
+	int slot;
+	struct btrfs_root_item root_item;
+	u32 item_size;
+	struct btrfs_trans_handle *trans = NULL;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+
+	max_key.objectid = (u64)-1;
+	max_key.type = BTRFS_ROOT_ITEM_KEY;
+	max_key.offset = (u64)-1;
+
+	path->keep_locks = 1;
+
+	while (1) {
+		ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+		if (ret) {
+			if (ret > 0)
+				ret = 0;
+			break;
+		}
+
+		if (key.type != BTRFS_ROOT_ITEM_KEY ||
+		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
+			goto skip;
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		item_size = btrfs_item_size_nr(eb, slot);
+		if (item_size < sizeof(root_item))
+			goto skip;
+
+		read_extent_buffer(eb, &root_item,
+				   btrfs_item_ptr_offset(eb, slot),
+				   (int)sizeof(root_item));
+		if (btrfs_root_refs(&root_item) == 0)
+			goto skip;
+
+		if (!btrfs_is_empty_uuid(root_item.uuid) ||
+		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
+			if (trans)
+				goto update_tree;
+
+			btrfs_release_path(path);
+			/*
+			 * 1 - subvol uuid item
+			 * 1 - received_subvol uuid item
+			 */
+			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
+			continue;
+		} else {
+			goto skip;
+		}
+update_tree:
+		if (!btrfs_is_empty_uuid(root_item.uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.uuid,
+						  BTRFS_UUID_KEY_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				pr_warn("btrfs: uuid_tree_add failed %d\n",
+					ret);
+				break;
+			}
+		}
+
+		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.received_uuid,
+						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				pr_warn("btrfs: uuid_tree_add failed %d\n",
+					ret);
+				break;
+			}
+		}
+
+skip:
+		if (trans) {
+			ret = btrfs_end_transaction(trans, fs_info->uuid_root);
+			trans = NULL;
+			if (ret)
+				break;
+		}
+
+		btrfs_release_path(path);
+		if (key.offset < (u64)-1) {
+			key.offset++;
+		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+		} else if (key.objectid < (u64)-1) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.objectid++;
+		} else {
+			break;
+		}
+		cond_resched();
+	}
+
+out:
+	btrfs_free_path(path);
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fs_info->uuid_root);
+	if (ret)
+		pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret);
+	else
+		fs_info->update_uuid_tree_gen = 1;
+	up(&fs_info->uuid_tree_rescan_sem);
+	return 0;
+}
+
+/*
+ * Callback for btrfs_uuid_tree_iterate().
+ * returns:
+ * 0	check succeeded, the entry is not outdated.
+ * < 0	if an error occured.
+ * > 0	if the check failed, which means the caller shall remove the entry.
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+				       u8 *uuid, u8 type, u64 subid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct btrfs_root *subvol_root;
+
+	if (type != BTRFS_UUID_KEY_SUBVOL &&
+	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+		goto out;
+
+	key.objectid = subid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(subvol_root)) {
+		ret = PTR_ERR(subvol_root);
+		if (ret == -ENOENT)
+			ret = 1;
+		goto out;
+	}
+
+	switch (type) {
+	case BTRFS_UUID_KEY_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.received_uuid,
+			   BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	}
+
+out:
 	return ret;
 }
 
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+	int ret;
+
+	/*
+	 * 1st step is to iterate through the existing UUID tree and
+	 * to delete all entries that contain outdated data.
+	 * 2nd step is to add all missing entries to the UUID tree.
+	 */
+	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
+	if (ret < 0) {
+		pr_warn("btrfs: iterating uuid_tree failed %d\n", ret);
+		up(&fs_info->uuid_tree_rescan_sem);
+		return ret;
+	}
+	return btrfs_uuid_scan_kthread(data);
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *uuid_root;
+	struct task_struct *task;
+	int ret;
+
+	/*
+	 * 1 - root node
+	 * 1 - root item
+	 */
+	trans = btrfs_start_transaction(tree_root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	uuid_root = btrfs_create_tree(trans, fs_info,
+				      BTRFS_UUID_TREE_OBJECTID);
+	if (IS_ERR(uuid_root)) {
+		btrfs_abort_transaction(trans, tree_root,
+					PTR_ERR(uuid_root));
+		return PTR_ERR(uuid_root);
+	}
+
+	fs_info->uuid_root = uuid_root;
+
+	ret = btrfs_commit_transaction(trans, tree_root);
+	if (ret)
+		return ret;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		pr_warn("btrfs: failed to start uuid_scan task\n");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		pr_warn("btrfs: failed to start uuid_rescan task\n");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -2209,7 +3741,7 @@
 	u64 old_size = device->total_bytes;
 	u64 diff = device->total_bytes - new_size;
 
-	if (new_size >= device->total_bytes)
+	if (device->is_tgtdev_for_dev_replace)
 		return -EINVAL;
 
 	path = btrfs_alloc_path();
@@ -2234,7 +3766,7 @@
 	key.offset = (u64)-1;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 
-	while (1) {
+	do {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto done;
@@ -2276,8 +3808,7 @@
 			goto done;
 		if (ret == -ENOSPC)
 			failed++;
-		key.offset -= 1;
-	}
+	} while (key.offset-- > 0);
 
 	if (failed && !retried) {
 		failed = 0;
@@ -2323,8 +3854,7 @@
 	return ret;
 }
 
-static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root,
+static int btrfs_add_system_chunk(struct btrfs_root *root,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
@@ -2366,11 +3896,82 @@
 	return 0;
 }
 
+static struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
+	[BTRFS_RAID_RAID10] = {
+		.sub_stripes	= 2,
+		.dev_stripes	= 1,
+		.devs_max	= 0,	/* 0 == as many as possible */
+		.devs_min	= 4,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID1] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 2,
+		.devs_min	= 2,
+		.devs_increment	= 2,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_DUP] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 2,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID0] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_SINGLE] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 1,
+		.devs_min	= 1,
+		.devs_increment	= 1,
+		.ncopies	= 1,
+	},
+	[BTRFS_RAID_RAID5] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 2,
+		.devs_increment	= 1,
+		.ncopies	= 2,
+	},
+	[BTRFS_RAID_RAID6] = {
+		.sub_stripes	= 1,
+		.dev_stripes	= 1,
+		.devs_max	= 0,
+		.devs_min	= 3,
+		.devs_increment	= 1,
+		.ncopies	= 3,
+	},
+};
+
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+	/* TODO allow them to set a preferred stripe size */
+	return 64 * 1024;
+}
+
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+	if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+		return;
+
+	btrfs_set_fs_incompat(info, RAID56);
+}
+
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *extent_root,
-			       struct map_lookup **map_ret,
-			       u64 *num_bytes_out, u64 *stripe_size_out,
-			       u64 start, u64 type)
+			       struct btrfs_root *extent_root, u64 start,
+			       u64 type)
 {
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_fs_devices *fs_devices = info->fs_devices;
@@ -2381,6 +3982,8 @@
 	struct btrfs_device_info *devices_info = NULL;
 	u64 total_avail;
 	int num_stripes;	/* total number of stripes to allocate */
+	int data_stripes;	/* number of stripes that count for
+				   block group size */
 	int sub_stripes;	/* sub_stripes info for map */
 	int dev_stripes;	/* stripes per dev */
 	int devs_max;		/* max devs to use */
@@ -2392,59 +3995,38 @@
 	u64 max_chunk_size;
 	u64 stripe_size;
 	u64 num_bytes;
+	u64 raid_stripe_len = BTRFS_STRIPE_LEN;
 	int ndevs;
 	int i;
 	int j;
+	int index;
 
-	if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
-	    (type & BTRFS_BLOCK_GROUP_DUP)) {
-		WARN_ON(1);
-		type &= ~BTRFS_BLOCK_GROUP_DUP;
-	}
+	BUG_ON(!alloc_profile_is_valid(type, 0));
 
 	if (list_empty(&fs_devices->alloc_list))
 		return -ENOSPC;
 
-	sub_stripes = 1;
-	dev_stripes = 1;
-	devs_increment = 1;
-	ncopies = 1;
-	devs_max = 0;	/* 0 == as many as possible */
-	devs_min = 1;
-
-	/*
-	 * define the properties of each RAID type.
-	 * FIXME: move this to a global table and use it in all RAID
-	 * calculation code
-	 */
-	if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-		dev_stripes = 2;
-		ncopies = 2;
-		devs_max = 1;
-	} else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-		devs_min = 2;
-	} else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-		devs_increment = 2;
-		ncopies = 2;
-		devs_max = 2;
-		devs_min = 2;
-	} else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-		sub_stripes = 2;
-		devs_increment = 2;
-		ncopies = 2;
-		devs_min = 4;
-	} else {
-		devs_max = 1;
-	}
+	index = __get_raid_index(type);
+
+	sub_stripes = btrfs_raid_array[index].sub_stripes;
+	dev_stripes = btrfs_raid_array[index].dev_stripes;
+	devs_max = btrfs_raid_array[index].devs_max;
+	devs_min = btrfs_raid_array[index].devs_min;
+	devs_increment = btrfs_raid_array[index].devs_increment;
+	ncopies = btrfs_raid_array[index].ncopies;
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
 		max_stripe_size = 1024 * 1024 * 1024;
 		max_chunk_size = 10 * max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-		max_stripe_size = 256 * 1024 * 1024;
+		/* for larger filesystems, use larger metadata chunks */
+		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
+			max_stripe_size = 1024 * 1024 * 1024;
+		else
+			max_stripe_size = 256 * 1024 * 1024;
 		max_chunk_size = max_stripe_size;
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		max_stripe_size = 8 * 1024 * 1024;
+		max_stripe_size = 32 * 1024 * 1024;
 		max_chunk_size = 2 * max_stripe_size;
 	} else {
 		printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
@@ -2478,13 +4060,13 @@
 		cur = cur->next;
 
 		if (!device->writeable) {
-			printk(KERN_ERR
+			WARN(1, KERN_ERR
 			       "btrfs: read-only device in alloc_list\n");
-			WARN_ON(1);
 			continue;
 		}
 
-		if (!device->in_fs_metadata)
+		if (!device->in_fs_metadata ||
+		    device->is_tgtdev_for_dev_replace)
 			continue;
 
 		if (device->total_bytes > device->bytes_used)
@@ -2508,6 +4090,11 @@
 		if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
 			continue;
 
+		if (ndevs == fs_devices->rw_devices) {
+			WARN(1, "%s: found more than %llu devices\n",
+			     __func__, fs_devices->rw_devices);
+			break;
+		}
 		devices_info[ndevs].dev_offset = dev_offset;
 		devices_info[ndevs].max_avail = max_avail;
 		devices_info[ndevs].total_avail = total_avail;
@@ -2538,14 +4125,48 @@
 	stripe_size = devices_info[ndevs-1].max_avail;
 	num_stripes = ndevs * dev_stripes;
 
-	if (stripe_size * num_stripes > max_chunk_size * ncopies) {
-		stripe_size = max_chunk_size * ncopies;
-		do_div(stripe_size, num_stripes);
+	/*
+	 * this will have to be fixed for RAID1 and RAID10 over
+	 * more drives
+	 */
+	data_stripes = num_stripes / ncopies;
+
+	if (type & BTRFS_BLOCK_GROUP_RAID5) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 1;
+	}
+	if (type & BTRFS_BLOCK_GROUP_RAID6) {
+		raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+				 btrfs_super_stripesize(info->super_copy));
+		data_stripes = num_stripes - 2;
+	}
+
+	/*
+	 * Use the number of data stripes to figure out how big this chunk
+	 * is really going to be in terms of logical address space,
+	 * and compare that answer with the max chunk size
+	 */
+	if (stripe_size * data_stripes > max_chunk_size) {
+		u64 mask = (1ULL << 24) - 1;
+		stripe_size = max_chunk_size;
+		do_div(stripe_size, data_stripes);
+
+		/* bump the answer up to a 16MB boundary */
+		stripe_size = (stripe_size + mask) & ~mask;
+
+		/* but don't go higher than the limits we found
+		 * while searching for free extents
+		 */
+		if (stripe_size > devices_info[ndevs-1].max_avail)
+			stripe_size = devices_info[ndevs-1].max_avail;
 	}
 
 	do_div(stripe_size, dev_stripes);
-	do_div(stripe_size, BTRFS_STRIPE_LEN);
-	stripe_size *= BTRFS_STRIPE_LEN;
+
+	/* align to BTRFS_STRIPE_LEN */
+	do_div(stripe_size, raid_stripe_len);
+	stripe_size *= raid_stripe_len;
 
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
@@ -2563,17 +4184,13 @@
 		}
 	}
 	map->sector_size = extent_root->sectorsize;
-	map->stripe_len = BTRFS_STRIPE_LEN;
-	map->io_align = BTRFS_STRIPE_LEN;
-	map->io_width = BTRFS_STRIPE_LEN;
+	map->stripe_len = raid_stripe_len;
+	map->io_align = raid_stripe_len;
+	map->io_width = raid_stripe_len;
 	map->type = type;
 	map->sub_stripes = sub_stripes;
 
-	*map_ret = map;
-	num_bytes = stripe_size * (num_stripes / ncopies);
-
-	*stripe_size_out = stripe_size;
-	*num_bytes_out = num_bytes;
+	num_bytes = stripe_size * data_stripes;
 
 	trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
 
@@ -2587,68 +4204,110 @@
 	em->len = num_bytes;
 	em->block_start = 0;
 	em->block_len = em->len;
+	em->orig_block_len = stripe_size;
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
 	write_lock(&em_tree->lock);
-	ret = add_extent_mapping(em_tree, em);
+	ret = add_extent_mapping(em_tree, em, 0);
+	if (!ret) {
+		list_add_tail(&em->list, &trans->transaction->pending_chunks);
+		atomic_inc(&em->refs);
+	}
 	write_unlock(&em_tree->lock);
-	BUG_ON(ret);
-	free_extent_map(em);
+	if (ret) {
+		free_extent_map(em);
+		goto error;
+	}
 
 	ret = btrfs_make_block_group(trans, extent_root, 0, type,
 				     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
 				     start, num_bytes);
-	BUG_ON(ret);
-
-	for (i = 0; i < map->num_stripes; ++i) {
-		struct btrfs_device *device;
-		u64 dev_offset;
-
-		device = map->stripes[i].dev;
-		dev_offset = map->stripes[i].physical;
+	if (ret)
+		goto error_del_extent;
 
-		ret = btrfs_alloc_dev_extent(trans, device,
-				info->chunk_root->root_key.objectid,
-				BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-				start, dev_offset, stripe_size);
-		BUG_ON(ret);
-	}
+	free_extent_map(em);
+	check_raid56_incompat_flag(extent_root->fs_info, type);
 
 	kfree(devices_info);
 	return 0;
 
+error_del_extent:
+	write_lock(&em_tree->lock);
+	remove_extent_mapping(em_tree, em);
+	write_unlock(&em_tree->lock);
+
+	/* One for our allocation */
+	free_extent_map(em);
+	/* One for the tree reference */
+	free_extent_map(em);
 error:
 	kfree(map);
 	kfree(devices_info);
 	return ret;
 }
 
-static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 				struct btrfs_root *extent_root,
-				struct map_lookup *map, u64 chunk_offset,
-				u64 chunk_size, u64 stripe_size)
+				u64 chunk_offset, u64 chunk_size)
 {
-	u64 dev_offset;
 	struct btrfs_key key;
 	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
 	struct btrfs_device *device;
 	struct btrfs_chunk *chunk;
 	struct btrfs_stripe *stripe;
-	size_t item_size = btrfs_chunk_item_size(map->num_stripes);
-	int index = 0;
+	struct extent_map_tree *em_tree;
+	struct extent_map *em;
+	struct map_lookup *map;
+	size_t item_size;
+	u64 dev_offset;
+	u64 stripe_size;
+	int i = 0;
 	int ret;
 
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size);
+	read_unlock(&em_tree->lock);
+
+	if (!em) {
+		btrfs_crit(extent_root->fs_info, "unable to find logical "
+			   "%Lu len %Lu", chunk_offset, chunk_size);
+		return -EINVAL;
+	}
+
+	if (em->start != chunk_offset || em->len != chunk_size) {
+		btrfs_crit(extent_root->fs_info, "found a bad mapping, wanted"
+			  " %Lu-%Lu, found %Lu-%Lu\n", chunk_offset,
+			  chunk_size, em->start, em->len);
+		free_extent_map(em);
+		return -EINVAL;
+	}
+
+	map = (struct map_lookup *)em->bdev;
+	item_size = btrfs_chunk_item_size(map->num_stripes);
+	stripe_size = em->orig_block_len;
+
 	chunk = kzalloc(item_size, GFP_NOFS);
-	if (!chunk)
-		return -ENOMEM;
+	if (!chunk) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
 
-	index = 0;
-	while (index < map->num_stripes) {
-		device = map->stripes[index].dev;
 		device->bytes_used += stripe_size;
 		ret = btrfs_update_device(trans, device);
-		BUG_ON(ret);
-		index++;
+		if (ret)
+			goto out;
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     chunk_root->root_key.objectid,
+					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+					     chunk_offset, dev_offset,
+					     stripe_size);
+		if (ret)
+			goto out;
 	}
 
 	spin_lock(&extent_root->fs_info->free_chunk_lock);
@@ -2656,17 +4315,15 @@
 						   map->num_stripes);
 	spin_unlock(&extent_root->fs_info->free_chunk_lock);
 
-	index = 0;
 	stripe = &chunk->stripe;
-	while (index < map->num_stripes) {
-		device = map->stripes[index].dev;
-		dev_offset = map->stripes[index].physical;
+	for (i = 0; i < map->num_stripes; i++) {
+		device = map->stripes[i].dev;
+		dev_offset = map->stripes[i].physical;
 
 		btrfs_set_stack_stripe_devid(stripe, device->devid);
 		btrfs_set_stack_stripe_offset(stripe, dev_offset);
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		stripe++;
-		index++;
 	}
 
 	btrfs_set_stack_chunk_length(chunk, chunk_size);
@@ -2684,16 +4341,19 @@
 	key.offset = chunk_offset;
 
 	ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
-	BUG_ON(ret);
-
-	if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk,
+	if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
+		/*
+		 * TODO: Cleanup of inserted chunk root in case of
+		 * failure.
+		 */
+		ret = btrfs_add_system_chunk(chunk_root, &key, chunk,
 					     item_size);
-		BUG_ON(ret);
 	}
 
+out:
 	kfree(chunk);
-	return 0;
+	free_extent_map(em);
+	return ret;
 }
 
 /*
@@ -2707,26 +4367,9 @@
 		      struct btrfs_root *extent_root, u64 type)
 {
 	u64 chunk_offset;
-	u64 chunk_size;
-	u64 stripe_size;
-	struct map_lookup *map;
-	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
-	int ret;
-
-	ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-			      &chunk_offset);
-	if (ret)
-		return ret;
 
-	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-				  &stripe_size, chunk_offset, type);
-	if (ret)
-		return ret;
-
-	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-				   chunk_size, stripe_size);
-	BUG_ON(ret);
-	return 0;
+	chunk_offset = find_next_chunk(extent_root->fs_info);
+	return __btrfs_alloc_chunk(trans, extent_root, chunk_offset, type);
 }
 
 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
@@ -2735,61 +4378,32 @@
 {
 	u64 chunk_offset;
 	u64 sys_chunk_offset;
-	u64 chunk_size;
-	u64 sys_chunk_size;
-	u64 stripe_size;
-	u64 sys_stripe_size;
 	u64 alloc_profile;
-	struct map_lookup *map;
-	struct map_lookup *sys_map;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
 
-	ret = find_next_chunk(fs_info->chunk_root,
-			      BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset);
+	chunk_offset = find_next_chunk(fs_info);
+	alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, chunk_offset,
+				  alloc_profile);
 	if (ret)
 		return ret;
 
-	alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
-			(fs_info->metadata_alloc_profile &
-			 fs_info->avail_metadata_alloc_bits);
-	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
-	ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
-				  &stripe_size, chunk_offset, alloc_profile);
-	BUG_ON(ret);
-
-	sys_chunk_offset = chunk_offset + chunk_size;
-
-	alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
-			(fs_info->system_alloc_profile &
-			 fs_info->avail_system_alloc_bits);
-	alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
-
-	ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
-				  &sys_chunk_size, &sys_stripe_size,
-				  sys_chunk_offset, alloc_profile);
-	BUG_ON(ret);
+	sys_chunk_offset = find_next_chunk(root->fs_info);
+	alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
+	ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset,
+				  alloc_profile);
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	ret = btrfs_add_device(trans, fs_info->chunk_root, device);
-	BUG_ON(ret);
-
-	/*
-	 * Modifying chunk tree needs allocating new blocks from both
-	 * system block group and metadata block group. So we only can
-	 * do operations require modifying the chunk tree after both
-	 * block groups were created.
-	 */
-	ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset,
-				   chunk_size, stripe_size);
-	BUG_ON(ret);
-
-	ret = __finish_chunk_alloc(trans, extent_root, sys_map,
-				   sys_chunk_offset, sys_chunk_size,
-				   sys_stripe_size);
-	BUG_ON(ret);
-	return 0;
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+out:
+	return ret;
 }
 
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
@@ -2847,8 +4461,9 @@
 	}
 }
 
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len)
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 {
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map *em;
 	struct map_lookup *map;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
@@ -2857,43 +4472,166 @@
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, len);
 	read_unlock(&em_tree->lock);
-	BUG_ON(!em);
 
-	BUG_ON(em->start > logical || em->start + em->len < logical);
+	/*
+	 * We could return errors for these cases, but that could get ugly and
+	 * we'd probably do the same thing which is just not do anything else
+	 * and exit, so return 1 so the callers don't try to use other copies.
+	 */
+	if (!em) {
+		btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical,
+			    logical+len);
+		return 1;
+	}
+
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
+			    "%Lu-%Lu\n", logical, logical+len, em->start,
+			    em->start + em->len);
+		return 1;
+	}
+
 	map = (struct map_lookup *)em->bdev;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		ret = map->sub_stripes;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+		ret = 2;
+	else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+		ret = 3;
 	else
 		ret = 1;
 	free_extent_map(em);
+
+	btrfs_dev_replace_lock(&fs_info->dev_replace);
+	if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))
+		ret++;
+	btrfs_dev_replace_unlock(&fs_info->dev_replace);
+
+	return ret;
+}
+
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	unsigned long len = root->sectorsize;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6)) {
+		len = map->stripe_len * nr_data_stripes(map);
+	}
+	free_extent_map(em);
+	return len;
+}
+
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	int ret = 0;
+
+	read_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, len);
+	read_unlock(&em_tree->lock);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			 BTRFS_BLOCK_GROUP_RAID6))
+		ret = 1;
+	free_extent_map(em);
 	return ret;
 }
 
-static int find_live_mirror(struct map_lookup *map, int first, int num,
-			    int optimal)
+static int find_live_mirror(struct btrfs_fs_info *fs_info,
+			    struct map_lookup *map, int first, int num,
+			    int optimal, int dev_replace_is_ongoing)
 {
 	int i;
-	if (map->stripes[optimal].dev->bdev)
-		return optimal;
-	for (i = first; i < first + num; i++) {
-		if (map->stripes[i].dev->bdev)
-			return i;
+	int tolerance;
+	struct btrfs_device *srcdev;
+
+	if (dev_replace_is_ongoing &&
+	    fs_info->dev_replace.cont_reading_from_srcdev_mode ==
+	     BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
+		srcdev = fs_info->dev_replace.srcdev;
+	else
+		srcdev = NULL;
+
+	/*
+	 * try to avoid the drive that is the source drive for a
+	 * dev-replace procedure, only choose it if no other non-missing
+	 * mirror is available
+	 */
+	for (tolerance = 0; tolerance < 2; tolerance++) {
+		if (map->stripes[optimal].dev->bdev &&
+		    (tolerance || map->stripes[optimal].dev != srcdev))
+			return optimal;
+		for (i = first; i < first + num; i++) {
+			if (map->stripes[i].dev->bdev &&
+			    (tolerance || map->stripes[i].dev != srcdev))
+				return i;
+		}
 	}
+
 	/* we couldn't find one that doesn't fail.  Just return something
 	 * and the io error handling code will clean up eventually
 	 */
 	return optimal;
 }
 
-static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+static inline int parity_smaller(u64 a, u64 b)
+{
+	return a > b;
+}
+
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+{
+	struct btrfs_bio_stripe s;
+	int i;
+	u64 l;
+	int again = 1;
+
+	while (again) {
+		again = 0;
+		for (i = 0; i < bbio->num_stripes - 1; i++) {
+			if (parity_smaller(raid_map[i], raid_map[i+1])) {
+				s = bbio->stripes[i];
+				l = raid_map[i];
+				bbio->stripes[i] = bbio->stripes[i+1];
+				raid_map[i] = raid_map[i+1];
+				bbio->stripes[i+1] = s;
+				raid_map[i+1] = l;
+				again = 1;
+			}
+		}
+	}
+}
+
+static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 			     u64 logical, u64 *length,
 			     struct btrfs_bio **bbio_ret,
-			     int mirror_num)
+			     int mirror_num, u64 **raid_map_ret)
 {
 	struct extent_map *em;
 	struct map_lookup *map;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct extent_map_tree *em_tree = &map_tree->map_tree;
 	u64 offset;
 	u64 stripe_offset;
@@ -2901,126 +4639,213 @@
 	u64 stripe_nr;
 	u64 stripe_nr_orig;
 	u64 stripe_nr_end;
-	int stripes_allocated = 8;
-	int stripes_required = 1;
+	u64 stripe_len;
+	u64 *raid_map = NULL;
 	int stripe_index;
 	int i;
+	int ret = 0;
 	int num_stripes;
 	int max_errors = 0;
 	struct btrfs_bio *bbio = NULL;
-
-	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
-		stripes_allocated = 1;
-again:
-	if (bbio_ret) {
-		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
-				GFP_NOFS);
-		if (!bbio)
-			return -ENOMEM;
-
-		atomic_set(&bbio->error, 0);
-	}
+	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
+	int dev_replace_is_ongoing = 0;
+	int num_alloc_stripes;
+	int patch_the_first_stripe_for_dev_replace = 0;
+	u64 physical_to_patch_in_first_stripe = 0;
+	u64 raid56_full_stripe_start = (u64)-1;
 
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, logical, *length);
 	read_unlock(&em_tree->lock);
 
 	if (!em) {
-		printk(KERN_CRIT "unable to find logical %llu len %llu\n",
-		       (unsigned long long)logical,
-		       (unsigned long long)*length);
-		BUG();
+		btrfs_crit(fs_info, "unable to find logical %llu len %llu",
+			logical, *length);
+		return -EINVAL;
+	}
+
+	if (em->start > logical || em->start + em->len < logical) {
+		btrfs_crit(fs_info, "found a bad mapping, wanted %Lu, "
+			   "found %Lu-%Lu\n", logical, em->start,
+			   em->start + em->len);
+		return -EINVAL;
 	}
 
-	BUG_ON(em->start > logical || em->start + em->len < logical);
 	map = (struct map_lookup *)em->bdev;
 	offset = logical - em->start;
 
-	if (mirror_num > map->num_stripes)
-		mirror_num = 0;
-
-	/* if our btrfs_bio struct is too small, back off and try again */
-	if (rw & REQ_WRITE) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP)) {
-			stripes_required = map->num_stripes;
-			max_errors = 1;
-		} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-			stripes_required = map->sub_stripes;
-			max_errors = 1;
-		}
-	}
-	if (rw & REQ_DISCARD) {
-		if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-				 BTRFS_BLOCK_GROUP_RAID1 |
-				 BTRFS_BLOCK_GROUP_DUP |
-				 BTRFS_BLOCK_GROUP_RAID10)) {
-			stripes_required = map->num_stripes;
-		}
-	}
-	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
-	    stripes_allocated < stripes_required) {
-		stripes_allocated = map->num_stripes;
-		free_extent_map(em);
-		kfree(bbio);
-		goto again;
-	}
+	stripe_len = map->stripe_len;
 	stripe_nr = offset;
 	/*
 	 * stripe_nr counts the total number of stripes we have to stride
 	 * to get to this block
 	 */
-	do_div(stripe_nr, map->stripe_len);
+	do_div(stripe_nr, stripe_len);
 
-	stripe_offset = stripe_nr * map->stripe_len;
+	stripe_offset = stripe_nr * stripe_len;
 	BUG_ON(offset < stripe_offset);
 
 	/* stripe_offset is the offset of this block in its stripe*/
 	stripe_offset = offset - stripe_offset;
 
-	if (rw & REQ_DISCARD)
+	/* if we're here for raid56, we need to know the stripe aligned start */
+	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+		unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+		raid56_full_stripe_start = offset;
+
+		/* allow a write of a full stripe, but make sure we don't
+		 * allow straddling of stripes
+		 */
+		do_div(raid56_full_stripe_start, full_stripe_len);
+		raid56_full_stripe_start *= full_stripe_len;
+	}
+
+	if (rw & REQ_DISCARD) {
+		/* we don't discard raid56 yet */
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+			ret = -EOPNOTSUPP;
+			goto out;
+		}
 		*length = min_t(u64, em->len - offset, *length);
-	else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
-			      BTRFS_BLOCK_GROUP_RAID1 |
-			      BTRFS_BLOCK_GROUP_RAID10 |
-			      BTRFS_BLOCK_GROUP_DUP)) {
-		/* we limit the length of each bio to what fits in a stripe */
-		*length = min_t(u64, em->len - offset,
-				map->stripe_len - stripe_offset);
+	} else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+		u64 max_len;
+		/* For writes to RAID[56], allow a full stripeset across all disks.
+		   For other RAID types and for RAID[56] reads, just allow a single
+		   stripe (on a single disk). */
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+		    (rw & REQ_WRITE)) {
+			max_len = stripe_len * nr_data_stripes(map) -
+				(offset - raid56_full_stripe_start);
+		} else {
+			/* we limit the length of each bio to what fits in a stripe */
+			max_len = stripe_len - stripe_offset;
+		}
+		*length = min_t(u64, em->len - offset, max_len);
 	} else {
 		*length = em->len - offset;
 	}
 
+	/* This is for when we're called from btrfs_merge_bio_hook() and all
+	   it cares about is the length */
 	if (!bbio_ret)
 		goto out;
 
+	btrfs_dev_replace_lock(dev_replace);
+	dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
+	if (!dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
+
+	if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
+	    !(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) &&
+	    dev_replace->tgtdev != NULL) {
+		/*
+		 * in dev-replace case, for repair case (that's the only
+		 * case where the mirror is selected explicitly when
+		 * calling btrfs_map_block), blocks left of the left cursor
+		 * can also be read from the target drive.
+		 * For REQ_GET_READ_MIRRORS, the target drive is added as
+		 * the last one to the array of stripes. For READ, it also
+		 * needs to be supported using the same mirror number.
+		 * If the requested block is not left of the left cursor,
+		 * EIO is returned. This can happen because btrfs_num_copies()
+		 * returns one more in the dev-replace case.
+		 */
+		u64 tmp_length = *length;
+		struct btrfs_bio *tmp_bbio = NULL;
+		int tmp_num_stripes;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
+			     logical, &tmp_length, &tmp_bbio, 0, NULL);
+		if (ret) {
+			WARN_ON(tmp_bbio != NULL);
+			goto out;
+		}
+
+		tmp_num_stripes = tmp_bbio->num_stripes;
+		if (mirror_num > tmp_num_stripes) {
+			/*
+			 * REQ_GET_READ_MIRRORS does not contain this
+			 * mirror, that means that the requested area
+			 * is not left of the left cursor
+			 */
+			ret = -EIO;
+			kfree(tmp_bbio);
+			goto out;
+		}
+
+		/*
+		 * process the rest of the function using the mirror_num
+		 * of the source drive. Therefore look it up first.
+		 * At the end, patch the device pointer to the one of the
+		 * target drive.
+		 */
+		for (i = 0; i < tmp_num_stripes; i++) {
+			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     tmp_bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found =
+					tmp_bbio->stripes[i].physical;
+			}
+		}
+
+		if (found) {
+			mirror_num = index_srcdev + 1;
+			patch_the_first_stripe_for_dev_replace = 1;
+			physical_to_patch_in_first_stripe = physical_of_found;
+		} else {
+			WARN_ON(1);
+			ret = -EIO;
+			kfree(tmp_bbio);
+			goto out;
+		}
+
+		kfree(tmp_bbio);
+	} else if (mirror_num > map->num_stripes) {
+		mirror_num = 0;
+	}
+
 	num_stripes = 1;
 	stripe_index = 0;
 	stripe_nr_orig = stripe_nr;
-	stripe_nr_end = (offset + *length + map->stripe_len - 1) &
-			(~(map->stripe_len - 1));
+	stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
 	do_div(stripe_nr_end, map->stripe_len);
 	stripe_end_offset = stripe_nr_end * map->stripe_len -
 			    (offset + *length);
+
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		if (rw & REQ_DISCARD)
 			num_stripes = min_t(u64, map->num_stripes,
 					    stripe_nr_end - stripe_nr_orig);
 		stripe_index = do_div(stripe_nr, map->num_stripes);
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
-		if (rw & (REQ_WRITE | REQ_DISCARD))
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))
 			num_stripes = map->num_stripes;
 		else if (mirror_num)
 			stripe_index = mirror_num - 1;
 		else {
-			stripe_index = find_live_mirror(map, 0,
+			stripe_index = find_live_mirror(fs_info, map, 0,
 					    map->num_stripes,
-					    current->pid % map->num_stripes);
+					    current->pid % map->num_stripes,
+					    dev_replace_is_ongoing);
 			mirror_num = stripe_index + 1;
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw & (REQ_WRITE | REQ_DISCARD)) {
+		if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) {
 			num_stripes = map->num_stripes;
 		} else if (mirror_num) {
 			stripe_index = mirror_num - 1;
@@ -3034,7 +4859,7 @@
 		stripe_index = do_div(stripe_nr, factor);
 		stripe_index *= map->sub_stripes;
 
-		if (rw & REQ_WRITE)
+		if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
 			num_stripes = map->sub_stripes;
 		else if (rw & REQ_DISCARD)
 			num_stripes = min_t(u64, map->sub_stripes *
@@ -3043,10 +4868,72 @@
 		else if (mirror_num)
 			stripe_index += mirror_num - 1;
 		else {
-			stripe_index = find_live_mirror(map, stripe_index,
+			int old_stripe_index = stripe_index;
+			stripe_index = find_live_mirror(fs_info, map,
+					      stripe_index,
 					      map->sub_stripes, stripe_index +
-					      current->pid % map->sub_stripes);
-			mirror_num = stripe_index + 1;
+					      current->pid % map->sub_stripes,
+					      dev_replace_is_ongoing);
+			mirror_num = stripe_index - old_stripe_index + 1;
+		}
+
+	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+				BTRFS_BLOCK_GROUP_RAID6)) {
+		u64 tmp;
+
+		if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+		    && raid_map_ret) {
+			int i, rot;
+
+			/* push stripe_nr back to the start of the full stripe */
+			stripe_nr = raid56_full_stripe_start;
+			do_div(stripe_nr, stripe_len);
+
+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+
+			/* RAID[56] write or recovery. Return all stripes */
+			num_stripes = map->num_stripes;
+			max_errors = nr_parity_stripes(map);
+
+			raid_map = kmalloc(sizeof(u64) * num_stripes,
+					   GFP_NOFS);
+			if (!raid_map) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			/* Work out the disk rotation on this stripe-set */
+			tmp = stripe_nr;
+			rot = do_div(tmp, num_stripes);
+
+			/* Fill in the logical address of each stripe */
+			tmp = stripe_nr * nr_data_stripes(map);
+			for (i = 0; i < nr_data_stripes(map); i++)
+				raid_map[(i+rot) % num_stripes] =
+					em->start + (tmp + i) * map->stripe_len;
+
+			raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+			if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+				raid_map[(i+rot+1) % num_stripes] =
+					RAID6_Q_STRIPE;
+
+			*length = map->stripe_len;
+			stripe_index = 0;
+			stripe_offset = 0;
+		} else {
+			/*
+			 * Mirror #0 or #1 means the original data block.
+			 * Mirror #2 is RAID5 parity block.
+			 * Mirror #3 is RAID6 Q block.
+			 */
+			stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+			if (mirror_num > 1)
+				stripe_index = nr_data_stripes(map) +
+						mirror_num - 2;
+
+			/* We distribute the parity blocks across stripes */
+			tmp = stripe_nr + stripe_index;
+			stripe_index = do_div(tmp, map->num_stripes);
 		}
 	} else {
 		/*
@@ -3059,81 +4946,79 @@
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
+	num_alloc_stripes = num_stripes;
+	if (dev_replace_is_ongoing) {
+		if (rw & (REQ_WRITE | REQ_DISCARD))
+			num_alloc_stripes <<= 1;
+		if (rw & REQ_GET_READ_MIRRORS)
+			num_alloc_stripes++;
+	}
+	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
+	if (!bbio) {
+		kfree(raid_map);
+		ret = -ENOMEM;
+		goto out;
+	}
+	atomic_set(&bbio->error, 0);
+
 	if (rw & REQ_DISCARD) {
+		int factor = 0;
+		int sub_stripes = 0;
+		u64 stripes_per_dev = 0;
+		u32 remaining_stripes = 0;
+		u32 last_stripe = 0;
+
+		if (map->type &
+		    (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
+			if (map->type & BTRFS_BLOCK_GROUP_RAID0)
+				sub_stripes = 1;
+			else
+				sub_stripes = map->sub_stripes;
+
+			factor = map->num_stripes / sub_stripes;
+			stripes_per_dev = div_u64_rem(stripe_nr_end -
+						      stripe_nr_orig,
+						      factor,
+						      &remaining_stripes);
+			div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
+			last_stripe *= sub_stripes;
+		}
+
 		for (i = 0; i < num_stripes; i++) {
 			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset + stripe_nr * map->stripe_len;
 			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
-			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
-				u64 stripes;
-				u32 last_stripe = 0;
-				int j;
-
-				div_u64_rem(stripe_nr_end - 1,
-					    map->num_stripes,
-					    &last_stripe);
-
-				for (j = 0; j < map->num_stripes; j++) {
-					u32 test;
-
-					div_u64_rem(stripe_nr_end - 1 - j,
-						    map->num_stripes, &test);
-					if (test == stripe_index)
-						break;
-				}
-				stripes = stripe_nr_end - 1 - j;
-				do_div(stripes, map->num_stripes);
-				bbio->stripes[i].length = map->stripe_len *
-					(stripes - stripe_nr + 1);
+			if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+					 BTRFS_BLOCK_GROUP_RAID10)) {
+				bbio->stripes[i].length = stripes_per_dev *
+							  map->stripe_len;
+
+				if (i / sub_stripes < remaining_stripes)
+					bbio->stripes[i].length +=
+						map->stripe_len;
 
-				if (i == 0) {
+				/*
+				 * Special for the first stripe and
+				 * the last stripe:
+				 *
+				 * |-------|...|-------|
+				 *     |----------|
+				 *    off     end_off
+				 */
+				if (i < sub_stripes)
 					bbio->stripes[i].length -=
 						stripe_offset;
-					stripe_offset = 0;
-				}
-				if (stripe_index == last_stripe)
-					bbio->stripes[i].length -=
-						stripe_end_offset;
-			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
-				u64 stripes;
-				int j;
-				int factor = map->num_stripes /
-					     map->sub_stripes;
-				u32 last_stripe = 0;
-
-				div_u64_rem(stripe_nr_end - 1,
-					    factor, &last_stripe);
-				last_stripe *= map->sub_stripes;
-
-				for (j = 0; j < factor; j++) {
-					u32 test;
-
-					div_u64_rem(stripe_nr_end - 1 - j,
-						    factor, &test);
-
-					if (test ==
-					    stripe_index / map->sub_stripes)
-						break;
-				}
-				stripes = stripe_nr_end - 1 - j;
-				do_div(stripes, factor);
-				bbio->stripes[i].length = map->stripe_len *
-					(stripes - stripe_nr + 1);
 
-				if (i < map->sub_stripes) {
-					bbio->stripes[i].length -=
-						stripe_offset;
-					if (i == map->sub_stripes - 1)
-						stripe_offset = 0;
-				}
 				if (stripe_index >= last_stripe &&
 				    stripe_index <= (last_stripe +
-						     map->sub_stripes - 1)) {
+						     sub_stripes - 1))
 					bbio->stripes[i].length -=
 						stripe_end_offset;
-				}
+
+				if (i == sub_stripes - 1)
+					stripe_offset = 0;
 			} else
 				bbio->stripes[i].length = *length;
 
@@ -3155,23 +5040,132 @@
 			stripe_index++;
 		}
 	}
-	if (bbio_ret) {
-		*bbio_ret = bbio;
-		bbio->num_stripes = num_stripes;
-		bbio->max_errors = max_errors;
-		bbio->mirror_num = mirror_num;
+
+	if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
+		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
+				 BTRFS_BLOCK_GROUP_RAID10 |
+				 BTRFS_BLOCK_GROUP_RAID5 |
+				 BTRFS_BLOCK_GROUP_DUP)) {
+			max_errors = 1;
+		} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+			max_errors = 2;
+		}
+	}
+
+	if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	    dev_replace->tgtdev != NULL) {
+		int index_where_to_add;
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+
+		/*
+		 * duplicate the write operations while the dev replace
+		 * procedure is running. Since the copying of the old disk
+		 * to the new disk takes place at run time while the
+		 * filesystem is mounted writable, the regular write
+		 * operations to the old disk have to be duplicated to go
+		 * to the new disk as well.
+		 * Note that device->missing is handled by the caller, and
+		 * that the write to the old disk is already set up in the
+		 * stripes array.
+		 */
+		index_where_to_add = num_stripes;
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/* write to new disk, too */
+				struct btrfs_bio_stripe *new =
+					bbio->stripes + index_where_to_add;
+				struct btrfs_bio_stripe *old =
+					bbio->stripes + i;
+
+				new->physical = old->physical;
+				new->length = old->length;
+				new->dev = dev_replace->tgtdev;
+				index_where_to_add++;
+				max_errors++;
+			}
+		}
+		num_stripes = index_where_to_add;
+	} else if (dev_replace_is_ongoing && (rw & REQ_GET_READ_MIRRORS) &&
+		   dev_replace->tgtdev != NULL) {
+		u64 srcdev_devid = dev_replace->srcdev->devid;
+		int index_srcdev = 0;
+		int found = 0;
+		u64 physical_of_found = 0;
+
+		/*
+		 * During the dev-replace procedure, the target drive can
+		 * also be used to read data in case it is needed to repair
+		 * a corrupt block elsewhere. This is possible if the
+		 * requested area is left of the left cursor. In this area,
+		 * the target drive is a full copy of the source drive.
+		 */
+		for (i = 0; i < num_stripes; i++) {
+			if (bbio->stripes[i].dev->devid == srcdev_devid) {
+				/*
+				 * In case of DUP, in order to keep it
+				 * simple, only add the mirror with the
+				 * lowest physical address
+				 */
+				if (found &&
+				    physical_of_found <=
+				     bbio->stripes[i].physical)
+					continue;
+				index_srcdev = i;
+				found = 1;
+				physical_of_found = bbio->stripes[i].physical;
+			}
+		}
+		if (found) {
+			u64 length = map->stripe_len;
+
+			if (physical_of_found + length <=
+			    dev_replace->cursor_left) {
+				struct btrfs_bio_stripe *tgtdev_stripe =
+					bbio->stripes + num_stripes;
+
+				tgtdev_stripe->physical = physical_of_found;
+				tgtdev_stripe->length =
+					bbio->stripes[index_srcdev].length;
+				tgtdev_stripe->dev = dev_replace->tgtdev;
+
+				num_stripes++;
+			}
+		}
+	}
+
+	*bbio_ret = bbio;
+	bbio->num_stripes = num_stripes;
+	bbio->max_errors = max_errors;
+	bbio->mirror_num = mirror_num;
+
+	/*
+	 * this is the case that REQ_READ && dev_replace_is_ongoing &&
+	 * mirror_num == num_stripes + 1 && dev_replace target drive is
+	 * available as a mirror
+	 */
+	if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
+		WARN_ON(num_stripes > 1);
+		bbio->stripes[0].dev = dev_replace->tgtdev;
+		bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
+		bbio->mirror_num = map->num_stripes + 1;
+	}
+	if (raid_map) {
+		sort_parity_stripes(bbio, raid_map);
+		*raid_map_ret = raid_map;
 	}
 out:
+	if (dev_replace_is_ongoing)
+		btrfs_dev_replace_unlock(dev_replace);
 	free_extent_map(em);
-	return 0;
+	return ret;
 }
 
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		      u64 logical, u64 *length,
 		      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
-				 mirror_num);
+	return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
+				 mirror_num, NULL);
 }
 
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -3185,23 +5179,42 @@
 	u64 bytenr;
 	u64 length;
 	u64 stripe_nr;
+	u64 rmap_len;
 	int i, j, nr = 0;
 
 	read_lock(&em_tree->lock);
 	em = lookup_extent_mapping(em_tree, chunk_start, 1);
 	read_unlock(&em_tree->lock);
 
-	BUG_ON(!em || em->start != chunk_start);
+	if (!em) {
+		printk(KERN_ERR "btrfs: couldn't find em for chunk %Lu\n",
+		       chunk_start);
+		return -EIO;
+	}
+
+	if (em->start != chunk_start) {
+		printk(KERN_ERR "btrfs: bad chunk start, em=%Lu, wanted=%Lu\n",
+		       em->start, chunk_start);
+		free_extent_map(em);
+		return -EIO;
+	}
 	map = (struct map_lookup *)em->bdev;
 
 	length = em->len;
+	rmap_len = map->stripe_len;
+
 	if (map->type & BTRFS_BLOCK_GROUP_RAID10)
 		do_div(length, map->num_stripes / map->sub_stripes);
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
 		do_div(length, map->num_stripes);
+	else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+			      BTRFS_BLOCK_GROUP_RAID6)) {
+		do_div(length, nr_data_stripes(map));
+		rmap_len = map->stripe_len * nr_data_stripes(map);
+	}
 
 	buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
-	BUG_ON(!buf);
+	BUG_ON(!buf); /* -ENOMEM */
 
 	for (i = 0; i < map->num_stripes; i++) {
 		if (devid && map->stripes[i].dev->devid != devid)
@@ -3218,8 +5231,11 @@
 			do_div(stripe_nr, map->sub_stripes);
 		} else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 			stripe_nr = stripe_nr * map->num_stripes + i;
-		}
-		bytenr = chunk_start + stripe_nr * map->stripe_len;
+		} /* else if RAID[56], multiply by nr_data_stripes().
+		   * Alternatively, just use rmap_len below instead of
+		   * map->stripe_len */
+
+		bytenr = chunk_start + stripe_nr * rmap_len;
 		WARN_ON(nr >= map->num_stripes);
 		for (j = 0; j < nr; j++) {
 			if (buf[j] == bytenr)
@@ -3233,7 +5249,7 @@
 
 	*logical = buf;
 	*naddrs = nr;
-	*stripe_len = map->stripe_len;
+	*stripe_len = rmap_len;
 
 	free_extent_map(em);
 	return 0;
@@ -3244,8 +5260,29 @@
 	struct btrfs_bio *bbio = bio->bi_private;
 	int is_orig_bio = 0;
 
-	if (err)
+	if (err) {
 		atomic_inc(&bbio->error);
+		if (err == -EIO || err == -EREMOTEIO) {
+			unsigned int stripe_index =
+				btrfs_io_bio(bio)->stripe_index;
+			struct btrfs_device *dev;
+
+			BUG_ON(stripe_index >= bbio->num_stripes);
+			dev = bbio->stripes[stripe_index].dev;
+			if (dev->bdev) {
+				if (bio->bi_rw & WRITE)
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_WRITE_ERRS);
+				else
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_READ_ERRS);
+				if ((bio->bi_rw & WRITE_FLUSH) == WRITE_FLUSH)
+					btrfs_dev_stat_inc(dev,
+						BTRFS_DEV_STAT_FLUSH_ERRS);
+				btrfs_dev_stat_print_on_error(dev);
+			}
+		}
+	}
 
 	if (bio == bbio->orig_bio)
 		is_orig_bio = 1;
@@ -3257,10 +5294,9 @@
 		}
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
-		bio->bi_bdev = (struct block_device *)
-					(unsigned long)bbio->mirror_num;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
 		/* only send an error to the higher layers if it is
-		 * beyond the tolerance of the multi-bio
+		 * beyond the tolerance of the btrfs bio
 		 */
 		if (atomic_read(&bbio->error) > bbio->max_errors) {
 			err = -EIO;
@@ -3294,19 +5330,24 @@
  * This will add one bio to the pending list for a device and make sure
  * the work struct is scheduled.
  */
-static noinline int schedule_bio(struct btrfs_root *root,
-				 struct btrfs_device *device,
-				 int rw, struct bio *bio)
+static noinline void btrfs_schedule_bio(struct btrfs_root *root,
+					struct btrfs_device *device,
+					int rw, struct bio *bio)
 {
 	int should_queue = 1;
 	struct btrfs_pending_bios *pending_bios;
 
+	if (device->missing || !device->bdev) {
+		bio_endio(bio, -EIO);
+		return;
+	}
+
 	/* don't bother with additional async steps for reads, right now */
 	if (!(rw & REQ_WRITE)) {
 		bio_get(bio);
-		submit_bio(rw, bio);
+		btrfsic_submit_bio(rw, bio);
 		bio_put(bio);
-		return 0;
+		return;
 	}
 
 	/*
@@ -3340,83 +5381,203 @@
 	if (should_queue)
 		btrfs_queue_worker(&root->fs_info->submit_workers,
 				   &device->work);
+}
+
+static int bio_size_ok(struct block_device *bdev, struct bio *bio,
+		       sector_t sector)
+{
+	struct bio_vec *prev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned short max_sectors = queue_max_sectors(q);
+	struct bvec_merge_data bvm = {
+		.bi_bdev = bdev,
+		.bi_sector = sector,
+		.bi_rw = bio->bi_rw,
+	};
+
+	if (bio->bi_vcnt == 0) {
+		WARN_ON(1);
+		return 1;
+	}
+
+	prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	if ((bio->bi_size >> 9) > max_sectors)
+		return 0;
+
+	if (!q->merge_bvec_fn)
+		return 1;
+
+	bvm.bi_size = bio->bi_size - prev->bv_len;
+	if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len)
+		return 0;
+	return 1;
+}
+
+static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *bio, u64 physical, int dev_nr,
+			      int rw, int async)
+{
+	struct btrfs_device *dev = bbio->stripes[dev_nr].dev;
+
+	bio->bi_private = bbio;
+	btrfs_io_bio(bio)->stripe_index = dev_nr;
+	bio->bi_end_io = btrfs_end_bio;
+	bio->bi_sector = physical >> 9;
+#ifdef DEBUG
+	{
+		struct rcu_string *name;
+
+		rcu_read_lock();
+		name = rcu_dereference(dev->name);
+		pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
+			 "(%s id %llu), size=%u\n", rw,
+			 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+			 name->str, dev->devid, bio->bi_size);
+		rcu_read_unlock();
+	}
+#endif
+	bio->bi_bdev = dev->bdev;
+	if (async)
+		btrfs_schedule_bio(root, dev, rw, bio);
+	else
+		btrfsic_submit_bio(rw, bio);
+}
+
+static int breakup_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
+			      struct bio *first_bio, struct btrfs_device *dev,
+			      int dev_nr, int rw, int async)
+{
+	struct bio_vec *bvec = first_bio->bi_io_vec;
+	struct bio *bio;
+	int nr_vecs = bio_get_nr_vecs(dev->bdev);
+	u64 physical = bbio->stripes[dev_nr].physical;
+
+again:
+	bio = btrfs_bio_alloc(dev->bdev, physical >> 9, nr_vecs, GFP_NOFS);
+	if (!bio)
+		return -ENOMEM;
+
+	while (bvec <= (first_bio->bi_io_vec + first_bio->bi_vcnt - 1)) {
+		if (bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+				 bvec->bv_offset) < bvec->bv_len) {
+			u64 len = bio->bi_size;
+
+			atomic_inc(&bbio->stripes_pending);
+			submit_stripe_bio(root, bbio, bio, physical, dev_nr,
+					  rw, async);
+			physical += len;
+			goto again;
+		}
+		bvec++;
+	}
+
+	submit_stripe_bio(root, bbio, bio, physical, dev_nr, rw, async);
 	return 0;
 }
 
+static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
+{
+	atomic_inc(&bbio->error);
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
+		btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
+		bio->bi_sector = logical >> 9;
+		kfree(bbio);
+		bio_endio(bio, -EIO);
+	}
+}
+
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		  int mirror_num, int async_submit)
 {
-	struct btrfs_mapping_tree *map_tree;
 	struct btrfs_device *dev;
 	struct bio *first_bio = bio;
 	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
+	u64 *raid_map = NULL;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
 	struct btrfs_bio *bbio = NULL;
 
 	length = bio->bi_size;
-	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
-			      mirror_num);
-	BUG_ON(ret);
+	ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+			      mirror_num, &raid_map);
+	if (ret) /* -ENOMEM */
+		return ret;
 
 	total_devs = bbio->num_stripes;
-	if (map_length < length) {
-		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
-		       "len %llu\n", (unsigned long long)logical,
-		       (unsigned long long)length,
-		       (unsigned long long)map_length);
-		BUG();
-	}
-
 	bbio->orig_bio = first_bio;
 	bbio->private = first_bio->bi_private;
 	bbio->end_io = first_bio->bi_end_io;
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
-	while (dev_nr < total_devs) {
-		if (dev_nr < total_devs - 1) {
-			bio = bio_clone(first_bio, GFP_NOFS);
-			BUG_ON(!bio);
+	if (raid_map) {
+		/* In this case, map_length has been set to the length of
+		   a single stripe; not the whole write */
+		if (rw & WRITE) {
+			return raid56_parity_write(root, bio, bbio,
+						   raid_map, map_length);
 		} else {
-			bio = first_bio;
+			return raid56_parity_recover(root, bio, bbio,
+						     raid_map, map_length,
+						     mirror_num);
 		}
-		bio->bi_private = bbio;
-		bio->bi_end_io = btrfs_end_bio;
-		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
+	}
+
+	if (map_length < length) {
+		btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
+			logical, length, map_length);
+		BUG();
+	}
+
+	while (dev_nr < total_devs) {
 		dev = bbio->stripes[dev_nr].dev;
-		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
-				 "(%s id %llu), size=%u\n", rw,
-				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
-				 dev->name, dev->devid, bio->bi_size);
-			bio->bi_bdev = dev->bdev;
-			if (async_submit)
-				schedule_bio(root, dev, rw, bio);
-			else
-				submit_bio(rw, bio);
+		if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
+			bbio_error(bbio, first_bio, logical);
+			dev_nr++;
+			continue;
+		}
+
+		/*
+		 * Check and see if we're ok with this bio based on it's size
+		 * and offset with the given device.
+		 */
+		if (!bio_size_ok(dev->bdev, first_bio,
+				 bbio->stripes[dev_nr].physical >> 9)) {
+			ret = breakup_stripe_bio(root, bbio, first_bio, dev,
+						 dev_nr, rw, async_submit);
+			BUG_ON(ret);
+			dev_nr++;
+			continue;
+		}
+
+		if (dev_nr < total_devs - 1) {
+			bio = btrfs_bio_clone(first_bio, GFP_NOFS);
+			BUG_ON(!bio); /* -ENOMEM */
 		} else {
-			bio->bi_bdev = root->fs_info->fs_devices->latest_bdev;
-			bio->bi_sector = logical >> 9;
-			bio_endio(bio, -EIO);
+			bio = first_bio;
 		}
+
+		submit_stripe_bio(root, bbio, bio,
+				  bbio->stripes[dev_nr].physical, dev_nr, rw,
+				  async_submit);
 		dev_nr++;
 	}
 	return 0;
 }
 
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid)
 {
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *cur_devices;
 
-	cur_devices = root->fs_info->fs_devices;
+	cur_devices = fs_info->fs_devices;
 	while (cur_devices) {
 		if (!fsid ||
 		    !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) {
@@ -3436,24 +5597,72 @@
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 
-	device = kzalloc(sizeof(*device), GFP_NOFS);
-	if (!device)
+	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+	if (IS_ERR(device))
 		return NULL;
-	list_add(&device->dev_list,
-		 &fs_devices->devices);
-	device->dev_root = root->fs_info->dev_root;
-	device->devid = devid;
-	device->work.func = pending_bios_fn;
+
+	list_add(&device->dev_list, &fs_devices->devices);
 	device->fs_devices = fs_devices;
-	device->missing = 1;
 	fs_devices->num_devices++;
+
+	device->missing = 1;
 	fs_devices->missing_devices++;
-	spin_lock_init(&device->io_lock);
-	INIT_LIST_HEAD(&device->dev_alloc_list);
-	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+
 	return device;
 }
 
+/**
+ * btrfs_alloc_device - allocate struct btrfs_device
+ * @fs_info:	used only for generating a new devid, can be NULL if
+ *		devid is provided (i.e. @devid != NULL).
+ * @devid:	a pointer to devid for this device.  If NULL a new devid
+ *		is generated.
+ * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
+ *		is generated.
+ *
+ * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
+ * on error.  Returned struct is not linked onto any lists and can be
+ * destroyed with kfree() right away.
+ */
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid)
+{
+	struct btrfs_device *dev;
+	u64 tmp;
+
+	if (!devid && !fs_info) {
+		WARN_ON(1);
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev = __alloc_device();
+	if (IS_ERR(dev))
+		return dev;
+
+	if (devid)
+		tmp = *devid;
+	else {
+		int ret;
+
+		ret = find_next_devid(fs_info, &tmp);
+		if (ret) {
+			kfree(dev);
+			return ERR_PTR(ret);
+		}
+	}
+	dev->devid = tmp;
+
+	if (uuid)
+		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
+	else
+		generate_random_uuid(dev->uuid);
+
+	dev->work.func = pending_bios_fn;
+
+	return dev;
+}
+
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -3497,6 +5706,7 @@
 	em->bdev = (struct block_device *)map;
 	em->start = logical;
 	em->len = length;
+	em->orig_start = 0;
 	em->block_start = 0;
 	em->block_len = em->len;
 
@@ -3514,8 +5724,8 @@
 		read_extent_buffer(leaf, uuid, (unsigned long)
 				   btrfs_stripe_dev_uuid_nr(chunk, i),
 				   BTRFS_UUID_SIZE);
-		map->stripes[i].dev = btrfs_find_device(root, devid, uuid,
-							NULL);
+		map->stripes[i].dev = btrfs_find_device(root->fs_info, devid,
+							uuid, NULL);
 		if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) {
 			kfree(map);
 			free_extent_map(em);
@@ -3534,15 +5744,15 @@
 	}
 
 	write_lock(&map_tree->map_tree.lock);
-	ret = add_extent_mapping(&map_tree->map_tree, em);
+	ret = add_extent_mapping(&map_tree->map_tree, em, 0);
 	write_unlock(&map_tree->map_tree.lock);
-	BUG_ON(ret);
+	BUG_ON(ret); /* Tree corruption */
 	free_extent_map(em);
 
 	return 0;
 }
 
-static int fill_device_from_item(struct extent_buffer *leaf,
+static void fill_device_from_item(struct extent_buffer *leaf,
 				 struct btrfs_dev_item *dev_item,
 				 struct btrfs_device *device)
 {
@@ -3556,11 +5766,11 @@
 	device->io_align = btrfs_device_io_align(leaf, dev_item);
 	device->io_width = btrfs_device_io_width(leaf, dev_item);
 	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
+	device->is_tgtdev_for_dev_replace = 0;
 
-	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	ptr = btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-
-	return 0;
 }
 
 static int open_seed_devices(struct btrfs_root *root, u8 *fsid)
@@ -3568,7 +5778,7 @@
 	struct btrfs_fs_devices *fs_devices;
 	int ret;
 
-	mutex_lock(&uuid_mutex);
+	BUG_ON(!mutex_is_locked(&uuid_mutex));
 
 	fs_devices = root->fs_info->fs_devices->seed;
 	while (fs_devices) {
@@ -3593,8 +5803,10 @@
 
 	ret = __btrfs_open_devices(fs_devices, FMODE_READ,
 				   root->fs_info->bdev_holder);
-	if (ret)
+	if (ret) {
+		free_fs_devices(fs_devices);
 		goto out;
+	}
 
 	if (!fs_devices->seeding) {
 		__btrfs_close_devices(fs_devices);
@@ -3606,7 +5818,6 @@
 	fs_devices->seed = root->fs_info->fs_devices->seed;
 	root->fs_info->fs_devices->seed = fs_devices;
 out:
-	mutex_unlock(&uuid_mutex);
 	return ret;
 }
 
@@ -3621,11 +5832,9 @@
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
 	devid = btrfs_device_id(leaf, dev_item);
-	read_extent_buffer(leaf, dev_uuid,
-			   (unsigned long)btrfs_device_uuid(dev_item),
+	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 			   BTRFS_UUID_SIZE);
-	read_extent_buffer(leaf, fs_uuid,
-			   (unsigned long)btrfs_device_fsid(dev_item),
+	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 			   BTRFS_UUID_SIZE);
 
 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
@@ -3634,14 +5843,13 @@
 			return ret;
 	}
 
-	device = btrfs_find_device(root, devid, dev_uuid, fs_uuid);
+	device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid);
 	if (!device || !device->bdev) {
 		if (!btrfs_test_opt(root, DEGRADED))
 			return -EIO;
 
 		if (!device) {
-			printk(KERN_WARNING "warning devid %llu missing\n",
-			       (unsigned long long)devid);
+			btrfs_warn(root->fs_info, "devid %llu missing", devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -3665,9 +5873,8 @@
 	}
 
 	fill_device_from_item(leaf, dev_item, device);
-	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
-	if (device->writeable) {
+	if (device->writeable && !device->is_tgtdev_for_dev_replace) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
 		spin_lock(&root->fs_info->free_chunk_lock);
 		root->fs_info->free_chunk_space += device->total_bytes -
@@ -3699,6 +5906,20 @@
 		return -ENOMEM;
 	btrfs_set_buffer_uptodate(sb);
 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
+	/*
+	 * The sb extent buffer is artifical and just used to read the system array.
+	 * btrfs_set_buffer_uptodate() call does not properly mark all it's
+	 * pages up-to-date when the page is larger: extent does not cover the
+	 * whole page and consequently check_page_uptodate does not find all
+	 * the page's extents up-to-date (the hole beyond sb),
+	 * write_extent_buffer then triggers a WARN_ON.
+	 *
+	 * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
+	 * but sb spans only this function. Add an explicit SetPageUptodate call
+	 * to silence the warning eg. on PowerPC 64.
+	 */
+	if (PAGE_CACHE_SIZE > BTRFS_SUPER_INFO_SIZE)
+		SetPageUptodate(sb->pages[0]);
 
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
@@ -3749,14 +5970,18 @@
 	if (!path)
 		return -ENOMEM;
 
-	/* first we search for all of the device items, and then we
-	 * read in all of the chunk items.  This way we can create chunk
-	 * mappings that reference all of the devices that are afound
+	mutex_lock(&uuid_mutex);
+	lock_chunks(root);
+
+	/*
+	 * Read all device items, and then all the chunk items. All
+	 * device items are found before any chunk item (their object id
+	 * is smaller than the lowest possible object id for a chunk
+	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
 	 */
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.offset = 0;
 	key.type = 0;
-again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto error;
@@ -3772,17 +5997,13 @@
 			break;
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
-			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
-				break;
-			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
-				struct btrfs_dev_item *dev_item;
-				dev_item = btrfs_item_ptr(leaf, slot,
+		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+			struct btrfs_dev_item *dev_item;
+			dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
-				ret = read_one_dev(root, leaf, dev_item);
-				if (ret)
-					goto error;
-			}
+			ret = read_one_dev(root, leaf, dev_item);
+			if (ret)
+				goto error;
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -3792,13 +6013,272 @@
 		}
 		path->slots[0]++;
 	}
-	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+	ret = 0;
+error:
+	unlock_chunks(root);
+	mutex_unlock(&uuid_mutex);
+
+	btrfs_free_path(path);
+	return ret;
+}
+
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list)
+		device->dev_root = fs_info->dev_root;
+	mutex_unlock(&fs_devices->device_list_mutex);
+}
+
+static void __btrfs_reset_dev_stats(struct btrfs_device *dev)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		btrfs_dev_stat_reset(dev, i);
+}
+
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct extent_buffer *eb;
+	int slot;
+	int ret = 0;
+	struct btrfs_device *device;
+	struct btrfs_path *path = NULL;
+	int i;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		int item_size;
+		struct btrfs_dev_stats_item *ptr;
+
 		key.objectid = 0;
+		key.type = BTRFS_DEV_STATS_KEY;
+		key.offset = device->devid;
+		ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0);
+		if (ret) {
+			__btrfs_reset_dev_stats(device);
+			device->dev_stats_valid = 1;
+			btrfs_release_path(path);
+			continue;
+		}
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		btrfs_item_key_to_cpu(eb, &found_key, slot);
+		item_size = btrfs_item_size_nr(eb, slot);
+
+		ptr = btrfs_item_ptr(eb, slot,
+				     struct btrfs_dev_stats_item);
+
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+			if (item_size >= (1 + i) * sizeof(__le64))
+				btrfs_dev_stat_set(device, i,
+					btrfs_dev_stats_value(eb, ptr, i));
+			else
+				btrfs_dev_stat_reset(device, i);
+		}
+
+		device->dev_stats_valid = 1;
+		btrfs_dev_stat_print_on_load(device);
 		btrfs_release_path(path);
-		goto again;
 	}
-	ret = 0;
-error:
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+out:
+	btrfs_free_path(path);
+	return ret < 0 ? ret : 0;
+}
+
+static int update_dev_stat_item(struct btrfs_trans_handle *trans,
+				struct btrfs_root *dev_root,
+				struct btrfs_device *device)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_dev_stats_item *ptr;
+	int ret;
+	int i;
+
+	key.objectid = 0;
+	key.type = BTRFS_DEV_STATS_KEY;
+	key.offset = device->devid;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+	ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
+	if (ret < 0) {
+		printk_in_rcu(KERN_WARNING "btrfs: error %d while searching for dev_stats item for device %s!\n",
+			      ret, rcu_str_deref(device->name));
+		goto out;
+	}
+
+	if (ret == 0 &&
+	    btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
+		/* need to delete old one and insert a new one */
+		ret = btrfs_del_item(trans, dev_root, path);
+		if (ret != 0) {
+			printk_in_rcu(KERN_WARNING "btrfs: delete too small dev_stats item for device %s failed %d!\n",
+				      rcu_str_deref(device->name), ret);
+			goto out;
+		}
+		ret = 1;
+	}
+
+	if (ret == 1) {
+		/* need to insert a new item */
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, dev_root, path,
+					      &key, sizeof(*ptr));
+		if (ret < 0) {
+			printk_in_rcu(KERN_WARNING "btrfs: insert dev_stats item for device %s failed %d!\n",
+				      rcu_str_deref(device->name), ret);
+			goto out;
+		}
+	}
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		btrfs_set_dev_stats_value(eb, ptr, i,
+					  btrfs_dev_stat_read(device, i));
+	btrfs_mark_buffer_dirty(eb);
+
+out:
 	btrfs_free_path(path);
 	return ret;
 }
+
+/*
+ * called from commit_transaction. Writes all changed device stats to disk.
+ */
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_root *dev_root = fs_info->dev_root;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	struct btrfs_device *device;
+	int ret = 0;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		if (!device->dev_stats_valid || !device->dev_stats_dirty)
+			continue;
+
+		ret = update_dev_stat_item(trans, dev_root, device);
+		if (!ret)
+			device->dev_stats_dirty = 0;
+	}
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	return ret;
+}
+
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
+{
+	btrfs_dev_stat_inc(dev, index);
+	btrfs_dev_stat_print_on_error(dev);
+}
+
+static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
+{
+	if (!dev->dev_stats_valid)
+		return;
+	printk_ratelimited_in_rcu(KERN_ERR
+			   "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+			   rcu_str_deref(dev->name),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+			   btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+			   btrfs_dev_stat_read(dev,
+					       BTRFS_DEV_STAT_CORRUPTION_ERRS),
+			   btrfs_dev_stat_read(dev,
+					       BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
+{
+	int i;
+
+	for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+		if (btrfs_dev_stat_read(dev, i) != 0)
+			break;
+	if (i == BTRFS_DEV_STAT_VALUES_MAX)
+		return; /* all values == 0, suppress message */
+
+	printk_in_rcu(KERN_INFO "btrfs: bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u\n",
+	       rcu_str_deref(dev->name),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
+	       btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
+}
+
+int btrfs_get_dev_stats(struct btrfs_root *root,
+			struct btrfs_ioctl_get_dev_stats *stats)
+{
+	struct btrfs_device *dev;
+	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+	int i;
+
+	mutex_lock(&fs_devices->device_list_mutex);
+	dev = btrfs_find_device(root->fs_info, stats->devid, NULL, NULL);
+	mutex_unlock(&fs_devices->device_list_mutex);
+
+	if (!dev) {
+		printk(KERN_WARNING
+		       "btrfs: get dev_stats failed, device not found\n");
+		return -ENODEV;
+	} else if (!dev->dev_stats_valid) {
+		printk(KERN_WARNING
+		       "btrfs: get dev_stats failed, not yet valid\n");
+		return -ENODEV;
+	} else if (stats->flags & BTRFS_DEV_STATS_RESET) {
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
+			if (stats->nr_items > i)
+				stats->values[i] =
+					btrfs_dev_stat_read_and_reset(dev, i);
+			else
+				btrfs_dev_stat_reset(dev, i);
+		}
+	} else {
+		for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
+			if (stats->nr_items > i)
+				stats->values[i] = btrfs_dev_stat_read(dev, i);
+	}
+	if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
+		stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
+	return 0;
+}
+
+int btrfs_scratch_superblock(struct btrfs_device *device)
+{
+	struct buffer_head *bh;
+	struct btrfs_super_block *disk_super;
+
+	bh = btrfs_read_dev_super(device->bdev);
+	if (!bh)
+		return -EINVAL;
+	disk_super = (struct btrfs_super_block *)bh->b_data;
+
+	memset(&disk_super->magic, 0, sizeof(disk_super->magic));
+	set_buffer_dirty(bh);
+	sync_dirty_buffer(bh);
+	brelse(bh);
+
+	return 0;
+}
diff -ur a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
--- a/fs/btrfs/volumes.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/volumes.h	2014-02-17 11:56:58.000000000 +0100
@@ -21,6 +21,7 @@
 
 #include <linux/bio.h>
 #include <linux/sort.h>
+#include <linux/btrfs.h>
 #include "async-thread.h"
 
 #define BTRFS_STRIPE_LEN	(64 * 1024)
@@ -49,6 +50,7 @@
 	int in_fs_metadata;
 	int missing;
 	int can_discard;
+	int is_tgtdev_for_dev_replace;
 
 	spinlock_t io_lock;
 
@@ -57,7 +59,7 @@
 	/* the mode sent to blkdev_get */
 	fmode_t mode;
 
-	char *name;
+	struct rcu_string *name;
 
 	/* the internal btrfs device id */
 	u64 devid;
@@ -87,7 +89,7 @@
 	u8 uuid[BTRFS_UUID_SIZE];
 
 	/* per-device scrub information */
-	struct scrub_dev *scrub_device;
+	struct scrub_ctx *scrub_device;
 
 	struct btrfs_work work;
 	struct rcu_head rcu;
@@ -106,6 +108,11 @@
 	struct completion flush_wait;
 	int nobarriers;
 
+	/* disk I/O failure stats. For detailed description refer to
+	 * enum btrfs_dev_stat_values in ioctl.h */
+	int dev_stats_valid;
+	int dev_stats_dirty; /* counters need to be written to disk */
+	atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
 };
 
 struct btrfs_fs_devices {
@@ -120,6 +127,7 @@
 	u64 missing_devices;
 	u64 total_rw_bytes;
 	u64 num_can_discard;
+	u64 total_devices;
 	struct block_device *latest_bdev;
 
 	/* all of the devices in the FS, protected by a mutex
@@ -144,6 +152,33 @@
 	int rotating;
 };
 
+#define BTRFS_BIO_INLINE_CSUM_SIZE	64
+
+/*
+ * we need the mirror number and stripe index to be passed around
+ * the call chain while we are processing end_io (especially errors).
+ * Really, what we need is a btrfs_bio structure that has this info
+ * and is properly sized with its stripe array, but we're not there
+ * quite yet.  We have our own btrfs bioset, and all of the bios
+ * we allocate are actually btrfs_io_bios.  We'll cram as much of
+ * struct btrfs_bio as we can into this over time.
+ */
+typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
+struct btrfs_io_bio {
+	unsigned long mirror_num;
+	unsigned long stripe_index;
+	u8 *csum;
+	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+	u8 *csum_allocated;
+	btrfs_io_bio_end_io_t *end_io;
+	struct bio bio;
+};
+
+static inline struct btrfs_io_bio *btrfs_io_bio(struct bio *bio)
+{
+	return container_of(bio, struct btrfs_io_bio, bio);
+}
+
 struct btrfs_bio_stripe {
 	struct btrfs_device *dev;
 	u64 physical;
@@ -172,6 +207,15 @@
 	u64 total_avail;
 };
 
+struct btrfs_raid_attr {
+	int sub_stripes;	/* sub_stripes info for map */
+	int dev_stripes;	/* stripes per dev */
+	int devs_max;		/* max devs to use */
+	int devs_min;		/* min devs needed */
+	int devs_increment;	/* ndevs has to be a multiple of this */
+	int ncopies;		/* how many copies to data has */
+};
+
 struct map_lookup {
 	u64 type;
 	int io_align;
@@ -186,17 +230,58 @@
 #define map_lookup_size(n) (sizeof(struct map_lookup) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
+/*
+ * Restriper's general type filter
+ */
+#define BTRFS_BALANCE_DATA		(1ULL << 0)
+#define BTRFS_BALANCE_SYSTEM		(1ULL << 1)
+#define BTRFS_BALANCE_METADATA		(1ULL << 2)
+
+#define BTRFS_BALANCE_TYPE_MASK		(BTRFS_BALANCE_DATA |	    \
+					 BTRFS_BALANCE_SYSTEM |	    \
+					 BTRFS_BALANCE_METADATA)
+
+#define BTRFS_BALANCE_FORCE		(1ULL << 3)
+#define BTRFS_BALANCE_RESUME		(1ULL << 4)
+
+/*
+ * Balance filters
+ */
+#define BTRFS_BALANCE_ARGS_PROFILES	(1ULL << 0)
+#define BTRFS_BALANCE_ARGS_USAGE	(1ULL << 1)
+#define BTRFS_BALANCE_ARGS_DEVID	(1ULL << 2)
+#define BTRFS_BALANCE_ARGS_DRANGE	(1ULL << 3)
+#define BTRFS_BALANCE_ARGS_VRANGE	(1ULL << 4)
+
+/*
+ * Profile changing flags.  When SOFT is set we won't relocate chunk if
+ * it already has the target profile (even though it may be
+ * half-filled).
+ */
+#define BTRFS_BALANCE_ARGS_CONVERT	(1ULL << 8)
+#define BTRFS_BALANCE_ARGS_SOFT		(1ULL << 9)
+
+struct btrfs_balance_args;
+struct btrfs_balance_progress;
+struct btrfs_balance_control {
+	struct btrfs_fs_info *fs_info;
+
+	struct btrfs_balance_args data;
+	struct btrfs_balance_args meta;
+	struct btrfs_balance_args sys;
+
+	u64 flags;
+
+	struct btrfs_balance_progress stat;
+};
+
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 				   u64 end, u64 *length);
 
 #define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
-int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
-			   struct btrfs_device *device,
-			   u64 chunk_tree, u64 chunk_objectid,
-			   u64 chunk_offset, u64 start, u64 num_bytes);
-int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
+int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		    u64 logical, u64 *length,
 		    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -215,22 +300,92 @@
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 			  struct btrfs_fs_devices **fs_devices_ret);
 int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
-int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices);
-int btrfs_add_device(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_device *device);
+void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
+			       struct btrfs_fs_devices *fs_devices, int step);
+int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
+					 char *device_path,
+					 struct btrfs_device **device);
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid);
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
-int btrfs_cleanup_fs_uuids(void);
-int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
+void btrfs_cleanup_fs_uuids(void);
+int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size);
-struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
+struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid,
 				       u8 *uuid, u8 *fsid);
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
-int btrfs_balance(struct btrfs_root *dev_root);
+int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
+				  struct btrfs_device **device_out);
+int btrfs_balance(struct btrfs_balance_control *bctl,
+		  struct btrfs_ioctl_balance_args *bargs);
+int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
+int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
+int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
 			 u64 *start, u64 *max_avail);
+void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index);
+int btrfs_get_dev_stats(struct btrfs_root *root,
+			struct btrfs_ioctl_get_dev_stats *stats);
+void btrfs_init_devices_late(struct btrfs_fs_info *fs_info);
+int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info);
+int btrfs_run_dev_stats(struct btrfs_trans_handle *trans,
+			struct btrfs_fs_info *fs_info);
+void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info,
+				 struct btrfs_device *srcdev);
+void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
+				      struct btrfs_device *tgtdev);
+void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
+					      struct btrfs_device *tgtdev);
+int btrfs_scratch_superblock(struct btrfs_device *device);
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+			   u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+				    struct btrfs_mapping_tree *map_tree,
+				    u64 logical);
+int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
+				struct btrfs_root *extent_root,
+				u64 chunk_offset, u64 chunk_size);
+static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
+				      int index)
+{
+	atomic_inc(dev->dev_stat_values + index);
+	dev->dev_stats_dirty = 1;
+}
+
+static inline int btrfs_dev_stat_read(struct btrfs_device *dev,
+				      int index)
+{
+	return atomic_read(dev->dev_stat_values + index);
+}
+
+static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev,
+						int index)
+{
+	int ret;
+
+	ret = atomic_xchg(dev->dev_stat_values + index, 0);
+	dev->dev_stats_dirty = 1;
+	return ret;
+}
+
+static inline void btrfs_dev_stat_set(struct btrfs_device *dev,
+				      int index, unsigned long val)
+{
+	atomic_set(dev->dev_stat_values + index, val);
+	dev->dev_stats_dirty = 1;
+}
+
+static inline void btrfs_dev_stat_reset(struct btrfs_device *dev,
+					int index)
+{
+	btrfs_dev_stat_set(dev, index, 0);
+}
 #endif
diff -ur a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
--- a/fs/btrfs/xattr.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/xattr.c	2014-02-17 11:56:58.000000000 +0100
@@ -122,6 +122,16 @@
 		 */
 		if (!value)
 			goto out;
+	} else {
+		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
+					name, name_len, 0);
+		if (IS_ERR(di)) {
+			ret = PTR_ERR(di);
+			goto out;
+		}
+		if (!di && !value)
+			goto out;
+		btrfs_release_path(path);
 	}
 
 again:
@@ -196,11 +206,13 @@
 	if (ret)
 		goto out;
 
+	inode_inc_iversion(inode);
 	inode->i_ctime = CURRENT_TIME;
+	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
 	ret = btrfs_update_inode(trans, root, inode);
 	BUG_ON(ret);
 out:
-	btrfs_end_transaction_throttle(trans, root);
+	btrfs_end_transaction(trans, root);
 	return ret;
 }
 
@@ -264,7 +276,7 @@
 
 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
 		if (verify_dir_item(root, leaf, di))
-			continue;
+			goto next;
 
 		name_len = btrfs_dir_name_len(leaf, di);
 		total_size += name_len + 1;
@@ -304,6 +316,10 @@
 	&btrfs_xattr_acl_access_handler,
 	&btrfs_xattr_acl_default_handler,
 #endif
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+	&btrfs_xattr_synoacl_access_handler,
+	&btrfs_xattr_synoacl_noperm_access_handler,
+#endif
 	NULL,
 };
 
@@ -319,6 +335,9 @@
 			XATTR_SECURITY_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) ||
 	       !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
+#ifdef MY_ABC_HERE
+	       !strncmp(name, XATTR_SYNO_PREFIX, XATTR_SYNO_PREFIX_LEN) ||
+#endif
 	       !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
 }
 
@@ -394,8 +413,8 @@
 				XATTR_REPLACE);
 }
 
-int btrfs_initxattrs(struct inode *inode, const struct xattr *xattr_array,
-		     void *fs_info)
+static int btrfs_initxattrs(struct inode *inode,
+			    const struct xattr *xattr_array, void *fs_info)
 {
 	const struct xattr *xattr;
 	struct btrfs_trans_handle *trans = fs_info;
diff -ur a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
--- a/fs/btrfs/xattr.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/btrfs/xattr.h	2014-02-17 11:56:58.000000000 +0100
@@ -23,6 +23,10 @@
 
 extern const struct xattr_handler btrfs_xattr_acl_access_handler;
 extern const struct xattr_handler btrfs_xattr_acl_default_handler;
+#ifdef CONFIG_BTRFS_FS_SYNO_ACL
+extern const struct xattr_handler btrfs_xattr_synoacl_access_handler;
+extern const struct xattr_handler btrfs_xattr_synoacl_noperm_access_handler;
+#endif
 extern const struct xattr_handler *btrfs_xattr_handlers[];
 
 extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
diff -ur a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
--- a/fs/btrfs/zlib.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/btrfs/zlib.c	2014-02-17 11:56:58.000000000 +0100
@@ -97,7 +97,7 @@
 	*total_in = 0;
 
 	if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
-		printk(KERN_WARNING "deflateInit failed\n");
+		printk(KERN_WARNING "btrfs: deflateInit failed\n");
 		ret = -1;
 		goto out;
 	}
@@ -125,7 +125,7 @@
 	while (workspace->def_strm.total_in < len) {
 		ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
 		if (ret != Z_OK) {
-			printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+			printk(KERN_DEBUG "btrfs: deflate in loop returned %d\n",
 			       ret);
 			zlib_deflateEnd(&workspace->def_strm);
 			ret = -1;
@@ -252,7 +252,7 @@
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
+		printk(KERN_WARNING "btrfs: inflateInit failed\n");
 		return -1;
 	}
 	while (workspace->inf_strm.total_in < srclen) {
@@ -336,7 +336,7 @@
 	}
 
 	if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
-		printk(KERN_WARNING "inflateInit failed\n");
+		printk(KERN_WARNING "btrfs: inflateInit failed\n");
 		return -1;
 	}
 
@@ -370,9 +370,9 @@
 			    PAGE_CACHE_SIZE - buf_offset);
 		bytes = min(bytes, bytes_left);
 
-		kaddr = kmap_atomic(dest_page, KM_USER0);
+		kaddr = kmap_atomic(dest_page);
 		memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes);
-		kunmap_atomic(kaddr, KM_USER0);
+		kunmap_atomic(kaddr);
 
 		pg_offset += bytes;
 		bytes_left -= bytes;
diff -ur a/fs/buffer.c b/fs/buffer.c
--- a/fs/buffer.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/buffer.c	2014-02-17 11:57:00.000000000 +0100
@@ -2363,8 +2363,8 @@
  * beyond EOF, then the page is guaranteed safe against truncation until we
  * unlock the page.
  *
- * Direct callers of this function should call vfs_check_frozen() so that page
- * fault does not busyloop until the fs is thawed.
+ * Direct callers of this function should protect against filesystem freezing
+ * using sb_start_write() - sb_end_write() functions.
  */
 int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 			 get_block_t get_block)
@@ -2396,18 +2396,7 @@
 
 	if (unlikely(ret < 0))
 		goto out_unlock;
-	/*
-	 * Freezing in progress? We check after the page is marked dirty and
-	 * with page lock held so if the test here fails, we are sure freezing
-	 * code will wait during syncing until the page fault is done - at that
-	 * point page will be dirty and unlocked so freezing code will write it
-	 * and writeprotect it again.
-	 */
 	set_page_dirty(page);
-	if (inode->i_sb->s_frozen != SB_UNFROZEN) {
-		ret = -EAGAIN;
-		goto out_unlock;
-	}
 	wait_on_page_writeback(page);
 	return 0;
 out_unlock:
@@ -2422,12 +2411,9 @@
 	int ret;
 	struct super_block *sb = vma->vm_file->f_path.dentry->d_inode->i_sb;
 
-	/*
-	 * This check is racy but catches the common case. The check in
-	 * __block_page_mkwrite() is reliable.
-	 */
-	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+	sb_start_pagefault(sb);
 	ret = __block_page_mkwrite(vma, vmf, get_block);
+	sb_end_pagefault(sb);
 	return block_page_mkwrite_return(ret);
 }
 EXPORT_SYMBOL(block_page_mkwrite);
diff -ur a/fs/ceph/super.c b/fs/ceph/super.c
--- a/fs/ceph/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ceph/super.c	2014-02-17 11:56:56.000000000 +0100
@@ -576,6 +576,11 @@
 
 static void destroy_caches(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ceph_inode_cachep);
 	kmem_cache_destroy(ceph_cap_cachep);
 	kmem_cache_destroy(ceph_dentry_cachep);
diff -ur a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
--- a/fs/cifs/cifsfs.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/cifs/cifsfs.c	2014-02-17 11:56:59.000000000 +0100
@@ -973,6 +973,11 @@
 static void
 cifs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(cifs_inode_cachep);
 }
 
diff -ur a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
--- a/fs/cifs/cifsglob.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/cifs/cifsglob.h	2014-02-17 11:56:59.000000000 +0100
@@ -782,6 +782,7 @@
 #define CIFS_FATTR_DELETE_PENDING	0x2
 #define CIFS_FATTR_NEED_REVAL		0x4
 #define CIFS_FATTR_INO_COLLISION	0x8
+#define CIFS_FATTR_UNKNOWN_NLINK	0x10
 
 struct cifs_fattr {
 	u32		cf_flags;
diff -ur a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
--- a/fs/cifs/cifs_unicode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/cifs/cifs_unicode.c	2014-02-17 11:56:59.000000000 +0100
@@ -200,6 +200,32 @@
 	return outlen;
 }
 
+#ifdef MY_ABC_HERE
+int
+cifs_strtoUCS_NoSpecialChar(__le16 *to, const char *from, int len,
+	      const struct nls_table *codepage)
+{
+	int charlen;
+	int i;
+	wchar_t wchar_to; /* needed to quiet sparse */
+
+	for (i = 0; len && *from; i++, from += charlen, len -= charlen) {
+		charlen = codepage->char2uni(from, len, &wchar_to);
+		if (charlen < 1) {
+			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
+				*from, charlen);
+			/* A question mark */
+			wchar_to = 0x003f;
+			charlen = 1;
+		}
+		put_unaligned_le16(wchar_to, &to[i]);
+	}
+
+	put_unaligned_le16(0, &to[i]);
+	return i;
+}
+#endif
+
 /*
  * NAME:	cifs_strtoUCS()
  *
@@ -247,7 +273,7 @@
 #endif
 		charlen = codepage->char2uni(from, len, &wchar_to);
 		if (charlen < 1) {
-#ifdef MY_ABC_HERE
+#ifndef MY_ABC_HERE
 			cERROR(1, "strtoUCS: char2uni of 0x%x returned %d",
 				*from, charlen);
 #endif
diff -ur a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
--- a/fs/cifs/cifs_unicode.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/cifs/cifs_unicode.h	2014-02-17 11:56:59.000000000 +0100
@@ -84,6 +84,9 @@
 int cifs_ucs2_bytes(const __le16 *from, int maxbytes,
 		    const struct nls_table *codepage);
 int cifs_strtoUCS(__le16 *, const char *, int, const struct nls_table *);
+#ifdef MY_ABC_HERE
+int cifs_strtoUCS_NoSpecialChar(__le16 *, const char *, int, const struct nls_table *);
+#endif
 char *cifs_strndup_from_ucs(const char *src, const int maxlen,
 			    const bool is_unicode,
 			    const struct nls_table *codepage);
diff -ur a/fs/cifs/inode.c b/fs/cifs/inode.c
--- a/fs/cifs/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/cifs/inode.c	2014-02-17 11:56:59.000000000 +0100
@@ -118,6 +118,33 @@
 	cifs_i->invalid_mapping = true;
 }
 
+/*
+ * copy nlink to the inode, unless it wasn't provided.  Provide
+ * sane values if we don't have an existing one and none was provided
+ */
+static void
+cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
+{
+	/*
+	 * if we're in a situation where we can't trust what we
+	 * got from the server (readdir, some non-unix cases)
+	 * fake reasonable values
+	 */
+	if (fattr->cf_flags & CIFS_FATTR_UNKNOWN_NLINK) {
+		/* only provide fake values on a new inode */
+		if (inode->i_state & I_NEW) {
+			if (fattr->cf_cifsattrs & ATTR_DIRECTORY)
+				set_nlink(inode, 2);
+			else
+				set_nlink(inode, 1);
+		}
+		return;
+	}
+
+	/* we trust the server, so update it */
+	set_nlink(inode, fattr->cf_nlink);
+}
+
 /* populate an inode with info from a cifs_fattr struct */
 void
 cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -132,7 +159,7 @@
 	inode->i_mtime = fattr->cf_mtime;
 	inode->i_ctime = fattr->cf_ctime;
 	inode->i_rdev = fattr->cf_rdev;
-	set_nlink(inode, fattr->cf_nlink);
+	cifs_nlink_fattr_to_inode(inode, fattr);
 	inode->i_uid = fattr->cf_uid;
 	inode->i_gid = fattr->cf_gid;
 #ifdef MY_ABC_HERE
@@ -534,9 +561,20 @@
 	fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
 	fattr->cf_createtime = le64_to_cpu(info->CreationTime);
 
+	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
 	if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
 		fattr->cf_mode = S_IFDIR | cifs_sb->mnt_dir_mode;
 		fattr->cf_dtype = DT_DIR;
+		/*
+		 * Server can return wrong NumberOfLinks value for directories
+		 * when Unix extensions are disabled - fake it.
+		 */
+		if (!tcon->unix_ext)
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+	} else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
+		fattr->cf_mode = S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+		fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
 	} else {
 		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
 		fattr->cf_dtype = DT_REG;
@@ -544,9 +582,18 @@
 		/* clear write bits if ATTR_READONLY is set */
 		if (fattr->cf_cifsattrs & ATTR_READONLY)
 			fattr->cf_mode &= ~(S_IWUGO);
-	}
 
-	fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+		/*
+		 * Don't accept zero nlink from non-unix servers unless
+		 * delete is pending.  Instead mark it as unknown.
+		 */
+		if ((fattr->cf_nlink < 1) && !tcon->unix_ext &&
+		    !info->DeletePending) {
+			cFYI(1, "bogus file nlink value %u\n",
+				fattr->cf_nlink);
+			fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+		}
+	}
 
 	fattr->cf_uid = cifs_sb->mnt_uid;
 	fattr->cf_gid = cifs_sb->mnt_gid;
diff -ur a/fs/cifs/readdir.c b/fs/cifs/readdir.c
--- a/fs/cifs/readdir.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/cifs/readdir.c	2014-02-17 11:56:59.000000000 +0100
@@ -131,6 +131,9 @@
 		fattr->cf_dtype = DT_REG;
 	}
 
+	/* non-unix readdir doesn't provide nlink */
+	fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
+
 	if (fattr->cf_cifsattrs & ATTR_READONLY)
 		fattr->cf_mode &= ~S_IWUGO;
 
diff -ur a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
--- a/fs/cifs/smbencrypt.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/cifs/smbencrypt.c	2014-02-17 11:56:59.000000000 +0100
@@ -213,7 +213,11 @@
 
 	/* Password cannot be longer than 128 characters */
 	if (passwd) /* Password must be converted to NT unicode */
+#ifdef MY_ABC_HERE
+		len = cifs_strtoUCS_NoSpecialChar(wpwd, passwd, 128, codepage);
+#else
 		len = cifs_strtoUCS(wpwd, passwd, 128, codepage);
+#endif
 	else {
 		len = 0;
 		*wpwd = 0; /* Ensure string is null terminated */
diff -ur a/fs/coda/inode.c b/fs/coda/inode.c
--- a/fs/coda/inode.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/coda/inode.c	2014-02-17 11:56:56.000000000 +0100
@@ -86,6 +86,11 @@
 
 void coda_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(coda_inode_cachep);
 }
 
diff -ur a/fs/compat.c b/fs/compat.c
--- a/fs/compat.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/compat.c	2014-02-17 11:56:59.000000000 +0100
@@ -60,6 +60,10 @@
 extern int syno_hibernation_log_sec;
 #endif
 
+#ifdef CONFIG_FS_SYNO_ACL
+#include "synoacl_int.h"
+#endif
+
 int compat_log = 1;
 
 int compat_printk(const char *fmt, ...)
@@ -1857,45 +1861,51 @@
 	int error;
 	struct path path;
 	struct inode *inode = NULL;
-	struct iattr newattrs;
-	compat_time_t   tv_sec;
-	s32     tv_nsec;
+	compat_time_t tv_sec;
+	s32 tv_nsec;
+	struct timespec crtime;
 
 	if (!pCtime) {
 		return -EINVAL;
 	}
-
-	error = user_path_at(AT_FDCWD, filename, LOOKUP_FOLLOW, &path);
+	error = get_user(tv_sec, &pCtime->tv_sec);
+	if (error)
+		goto out;
+	error = get_user(tv_nsec, &pCtime->tv_nsec);
 	if (error)
 		goto out;
-	inode = path.dentry->d_inode;
 
-	error = -EROFS;
-	if (IS_RDONLY(inode))
-		goto dput_and_out;
+	crtime.tv_sec = tv_sec;
+	crtime.tv_nsec = tv_nsec;
 
-	error = get_user(tv_sec, &pCtime->tv_sec);
+	error = user_path_at(AT_FDCWD, filename, LOOKUP_FOLLOW, &path);
 	if (error)
-		goto dput_and_out;
-	error = get_user(tv_nsec, &pCtime->tv_nsec);
+		goto out;
+
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto dput_and_out;
 
-	newattrs.ia_ctime.tv_sec = tv_sec;
-	newattrs.ia_ctime.tv_nsec = tv_nsec;
-	newattrs.ia_valid = ATTR_CREATE_TIME;
-	mutex_lock(&inode->i_mutex);
-	if (inode->i_op && inode->i_op->setattr)  {
-		error = inode->i_op->setattr(path.dentry, &newattrs);
-	} else {
-		error = inode_change_ok(inode, &newattrs);
-		if (!error)
-			setattr_copy(inode, &newattrs);
-			mark_inode_dirty(inode);
-			error = 0;
+	inode = path.dentry->d_inode;
+	if (!inode_owner_or_capable(inode)) {
+#ifdef CONFIG_FS_SYNO_ACL
+		if (IS_SYNOACL(path.dentry)) {
+			error = synoacl_op_perm(path.dentry, MAY_WRITE_ATTR | MAY_WRITE_EXT_ATTR);
+			if (error) 
+				goto drop_write;
+		} else {
+#endif
+			error = -EPERM;
+			goto drop_write;
+#ifdef CONFIG_FS_SYNO_ACL
+		}
+#endif
 	}
-	mutex_unlock(&inode->i_mutex);
 
+	error = syno_op_set_crtime(path.dentry, &crtime);
+
+drop_write:
+	mnt_drop_write(path.mnt);
 dput_and_out:
 	path_put(&path);
 out:
@@ -1903,46 +1913,6 @@
 }
 #endif
 
-#ifdef MY_ABC_HERE
-asmlinkage long compat_sys_SYNOmmap(compat_SYNO_MMAP_ARG __user *arg)
-{
-	long error = -EFAULT;
-	SYNO_MMAP_ARG arg64;
-	mm_segment_t oldfs = get_fs();
-
-	if (!arg) {
-		return -EFAULT;
-	}
-
-	if (unlikely(get_user(arg64.addr, &arg->addr)) ||
-		unlikely(get_user(arg64.len, &arg->len)) ||
-		unlikely(get_user(arg64.prot, &arg->prot)) ||
-		unlikely(get_user(arg64.flags, &arg->flags)) ||
-		unlikely(get_user(arg64.fd, &arg->fd)) ||
-		unlikely(get_user(arg64.pgoff, &arg->pgoff))) {
-		return -EFAULT;
-	}
-
-	set_fs(KERNEL_DS);
-	error = sys_mmap((unsigned long)arg64.addr, (unsigned long)arg64.len,
-					 (unsigned long)arg64.prot, (unsigned long)arg64.flags,
-					 (unsigned long)arg64.fd, (unsigned long)(arg64.pgoff << PAGE_SHIFT));
-	set_fs(oldfs);
-
-	if (unlikely(put_user((u32)arg64.addr, &arg->addr)) ||
-		unlikely(put_user((u32)arg64.len, &arg->len)) ||
-		unlikely(put_user((u32)arg64.prot, &arg->prot)) ||
-		unlikely(put_user((u32)arg64.flags, &arg->flags)) ||
-		unlikely(put_user((u32)arg64.fd, &arg->fd)) ||
-		unlikely(put_user((u32)arg64.pgoff, &arg->pgoff))) {
-		return -EFAULT;
-	}
-
-	return error;
-}
-#endif
-
-
 #ifdef CONFIG_FHANDLE
 /*
  * Exactly like fs/open.c:sys_open_by_handle_at(), except that it
diff -ur a/fs/cramfs/inode.c b/fs/cramfs/inode.c
--- a/fs/cramfs/inode.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/cramfs/inode.c	2014-02-17 11:57:01.000000000 +0100
@@ -378,7 +378,7 @@
 		unsigned long nextoffset;
 		char *name;
 		ino_t ino;
-		mode_t mode;
+		umode_t mode;
 		int namelen, error;
 
 		mutex_lock(&read_mutex);
diff -ur a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h
--- a/fs/ecryptfs/ecryptfs_kernel.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ecryptfs/ecryptfs_kernel.h	2014-02-17 11:56:59.000000000 +0100
@@ -330,6 +330,9 @@
 #define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK   0x00000020
 #define ECRYPTFS_GLOBAL_ENCFN_USE_FEK          0x00000040
 #define ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY    0x00000080
+#ifdef MY_ABC_HERE
+#define ECRYPTFS_SYNO_ERROR_REPORT             0x10000000
+#endif
 	u32 flags;
 	struct list_head global_auth_tok_list;
 	struct mutex global_auth_tok_list_mutex;
diff -ur a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
--- a/fs/ecryptfs/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ecryptfs/inode.c	2014-02-17 11:56:59.000000000 +0100
@@ -36,8 +36,8 @@
 #include <asm/unaligned.h>
 #include "ecryptfs_kernel.h"
 
-#ifdef MY_ABC_HERE
-extern long __SYNOArchiveSet(struct dentry *dentry, unsigned int cmd);
+#ifdef CONFIG_FS_SYNO_ACL
+#include "../synoacl_int.h"
 #endif
 
 static struct dentry *lock_parent(struct dentry *dentry)
@@ -205,7 +205,7 @@
  */
 static struct inode *
 ecryptfs_do_create(struct inode *directory_inode,
-		   struct dentry *ecryptfs_dentry, int mode)
+		   struct dentry *ecryptfs_dentry, umode_t mode)
 {
 	int rc;
 	struct dentry *lower_dentry;
@@ -664,7 +664,7 @@
 static void CopySynoArchive(struct dentry *ecrypt_entry, struct dentry *lower_entry)
 {
 	if (ecrypt_entry && ecrypt_entry->d_inode && lower_entry && lower_entry->d_inode) {
-		fsstack_copy_syno_archive(ecrypt_entry->d_inode, lower_entry->d_inode);
+		ecrypt_entry->d_inode->i_mode2 = lower_entry->d_inode->i_mode2;
 	}
 }
 #endif
@@ -1020,61 +1020,84 @@
 }
 
 #ifdef MY_ABC_HERE
-static int
-ecryptfs_set_archive(struct dentry *dentry, int cmd)
+static int ecryptfs_syno_set_crtime(struct dentry *dentry, struct timespec *time)
 {
+	int error;
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	int err = __SYNOArchiveSet(lower_dentry, cmd);
 
-	if (!err) {
-		mutex_lock(&dentry->d_inode->i_syno_mutex);
-		fsstack_copy_syno_archive(dentry->d_inode, lower_dentry->d_inode);
-		mutex_unlock(&dentry->d_inode->i_syno_mutex);
+	error = syno_op_set_crtime(lower_dentry, time);
+	if (!error) {
+		dentry->d_inode->i_CreateTime = *time;
 	}
+	return error;
+}
+#endif
+
+#ifdef MY_ABC_HERE
+static int ecryptfs_syno_set_archive_bit(struct dentry *dentry, unsigned int arbit)
+{
+	int error;
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
-	return err;
+	error = syno_op_set_archive_bit(lower_dentry, arbit);
+	if (!error) {
+		dentry->d_inode->i_mode2 = arbit;
+	}
+	return error;
+}
+#endif //MY_ABC_HERE
+
+#ifdef MY_ABC_HERE
+static int ecryptfs_syno_set_archive_ver(struct dentry *dentry, u32 version)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	if (!lower_dentry->d_inode->i_op->syno_set_archive_ver)
+		return -EINVAL;
+	return lower_dentry->d_inode->i_op->syno_set_archive_ver(lower_dentry, version);
+}
+
+static int ecryptfs_syno_get_archive_ver(struct dentry *dentry, u32 *version)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	if (!lower_dentry->d_inode->i_op->syno_get_archive_ver)
+		return -EINVAL;
+	return lower_dentry->d_inode->i_op->syno_get_archive_ver(lower_dentry, version);
 }
 #endif
 
 #ifdef CONFIG_FS_SYNO_ACL
-#define IS_IOP_READY(x) (i_op && i_op->x)
-#define DO_IOP(x, ...) i_op->x(__VA_ARGS__)
-
-static int ecryptfs_get_syno_acl(struct dentry *dentry, int cmd, void *value, size_t size)
+static int ecryptfs_get_syno_acl_xattr(struct dentry *dentry, int cmd, void *value, size_t size)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	const struct inode_operations *i_op = lower_dentry->d_inode->i_op;
 
-	if (IS_IOP_READY(syno_acl_get)) {
-		return DO_IOP(syno_acl_get, lower_dentry, cmd, value, size);
-	}
-	return -EOPNOTSUPP;
+	return synoacl_mod_get_acl_xattr(lower_dentry, cmd, value, size);
 }
 
 static int
-ecryptfs_get_syno_permission(struct dentry *dentry, unsigned int *pPermAllow, unsigned int *pPermDeny)
+ecryptfs_syno_inode_change_ok(struct dentry *dentry, struct iattr *attr)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	const struct inode_operations *i_op = lower_dentry->d_inode->i_op;
 
-	if (IS_IOP_READY(syno_permission_get)) {
-		return DO_IOP(syno_permission_get, lower_dentry, pPermAllow, pPermDeny);
-	}
-	return -EOPNOTSUPP;
+	return synoacl_mod_inode_change_ok(lower_dentry, attr);
 }
 
 static int
-ecryptfs_syno_inode_change_ok(struct dentry *dentry, struct iattr *attr)
+ecryptfs_syno_arbit_chg_ok(struct dentry *dentry, unsigned int cmd, int tag, int mask)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	const struct inode_operations *i_op = lower_dentry->d_inode->i_op;
 
-	if (IS_IOP_READY(syno_inode_change_ok)) {
-		return DO_IOP(syno_inode_change_ok, lower_dentry, attr);
-	}
-	return inode_change_ok(lower_dentry->d_inode, attr);
+	return synoacl_mod_archive_change_ok(lower_dentry, cmd, tag, mask);
 }
 
+static int
+ecryptfs_syno_setattr_post(struct dentry *dentry, struct iattr *attr)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	return synoacl_mod_setattr_post(lower_dentry, attr);
+}
 /*
  * Check Only 1 time.
  */
@@ -1082,26 +1105,25 @@
 ecryptfs_syno_exec_permission(struct dentry *dentry)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	const struct inode_operations *i_op = lower_dentry->d_inode->i_op;
 
-	if (IS_IOP_READY(syno_exec_permission)) {
-		return DO_IOP(syno_exec_permission, lower_dentry);
-	}
-	return 0;
+	return synoacl_mod_exec_permission(lower_dentry);
 }
 
 static int
-ecryptfs_syno_access(struct dentry *dentry, int mask)
+ecryptfs_syno_acl_access(struct dentry *dentry, int mask)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	const struct inode_operations *i_op = lower_dentry->d_inode->i_op;
 
-	if (IS_IOP_READY(syno_access)) {
-		return DO_IOP(syno_access, lower_dentry, mask);
-	}
-	return inode_permission(lower_dentry->d_inode, mask);
+	return synoacl_mod_access(lower_dentry, mask);
 }
 
+static void
+ecryptfs_syno_acl_to_mode(struct dentry *dentry, struct kstat *stat)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	synoacl_mod_to_mode(lower_dentry, stat);
+}
 /*
  * For some operations(like vfs_create ), it checks 2 times.
  * For some operations(like openat() or SYNOACLPermCheck()), it checks only 1 times.
@@ -1110,15 +1132,33 @@
 ecryptfs_syno_permission(struct dentry *dentry, int mask)
 {
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	const struct inode_operations *i_op = lower_dentry->d_inode->i_op;
 
-	if (IS_IOP_READY(syno_permission)) {
-		return DO_IOP(syno_permission, lower_dentry, mask);
-	}
-	return inode_permission(lower_dentry->d_inode, mask);
+	return synoacl_mod_permission(lower_dentry, mask);
+}
+static int
+ecryptfs_syno_acl_init(struct dentry *dentry, struct inode *inode)
+{
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	return synoacl_mod_init_acl(lower_dentry, lower_dentry->d_inode);
 }
 #endif //CONFIG_FS_SYNO_ACL
 
+#ifdef MY_ABC_HERE
+static int 
+ecryptfs_syno_getattr(struct dentry *dentry, struct kstat *st, int flags)
+{
+	struct inode *lower_inode = NULL;
+	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
+	lower_inode = lower_dentry->d_inode;
+	if (lower_inode->i_op->syno_getattr) {
+		return lower_inode->i_op->syno_getattr(lower_dentry, st, flags);
+	}
+	return -EOPNOTSUPP;
+}
+#endif //MY_ABC_HERE
+
 static int
 ecryptfs_permission(struct inode *inode, int mask)
 {
@@ -1188,9 +1228,17 @@
 	}
 	mutex_unlock(&crypt_stat->cs_mutex);
 
+#ifdef CONFIG_FS_SYNO_ACL
+	if (!IS_SYNOACL(lower_dentry)) {
+		rc = inode_change_ok(inode, ia);
+		if (rc)
+			goto out;
+	}
+#else
 	rc = inode_change_ok(inode, ia);
 	if (rc)
 		goto out;
+#endif
 	if (ia->ia_valid & ATTR_SIZE) {
 		rc = ecryptfs_inode_newsize_ok(inode, ia->ia_size);
 		if (rc)
@@ -1256,12 +1304,6 @@
 				      ecryptfs_inode_to_lower(dentry->d_inode));
 		generic_fillattr(dentry->d_inode, stat);
 		stat->blocks = lower_stat.blocks;
-#ifdef MY_ABC_HERE
-		stat->SynoMode = lower_stat.SynoMode;
-#endif
-#ifdef CONFIG_FS_SYNO_ACL
-		stat->mode = lower_stat.mode;
-#endif
 	}
 	return rc;
 }
@@ -1284,7 +1326,7 @@
 		 * Copy synoarchive since synoacl archive may be changed after setxattr 
 		 * No need to lock dentry because lock has done by vfs_setxattr().
 		 */
-		fsstack_copy_syno_archive(dentry->d_inode, lower_dentry->d_inode);
+		dentry->d_inode->i_mode2 = lower_dentry->d_inode->i_mode2;
 	}
 #endif
 
@@ -1305,6 +1347,7 @@
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
+
 	mutex_lock(&lower_dentry->d_inode->i_mutex);
 	rc = lower_dentry->d_inode->i_op->getxattr(lower_dentry, name, value,
 						   size);
@@ -1345,6 +1388,7 @@
 	struct dentry *lower_dentry;
 
 	lower_dentry = ecryptfs_dentry_to_lower(dentry);
+
 	if (!lower_dentry->d_inode->i_op->removexattr) {
 		rc = -EOPNOTSUPP;
 		goto out;
@@ -1357,6 +1401,19 @@
 }
 
 const struct inode_operations ecryptfs_symlink_iops = {
+#ifdef MY_ABC_HERE
+	.syno_getattr = ecryptfs_syno_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = ecryptfs_syno_set_crtime,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_bit = ecryptfs_syno_set_archive_bit,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = ecryptfs_syno_get_archive_ver,
+	.syno_set_archive_ver = ecryptfs_syno_set_archive_ver,
+#endif
 	.readlink = ecryptfs_readlink,
 	.follow_link = ecryptfs_follow_link,
 	.put_link = ecryptfs_put_link,
@@ -1380,16 +1437,29 @@
 	.mknod = ecryptfs_mknod,
 	.rename = ecryptfs_rename,
 #ifdef CONFIG_FS_SYNO_ACL
+	.getattr = ecryptfs_getattr,
 	.syno_permission = ecryptfs_syno_permission,
-	.syno_access = ecryptfs_syno_access,
-	.syno_acl_get = ecryptfs_get_syno_acl,
+	.syno_acl_access = ecryptfs_syno_acl_access,
+	.syno_acl_xattr_get = ecryptfs_get_syno_acl_xattr,
 	.syno_exec_permission = ecryptfs_syno_exec_permission,
-	.getattr = ecryptfs_getattr,
-	.syno_permission_get = ecryptfs_get_syno_permission,
 	.syno_inode_change_ok = ecryptfs_syno_inode_change_ok,
+	.syno_arbit_chg_ok = ecryptfs_syno_arbit_chg_ok,
+	.syno_setattr_post = ecryptfs_syno_setattr_post,
+	.syno_acl_to_mode = ecryptfs_syno_acl_to_mode,
+	.syno_acl_init = ecryptfs_syno_acl_init,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_getattr = ecryptfs_syno_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = ecryptfs_syno_set_crtime,
 #endif
 #ifdef MY_ABC_HERE
-	.set_archive = ecryptfs_set_archive,
+	.syno_set_archive_bit = ecryptfs_syno_set_archive_bit,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = ecryptfs_syno_get_archive_ver,
+	.syno_set_archive_ver = ecryptfs_syno_set_archive_ver,
 #endif
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
@@ -1401,15 +1471,28 @@
 
 const struct inode_operations ecryptfs_main_iops = {
 #ifdef CONFIG_FS_SYNO_ACL
-	.syno_acl_get = ecryptfs_get_syno_acl,
-	.syno_access = ecryptfs_syno_access,
+	.syno_acl_xattr_get = ecryptfs_get_syno_acl_xattr,
+	.syno_acl_access = ecryptfs_syno_acl_access,
 	.syno_permission = ecryptfs_syno_permission,
 	.syno_exec_permission = ecryptfs_syno_exec_permission,
-	.syno_permission_get = ecryptfs_get_syno_permission,
 	.syno_inode_change_ok = ecryptfs_syno_inode_change_ok,
+	.syno_arbit_chg_ok = ecryptfs_syno_arbit_chg_ok,
+	.syno_setattr_post = ecryptfs_syno_setattr_post,
+	.syno_acl_to_mode = ecryptfs_syno_acl_to_mode,
+	.syno_acl_init = ecryptfs_syno_acl_init,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_getattr = ecryptfs_syno_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = ecryptfs_syno_set_crtime,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_archive_bit = ecryptfs_syno_set_archive_bit,
 #endif
 #ifdef MY_ABC_HERE
-	.set_archive = ecryptfs_set_archive,
+	.syno_get_archive_ver = ecryptfs_syno_get_archive_ver,
+	.syno_set_archive_ver = ecryptfs_syno_set_archive_ver,
 #endif
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
diff -ur a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
--- a/fs/ecryptfs/keystore.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ecryptfs/keystore.c	2014-02-17 11:56:59.000000000 +0100
@@ -649,6 +649,9 @@
 		printk(KERN_ERR "%s: Error attempting to find auth tok for "
 		       "fnek sig [%s]; rc = [%d]\n", __func__,
 		       mount_crypt_stat->global_default_fnek_sig, rc);
+#ifdef MY_ABC_HERE
+		mount_crypt_stat->flags |= ECRYPTFS_SYNO_ERROR_REPORT;
+#endif
 		goto out;
 	}
 	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(
@@ -999,6 +1002,9 @@
 		printk(KERN_ERR "%s: Error attempting to find auth tok for "
 		       "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex,
 		       rc);
+#ifdef MY_ABC_HERE
+		mount_crypt_stat->flags |= ECRYPTFS_SYNO_ERROR_REPORT;
+#endif
 		goto out;
 	}
 	rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm,
diff -ur a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
--- a/fs/ecryptfs/main.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ecryptfs/main.c	2014-02-17 11:56:59.000000000 +0100
@@ -728,6 +728,12 @@
 {
 	int i;
 
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
 	for (i = 0; i < ARRAY_SIZE(ecryptfs_cache_infos); i++) {
 		struct ecryptfs_cache_info *info;
 
diff -ur a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
--- a/fs/ecryptfs/super.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ecryptfs/super.c	2014-02-17 11:56:59.000000000 +0100
@@ -177,11 +177,59 @@
 		seq_printf(m, ",ecryptfs_unlink_sigs");
 	if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_MOUNT_AUTH_TOK_ONLY)
 		seq_printf(m, ",ecryptfs_mount_auth_tok_only");
+#ifdef MY_ABC_HERE
+	if (mount_crypt_stat->flags & ECRYPTFS_SYNO_ERROR_REPORT)
+		seq_printf(m, ",syno_error_report");
+#endif
 
 	return 0;
 }
 
+#ifdef MY_ABC_HERE
+static int ecryptfs_get_sb_archive_ver(struct super_block *sb, u32 *archive_ver)
+{
+	struct super_block *lower_sb = ecryptfs_superblock_to_lower(sb);
+	if (!lower_sb->s_op->syno_get_sb_archive_ver)
+		return -EINVAL;
+	return lower_sb->s_op->syno_get_sb_archive_ver(lower_sb, archive_ver);
+}
+
+static int ecryptfs_set_sb_archive_ver(struct super_block *sb, u32 archive_ver)
+{
+	struct super_block *lower_sb = ecryptfs_superblock_to_lower(sb);
+	if (!lower_sb->s_op->syno_set_sb_archive_ver)
+		return -EINVAL;
+	return lower_sb->s_op->syno_set_sb_archive_ver(lower_sb, archive_ver);
+}
+
+#ifdef MY_ABC_HERE
+static int ecryptfs_get_sb_archive_ver1(struct super_block *sb, u32 *archive_ver)
+{
+	struct super_block *lower_sb = ecryptfs_superblock_to_lower(sb);
+	if (!lower_sb->s_op->syno_get_sb_archive_ver1)
+		return -EINVAL;
+	return lower_sb->s_op->syno_get_sb_archive_ver1(lower_sb, archive_ver);
+}
+
+static int ecryptfs_set_sb_archive_ver1(struct super_block *sb, u32 archive_ver)
+{
+	struct super_block *lower_sb = ecryptfs_superblock_to_lower(sb);
+	if (!lower_sb->s_op->syno_set_sb_archive_ver1)
+		return -EINVAL;
+	return lower_sb->s_op->syno_set_sb_archive_ver1(lower_sb, archive_ver);
+}
+#endif /* MY_ABC_HERE */
+#endif /* MY_ABC_HERE */
+
 const struct super_operations ecryptfs_sops = {
+#ifdef MY_ABC_HERE
+	.syno_get_sb_archive_ver = ecryptfs_get_sb_archive_ver,
+	.syno_set_sb_archive_ver = ecryptfs_set_sb_archive_ver,
+#ifdef MY_ABC_HERE
+	.syno_get_sb_archive_ver1 = ecryptfs_get_sb_archive_ver1,
+	.syno_set_sb_archive_ver1 = ecryptfs_set_sb_archive_ver1,
+#endif
+#endif
 	.alloc_inode = ecryptfs_alloc_inode,
 	.destroy_inode = ecryptfs_destroy_inode,
 	.drop_inode = generic_delete_inode,
diff -ur a/fs/efs/super.c b/fs/efs/super.c
--- a/fs/efs/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/efs/super.c	2014-02-17 11:56:56.000000000 +0100
@@ -96,6 +96,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(efs_inode_cachep);
 }
 
diff -ur a/fs/eventpoll.c b/fs/eventpoll.c
--- a/fs/eventpoll.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/eventpoll.c	2014-02-17 11:57:01.000000000 +0100
@@ -699,9 +699,12 @@
 			       void *priv)
 {
 	struct epitem *epi, *tmp;
+	poll_table pt;
 
+	init_poll_funcptr(&pt, NULL);
 	list_for_each_entry_safe(epi, tmp, head, rdllink) {
-		if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+		pt._key = epi->event.events;
+		if (epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
 		    epi->event.events)
 			return POLLIN | POLLRDNORM;
 		else {
@@ -1097,6 +1100,7 @@
 	/* Initialize the poll table using the queue callback */
 	epq.epi = epi;
 	init_poll_funcptr(&epq.pt, ep_ptable_queue_proc);
+	epq.pt._key = event->events;
 
 	/*
 	 * Attach the item to the poll hooks and get current event bits.
@@ -1191,13 +1195,17 @@
 {
 	int pwake = 0;
 	unsigned int revents;
+	poll_table pt;
+
+	init_poll_funcptr(&pt, NULL);
 
 	/*
 	 * Set the new event interest mask before calling f_op->poll();
 	 * otherwise we might miss an event that happens between the
 	 * f_op->poll() call and the new event set registering.
 	 */
-	epi->event.events = event->events; /* need barrier below */
+	epi->event.events = event->events;
+	pt._key = event->events;
 	epi->event.data = event->data; /* protected by mtx */
 
 	/*
@@ -1224,7 +1232,7 @@
 	 * Get current event bits. We can safely use the file* here because
 	 * its usage count has been increased by the caller of this function.
 	 */
-	revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
+	revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt);
 
 	/*
 	 * If the item is "hot" and it is not registered inside the ready
@@ -1259,6 +1267,9 @@
 	unsigned int revents;
 	struct epitem *epi;
 	struct epoll_event __user *uevent;
+	poll_table pt;
+
+	init_poll_funcptr(&pt, NULL);
 
 	/*
 	 * We can loop without lock because we are passed a task private list.
@@ -1271,7 +1282,8 @@
 
 		list_del_init(&epi->rdllink);
 
-		revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+		pt._key = epi->event.events;
+		revents = epi->ffd.file->f_op->poll(epi->ffd.file, &pt) &
 			epi->event.events;
 
 		/*
diff -ur a/fs/exec.c b/fs/exec.c
--- a/fs/exec.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/exec.c	2014-02-17 11:56:58.000000000 +0100
@@ -624,7 +624,11 @@
 		 * when the old and new regions overlap clear from new_end.
 		 */
 		free_pgd_range(&tlb, new_end, old_end, new_end,
+#if defined(CONFIG_SYNO_COMCERTO)
+			vma->vm_next ? vma->vm_next->vm_start : mm->task_size);
+#else
 			vma->vm_next ? vma->vm_next->vm_start : 0);
+#endif
 	} else {
 		/*
 		 * otherwise, clean from old_start; this is done to not touch
@@ -633,7 +637,11 @@
 		 * for the others its just a little faster.
 		 */
 		free_pgd_range(&tlb, old_start, old_end, new_end,
+#if defined(CONFIG_SYNO_COMCERTO)
+			vma->vm_next ? vma->vm_next->vm_start : mm->task_size);
+#else
 			vma->vm_next ? vma->vm_next->vm_start : 0);
+#endif
 	}
 	tlb_finish_mmu(&tlb, new_end, old_end);
 
diff -ur a/fs/exofs/super.c b/fs/exofs/super.c
--- a/fs/exofs/super.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/exofs/super.c	2014-02-17 11:57:00.000000000 +0100
@@ -206,6 +206,11 @@
  */
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(exofs_inode_cachep);
 }
 
diff -ur a/fs/ext2/inode.c b/fs/ext2/inode.c
--- a/fs/ext2/inode.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ext2/inode.c	2014-02-17 11:56:58.000000000 +0100
@@ -84,6 +84,7 @@
 	truncate_inode_pages(&inode->i_data, 0);
 
 	if (want_delete) {
+		sb_start_intwrite(inode->i_sb);
 		/* set dtime */
 		EXT2_I(inode)->i_dtime	= get_seconds();
 		mark_inode_dirty(inode);
@@ -103,8 +104,10 @@
 	if (unlikely(rsv))
 		kfree(rsv);
 
-	if (want_delete)
+	if (want_delete) {
 		ext2_free_inode(inode);
+		sb_end_intwrite(inode->i_sb);
+	}
 }
 
 typedef struct {
diff -ur a/fs/ext2/super.c b/fs/ext2/super.c
--- a/fs/ext2/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ext2/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -42,6 +42,8 @@
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext2_sync_fs(struct super_block *sb, int wait);
+static int ext2_freeze(struct super_block *sb);
+static int ext2_unfreeze(struct super_block *sb);
 
 void ext2_error(struct super_block *sb, const char *function,
 		const char *fmt, ...)
@@ -207,6 +209,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext2_inode_cachep);
 }
 
@@ -307,6 +314,8 @@
 	.put_super	= ext2_put_super,
 	.write_super	= ext2_write_super,
 	.sync_fs	= ext2_sync_fs,
+	.freeze_fs	= ext2_freeze,
+	.unfreeze_fs	= ext2_unfreeze,
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
 	.show_options	= ext2_show_options,
@@ -1191,6 +1200,35 @@
 	return 0;
 }
 
+static int ext2_freeze(struct super_block *sb)
+{
+	struct ext2_sb_info *sbi = EXT2_SB(sb);
+
+	/*
+	 * Open but unlinked files present? Keep EXT2_VALID_FS flag cleared
+	 * because we have unattached inodes and thus filesystem is not fully
+	 * consistent.
+	 */
+	if (atomic_long_read(&sb->s_remove_count)) {
+		ext2_sync_fs(sb, 1);
+		return 0;
+	}
+	/* Set EXT2_FS_VALID flag */
+	spin_lock(&sbi->s_lock);
+	sbi->s_es->s_state = cpu_to_le16(sbi->s_mount_state);
+	spin_unlock(&sbi->s_lock);
+	ext2_sync_super(sb, sbi->s_es, 1);
+
+	return 0;
+}
+
+static int ext2_unfreeze(struct super_block *sb)
+{
+	/* Just write sb to clear EXT2_VALID_FS flag */
+	ext2_write_super(sb);
+
+	return 0;
+}
 
 void ext2_write_super(struct super_block *sb)
 {
diff -ur a/fs/ext3/file.c b/fs/ext3/file.c
--- a/fs/ext3/file.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ext3/file.c	2014-02-17 11:56:56.000000000 +0100
@@ -71,6 +71,13 @@
 };
 
 const struct inode_operations ext3_file_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext3_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext3_get_archive_ver,
+	.syno_set_archive_ver = syno_ext3_set_archive_ver,
+#endif
 	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
@@ -78,9 +85,6 @@
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-#ifdef MY_ABC_HERE
-	.synosetxattr	= syno_generic_setxattr,
-#endif
 	.get_acl	= ext3_get_acl,
 	.fiemap		= ext3_fiemap,
 };
diff -ur a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
--- a/fs/ext3/ialloc.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ext3/ialloc.c	2014-02-17 11:56:56.000000000 +0100
@@ -574,9 +574,6 @@
 #ifdef MY_ABC_HERE
 	inode->i_mode2 = ALL_SYNO_ARCHIVE;   /* set archive bit on creation */
 #endif
-#ifdef MY_ABC_HERE
-	inode->i_archive_version = inode->i_sb->s_archive_version;
-#endif
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
diff -ur a/fs/ext3/inode.c b/fs/ext3/inode.c
--- a/fs/ext3/inode.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ext3/inode.c	2014-02-17 11:56:56.000000000 +0100
@@ -1277,29 +1277,18 @@
 	to = from + len;
 
 retry:
-#ifndef MY_ABC_HERE
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page)
 		return -ENOMEM;
 	*pagep = page;
-#endif
 
 	handle = ext3_journal_start(inode, needed_blocks);
 	if (IS_ERR(handle)) {
-#ifndef MY_ABC_HERE
 		unlock_page(page);
 		page_cache_release(page);
-#endif
 		ret = PTR_ERR(handle);
 		goto out;
 	}
-#ifdef MY_ABC_HERE
-	flags |= AOP_FLAG_NOFS;
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page)
-		return -ENOMEM;
-	*pagep = page;
-#endif
 
 	ret = __block_write_begin(page, pos, len, ext3_get_block);
 	if (ret)
@@ -2925,10 +2914,6 @@
 	struct ext3_inode_info *ei;
 	struct buffer_head *bh;
 	struct inode *inode;
-#ifdef MY_ABC_HERE
-	struct syno_xattr_archive_version value;
-	int retval;
-#endif
 
 	journal_t *journal = EXT3_SB(sb)->s_journal;
 	transaction_t *transaction;
@@ -3095,14 +3080,6 @@
 	}
 	brelse (iloc.bh);
 	ext3_set_inode_flags(inode);
-#ifdef MY_ABC_HERE
-	retval = ext3_xattr_get(inode, EXT3_XATTR_INDEX_SYNO, XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value));
-	if(retval>0) {
-		inode->i_archive_version = le32_to_cpu(value.v_archive_version);
-	} else {
-		inode->i_archive_version = 0;
-	}
-#endif
 	unlock_new_inode(inode);
 	return inode;
 
@@ -3423,6 +3400,74 @@
 	return error;
 }
 
+#ifdef MY_ABC_HERE
+int syno_ext3_getattr(struct dentry *d, struct kstat *stat, int flags)
+{
+	struct inode *inode = d->d_inode;
+	int err = 0;
+
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_CREATIME) {
+		stat->SynoCreateTime = inode->i_CreateTime;
+	}
+#endif
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_ARBIT) {
+		stat->SynoMode = inode->i_mode2;
+	}
+#endif
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_BKPVER) {
+		err = syno_ext3_get_archive_ver(d, &stat->syno_archive_version);
+	}
+#endif
+	return err;
+}
+#endif
+
+#ifdef MY_ABC_HERE
+int syno_ext3_set_archive_ver(struct dentry *dentry, u32 version)
+{
+	struct inode *inode = dentry->d_inode;
+	struct syno_xattr_archive_version value;
+	int err;
+
+	value.v_magic = cpu_to_le16(0x2552);
+	value.v_struct_version = cpu_to_le16(1);
+	value.v_archive_version = cpu_to_le32(version);
+	err = ext3_xattr_set(inode, EXT3_XATTR_INDEX_SYNO, XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value), 0);
+	if (!err) {
+		inode->i_archive_version = version;
+		inode->i_flags |= S_ARCHIVE_VERSION_CACHED;
+	}
+	return err;
+}
+
+int syno_ext3_get_archive_ver(struct dentry *dentry, u32 *version)
+{
+	struct inode *inode = dentry->d_inode;
+	struct syno_xattr_archive_version value;
+	int err;
+
+	if (IS_ARCHIVE_VERSION_CACHED(inode)) {
+		*version = inode->i_archive_version;
+		return 0;
+	}
+
+	err = ext3_xattr_get(inode, EXT3_XATTR_INDEX_SYNO, XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value));
+	if (0 < err) {
+		inode->i_archive_version = le32_to_cpu(value.v_archive_version);
+	} else if (-ENODATA == err) {
+		inode->i_archive_version = 0;
+	} else {
+		*version = 0;
+		return err;
+	}
+	*version = inode->i_archive_version;
+	inode->i_flags |= S_ARCHIVE_VERSION_CACHED;
+	return 0;
+}
+#endif
 
 /*
  * How many blocks doth make a writepage()?
diff -ur a/fs/ext3/namei.c b/fs/ext3/namei.c
--- a/fs/ext3/namei.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ext3/namei.c	2014-02-17 11:56:56.000000000 +0100
@@ -2463,7 +2463,7 @@
 			err = PTR_ERR(handle);
 			goto err_drop_inode;
 		}
-		inc_nlink(inode);
+		set_nlink(inode, 1);
 		err = ext3_orphan_del(handle, inode);
 		if (err) {
 			ext3_journal_stop(handle);
@@ -2723,6 +2723,13 @@
  * directories can handle most operations...
  */
 const struct inode_operations ext3_dir_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext3_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext3_get_archive_ver,
+	.syno_set_archive_ver = syno_ext3_set_archive_ver,
+#endif
 	.create		= ext3_create,
 	.lookup		= ext3_lookup,
 	.link		= ext3_link,
@@ -2739,13 +2746,17 @@
 	.listxattr	= ext3_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
-#ifdef MY_ABC_HERE
-	.synosetxattr	= syno_generic_setxattr,
-#endif
 	.get_acl	= ext3_get_acl,
 };
 
 const struct inode_operations ext3_special_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext3_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext3_get_archive_ver,
+	.syno_set_archive_ver = syno_ext3_set_archive_ver,
+#endif
 	.setattr	= ext3_setattr,
 #ifdef CONFIG_EXT3_FS_XATTR
 	.setxattr	= generic_setxattr,
diff -ur a/fs/ext3/super.c b/fs/ext3/super.c
--- a/fs/ext3/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ext3/super.c	2014-02-17 11:56:56.000000000 +0100
@@ -95,13 +95,6 @@
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
-#ifdef MY_ABC_HERE
-	/* strengthen freezing fs as ext4 */
-	if (!journal_current_handle()) {
-		vfs_check_frozen(sb, SB_FREEZE_TRANS);
-	}
-#endif
-
 	/* Special case here: if the journal has aborted behind our
 	 * backs (eg. EIO in the commit thread), then we still need to
 	 * take the FS itself readonly cleanly. */
@@ -568,6 +561,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext3_inode_cachep);
 }
 
@@ -783,6 +781,42 @@
 	return try_to_free_buffers(page);
 }
 
+#ifdef MY_ABC_HERE
+static int syno_ext3_set_sb_archive_ver(struct super_block *sb, u32 archive_ver)
+{
+	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
+	handle_t *handle;
+	int err, err2;
+
+	sb->s_archive_version = archive_ver;
+	es->s_archive_version = cpu_to_le32(sb->s_archive_version);
+
+	handle = ext3_journal_start_sb(sb, 1);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+	err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh);
+	if (err) {
+		goto exit_journal;
+	}
+	err = ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
+exit_journal:
+	err2 = ext3_journal_stop(handle);
+	if (!err) {
+		err = err2;
+	}
+exit:
+	return err;
+}
+
+static int syno_ext3_get_sb_archive_ver(struct super_block *sb, u32 *archive_ver)
+{
+	*archive_ver = sb->s_archive_version;
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group")
 #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -822,6 +856,10 @@
 #endif
 
 static const struct super_operations ext3_sops = {
+#ifdef MY_ABC_HERE
+	.syno_set_sb_archive_ver = syno_ext3_set_sb_archive_ver,
+	.syno_get_sb_archive_ver = syno_ext3_get_sb_archive_ver,
+#endif
 	.alloc_inode	= ext3_alloc_inode,
 	.destroy_inode	= ext3_destroy_inode,
 	.write_inode	= ext3_write_inode,
@@ -2483,9 +2521,6 @@
 		es->s_wtime = cpu_to_le32(get_seconds());
 	es->s_free_blocks_count = cpu_to_le32(ext3_count_free_blocks(sb));
 	es->s_free_inodes_count = cpu_to_le32(ext3_count_free_inodes(sb));
-#ifdef MY_ABC_HERE
-	es->s_archive_version = cpu_to_le32(sb->s_archive_version);
-#endif
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
diff -ur a/fs/ext3/symlink.c b/fs/ext3/symlink.c
--- a/fs/ext3/symlink.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ext3/symlink.c	2014-02-17 11:56:56.000000000 +0100
@@ -31,6 +31,13 @@
 }
 
 const struct inode_operations ext3_symlink_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext3_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext3_get_archive_ver,
+	.syno_set_archive_ver = syno_ext3_set_archive_ver,
+#endif
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
@@ -44,6 +51,13 @@
 };
 
 const struct inode_operations ext3_fast_symlink_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext3_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext3_get_archive_ver,
+	.syno_set_archive_ver = syno_ext3_set_archive_ver,
+#endif
 	.readlink	= generic_readlink,
 	.follow_link	= ext3_follow_link,
 	.setattr	= ext3_setattr,
diff -ur a/fs/ext3/xattr.c b/fs/ext3/xattr.c
--- a/fs/ext3/xattr.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ext3/xattr.c	2014-02-17 11:56:56.000000000 +0100
@@ -1377,22 +1377,10 @@
 			      value, size, flags);
 }
 
-static int ext3_xattr_syno_set_compact(struct inode *inode, const char *name,
-			  const void *value, size_t size, int flags, int handler_flags)
-{
-	if (strcmp(name, "") == 0){
-		return -EINVAL;
-	}
-
-	return ext3_xattr_set(inode, EXT3_XATTR_INDEX_SYNO, name,
-			      value, size, flags);
-}
-
 struct xattr_handler ext3_xattr_syno_handler = {
 	.prefix	= XATTR_SYNO_PREFIX,
 	.list	= ext3_xattr_syno_list,
 	.get	= ext3_xattr_syno_get,
 	.set	= ext3_xattr_syno_set,
-	.set_compact_syno	= ext3_xattr_syno_set_compact,
 };
 #endif
diff -ur a/fs/ext4/balloc.c b/fs/ext4/balloc.c
--- a/fs/ext4/balloc.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/balloc.c	2014-02-17 11:57:00.000000000 +0100
@@ -486,9 +486,6 @@
 
 	return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal);
 }
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-EXPORT_SYMBOL(ext4_should_retry_alloc);
-#endif
 
 /*
  * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks
diff -ur a/fs/ext4/ext4.h b/fs/ext4/ext4.h
--- a/fs/ext4/ext4.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/ext4.h	2014-02-17 11:56:59.000000000 +0100
@@ -1123,7 +1123,7 @@
 #if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
 	__le32  s_reserved[106];        /* Padding to the end of the block */
 	__le32  s_archive_version;      /* Last archived version */
-	__le32  s_syno_reserved;
+	__le32  s_archive_version_obsoleted;
 	__le32  s_syno_hash_magic;      /* Enable Htree if the magic is given */
 #else
 	__le32	s_reserved[109];        /* Padding to the end of the block */
@@ -1263,6 +1263,7 @@
 	struct flex_groups *s_flex_groups;
 #ifdef MY_ABC_HERE
 	int s_new_error_fs_event_flag;
+	char *s_mount_path;
 #endif
 #ifdef MY_DEF_HERE
 	int s_swap_create_time;
@@ -1934,6 +1935,13 @@
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
+#ifdef MY_ABC_HERE
+extern int syno_ext4_getattr(struct dentry *d, struct kstat *stat, int flags);
+#endif
+#ifdef MY_ABC_HERE
+extern int syno_ext4_get_archive_ver(struct dentry *d, u32 *);
+extern int syno_ext4_set_archive_ver(struct dentry *d, u32);
+#endif
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
diff -ur a/fs/ext4/file.c b/fs/ext4/file.c
--- a/fs/ext4/file.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/file.c	2014-02-17 11:57:00.000000000 +0100
@@ -30,7 +30,7 @@
 #include "acl.h"
 
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-#include "synoacl_int.h"
+#include "syno_acl.h"
 #endif
 
 /*
@@ -247,7 +247,11 @@
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_IMPROVED_SPLICE)
+	.splice_write	= comcerto_file_splice_write,
+#else
 	.splice_write	= generic_file_splice_write,
+#endif
 #if defined(CONFIG_SYNO_ARMADA)
 	.splice_from_socket = generic_splice_from_socket,
 #endif
@@ -255,6 +259,13 @@
 };
 
 const struct inode_operations ext4_file_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext4_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext4_get_archive_ver,
+	.syno_set_archive_ver = syno_ext4_set_archive_ver,
+#endif
 	.setattr	= ext4_setattr,
 	.getattr	= ext4_getattr,
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -262,19 +273,13 @@
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
-#ifdef MY_ABC_HERE
-	.synosetxattr	= syno_generic_setxattr,
-#endif
 #endif
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-	.syno_acl_get = ext4_mod_get_syno_acl_inherit,
-	.syno_access = ext4_mod_syno_access,
-	.syno_permission = ext4_mod_syno_permission,
-	.syno_exec_permission = ext4_mod_syno_exec_permission,
-	.syno_permission_get = ext4_mod_get_syno_permission,
-	.syno_inode_change_ok = ext4_mod_syno_inode_change_ok,
-#endif
+	.syno_acl_get   = ext4_get_syno_acl,
+	.syno_acl_set	= ext4_set_syno_acl,
+#else
 	.get_acl	= ext4_get_acl,
+#endif
 	.fiemap		= ext4_fiemap,
 };
 
diff -ur a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
--- a/fs/ext4/ialloc.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/ialloc.c	2014-02-17 11:56:59.000000000 +0100
@@ -1072,9 +1072,6 @@
 #ifdef MY_ABC_HERE
 	inode->i_mode2 = ALL_SYNO_ARCHIVE;   /* set archive bit on creation */
 #endif
-#ifdef MY_ABC_HERE
-	inode->i_archive_version = inode->i_sb->s_archive_version;
-#endif
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
diff -ur a/fs/ext4/inode.c b/fs/ext4/inode.c
--- a/fs/ext4/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/inode.c	2014-02-17 11:57:00.000000000 +0100
@@ -48,7 +48,7 @@
 #include "truncate.h"
 
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-#include "synoacl_int.h"
+#include "syno_acl.h"
 #endif
 
 #include <trace/events/ext4.h>
@@ -174,6 +174,11 @@
 	if (is_bad_inode(inode))
 		goto no_delete;
 
+	/*
+	 * Protect us against freezing - iput() caller didn't have to have any
+	 * protection against it
+	 */
+	sb_start_intwrite(inode->i_sb);
 	handle = ext4_journal_start(inode, ext4_blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
 		ext4_std_error(inode->i_sb, PTR_ERR(handle));
@@ -183,6 +188,7 @@
 		 * cleaned up.
 		 */
 		ext4_orphan_del(NULL, inode);
+		sb_end_intwrite(inode->i_sb);
 		goto no_delete;
 	}
 
@@ -214,6 +220,7 @@
 		stop_handle:
 			ext4_journal_stop(handle);
 			ext4_orphan_del(NULL, inode);
+			sb_end_intwrite(inode->i_sb);
 			goto no_delete;
 		}
 	}
@@ -242,6 +249,7 @@
 	else
 		ext4_free_inode(handle, inode);
 	ext4_journal_stop(handle);
+	sb_end_intwrite(inode->i_sb);
 	return;
 no_delete:
 	ext4_clear_inode(inode);	/* We must guarantee clearing of inode... */
@@ -3698,10 +3706,6 @@
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
 	int block;
-#ifdef MY_ABC_HERE
-	struct syno_xattr_archive_version value;
-	int retval;
-#endif
 
 	inode = iget_locked(sb, ino);
 	if (!inode)
@@ -3896,14 +3900,6 @@
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
-#ifdef MY_ABC_HERE
-	retval = ext4_xattr_get(inode, EXT4_XATTR_INDEX_SYNO, XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value));
-	if(retval>0) {
-		inode->i_archive_version = le32_to_cpu(value.v_archive_version);
-	} else {
-		inode->i_archive_version = 0;
-	}
-#endif
 	unlock_new_inode(inode);
 	return inode;
 
@@ -4200,13 +4196,14 @@
 	const unsigned int ia_valid = attr->ia_valid;
 
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-	if (IS_SYNOACL(inode)) {
-		error = inode->i_op->syno_inode_change_ok(dentry, attr);
-	} else 
+	if (!IS_EXT4_SYNOACL(inode)) {
 #endif
 	error = inode_change_ok(inode, attr);
 	if (error)
 		return error;
+#ifdef CONFIG_EXT4_FS_SYNO_ACL
+	}
+#endif
 
 	if (is_quota_modification(inode, attr))
 		dquot_initialize(inode);
@@ -4230,16 +4227,8 @@
 		/* Update corresponding info in inode so that everything is in
 		 * one transaction */
 		if (attr->ia_valid & ATTR_UID)
-#if defined(MY_ABC_HERE) && defined(CONFIG_EXT4_FS_SYNO_ACL)
-		{
-			inode->i_uid = attr->ia_uid;
-			if (IS_SYNOACL_OWNER_IS_GROUP(inode)) {
-				ext4_mod_syno_archive_safe_clean(inode, S2_SYNO_ACL_IS_OWNER_GROUP);
-			}
-		}
-#else
 			inode->i_uid = attr->ia_uid;
-#endif
+
 		if (attr->ia_valid & ATTR_GID)
 			inode->i_gid = attr->ia_gid;
 		error = ext4_mark_inode_dirty(handle, inode);
@@ -4316,17 +4305,7 @@
 		ext4_orphan_del(NULL, inode);
 
 	if (!rc && (ia_valid & ATTR_MODE))
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-	{
-		if (IS_SYNOACL(inode)) {
-			rc = ext4_mod_syno_acl_chmod(inode);
-		} else {
-			rc = ext4_acl_chmod(inode);
-		}
-	}
-#else
-	rc = ext4_acl_chmod(inode);
-#endif
+		rc = ext4_acl_chmod(inode);
 
 err_out:
 	ext4_std_error(inode->i_sb, error);
@@ -4344,11 +4323,6 @@
 	inode = dentry->d_inode;
 	generic_fillattr(inode, stat);
 
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-	if (IS_SYNOACL(inode)) {
-		ext4_mod_synoacl_to_mode(dentry, inode, stat);
-	}
-#endif //CONFIG_EXT4_FS_SYNO_ACL
 	/*
 	 * We can't update i_blocks if the block allocation is delayed
 	 * otherwise in the case of system crash before the real block
@@ -4365,6 +4339,75 @@
 	return 0;
 }
 
+#ifdef MY_ABC_HERE
+int syno_ext4_getattr(struct dentry *d, struct kstat *stat, int flags)
+{
+	struct inode *inode = d->d_inode;
+	int err = 0;
+
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_CREATIME) {
+		stat->SynoCreateTime = inode->i_CreateTime;
+	}
+#endif
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_ARBIT) {
+		stat->SynoMode = inode->i_mode2;
+	}
+#endif
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_BKPVER) {
+		err = syno_ext4_get_archive_ver(d, &stat->syno_archive_version);
+	}
+#endif
+	return err;
+}
+#endif
+
+#ifdef MY_ABC_HERE
+int syno_ext4_set_archive_ver(struct dentry *dentry, u32 version)
+{
+	struct inode *inode = dentry->d_inode;
+	struct syno_xattr_archive_version value;
+	int err;
+
+	value.v_magic = cpu_to_le16(0x2552);
+	value.v_struct_version = cpu_to_le16(1);
+	value.v_archive_version = cpu_to_le32(version);
+	err = ext4_xattr_set(inode, EXT4_XATTR_INDEX_SYNO, XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value), 0);
+	if (!err) {
+		inode->i_archive_version = version;
+		inode->i_flags |= S_ARCHIVE_VERSION_CACHED;
+	}
+	return err;
+}
+
+int syno_ext4_get_archive_ver(struct dentry *dentry, u32 *version)
+{
+	struct inode *inode = dentry->d_inode;
+	struct syno_xattr_archive_version value;
+	int err;
+
+	if (IS_ARCHIVE_VERSION_CACHED(inode)) {
+		*version = inode->i_archive_version;
+		return 0;
+	}
+
+	err = ext4_xattr_get(inode, EXT4_XATTR_INDEX_SYNO, XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value));
+	if (0 < err) {
+		inode->i_archive_version = le32_to_cpu(value.v_archive_version);
+	} else if (-ENODATA == err) {
+		inode->i_archive_version = 0;
+	} else {
+		*version = 0;
+		return err;
+	}
+	*version = inode->i_archive_version;
+	inode->i_flags |= S_ARCHIVE_VERSION_CACHED;
+	return 0;
+}
+#endif
+
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
 	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
@@ -4752,11 +4795,7 @@
 	get_block_t *get_block;
 	int retries = 0;
 
-	/*
-	 * This check is racy but catches the common case. We rely on
-	 * __block_page_mkwrite() to do a reliable check.
-	 */
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	sb_start_pagefault(inode->i_sb);
 	/* Delalloc case is easy... */
 	if (test_opt(inode->i_sb, DELALLOC) &&
 	    !ext4_should_journal_data(inode) &&
@@ -4824,5 +4863,6 @@
 out_ret:
 	ret = block_page_mkwrite_return(ret);
 out:
+	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
diff -ur a/fs/ext4/Makefile b/fs/ext4/Makefile
--- a/fs/ext4/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/fs/ext4/Makefile	2014-01-21 09:37:25.000000000 +0100
@@ -11,5 +11,5 @@
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
-ext4-$(CONFIG_EXT4_FS_SYNO_ACL)		+= synoacl_api.o
+ext4-$(CONFIG_EXT4_FS_SYNO_ACL)		+= syno_acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
diff -ur a/fs/ext4/mmp.c b/fs/ext4/mmp.c
--- a/fs/ext4/mmp.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/mmp.c	2014-02-17 11:57:00.000000000 +0100
@@ -10,14 +10,20 @@
  * Write the MMP block using WRITE_SYNC to try to get the block on-disk
  * faster.
  */
-static int write_mmp_block(struct buffer_head *bh)
+static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 {
+	/*
+	 * We protect against freezing so that we don't create dirty buffers
+	 * on frozen filesystem.
+	 */
+	sb_start_write(sb);
 	mark_buffer_dirty(bh);
 	lock_buffer(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
 	submit_bh(WRITE_SYNC, bh);
 	wait_on_buffer(bh);
+	sb_end_write(sb);
 	if (unlikely(!buffer_uptodate(bh)))
 		return 1;
 
@@ -122,7 +128,7 @@
 		mmp->mmp_time = cpu_to_le64(get_seconds());
 		last_update_time = jiffies;
 
-		retval = write_mmp_block(bh);
+		retval = write_mmp_block(sb, bh);
 		/*
 		 * Don't spew too many error messages. Print one every
 		 * (s_mmp_update_interval * 60) seconds.
@@ -202,7 +208,7 @@
 	mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
 	mmp->mmp_time = cpu_to_le64(get_seconds());
 
-	retval = write_mmp_block(bh);
+	retval = write_mmp_block(sb, bh);
 
 failed:
 	kfree(data);
@@ -301,7 +307,7 @@
 	seq = mmp_new_seq();
 	mmp->mmp_seq = cpu_to_le32(seq);
 
-	retval = write_mmp_block(bh);
+	retval = write_mmp_block(sb, bh);
 	if (retval)
 		goto failed;
 
diff -ur a/fs/ext4/namei.c b/fs/ext4/namei.c
--- a/fs/ext4/namei.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/namei.c	2014-02-17 11:56:59.000000000 +0100
@@ -45,7 +45,7 @@
 #include "acl.h"
 
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-#include "synoacl_int.h"
+#include "syno_acl.h"
 #endif
 
 #include <trace/events/ext4.h>
@@ -1960,12 +1960,6 @@
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-	if (!err && IS_SYNOACL(dir)) {
-		ext4_mod_init_syno_acl(inode, dentry);
-	}
-#endif
-
 	return err;
 }
 
@@ -2087,11 +2081,6 @@
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
 
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-	if (!err && IS_SYNOACL(dir)) {
-		ext4_mod_init_syno_acl(inode, dentry);
-	}
-#endif
 	return err;
 }
 
@@ -2527,7 +2516,7 @@
 			err = PTR_ERR(handle);
 			goto err_drop_inode;
 		}
-		inc_nlink(inode);
+		set_nlink(inode, 1);
 		err = ext4_orphan_del(handle, inode);
 		if (err) {
 			ext4_journal_stop(handle);
@@ -2786,13 +2775,7 @@
 	ext4_journal_stop(handle);
 	if (retval == 0 && force_da_alloc)
 		ext4_alloc_da_blocks(old_inode);
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-	if (0 == retval) {
-		if (IS_SYNOACL(old_dir) != IS_SYNOACL(new_dir)) {
-			ext4_mod_rename_syno_acl(old_inode, new_dir);
-		}
-	}
-#endif
+
 	return retval;
 }
 
@@ -2800,6 +2783,13 @@
  * directories can handle most operations...
  */
 const struct inode_operations ext4_dir_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext4_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext4_get_archive_ver,
+	.syno_set_archive_ver = syno_ext4_set_archive_ver,
+#endif
 	.create		= ext4_create,
 	.lookup		= ext4_lookup,
 	.link		= ext4_link,
@@ -2815,18 +2805,10 @@
 	.getxattr	= generic_getxattr,
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
-#ifdef MY_ABC_HERE
-	.synosetxattr	= syno_generic_setxattr,
-#endif
 #endif
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-	.getattr	= ext4_mod_dir_getattr,
-	.syno_acl_get = ext4_mod_get_syno_acl_inherit,
-	.syno_access	= ext4_mod_syno_access,
-	.syno_permission = ext4_mod_syno_permission,
-	.syno_exec_permission = ext4_mod_syno_exec_permission,
-	.syno_permission_get = ext4_mod_get_syno_permission,
-	.syno_inode_change_ok = ext4_mod_syno_inode_change_ok,
+	.syno_acl_get   = ext4_get_syno_acl,
+	.syno_acl_set	= ext4_set_syno_acl,
 #else
 	.get_acl	= ext4_get_acl,
 #endif
@@ -2834,6 +2816,13 @@
 };
 
 const struct inode_operations ext4_special_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext4_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext4_get_archive_ver,
+	.syno_set_archive_ver = syno_ext4_set_archive_ver,
+#endif
 	.setattr	= ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
 	.setxattr	= generic_setxattr,
@@ -2841,5 +2830,7 @@
 	.listxattr	= ext4_listxattr,
 	.removexattr	= generic_removexattr,
 #endif
+#ifndef CONFIG_EXT4_FS_SYNO_ACL
 	.get_acl	= ext4_get_acl,
+#endif
 };
diff -ur a/fs/ext4/resize.c b/fs/ext4/resize.c
--- a/fs/ext4/resize.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/resize.c	2014-02-17 11:57:00.000000000 +0100
@@ -1759,11 +1759,10 @@
 	int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
 		le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
 	struct inode *inode = NULL;
-	int gdb_off, gdb_num;
+	int gdb_off;
 	int err;
 	__u16 bg_flags = 0;
 
-	gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
 	gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
 
 	if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
diff -ur a/fs/ext4/super.c b/fs/ext4/super.c
--- a/fs/ext4/super.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/super.c	2014-02-17 11:57:00.000000000 +0100
@@ -52,7 +52,7 @@
 #include "mballoc.h"
 
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-#include <linux/syno_acl_xattr_ds.h>
+#include <linux/syno_acl.h>
 #endif
 
 #define CREATE_TRACE_POINTS
@@ -302,33 +302,17 @@
  * journal_end calls result in the superblock being marked dirty, so
  * that sync() will call the filesystem's write_super callback if
  * appropriate.
- *
- * To avoid j_barrier hold in userspace when a user calls freeze(),
- * ext4 prevents a new handle from being started by s_frozen, which
- * is in an upper layer.
  */
 handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 {
 	journal_t *journal;
-	handle_t  *handle;
 
 	trace_ext4_journal_start(sb, nblocks, _RET_IP_);
 	if (sb->s_flags & MS_RDONLY)
 		return ERR_PTR(-EROFS);
 
+	WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
 	journal = EXT4_SB(sb)->s_journal;
-	handle = ext4_journal_current_handle();
-
-	/*
-	 * If a handle has been started, it should be allowed to
-	 * finish, otherwise deadlock could happen between freeze
-	 * and others(e.g. truncate) due to the restart of the
-	 * journal handle if the filesystem is forzen and active
-	 * handles are not stopped.
-	 */
-	if (!handle)
-		vfs_check_frozen(sb, SB_FREEZE_TRANS);
-
 	if (!journal)
 		return ext4_get_nojournal();
 	/*
@@ -342,9 +326,6 @@
 	}
 	return jbd2_journal_start(journal, nblocks);
 }
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-EXPORT_SYMBOL(ext4_journal_start_sb);
-#endif
 
 /*
  * The only special thing we need to do here is to make sure that all
@@ -372,9 +353,6 @@
 		__ext4_std_error(sb, where, line, err);
 	return err;
 }
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-EXPORT_SYMBOL(__ext4_journal_stop);
-#endif
 
 void ext4_journal_abort_handle(const char *caller, unsigned int line,
 			       const char *err_fn, struct buffer_head *bh,
@@ -915,6 +893,11 @@
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
 #endif
+#ifdef MY_ABC_HERE
+	if (sbi->s_mount_path) {
+		kfree(sbi->s_mount_path);
+	}
+#endif
 
 	/* Debugging code just in case the in-memory inode orphan list
 	 * isn't empty.  The on-disk one can be non-empty if we've
@@ -1042,6 +1025,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ext4_inode_cachep);
 }
 
@@ -1167,10 +1155,8 @@
 #endif
 
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-	if (test_opt(sb, SYNO_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
+	if (test_opt(sb, SYNO_ACL))
 		seq_puts(seq, ","SYNO_ACL_MNT_OPT);
-	if (!test_opt(sb, SYNO_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
-		seq_puts(seq, ","SYNO_ACL_NOT_MNT_OPT);
 #elif defined(CONFIG_EXT4_FS_POSIX_ACL)
 	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
 		seq_puts(seq, ",acl");
@@ -1319,6 +1305,88 @@
 	return try_to_free_buffers(page);
 }
 
+#ifdef MY_ABC_HERE
+static int syno_ext4_set_sb_archive_ver(struct super_block *sb, u32 archive_ver)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	handle_t *handle;
+	int err = 0;
+	int err2;
+
+	sb->s_archive_version = archive_ver;
+	es->s_archive_version = cpu_to_le32(sb->s_archive_version);
+
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		err = ext4_commit_super(sb, 1);
+		goto exit;
+	}
+	handle = ext4_journal_start_sb(sb, 1);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err) {
+		goto exit_journal;
+	}
+	err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+	if ((err2 = ext4_journal_stop(handle)) && !err) {
+		err = err2;
+	}
+exit:
+	return err;
+}
+
+static int syno_ext4_get_sb_archive_ver(struct super_block *sb, u32 *version)
+{
+	*version = sb->s_archive_version;
+	return 0;
+}
+
+#ifdef MY_ABC_HERE
+static int syno_ext4_set_sb_archive_ver1(struct super_block *sb, u32 archive_ver1)
+{
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	handle_t *handle;
+	int err = 0;
+	int err2;
+
+	sb->s_archive_version1 = archive_ver1;
+	es->s_archive_version1 = cpu_to_le32(sb->s_archive_version1);
+
+	if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
+		err = ext4_commit_super(sb, 1);
+		goto exit;
+	}
+	handle = ext4_journal_start_sb(sb, 1);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto exit;
+	}
+	err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+	if (err) {
+		goto exit_journal;
+	}
+	err = ext4_handle_dirty_super(handle, sb);
+
+exit_journal:
+	if ((err2 = ext4_journal_stop(handle)) && !err) {
+		err = err2;
+	}
+exit:
+	return err;
+}
+
+static int syno_ext4_get_sb_archive_ver1(struct super_block *sb, u32 *version)
+{
+	*version = sb->s_archive_version1;
+	return 0;
+}
+#endif
+#endif
+
 #ifdef CONFIG_QUOTA
 #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
 #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
@@ -1360,6 +1428,14 @@
 #endif
 
 static const struct super_operations ext4_sops = {
+#ifdef MY_ABC_HERE
+	.syno_set_sb_archive_ver = syno_ext4_set_sb_archive_ver,
+	.syno_get_sb_archive_ver = syno_ext4_get_sb_archive_ver,
+#ifdef MY_ABC_HERE
+	.syno_set_sb_archive_ver1 = syno_ext4_set_sb_archive_ver1,
+	.syno_get_sb_archive_ver1 = syno_ext4_get_sb_archive_ver1,
+#endif
+#endif
 	.alloc_inode	= ext4_alloc_inode,
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
@@ -1381,6 +1457,14 @@
 };
 
 static const struct super_operations ext4_nojournal_sops = {
+#ifdef MY_ABC_HERE
+	.syno_set_sb_archive_ver = syno_ext4_set_sb_archive_ver,
+	.syno_get_sb_archive_ver = syno_ext4_get_sb_archive_ver,
+#ifdef MY_ABC_HERE
+	.syno_set_sb_archive_ver1 = syno_ext4_set_sb_archive_ver1,
+	.syno_get_sb_archive_ver1 = syno_ext4_get_sb_archive_ver1,
+#endif
+#endif
 	.alloc_inode	= ext4_alloc_inode,
 	.destroy_inode	= ext4_destroy_inode,
 	.write_inode	= ext4_write_inode,
@@ -2683,7 +2767,7 @@
 static ssize_t syno_fs_error_mounted_show(struct ext4_attr *a,
 				       struct ext4_sb_info *sbi, char *buf)
 {
-	return snprintf(buf, PAGE_SIZE, "%s\n", sbi->s_es->s_last_mounted);
+	return snprintf(buf, PAGE_SIZE, "%s\n", sbi->s_mount_path);
 }
 
 static ssize_t syno_fs_error_count_show(struct ext4_attr *a,
@@ -2907,75 +2991,6 @@
 	return 1;
 }
 
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-static int SYNOACLModuleStatusGet(const char *szModName)
-{
-	int st = -1;
-	struct module *mod = NULL;
-
-	mutex_lock(&module_mutex);
-
-	if (NULL == (mod = find_module(szModName))){
-		goto Err;
-	}
-
-	st = mod->state;
-Err:
-	mutex_unlock(&module_mutex);
-
-	return st;
-}
-
-static void UseACLModule(const char *szModName, int isGet)
-{
-	struct module *mod = NULL;
-
-	mutex_lock(&module_mutex);
-
-	if (NULL == (mod = find_module(szModName))){
-		printk("synoacl module [%s] is not loaded \n", szModName);
-		goto Err;
-	}
-
-	if (isGet) {
-		try_module_get(mod);
-	} else {
-		module_put(mod);
-	}
-Err:
-	mutex_unlock(&module_mutex);
-}
-
-static void SYNOACLModuleGet(const char *szModName)
-{
-	UseACLModule(szModName, 1);
-}
-static void SYNOACLModulePut(const char *szModName)
-{
-	UseACLModule(szModName, 0);
-}
-
-static void SYNOACLFlagSet(struct super_block *psb, unsigned long *ps_flags, unsigned int *ps_mount_opt)
-{
-	if (!psb || !ps_flags || !ps_mount_opt) {
-		return;
-	}
-
-	*ps_flags &= ~MS_SYNOACL;
-	if (*ps_mount_opt & EXT4_MOUNT_SYNO_ACL) {
-		if (MODULE_STATE_LIVE != SYNOACLModuleStatusGet("synoacl_ext4") ||
-			MODULE_STATE_LIVE != SYNOACLModuleStatusGet("synoacl_vfs")) {
-			ext4_msg(psb, KERN_ERR, "synoacl module has not been loaded. Unable to mount with synoacl, vfs_mod status=%d, ext4_mod status=%d", SYNOACLModuleStatusGet("synoacl_vfs"), SYNOACLModuleStatusGet("synoacl_ext4"));
-			*ps_mount_opt &= ~EXT4_MOUNT_SYNO_ACL;
-		} else {
-			*ps_flags |= MS_SYNOACL;
-			SYNOACLModuleGet("synoacl_ext4");
-			SYNOACLModuleGet("synoacl_vfs");
-		}
-	}
-}
-#endif
-
 /*
  * This function is called once a day if we have errors logged
  * on the file system
@@ -3035,6 +3050,7 @@
 	sb = elr->lr_super;
 	ngroups = EXT4_SB(sb)->s_groups_count;
 
+	sb_start_write(sb);
 	for (group = elr->lr_next_group; group < ngroups; group++) {
 		gdp = ext4_get_group_desc(sb, group, NULL);
 		if (!gdp) {
@@ -3065,6 +3081,7 @@
 		elr->lr_next_sched = jiffies + elr->lr_timeout;
 		elr->lr_next_group = group + 1;
 	}
+	sb_end_write(sb);
 
 	return ret;
 }
@@ -3148,8 +3165,7 @@
 		}
 		mutex_unlock(&eli->li_list_mtx);
 
-		if (freezing(current))
-			refrigerator();
+		try_to_freeze();
 
 		cur = jiffies;
 		if ((time_after_eq(cur, next_wakeup)) ||
@@ -3574,10 +3590,7 @@
 #ifdef CONFIG_EXT4_FS_XATTR
 	set_opt(sb, XATTR_USER);
 #endif
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-	if (def_mount_opts & EXT4_DEFM_ACL)
-		set_opt(sb, SYNO_ACL);
-#elif defined(CONFIG_EXT4_FS_POSIX_ACL)
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
 	set_opt(sb, POSIX_ACL);
 #endif
 	set_opt(sb, MBLK_IO_SUBMIT);
@@ -3654,7 +3667,16 @@
 			clear_opt(sb, DELALLOC);
 	}
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-	SYNOACLFlagSet(sb, &sb->s_flags, &sbi->s_mount_opt);
+	if (test_opt(sb, SYNO_ACL)) {
+		int st = SYNOACLModuleStatusGet("synoacl_vfs");
+		if (MODULE_STATE_LIVE != st) {
+			ext4_msg(sb, KERN_ERR, "synoacl module has not been loaded. Unable to mount with synoacl, vfs_mod status=%d", st);
+			clear_opt(sb, SYNO_ACL);
+		} else {
+			sb->s_flags |= MS_SYNOACL;
+			SYNOACLModuleGet("synoacl_vfs");
+		}
+	}
 #else
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -4144,10 +4166,10 @@
 #endif
 #ifdef MY_ABC_HERE
 	sb->s_archive_version = le32_to_cpu(es->s_archive_version);
-#endif
 #ifdef MY_ABC_HERE
 	sb->s_archive_version1 = le32_to_cpu(es->s_archive_version1);
 #endif
+#endif
 	if (ext4_setup_super(sb, es, sb->s_flags & MS_RDONLY))
 		sb->s_flags |= MS_RDONLY;
 
@@ -4602,12 +4624,6 @@
 	else
 		es->s_kbytes_written =
 			cpu_to_le64(EXT4_SB(sb)->s_kbytes_written);
-#ifdef MY_ABC_HERE
-	es->s_archive_version = cpu_to_le32(sb->s_archive_version);
-#endif
-#ifdef MY_ABC_HERE
-	es->s_archive_version1 = cpu_to_le32(sb->s_archive_version1);
-#endif
 	ext4_free_blocks_count_set(es,
 			EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive(
 				&EXT4_SB(sb)->s_freeclusters_counter)));
@@ -4712,10 +4728,8 @@
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal) {
-		vfs_check_frozen(sb, SB_FREEZE_TRANS);
+	if (journal)
 		ret = ext4_journal_force_commit(journal);
-	}
 
 	return ret;
 }
@@ -4747,9 +4761,8 @@
  * gives us a chance to flush the journal completely and mark the fs clean.
  *
  * Note that only this function cannot bring a filesystem to be in a clean
- * state independently, because ext4 prevents a new handle from being started
- * by @sb->s_frozen, which stays in an upper layer.  It thus needs help from
- * the upper layer.
+ * state independently. It relies on upper layer to stop all data & metadata
+ * modifications.
  */
 static int ext4_freeze(struct super_block *sb)
 {
@@ -4776,7 +4789,7 @@
 	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 	error = ext4_commit_super(sb, 1);
 out:
-	/* we rely on s_frozen to stop further updates */
+	/* we rely on upper layer to stop further updates */
 	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	return error;
 }
@@ -4860,8 +4873,20 @@
 	if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
 		ext4_abort(sb, "Abort forced by user");
 
-#ifdef CONFIG_EXT4_FS_SYNO_ACL  
-	SYNOACLFlagSet(sb, &sb->s_flags, &sbi->s_mount_opt);
+#ifdef CONFIG_EXT4_FS_SYNO_ACL
+	if ((sb->s_flags & MS_SYNOACL) && !test_opt(sb, SYNO_ACL)) {
+		sb->s_flags = sb->s_flags & ~MS_SYNOACL;
+		SYNOACLModulePut("synoacl_vfs");
+	} else if((!(sb->s_flags & MS_SYNOACL)) && test_opt(sb, SYNO_ACL)) {
+		int st = SYNOACLModuleStatusGet("synoacl_vfs");
+		if (MODULE_STATE_LIVE != st) {
+			ext4_msg(sb, KERN_ERR, "synoacl module has not been loaded. Unable to remount with synoacl, vfs_mod status=%d", st);
+			clear_opt(sb, SYNO_ACL);
+		} else {
+			sb->s_flags |= MS_SYNOACL;
+			SYNOACLModuleGet("synoacl_vfs");
+		}
+	}
 #else
 	sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
 		(test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0);
@@ -5339,9 +5364,26 @@
 static void ext4_kill_sb(struct super_block *sb)
 {
 	kill_block_super(sb);
-	SYNOACLModulePut("synoacl_ext4");
-	SYNOACLModulePut("synoacl_vfs");
+
+	if (MS_SYNOACL & sb->s_flags) {
+		SYNOACLModulePut("synoacl_vfs");
+	}
+}
+#endif
+
+#ifdef MY_ABC_HERE
+void ext4_fill_mount_path(struct super_block *sb, char *szPath)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_mount_path) {
+		strncpy(sbi->s_mount_path, szPath, SYNO_EXT4_MOUNT_PATH_LEN);
+	} else {
+		sbi->s_mount_path = kmemdup(szPath, SYNO_EXT4_MOUNT_PATH_LEN, GFP_KERNEL);
+	}
+	sbi->s_mount_path[SYNO_EXT4_MOUNT_PATH_LEN - 1] = '\0';
 }
+EXPORT_SYMBOL(ext4_fill_mount_path);
 #endif
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
diff -ur a/fs/ext4/symlink.c b/fs/ext4/symlink.c
--- a/fs/ext4/symlink.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/symlink.c	2014-02-17 11:56:59.000000000 +0100
@@ -31,6 +31,13 @@
 }
 
 const struct inode_operations ext4_symlink_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext4_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext4_get_archive_ver,
+	.syno_set_archive_ver = syno_ext4_set_archive_ver,
+#endif
 	.readlink	= generic_readlink,
 	.follow_link	= page_follow_link_light,
 	.put_link	= page_put_link,
@@ -44,6 +51,13 @@
 };
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
+#ifdef MY_ABC_HERE
+	.syno_getattr	= syno_ext4_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_ver = syno_ext4_get_archive_ver,
+	.syno_set_archive_ver = syno_ext4_set_archive_ver,
+#endif
 	.readlink	= generic_readlink,
 	.follow_link	= ext4_follow_link,
 	.setattr	= ext4_setattr,
Nur in a/fs/ext4: synoacl_api.c.
Nur in b/fs/ext4: syno_acl.c.
Nur in b/fs/ext4: syno_acl.h.
Nur in a/fs/ext4: synoacl_int.h.
diff -ur a/fs/ext4/xattr.c b/fs/ext4/xattr.c
--- a/fs/ext4/xattr.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/xattr.c	2014-02-17 11:57:00.000000000 +0100
@@ -61,9 +61,6 @@
 #include "xattr.h"
 #include "acl.h"
 
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-#include <linux/export.h>
-#endif
 
 #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data))
 #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr))
@@ -105,6 +102,7 @@
 	[EXT4_XATTR_INDEX_USER]		     = &ext4_xattr_user_handler,
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
 	[EXT4_XATTR_INDEX_SYNO_ACL_ACCESS]  = &ext4_xattr_synoacl_access_handler,
+	[EXT4_XATTR_INDEX_SYNO_ACL_ACCESS_NOPERM]  = &ext4_xattr_synoacl_noperm_access_handler,
 #elif defined(CONFIG_EXT4_FS_POSIX_ACL)
 	[EXT4_XATTR_INDEX_POSIX_ACL_ACCESS]  = &ext4_xattr_acl_access_handler,
 	[EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler,
@@ -123,6 +121,7 @@
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
 	&ext4_xattr_synoacl_access_handler,
+	&ext4_xattr_synoacl_noperm_access_handler,
 #elif defined(CONFIG_EXT4_FS_POSIX_ACL)
 	&ext4_xattr_acl_access_handler,
 	&ext4_xattr_acl_default_handler,
@@ -337,9 +336,6 @@
 	up_read(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-EXPORT_SYMBOL(ext4_xattr_get);
-#endif
 
 static int
 ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
@@ -1089,9 +1085,6 @@
 	up_write(&EXT4_I(inode)->xattr_sem);
 	return error;
 }
-#ifdef CONFIG_EXT4_FS_SYNO_ACL
-EXPORT_SYMBOL(ext4_xattr_set_handle);
-#endif
 
 /*
  * ext4_xattr_set()
@@ -1666,23 +1659,11 @@
 			      value, size, flags);
 }
 
-static int ext4_xattr_syno_set_compact(struct inode *inode, const char *name,
-			  const void *value, size_t size, int flags, int handler_flags)
-{
-	if (strcmp(name, "") == 0){
-		return -EINVAL;
-	}
-
-	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_SYNO, name,
-			      value, size, flags);
-}
-
 struct xattr_handler ext4_xattr_syno_handler = {
 	.prefix	= XATTR_SYNO_PREFIX,
 	.list	= ext4_xattr_syno_list,
 	.get	= ext4_xattr_syno_get,
 	.set	= ext4_xattr_syno_set,
-	.set_compact_syno	= ext4_xattr_syno_set_compact,
 };
 
 #endif
diff -ur a/fs/ext4/xattr.h b/fs/ext4/xattr.h
--- a/fs/ext4/xattr.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ext4/xattr.h	2014-02-17 11:57:00.000000000 +0100
@@ -27,6 +27,9 @@
 #ifdef MY_ABC_HERE
 #define EXT4_XATTR_INDEX_SYNO	8
 #endif
+#ifdef CONFIG_EXT4_FS_SYNO_ACL
+#define EXT4_XATTR_INDEX_SYNO_ACL_ACCESS_NOPERM	9
+#endif
 
 struct ext4_xattr_header {
 	__le32	h_magic;	/* magic number for identification */
@@ -72,7 +75,8 @@
 # ifdef CONFIG_EXT4_FS_XATTR
 
 #ifdef CONFIG_EXT4_FS_SYNO_ACL
-extern struct xattr_handler ext4_xattr_synoacl_access_handler;
+extern const struct xattr_handler ext4_xattr_synoacl_access_handler;
+extern const struct xattr_handler ext4_xattr_synoacl_noperm_access_handler;
 #endif
 #ifdef MY_ABC_HERE
 extern struct xattr_handler ext4_xattr_syno_handler;
diff -ur a/fs/fat/inode.c b/fs/fat/inode.c
--- a/fs/fat/inode.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/fat/inode.c	2014-02-17 11:56:58.000000000 +0100
@@ -564,6 +564,11 @@
 
 static void __exit fat_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(fat_inode_cachep);
 }
 
diff -ur a/fs/fcntl.c b/fs/fcntl.c
--- a/fs/fcntl.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/fcntl.c	2014-02-17 11:56:55.000000000 +0100
@@ -33,49 +33,74 @@
 #define ACL_MASK_NONE 0
 
 struct syno_archive_map {
-	unsigned int cmd; 	//set archive command
 	unsigned int sAr;	//syno archive
 	int isSetCmd;
+};
+
+static struct syno_archive_map rgSynoAr[] = {
+	{S2_IARCHIVE, 0},               /* F_CLEAR_ARCHIVE */
+	{S2_SMB_ARCHIVE, 1},            /* F_SETSMB_ARCHIVE */
+	{S2_SMB_HIDDEN, 1},             /* F_SETSMB_HIDDEN */
+	{S2_SMB_SYSTEM, 1},             /* F_SETSMB_SYSTEM */
+	{S2_SMB_ARCHIVE, 0},            /* F_CLRSMB_ARCHIVE */
+	{S2_SMB_HIDDEN, 0},             /* F_CLRSMB_HIDDEN */
+	{S2_SMB_SYSTEM, 0},             /* F_CLRSMB_SYSTEM */
+	{S3_IARCHIVE, 0},               /* F_CLEAR_S3_ARCHIVE */
 #ifdef CONFIG_FS_SYNO_ACL
-	int tag;
-	int mask;
+	{S2_SMB_READONLY, 0},           /* F_CLRSMB_READONLY */
+	{S2_SMB_READONLY, 1},           /* F_SETSMB_READONLY */
+	{S2_SYNO_ACL_INHERIT, 0},       /* F_CLRACL_INHERIT */
+	{S2_SYNO_ACL_INHERIT, 1},       /* F_SETSMB_INHERIT */
+	{S2_SYNO_ACL_EXIST, 0},         /* F_CLRACL_HAS_ACL */
+	{S2_SYNO_ACL_EXIST, 1},         /* F_SETACL_HAS_ACL */
+	{S2_SYNO_ACL_SUPPORT, 0},       /* F_CLRACL_SUPPORT */
+	{S2_SYNO_ACL_SUPPORT, 1},       /* F_SETACL_SUPPORT */
+	{S2_SYNO_ACL_IS_OWNER_GROUP, 0},/* F_CLRACL_OWNER_IS_GROUP */
+	{S2_SYNO_ACL_IS_OWNER_GROUP, 1},/* F_SETACL_OWNER_IS_GROUP */
 #endif
 };
-static struct syno_archive_map rgSynoAr[] = {
+
 #ifdef CONFIG_FS_SYNO_ACL
-	/* General archive */
-	{F_CLEAR_ARCHIVE, S2_IARCHIVE, 0, PROTECT_BY_ACL, MAY_WRITE_ATTR},
-	{F_CLEAR_S3_ARCHIVE, S3_IARCHIVE, 0, PROTECT_BY_ACL, MAY_WRITE_ATTR},
-	{F_SETSMB_ARCHIVE, S2_SMB_ARCHIVE, 1, PROTECT_BY_ACL, MAY_WRITE_ATTR},
-	{F_CLRSMB_ARCHIVE, S2_SMB_ARCHIVE, 0, PROTECT_BY_ACL, MAY_WRITE_ATTR},
-	{F_SETSMB_HIDDEN, S2_SMB_HIDDEN, 1, PROTECT_BY_ACL, MAY_WRITE_ATTR},
-	{F_CLRSMB_HIDDEN, S2_SMB_HIDDEN, 0, PROTECT_BY_ACL, MAY_WRITE_ATTR},
-	{F_SETSMB_SYSTEM, S2_SMB_SYSTEM, 1, PROTECT_BY_ACL, MAY_WRITE_ATTR},
-	{F_CLRSMB_SYSTEM, S2_SMB_SYSTEM, 0, PROTECT_BY_ACL, MAY_WRITE_ATTR},
+const int rgSynoArAclTag[] = {
+	PROTECT_BY_ACL,                 /* F_CLEAR_ARCHIVE */
+	PROTECT_BY_ACL,                 /* F_SETSMB_ARCHIVE */
+	PROTECT_BY_ACL,                 /* F_SETSMB_HIDDEN */
+	PROTECT_BY_ACL,                 /* F_SETSMB_SYSTEM */
+	PROTECT_BY_ACL,                 /* F_CLRSMB_ARCHIVE */
+	PROTECT_BY_ACL,                 /* F_CLRSMB_HIDDEN */
+	PROTECT_BY_ACL,                 /* F_CLRSMB_SYSTEM */
+	PROTECT_BY_ACL,                 /* F_CLEAR_S3_ARCHIVE */
+	PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, /* F_CLRSMB_READONLY */
+	PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, /* F_SETSMB_READONLY */
+	PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, /* F_CLRACL_INHERIT */
+	PROTECT_BY_ACL | NEED_FS_ACL_SUPPORT,                          /* F_SETACL_INHERIT */
+	NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT,                  /* F_CLRACL_HAS_ACL */
+	NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT,                  /* F_SETACL_HAS_ACL */
+	NEED_FS_ACL_SUPPORT,                                           /* F_CLRACL_SUPPORT */
+	NEED_FS_ACL_SUPPORT,                                           /* F_SETACL_SUPPORT */
+	PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, /* F_CLRACL_OWNER_IS_GROUP */
+	PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, /* F_SETACL_OWNER_IS_GROUP */
+};
 
-	/* ACL archive */
-	{F_SETSMB_READONLY, S2_SMB_READONLY, 1, PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, MAY_WRITE_ATTR},
-	{F_CLRSMB_READONLY, S2_SMB_READONLY, 0, PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, MAY_WRITE_ATTR},
-	{F_SETACL_OWNER_IS_GROUP, S2_SYNO_ACL_IS_OWNER_GROUP, 1, PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, MAY_GET_OWNER_SHIP},
-	{F_CLRACL_OWNER_IS_GROUP, S2_SYNO_ACL_IS_OWNER_GROUP, 0, PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, MAY_GET_OWNER_SHIP},
-	{F_SETACL_INHERIT, S2_SYNO_ACL_INHERIT, 1, PROTECT_BY_ACL | NEED_FS_ACL_SUPPORT, MAY_WRITE_PERMISSION},
-	{F_CLRACL_INHERIT, S2_SYNO_ACL_INHERIT, 0, PROTECT_BY_ACL | NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, MAY_WRITE_PERMISSION},
-	{F_SETACL_HAS_ACL, S2_SYNO_ACL_EXIST, 1, NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, ACL_MASK_NONE},
-	{F_CLRACL_HAS_ACL, S2_SYNO_ACL_EXIST, 0, NEED_INODE_ACL_SUPPORT | NEED_FS_ACL_SUPPORT, ACL_MASK_NONE},
-	{F_SETACL_SUPPORT, S2_SYNO_ACL_SUPPORT, 1, NEED_FS_ACL_SUPPORT, MAY_WRITE_PERMISSION},
-	{F_CLRACL_SUPPORT, S2_SYNO_ACL_SUPPORT, 0, NEED_FS_ACL_SUPPORT, MAY_WRITE_PERMISSION},
-	{0, 0, -1, -1, -1}
-#else //CONFIG_FS_SYNO_ACL
-	{F_CLEAR_ARCHIVE, S2_IARCHIVE, 0},
-	{F_CLEAR_S3_ARCHIVE, S3_IARCHIVE, 0},
-	{F_SETSMB_ARCHIVE, S2_SMB_ARCHIVE, 1},
-	{F_CLRSMB_ARCHIVE, S2_SMB_ARCHIVE, 0},
-	{F_SETSMB_HIDDEN, S2_SMB_HIDDEN, 1},
-	{F_CLRSMB_HIDDEN, S2_SMB_HIDDEN, 0},
-	{F_SETSMB_SYSTEM, S2_SMB_SYSTEM, 1},
-	{F_CLRSMB_SYSTEM, S2_SMB_SYSTEM, 0},
-	{0, 0, -1}
-#endif //CONFIG_FS_SYNO_ACL
+const int rgSynoArAclMask[] = {
+	MAY_WRITE_ATTR,       /* F_CLEAR_ARCHIVE */
+	MAY_WRITE_ATTR,       /* F_SETSMB_ARCHIVE */
+	MAY_WRITE_ATTR,       /* F_SETSMB_HIDDEN */
+	MAY_WRITE_ATTR,       /* F_SETSMB_SYSTEM */
+	MAY_WRITE_ATTR,       /* F_CLRSMB_ARCHIVE */
+	MAY_WRITE_ATTR,       /* F_CLRSMB_HIDDEN */
+	MAY_WRITE_ATTR,       /* F_CLRSMB_SYSTEM */
+	MAY_WRITE_ATTR,       /* F_CLEAR_S3_ARCHIVE */
+	MAY_WRITE_ATTR,       /* F_CLRSMB_READONLY */
+	MAY_WRITE_ATTR,       /* F_SETSMB_READONLY */
+	MAY_WRITE_PERMISSION, /* F_CLRACL_INHERIT */
+	MAY_WRITE_PERMISSION, /* F_SETACL_INHERIT */
+	ACL_MASK_NONE,        /* F_CLRACL_HAS_ACL */
+	ACL_MASK_NONE,        /* F_SETACL_HAS_ACL */
+	ACL_MASK_NONE,        /* F_CLRACL_SUPPORT */
+	ACL_MASK_NONE,        /* F_SETACL_SUPPORT */
+	MAY_GET_OWNER_SHIP,   /* F_CLRACL_OWNER_IS_GROUP */
+	MAY_GET_OWNER_SHIP,   /* F_SETACL_OWNER_IS_GROUP */
 };
 
 struct syno_archive_permission_mapping {
@@ -83,7 +108,6 @@
 	int permission;
 };
 static struct syno_archive_permission_mapping rgSynoArPermission[] = {
-#ifdef CONFIG_FS_SYNO_ACL
 	/* General archive */
 	{S2_IARCHIVE, MAY_WRITE_ATTR},
 	{S2_SMB_ARCHIVE, MAY_WRITE_ATTR},
@@ -96,111 +120,111 @@
 	{S2_SYNO_ACL_INHERIT, MAY_WRITE_PERMISSION},
 	{S2_SYNO_ACL_EXIST, MAY_WRITE_PERMISSION},
 	{S2_SYNO_ACL_SUPPORT, MAY_WRITE_PERMISSION},
-#endif //CONFIG_FS_SYNO_ACL
 	{0, -1}
 };
+#endif //CONFIG_FS_SYNO_ACL
 
 #ifdef MY_ABC_HERE
 long __SYNOArchiveOverwrite(struct dentry *dentry, unsigned int flags)
 {
 	struct inode *inode = dentry->d_inode;
-	int markInodeDirty = 0;
+	int err = 0;
+	u32 archive_bit;
+#ifdef CONFIG_FS_SYNO_ACL
 	int permissionCheck = 0;
 	int i = 0;
+#endif
+	mutex_lock(&inode->i_syno_mutex);
+	err = syno_op_get_archive_bit(dentry, &archive_bit);
+	if (err)
+		goto unlock;
 
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_FS_SYNOACL(inode) && IS_INODE_SYNOACL(inode)) {
+	if (IS_SYNOACL(dentry)) {
 		for (i = 0; -1 != rgSynoArPermission[i].permission; i++) {
-			if ((inode->i_mode2 & rgSynoArPermission[i].sAr) == (flags & rgSynoArPermission[i].sAr)) {
+			if ((archive_bit & rgSynoArPermission[i].sAr) == (flags & rgSynoArPermission[i].sAr)) {
 				continue;
 			}
 			permissionCheck |= rgSynoArPermission[i].permission;
 		}
-		if (inode->i_op && inode->i_op->syno_permission(dentry, permissionCheck)) {
-			return -EPERM;
+		err = synoacl_op_perm(dentry, permissionCheck);
+		if (err) {
+			goto unlock;
 		}
 	} else {
 		if (!inode_owner_or_capable(inode)) {
-			return -EPERM;
+			err = -EPERM;
+			goto unlock;
 		}
 	}
 	if (ALL_SYNO_ACL_ARCHIVE & flags) {
 		if (!IS_FS_SYNOACL(inode)) {
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			goto unlock;
 		}
 		// S2_SYNO_ACL_SUPPORT should be set if you want to set ACL archive bit.
 		if (!(S2_SYNO_ACL_SUPPORT & flags)) {
-			return -EINVAL;
+			err = -EINVAL;
+			goto unlock;
 		}
 	}
 #else
 	if (!inode_owner_or_capable(inode)) {
-		return -EPERM;
+		err = -EPERM;
+		goto unlock;
 	}
 #endif
 	if ((~ALL_ARCHIVE_BIT) & flags) {
-		return -EINVAL;
-	}
-	mutex_lock(&inode->i_syno_mutex);
-	if (flags != inode->i_mode2) {
-		inode->i_mode2 = flags;
-		markInodeDirty = 1;
-	}
-	mutex_unlock(&inode->i_syno_mutex);
-	if (markInodeDirty) {
-		mark_inode_dirty_sync(inode);
+		err = -EINVAL;
+		goto unlock;
 	}
 
-	return 0;
+	if (flags == archive_bit)
+		goto unlock;
+
+	err = syno_op_set_archive_bit_nolock(dentry, flags);
+unlock:
+	mutex_unlock(&inode->i_syno_mutex);
+	return err;
 }
 EXPORT_SYMBOL(__SYNOArchiveOverwrite);
 #endif
 
 long __SYNOArchiveSet(struct dentry *dentry, unsigned int cmd)
 {
-	int i = 0;
-	int found = 0;
+	int i = cmd - SYNO_FCNTL_BASE;
 	struct inode *inode = dentry->d_inode;
-	long err = -EINVAL;
+	long err;
+	u32 archive_bit;
 
-	for (i = 0; -1 != rgSynoAr[i].isSetCmd; i++) {
-		if (cmd == rgSynoAr[i].cmd) {
-			if ((rgSynoAr[i].isSetCmd == ((inode->i_mode2 & rgSynoAr[i].sAr)?1:0))){
-				found = 1;
-				break;
-			}
+	mutex_lock(&inode->i_syno_mutex);
+	err = syno_op_get_archive_bit(dentry, &archive_bit);
+	if (err)
+		goto unlock;
+
+	if ((rgSynoAr[i].isSetCmd == ((archive_bit & rgSynoAr[i].sAr)?1:0))){
+		err = 0;
+		goto unlock;
+	}
 #ifdef CONFIG_FS_SYNO_ACL
-			if (0 > (err = synoacl_mod_archive_change_ok(dentry, cmd, rgSynoAr[i].tag, rgSynoAr[i].mask))){
-				goto Err;
-			}
+	if (0 > (err = synoacl_op_arbit_chg_ok(dentry, cmd, rgSynoArAclTag[i], rgSynoArAclMask[i]))){
+		goto unlock;
+	}
 #endif //CONFIG_FS_SYNO_ACL
-			mutex_lock(&inode->i_syno_mutex);
-			if (rgSynoAr[i].isSetCmd) {
-				inode->i_mode2 |= rgSynoAr[i].sAr;
+	if (rgSynoAr[i].isSetCmd) {
+		archive_bit |= rgSynoAr[i].sAr;
 #ifdef CONFIG_FS_SYNO_ACL
-				if (S2_SYNO_ACL_INHERIT == rgSynoAr[i].sAr) {
-					inode->i_mode2 |= S2_SYNO_ACL_SUPPORT;
-				}
-#endif //CONFIG_FS_SYNO_ACL
-			} else {
-				inode->i_mode2 &= ~rgSynoAr[i].sAr;
-			}
-			mutex_unlock(&inode->i_syno_mutex);
-			mark_inode_dirty_sync(inode);
-
-			found = 1;
-			break;
+		if (S2_SYNO_ACL_INHERIT == rgSynoAr[i].sAr) {
+			archive_bit |= S2_SYNO_ACL_SUPPORT;
 		}
-	}
-	if (!found) {
-		printk("Archive bit cmd:%x not implement.\n", cmd);
-		err = -EINVAL;
+#endif //CONFIG_FS_SYNO_ACL
 	} else {
-		err = 0;
+		archive_bit &= ~rgSynoAr[i].sAr;
 	}
-#ifdef CONFIG_FS_SYNO_ACL
-Err:
-#endif
+
+	err = syno_op_set_archive_bit_nolock(dentry, archive_bit);
+unlock:
+	mutex_unlock(&inode->i_syno_mutex);
 	return err;
 }
 EXPORT_SYMBOL(__SYNOArchiveSet);
@@ -604,17 +628,16 @@
 	case F_GETPIPE_SZ:
 		err = pipe_fcntl(filp, cmd, arg);
 		break;
-	default:
 #ifdef MY_ABC_HERE
-		{
-			const struct inode_operations *i_op = filp->f_dentry->d_inode->i_op;
-			if (i_op && i_op->set_archive) {
-				err = i_op->set_archive(filp->f_dentry, cmd);
-			} else {
-				err = __SYNOArchiveSet(filp->f_dentry, cmd);
-			}
-		}
-#endif //MY_ABC_HERE
+	case SYNO_FCNTL_BASE ... SYNO_FCNTL_LAST:
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			break;
+		err = __SYNOArchiveSet(filp->f_dentry, cmd);
+		mnt_drop_write(filp->f_path.mnt);
+		break;
+#endif
+	default:
 		break;
 	}
 	return err;
diff -ur a/fs/file.c b/fs/file.c
--- a/fs/file.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/file.c	2014-02-17 11:57:01.000000000 +0100
@@ -268,6 +268,9 @@
 	/* All good, so we try */
 	return expand_fdtable(files, nr);
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(expand_files);
+#endif
 
 static int count_open_files(struct fdtable *fdt)
 {
diff -ur a/fs/file_table.c b/fs/file_table.c
--- a/fs/file_table.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/file_table.c	2014-02-17 11:56:58.000000000 +0100
@@ -216,7 +216,7 @@
 		return;
 	if (file_check_writeable(file) != 0)
 		return;
-	mnt_drop_write(mnt);
+	__mnt_drop_write(mnt);
 	file_release_write(file);
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
@@ -354,6 +354,9 @@
 
 	return file;
 }
+#ifdef MY_ABC_HERE
+EXPORT_SYMBOL(fget_light);
+#endif
 
 struct file *fget_raw_light(unsigned int fd, int *fput_needed)
 {
diff -ur a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
--- a/fs/freevxfs/vxfs_inode.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/freevxfs/vxfs_inode.c	2014-02-17 11:56:58.000000000 +0100
@@ -187,10 +187,10 @@
  *  vxfs_transmod returns a Linux mode_t for a given
  *  VxFS inode structure.
  */
-static __inline__ mode_t
+static __inline__ umode_t
 vxfs_transmod(struct vxfs_inode_info *vip)
 {
-	mode_t			ret = vip->vii_mode & ~VXFS_TYPE_MASK;
+	umode_t			ret = vip->vii_mode & ~VXFS_TYPE_MASK;
 
 	if (VXFS_ISFIFO(vip))
 		ret |= S_IFIFO;
diff -ur a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
--- a/fs/freevxfs/vxfs_super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/freevxfs/vxfs_super.c	2014-02-17 11:56:58.000000000 +0100
@@ -280,6 +280,11 @@
 vxfs_cleanup(void)
 {
 	unregister_filesystem(&vxfs_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(vxfs_inode_cachep);
 }
 
diff -ur a/fs/fuse/control.c b/fs/fuse/control.c
--- a/fs/fuse/control.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/fuse/control.c	2014-02-17 11:57:00.000000000 +0100
@@ -75,19 +75,13 @@
 				     unsigned global_limit)
 {
 	unsigned long t;
-	char tmp[32];
 	unsigned limit = (1 << 16) - 1;
 	int err;
 
-	if (*ppos || count >= sizeof(tmp) - 1)
+	if (*ppos)
 		return -EINVAL;
 
-	if (copy_from_user(tmp, buf, count))
-		return -EINVAL;
-
-	tmp[count] = '\0';
-
-	err = strict_strtoul(tmp, 0, &t);
+	err = kstrtoul_from_user(buf, count, 0, &t);
 	if (err)
 		return err;
 
@@ -123,7 +117,7 @@
 					      const char __user *buf,
 					      size_t count, loff_t *ppos)
 {
-	unsigned val;
+	unsigned uninitialized_var(val);
 	ssize_t ret;
 
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -160,7 +154,7 @@
 						    const char __user *buf,
 						    size_t count, loff_t *ppos)
 {
-	unsigned val;
+	unsigned uninitialized_var(val);
 	ssize_t ret;
 
 	ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
diff -ur a/fs/fuse/cuse.c b/fs/fuse/cuse.c
--- a/fs/fuse/cuse.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/fuse/cuse.c	2014-02-17 11:57:00.000000000 +0100
@@ -45,7 +45,6 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/module.h>
 
@@ -63,7 +62,7 @@
 	bool			unrestricted_ioctl;
 };
 
-static DEFINE_SPINLOCK(cuse_lock);		/* protects cuse_conntbl */
+static DEFINE_MUTEX(cuse_lock);		/* protects registration */
 static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
 static struct class *cuse_class;
 
@@ -92,19 +91,24 @@
 			 loff_t *ppos)
 {
 	loff_t pos = 0;
+	struct iovec iov = { .iov_base = buf, .iov_len = count };
+	struct fuse_io_priv io = { .async = 0, .file = file };
 
-	return fuse_direct_io(file, buf, count, &pos, 0);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
 }
 
 static ssize_t cuse_write(struct file *file, const char __user *buf,
 			  size_t count, loff_t *ppos)
 {
 	loff_t pos = 0;
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+	struct fuse_io_priv io = { .async = 0, .file = file };
+
 	/*
 	 * No locking or generic_write_checks(), the server is
 	 * responsible for locking and sanity checks.
 	 */
-	return fuse_direct_io(file, buf, count, &pos, 1);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
 }
 
 static int cuse_open(struct inode *inode, struct file *file)
@@ -114,14 +118,14 @@
 	int rc;
 
 	/* look up and get the connection */
-	spin_lock(&cuse_lock);
+	mutex_lock(&cuse_lock);
 	list_for_each_entry(pos, cuse_conntbl_head(devt), list)
 		if (pos->dev->devt == devt) {
 			fuse_conn_get(&pos->fc);
 			cc = pos;
 			break;
 		}
-	spin_unlock(&cuse_lock);
+	mutex_unlock(&cuse_lock);
 
 	/* dead? */
 	if (!cc)
@@ -267,7 +271,7 @@
 static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
 {
 	char *end = p + len;
-	char *key, *val;
+	char *uninitialized_var(key), *uninitialized_var(val);
 	int rc;
 
 	while (true) {
@@ -305,14 +309,14 @@
  */
 static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
-	struct cuse_conn *cc = fc_to_cc(fc);
+	struct cuse_conn *cc = fc_to_cc(fc), *pos;
 	struct cuse_init_out *arg = req->out.args[0].value;
 	struct page *page = req->pages[0];
 	struct cuse_devinfo devinfo = { };
 	struct device *dev;
 	struct cdev *cdev;
 	dev_t devt;
-	int rc;
+	int rc, i;
 
 	if (req->out.h.error ||
 	    arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
@@ -356,15 +360,24 @@
 	dev_set_drvdata(dev, cc);
 	dev_set_name(dev, "%s", devinfo.name);
 
+	mutex_lock(&cuse_lock);
+
+	/* make sure the device-name is unique */
+	for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
+		list_for_each_entry(pos, &cuse_conntbl[i], list)
+			if (!strcmp(dev_name(pos->dev), dev_name(dev)))
+				goto err_unlock;
+	}
+
 	rc = device_add(dev);
 	if (rc)
-		goto err_device;
+		goto err_unlock;
 
 	/* register cdev */
 	rc = -ENOMEM;
 	cdev = cdev_alloc();
 	if (!cdev)
-		goto err_device;
+		goto err_unlock;
 
 	cdev->owner = THIS_MODULE;
 	cdev->ops = &cuse_frontend_fops;
@@ -377,9 +390,8 @@
 	cc->cdev = cdev;
 
 	/* make the device available */
-	spin_lock(&cuse_lock);
 	list_add(&cc->list, cuse_conntbl_head(devt));
-	spin_unlock(&cuse_lock);
+	mutex_unlock(&cuse_lock);
 
 	/* announce device availability */
 	dev_set_uevent_suppress(dev, 0);
@@ -391,12 +403,13 @@
 
 err_cdev:
 	cdev_del(cdev);
-err_device:
+err_unlock:
+	mutex_unlock(&cuse_lock);
 	put_device(dev);
 err_region:
 	unregister_chrdev_region(devt, 1);
 err:
-	fc->conn_error = 1;
+	fuse_conn_kill(fc);
 	goto out;
 }
 
@@ -411,7 +424,7 @@
 
 	BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_for_background(fc, 1);
 	if (IS_ERR(req)) {
 		rc = PTR_ERR(req);
 		goto err;
@@ -492,7 +505,7 @@
 	cc->fc.release = cuse_fc_release;
 
 	cc->fc.connected = 1;
-	cc->fc.blocked = 0;
+	cc->fc.initialized = 1;
 	rc = cuse_send_init(cc);
 	if (rc) {
 		fuse_conn_put(&cc->fc);
@@ -520,9 +533,9 @@
 	int rc;
 
 	/* remove from the conntbl, no more access from this point on */
-	spin_lock(&cuse_lock);
+	mutex_lock(&cuse_lock);
 	list_del_init(&cc->list);
-	spin_unlock(&cuse_lock);
+	mutex_unlock(&cuse_lock);
 
 	/* remove device */
 	if (cc->dev)
@@ -532,8 +545,6 @@
 		cdev_del(cc->cdev);
 	}
 
-	/* kill connection and shutdown channel */
-	fuse_conn_kill(&cc->fc);
 	rc = fuse_dev_release(inode, file);	/* puts the base reference */
 
 	return rc;
diff -ur a/fs/fuse/dev.c b/fs/fuse/dev.c
--- a/fs/fuse/dev.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/fuse/dev.c	2014-02-17 11:57:00.000000000 +0100
@@ -34,34 +34,67 @@
 	return file->private_data;
 }
 
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_req *req, struct page **pages,
+			      struct fuse_page_desc *page_descs,
+			      unsigned npages)
 {
 	memset(req, 0, sizeof(*req));
+	memset(pages, 0, sizeof(*pages) * npages);
+	memset(page_descs, 0, sizeof(*page_descs) * npages);
 	INIT_LIST_HEAD(&req->list);
 	INIT_LIST_HEAD(&req->intr_entry);
 	init_waitqueue_head(&req->waitq);
 	atomic_set(&req->count, 1);
+	req->pages = pages;
+	req->page_descs = page_descs;
+	req->max_pages = npages;
 }
 
-struct fuse_req *fuse_request_alloc(void)
+static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
 {
-	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL);
-	if (req)
-		fuse_request_init(req);
+	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+	if (req) {
+		struct page **pages;
+		struct fuse_page_desc *page_descs;
+
+		if (npages <= FUSE_REQ_INLINE_PAGES) {
+			pages = req->inline_pages;
+			page_descs = req->inline_page_descs;
+		} else {
+			pages = kmalloc(sizeof(struct page *) * npages, flags);
+			page_descs = kmalloc(sizeof(struct fuse_page_desc) *
+					     npages, flags);
+		}
+
+		if (!pages || !page_descs) {
+			kfree(pages);
+			kfree(page_descs);
+			kmem_cache_free(fuse_req_cachep, req);
+			return NULL;
+		}
+
+		fuse_request_init(req, pages, page_descs, npages);
+	}
 	return req;
 }
+
+struct fuse_req *fuse_request_alloc(unsigned npages)
+{
+	return __fuse_request_alloc(npages, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(fuse_request_alloc);
 
-struct fuse_req *fuse_request_alloc_nofs(void)
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
 {
-	struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
-	if (req)
-		fuse_request_init(req);
-	return req;
+	return __fuse_request_alloc(npages, GFP_NOFS);
 }
 
 void fuse_request_free(struct fuse_req *req)
 {
+	if (req->pages != req->inline_pages) {
+		kfree(req->pages);
+		kfree(req->page_descs);
+	}
 	kmem_cache_free(fuse_req_cachep, req);
 }
 
@@ -78,7 +111,7 @@
 	sigprocmask(SIG_SETMASK, oldset, NULL);
 }
 
-static void __fuse_get_request(struct fuse_req *req)
+void __fuse_get_request(struct fuse_req *req)
 {
 	atomic_inc(&req->count);
 }
@@ -97,40 +130,66 @@
 	req->in.h.pid = current->pid;
 }
 
-struct fuse_req *fuse_get_req(struct fuse_conn *fc)
+static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
+{
+	return !fc->initialized || (for_background && fc->blocked);
+}
+
+static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
+				       bool for_background)
 {
 	struct fuse_req *req;
-	sigset_t oldset;
-	int intr;
 	int err;
-
 	atomic_inc(&fc->num_waiting);
-	block_sigs(&oldset);
-	intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
-	restore_sigs(&oldset);
-	err = -EINTR;
-	if (intr)
-		goto out;
+
+	if (fuse_block_alloc(fc, for_background)) {
+		sigset_t oldset;
+		int intr;
+
+		block_sigs(&oldset);
+		intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
+				!fuse_block_alloc(fc, for_background));
+		restore_sigs(&oldset);
+		err = -EINTR;
+		if (intr)
+			goto out;
+	}
 
 	err = -ENOTCONN;
 	if (!fc->connected)
 		goto out;
 
-	req = fuse_request_alloc();
+	req = fuse_request_alloc(npages);
 	err = -ENOMEM;
-	if (!req)
+	if (!req) {
+		if (for_background)
+			wake_up(&fc->blocked_waitq);
 		goto out;
+	}
 
 	fuse_req_init_context(req);
 	req->waiting = 1;
+	req->background = for_background;
 	return req;
 
  out:
 	atomic_dec(&fc->num_waiting);
 	return ERR_PTR(err);
 }
+
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
+{
+	return __fuse_get_req(fc, npages, false);
+}
 EXPORT_SYMBOL_GPL(fuse_get_req);
 
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+					     unsigned npages)
+{
+	return __fuse_get_req(fc, npages, true);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
+
 /*
  * Return request in fuse_file->reserved_req.  However that may
  * currently be in use.  If that is the case, wait for it to become
@@ -166,7 +225,7 @@
 	struct fuse_file *ff = file->private_data;
 
 	spin_lock(&fc->lock);
-	fuse_request_init(req);
+	fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
 	BUG_ON(ff->reserved_req);
 	ff->reserved_req = req;
 	wake_up_all(&fc->reserved_req_waitq);
@@ -187,24 +246,37 @@
  * filesystem should not have it's own file open.  If deadlock is
  * intentional, it can still be broken by "aborting" the filesystem.
  */
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+					     struct file *file)
 {
 	struct fuse_req *req;
 
 	atomic_inc(&fc->num_waiting);
-	wait_event(fc->blocked_waitq, !fc->blocked);
-	req = fuse_request_alloc();
+	wait_event(fc->blocked_waitq, fc->initialized);
+	req = fuse_request_alloc(0);
 	if (!req)
 		req = get_reserved_req(fc, file);
 
 	fuse_req_init_context(req);
 	req->waiting = 1;
+	req->background = 0;
 	return req;
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
+		if (unlikely(req->background)) {
+			/*
+			 * We get here in the unlikely case that a background
+			 * request was allocated but not sent
+			 */
+			spin_lock(&fc->lock);
+			if (!fc->blocked)
+				wake_up(&fc->blocked_waitq);
+			spin_unlock(&fc->lock);
+		}
+
 		if (req->waiting)
 			atomic_dec(&fc->num_waiting);
 
@@ -302,10 +374,15 @@
 	list_del(&req->intr_entry);
 	req->state = FUSE_REQ_FINISHED;
 	if (req->background) {
-		if (fc->num_background == fc->max_background) {
+		req->background = 0;
+
+		if (fc->num_background == fc->max_background)
 			fc->blocked = 0;
-			wake_up_all(&fc->blocked_waitq);
-		}
+
+		/* Wake up next waiter, if any */
+		if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
+			wake_up(&fc->blocked_waitq);
+
 		if (fc->num_background == fc->congestion_threshold &&
 		    fc->connected && fc->bdi_initialized) {
 			clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
@@ -407,9 +484,9 @@
 	}
 }
 
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-	req->isreply = 1;
+	BUG_ON(req->background);
 	spin_lock(&fc->lock);
 	if (!fc->connected)
 		req->out.h.error = -ENOTCONN;
@@ -426,12 +503,18 @@
 	}
 	spin_unlock(&fc->lock);
 }
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	req->isreply = 1;
+	__fuse_request_send(fc, req);
+}
 EXPORT_SYMBOL_GPL(fuse_request_send);
 
 static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 					    struct fuse_req *req)
 {
-	req->background = 1;
+	BUG_ON(!req->background);
 	fc->num_background++;
 	if (fc->num_background == fc->max_background)
 		fc->blocked = 1;
@@ -492,6 +575,27 @@
 	fuse_request_send_nowait_locked(fc, req);
 }
 
+void fuse_force_forget(struct file *file, u64 nodeid)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req;
+	struct fuse_forget_in inarg;
+
+	memset(&inarg, 0, sizeof(inarg));
+	inarg.nlookup = 1;
+	req = fuse_get_req_nofail_nopages(fc, file);
+	req->in.h.opcode = FUSE_FORGET;
+	req->in.h.nodeid = nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	req->isreply = 0;
+	__fuse_request_send(fc, req);
+	/* ignore errors */
+	fuse_put_request(fc, req);
+}
+
 /*
  * Lock the request.  Up to the next unlock_request() there mustn't be
  * anything that could cause a page-fault.  If the request was already
@@ -693,8 +797,6 @@
 	struct page *oldpage = *pagep;
 	struct page *newpage;
 	struct pipe_buffer *buf = cs->pipebufs;
-	struct address_space *mapping;
-	pgoff_t index;
 
 	unlock_request(cs->fc, cs->req);
 	fuse_copy_finish(cs);
@@ -725,9 +827,6 @@
 	if (fuse_check_page(newpage) != 0)
 		goto out_fallback_unlock;
 
-	mapping = oldpage->mapping;
-	index = oldpage->index;
-
 	/*
 	 * This is a new and locked page, it shouldn't be mapped or
 	 * have any special flags on it
@@ -834,10 +933,10 @@
 			}
 		}
 		if (page) {
-			void *mapaddr = kmap_atomic(page, KM_USER0);
+			void *mapaddr = kmap_atomic(page);
 			void *buf = mapaddr + offset;
 			offset += fuse_copy_do(cs, &buf, &count);
-			kunmap_atomic(mapaddr, KM_USER0);
+			kunmap_atomic(mapaddr);
 		} else
 			offset += fuse_copy_do(cs, NULL, &count);
 	}
@@ -852,7 +951,7 @@
 {
 	unsigned i;
 	struct fuse_req *req = cs->req;
-	unsigned offset = req->page_offset;
+	unsigned offset = req->page_descs[0].offset;
 	unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
 
 	for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
@@ -1374,7 +1473,59 @@
 	down_read(&fc->killsb);
 	err = -ENOENT;
 	if (fc->sb)
-		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, &name);
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+	up_read(&fc->killsb);
+	kfree(buf);
+	return err;
+
+err:
+	kfree(buf);
+	fuse_copy_finish(cs);
+	return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+			      struct fuse_copy_state *cs)
+{
+	struct fuse_notify_delete_out outarg;
+	int err = -ENOMEM;
+	char *buf;
+	struct qstr name;
+
+	buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+	if (!buf)
+		goto err;
+
+	err = -EINVAL;
+	if (size < sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+
+	err = -ENAMETOOLONG;
+	if (outarg.namelen > FUSE_NAME_MAX)
+		goto err;
+
+	err = -EINVAL;
+	if (size != sizeof(outarg) + outarg.namelen + 1)
+		goto err;
+
+	name.name = buf;
+	name.len = outarg.namelen;
+	err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+	buf[outarg.namelen] = 0;
+	name.hash = full_name_hash(name.name, name.len);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (fc->sb)
+		err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+					       outarg.child, &name);
 	up_read(&fc->killsb);
 	kfree(buf);
 	return err;
@@ -1486,29 +1637,34 @@
 	unsigned int num;
 	unsigned int offset;
 	size_t total_len = 0;
+	int num_pages;
+
+	offset = outarg->offset & ~PAGE_CACHE_MASK;
+	file_size = i_size_read(inode);
 
-	req = fuse_get_req(fc);
+	num = outarg->size;
+	if (outarg->offset > file_size)
+		num = 0;
+	else if (outarg->offset + num > file_size)
+		num = file_size - outarg->offset;
+
+	num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+
+	req = fuse_get_req(fc, num_pages);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	offset = outarg->offset & ~PAGE_CACHE_MASK;
-
 	req->in.h.opcode = FUSE_NOTIFY_REPLY;
 	req->in.h.nodeid = outarg->nodeid;
 	req->in.numargs = 2;
 	req->in.argpages = 1;
-	req->page_offset = offset;
+	req->page_descs[0].offset = offset;
 	req->end = fuse_retrieve_end;
 
 	index = outarg->offset >> PAGE_CACHE_SHIFT;
-	file_size = i_size_read(inode);
-	num = outarg->size;
-	if (outarg->offset > file_size)
-		num = 0;
-	else if (outarg->offset + num > file_size)
-		num = file_size - outarg->offset;
 
-	while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
+	while (num && req->num_pages < num_pages) {
 		struct page *page;
 		unsigned int this_num;
 
@@ -1594,6 +1750,9 @@
 	case FUSE_NOTIFY_RETRIEVE:
 		return fuse_notify_retrieve(fc, size, cs);
 
+	case FUSE_NOTIFY_DELETE:
+		return fuse_notify_delete(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
@@ -1603,11 +1762,9 @@
 /* Look up request on processing list by unique ID */
 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
 {
-	struct list_head *entry;
+	struct fuse_req *req;
 
-	list_for_each(entry, &fc->processing) {
-		struct fuse_req *req;
-		req = list_entry(entry, struct fuse_req, list);
+	list_for_each_entry(req, &fc->processing, list) {
 		if (req->in.h.unique == unique || req->intr_unique == unique)
 			return req;
 	}
@@ -1954,6 +2111,7 @@
 	if (fc->connected) {
 		fc->connected = 0;
 		fc->blocked = 0;
+		fc->initialized = 1;
 		end_io_requests(fc);
 		end_queued_requests(fc);
 		end_polls(fc);
@@ -1972,6 +2130,7 @@
 		spin_lock(&fc->lock);
 		fc->connected = 0;
 		fc->blocked = 0;
+		fc->initialized = 1;
 		end_queued_requests(fc);
 		end_polls(fc);
 		wake_up_all(&fc->blocked_waitq);
diff -ur a/fs/fuse/dir.c b/fs/fuse/dir.c
--- a/fs/fuse/dir.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/fuse/dir.c	2014-02-17 11:57:00.000000000 +0100
@@ -14,6 +14,36 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
 
+#ifdef MY_ABC_HERE
+#include <linux/xattr.h>
+#endif
+
+#ifdef SYNO_GLUSTER_FS
+#include "../synoacl_int.h"
+#endif
+static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+{
+	struct fuse_conn *fc = get_fuse_conn(dir);
+	struct fuse_inode *fi = get_fuse_inode(dir);
+
+	if (!fc->do_readdirplus)
+		return false;
+	if (!fc->readdirplus_auto)
+		return true;
+	if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+		return true;
+	if (filp->f_pos == 0)
+		return true;
+	return false;
+}
+
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+	struct fuse_inode *fi = get_fuse_inode(dir);
+
+	set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
+
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
 {
@@ -157,35 +187,40 @@
 static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 {
 	struct inode *inode;
+	struct dentry *parent;
+	struct fuse_conn *fc;
+	struct fuse_inode *fi;
+	int ret;
 
 	inode = ACCESS_ONCE(entry->d_inode);
 	if (inode && is_bad_inode(inode))
-		return 0;
+		goto invalid;
 	else if (fuse_dentry_time(entry) < get_jiffies_64()) {
 		int err;
 		struct fuse_entry_out outarg;
-		struct fuse_conn *fc;
 		struct fuse_req *req;
 		struct fuse_forget_link *forget;
-		struct dentry *parent;
 		u64 attr_version;
 
 		/* For negative dentries, always do a fresh lookup */
 		if (!inode)
-			return 0;
+			goto invalid;
 
+		ret = -ECHILD;
 		if (nd && (nd->flags & LOOKUP_RCU))
-			return -ECHILD;
+			goto out;
 
 		fc = get_fuse_conn(inode);
-		req = fuse_get_req(fc);
+		req = fuse_get_req_nopages(fc);
+		ret = PTR_ERR(req);
 		if (IS_ERR(req))
-			return 0;
+			goto out;
 
 		forget = fuse_alloc_forget();
 		if (!forget) {
 			fuse_put_request(fc, req);
-			return 0;
+			ret = -ENOMEM;
+			goto out;
 		}
 
 		attr_version = fuse_get_attr_version(fc);
@@ -201,10 +236,10 @@
 		if (!err && !outarg.nodeid)
 			err = -ENOENT;
 		if (!err) {
-			struct fuse_inode *fi = get_fuse_inode(inode);
+			fi = get_fuse_inode(inode);
 			if (outarg.nodeid != get_node_id(inode)) {
 				fuse_queue_forget(fc, forget, outarg.nodeid, 1);
-				return 0;
+				goto invalid;
 			}
 			spin_lock(&fc->lock);
 			fi->nlookup++;
@@ -212,14 +247,34 @@
 		}
 		kfree(forget);
 		if (err || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
-			return 0;
+			goto invalid;
 
 		fuse_change_attributes(inode, &outarg.attr,
 				       entry_attr_timeout(&outarg),
 				       attr_version);
 		fuse_change_entry_timeout(entry, &outarg);
+	} else if (inode) {
+		fi = get_fuse_inode(inode);
+#ifdef MY_ABC_HERE
+		if (nd && (nd->flags & LOOKUP_RCU)) {
+#else
+		if (flags & LOOKUP_RCU) {
+#endif
+			if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
+				return -ECHILD;
+		} else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
+			parent = dget_parent(entry);
+			fuse_advise_use_readdirplus(parent->d_inode);
+			dput(parent);
+		}
 	}
-	return 1;
+	ret = 1;
+out:
+	return ret;
+
+invalid:
+	ret = 0;
+	goto out;
 }
 
 static int invalid_nodeid(u64 nodeid)
@@ -237,26 +292,6 @@
 		S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
 }
 
-/*
- * Add a directory inode to a dentry, ensuring that no other dentry
- * refers to this inode.  Called with fc->inst_mutex.
- */
-static struct dentry *fuse_d_add_directory(struct dentry *entry,
-					   struct inode *inode)
-{
-	struct dentry *alias = d_find_alias(inode);
-	if (alias && !(alias->d_flags & DCACHE_DISCONNECTED)) {
-		/* This tries to shrink the subtree below alias */
-		fuse_invalidate_entry(alias);
-		dput(alias);
-		if (!list_empty(&inode->i_dentry))
-			return ERR_PTR(-EBUSY);
-	} else {
-		dput(alias);
-	}
-	return d_splice_alias(inode, entry);
-}
-
 int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 		     struct fuse_entry_out *outarg, struct inode **inode)
 {
@@ -271,7 +306,7 @@
 	if (name->len > FUSE_NAME_MAX)
 		goto out;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	err = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
@@ -315,6 +350,24 @@
 	return err;
 }
 
+static struct dentry *fuse_materialise_dentry(struct dentry *dentry,
+					      struct inode *inode)
+{
+	struct dentry *newent;
+
+	if (inode && S_ISDIR(inode->i_mode)) {
+		struct fuse_conn *fc = get_fuse_conn(inode);
+
+		mutex_lock(&fc->inst_mutex);
+		newent = d_materialise_unique(dentry, inode);
+		mutex_unlock(&fc->inst_mutex);
+	} else {
+		newent = d_materialise_unique(dentry, inode);
+	}
+
+	return newent;
+}
+
 static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 				  struct nameidata *nd)
 {
@@ -322,7 +375,6 @@
 	struct fuse_entry_out outarg;
 	struct inode *inode;
 	struct dentry *newent;
-	struct fuse_conn *fc = get_fuse_conn(dir);
 	bool outarg_valid = true;
 
 	err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
@@ -338,16 +390,10 @@
 	if (inode && get_node_id(inode) == FUSE_ROOT_ID)
 		goto out_iput;
 
-	if (inode && S_ISDIR(inode->i_mode)) {
-		mutex_lock(&fc->inst_mutex);
-		newent = fuse_d_add_directory(entry, inode);
-		mutex_unlock(&fc->inst_mutex);
-		err = PTR_ERR(newent);
-		if (IS_ERR(newent))
-			goto out_iput;
-	} else {
-		newent = d_splice_alias(inode, entry);
-	}
+	newent = fuse_materialise_dentry(entry, inode);
+	err = PTR_ERR(newent);
+	if (IS_ERR(newent))
+		goto out_err;
 
 	entry = newent ? newent : entry;
 	if (outarg_valid)
@@ -355,6 +401,7 @@
 	else
 		fuse_invalidate_entry_cache(entry);
 
+	fuse_advise_use_readdirplus(dir);
 	return newent;
 
  out_iput:
@@ -369,8 +416,8 @@
  * If the filesystem doesn't support this, then fall back to separate
  * 'mknod' + 'open' requests.
  */
-static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
-			    struct nameidata *nd)
+static int fuse_create_open(struct inode *dir, struct dentry *entry,
+			    umode_t mode, struct nameidata *nd)
 {
 	int err;
 	struct inode *inode;
@@ -387,14 +434,14 @@
 	if (fc->no_create)
 		return -ENOSYS;
 
-	if (flags & O_DIRECT)
-		return -EINVAL;
+	/* Userspace expects S_IFREG in create mode */
+	BUG_ON((mode & S_IFMT) != S_IFREG);
 
 	forget = fuse_alloc_forget();
 	if (!forget)
 		return -ENOMEM;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	err = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out_put_forget_req;
@@ -480,7 +527,7 @@
  */
 static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 			    struct inode *dir, struct dentry *entry,
-			    int mode)
+			    umode_t mode)
 {
 	struct fuse_entry_out outarg;
 	struct inode *inode;
@@ -552,7 +599,7 @@
 {
 	struct fuse_mknod_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -589,7 +636,7 @@
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -613,7 +660,7 @@
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -630,7 +677,7 @@
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -644,13 +691,19 @@
 	fuse_put_request(fc, req);
 	if (!err) {
 		struct inode *inode = entry->d_inode;
+		struct fuse_inode *fi = get_fuse_inode(inode);
 
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
 		/*
-		 * Set nlink to zero so the inode can be cleared, if the inode
-		 * does have more links this will be discovered at the next
-		 * lookup/getattr.
+		 * If i_nlink == 0 then unlink doesn't make sense, yet this can
+		 * happen if userspace filesystem is careless.  It would be
+		 * difficult to enforce correct nlink usage so just ignore this
+		 * condition here
 		 */
-		clear_nlink(inode);
+		if (inode->i_nlink > 0)
+			drop_nlink(inode);
+		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
 		fuse_invalidate_entry_cache(entry);
@@ -663,7 +716,7 @@
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -690,7 +743,7 @@
 	int err;
 	struct fuse_rename_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 
 	if (IS_ERR(req))
 		return PTR_ERR(req);
@@ -743,7 +796,7 @@
 	struct fuse_link_in inarg;
 	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -762,14 +815,25 @@
 	   will reflect changes in the backing inode (link count,
 	   etc.)
 	*/
-	if (!err || err == -EINTR)
+	if (!err) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+
+		spin_lock(&fc->lock);
+		fi->attr_version = ++fc->attr_version;
+		inc_nlink(inode);
+		spin_unlock(&fc->lock);
+		fuse_invalidate_attr(inode);
+	} else if (err == -EINTR) {
 		fuse_invalidate_attr(inode);
+	}
 	return err;
 }
 
 static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 			  struct kstat *stat)
 {
+	unsigned int blkbits;
+
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = attr->ino;
 	stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
@@ -785,18 +849,13 @@
 	stat->ctime.tv_nsec = attr->ctimensec;
 	stat->size = attr->size;
 	stat->blocks = attr->blocks;
-	stat->blksize = (1 << inode->i_blkbits);
-#ifdef MY_ABC_HERE
-	stat->SynoMode = 0;
-#endif
-#ifdef MY_ABC_HERE
-	/* we don't support syno archive version in fuse by now */
-	stat->syno_archive_version = 0;
-#endif
-#ifdef MY_ABC_HERE
-	stat->SynoCreateTime.tv_sec = 0;
-	stat->SynoCreateTime.tv_nsec = 0;
-#endif
+
+	if (attr->blksize != 0)
+		blkbits = ilog2(attr->blksize);
+	else
+		blkbits = inode->i_sb->s_blocksize_bits;
+
+	stat->blksize = 1 << blkbits;
 }
 
 static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
@@ -809,7 +868,7 @@
 	struct fuse_req *req;
 	u64 attr_version;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -880,7 +939,7 @@
 }
 
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
-			     struct qstr *name)
+			     u64 child_nodeid, struct qstr *name)
 {
 	int err = -ENOTDIR;
 	struct inode *parent;
@@ -907,8 +966,36 @@
 
 	fuse_invalidate_attr(parent);
 	fuse_invalidate_entry(entry);
+
+	if (child_nodeid != 0 && entry->d_inode) {
+		mutex_lock(&entry->d_inode->i_mutex);
+		if (get_node_id(entry->d_inode) != child_nodeid) {
+			err = -ENOENT;
+			goto badentry;
+		}
+		if (d_mountpoint(entry)) {
+			err = -EBUSY;
+			goto badentry;
+		}
+		if (S_ISDIR(entry->d_inode->i_mode)) {
+			shrink_dcache_parent(entry);
+			if (!simple_empty(entry)) {
+				err = -ENOTEMPTY;
+				goto badentry;
+			}
+			entry->d_inode->i_flags |= S_DEAD;
+		}
+		dont_mount(entry);
+		clear_nlink(entry->d_inode);
+		err = 0;
+ badentry:
+		mutex_unlock(&entry->d_inode->i_mutex);
+		if (!err)
+			d_delete(entry);
+	} else {
+		err = 0;
+	}
 	dput(entry);
-	err = 0;
 
  unlock:
 	mutex_unlock(&parent->i_mutex);
@@ -918,7 +1005,7 @@
 
 /*
  * Calling into a user-controlled filesystem gives the filesystem
- * daemon ptrace-like capabilities over the requester process.  This
+ * daemon ptrace-like capabilities over the current process.  This
  * means, that the filesystem daemon is able to record the exact
  * filesystem operations performed, and can also control the behavior
  * of the requester process in otherwise impossible ways.  For example
@@ -929,27 +1016,23 @@
  * for which the owner of the mount has ptrace privilege.  This
  * excludes processes started by other users, suid or sgid processes.
  */
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+int fuse_allow_current_process(struct fuse_conn *fc)
 {
 	const struct cred *cred;
-	int ret;
 
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		return 1;
 
-	rcu_read_lock();
-	ret = 0;
-	cred = __task_cred(task);
+	cred = current_cred();
 	if (cred->euid == fc->user_id &&
 	    cred->suid == fc->user_id &&
 	    cred->uid  == fc->user_id &&
 	    cred->egid == fc->group_id &&
 	    cred->sgid == fc->group_id &&
 	    cred->gid  == fc->group_id)
-		ret = 1;
-	rcu_read_unlock();
+		return 1;
 
-	return ret;
+	return 0;
 }
 
 static int fuse_access(struct inode *inode, int mask)
@@ -959,10 +1042,12 @@
 	struct fuse_access_in inarg;
 	int err;
 
+	BUG_ON(mask & MAY_NOT_BLOCK);
+
 	if (fc->no_access)
 		return 0;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1010,7 +1095,7 @@
 	bool refreshed = false;
 	int err = 0;
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	/*
@@ -1046,9 +1131,6 @@
 		   noticed immediately, only after the attribute
 		   timeout has expired */
 	} else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
-		if (mask & MAY_NOT_BLOCK)
-			return -ECHILD;
-
 		err = fuse_access(inode, mask);
 	} else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
 		if (!(inode->i_mode & S_IXUGO)) {
@@ -1074,6 +1156,8 @@
 			return -EIO;
 		if (reclen > nbytes)
 			break;
+		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+			return -EIO;
 
 		over = filldir(dstbuf, dirent->name, dirent->namelen,
 			       file->f_pos, dirent->ino, dirent->type);
@@ -1088,19 +1172,178 @@
 	return 0;
 }
 
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_direntplus_link(struct file *file,
+				struct fuse_direntplus *direntplus,
+				u64 attr_version)
 {
 	int err;
+	struct fuse_entry_out *o = &direntplus->entry_out;
+	struct fuse_dirent *dirent = &direntplus->dirent;
+	struct dentry *parent = file->f_path.dentry;
+#ifdef MY_ABC_HERE
+	struct qstr name = { .len = dirent->namelen, .name = dirent->name};
+#else
+	struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+#endif
+	struct dentry *dentry;
+	struct dentry *alias;
+	struct inode *dir = parent->d_inode;
+	struct fuse_conn *fc;
+	struct inode *inode;
+
+	if (!o->nodeid) {
+		/*
+		 * Unlike in the case of fuse_lookup, zero nodeid does not mean
+		 * ENOENT. Instead, it only means the userspace filesystem did
+		 * not want to return attributes/handle for this entry.
+		 *
+		 * So do nothing.
+		 */
+		return 0;
+	}
+
+	if (name.name[0] == '.') {
+		/*
+		 * We could potentially refresh the attributes of the directory
+		 * and its parent?
+		 */
+		if (name.len == 1)
+			return 0;
+		if (name.name[1] == '.' && name.len == 2)
+			return 0;
+	}
+
+	if (invalid_nodeid(o->nodeid))
+		return -EIO;
+	if (!fuse_valid_type(o->attr.mode))
+		return -EIO;
+
+	fc = get_fuse_conn(dir);
+
+	name.hash = full_name_hash(name.name, name.len);
+	dentry = d_lookup(parent, &name);
+	if (dentry) {
+		inode = dentry->d_inode;
+		if (!inode) {
+			d_drop(dentry);
+		} else if (get_node_id(inode) != o->nodeid ||
+			   ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+			err = d_invalidate(dentry);
+			if (err)
+				goto out;
+		} else if (is_bad_inode(inode)) {
+			err = -EIO;
+			goto out;
+		} else {
+			struct fuse_inode *fi;
+			fi = get_fuse_inode(inode);
+			spin_lock(&fc->lock);
+			fi->nlookup++;
+			spin_unlock(&fc->lock);
+
+			fuse_change_attributes(inode, &o->attr,
+					       entry_attr_timeout(o),
+					       attr_version);
+
+			/*
+			 * The other branch to 'found' comes via fuse_iget()
+			 * which bumps nlookup inside
+			 */
+			goto found;
+		}
+		dput(dentry);
+	}
+
+	dentry = d_alloc(parent, &name);
+	err = -ENOMEM;
+	if (!dentry)
+		goto out;
+
+	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+			  &o->attr, entry_attr_timeout(o), attr_version);
+	if (!inode)
+		goto out;
+
+	alias = fuse_materialise_dentry(dentry, inode);
+	err = PTR_ERR(alias);
+	if (IS_ERR(alias))
+		goto out;
+
+	if (alias) {
+		dput(dentry);
+		dentry = alias;
+	}
+
+found:
+	if (fc->readdirplus_auto)
+		set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+	fuse_change_entry_timeout(dentry, o);
+
+	err = 0;
+out:
+	dput(dentry);
+	return err;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+			     void *dstbuf, filldir_t filldir, u64 attr_version)
+{
+	struct fuse_direntplus *direntplus;
+	struct fuse_dirent *dirent;
+	size_t reclen;
+	int over = 0;
+	int ret;
+
+	while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+		direntplus = (struct fuse_direntplus *) buf;
+		dirent = &direntplus->dirent;
+		reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+		if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+			return -EIO;
+		if (reclen > nbytes)
+			break;
+		if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+			return -EIO;
+
+		if (!over) {
+			/* We fill entries into dstbuf only as much as
+			   it can hold. But we still continue iterating
+			   over remaining entries to link them. If not,
+			   we need to send a FORGET for each of those
+			   which we did not link.
+			*/
+			over = filldir(dstbuf, dirent->name, dirent->namelen,
+				       file->f_pos, dirent->ino,
+				       dirent->type);
+			file->f_pos = dirent->off;
+		}
+
+		buf += reclen;
+		nbytes -= reclen;
+
+		ret = fuse_direntplus_link(file, direntplus, attr_version);
+		if (ret)
+			fuse_force_forget(file, direntplus->entry_out.nodeid);
+	}
+
+	return 0;
+}
+
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+	int plus, err;
 	size_t nbytes;
 	struct page *page;
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
+	u64 attr_version = 0;
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req(fc, 1);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1109,17 +1352,34 @@
 		fuse_put_request(fc, req);
 		return -ENOMEM;
 	}
+
+	plus = fuse_use_readdirplus(inode, file);
 	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+	req->page_descs[0].length = PAGE_SIZE;
+	if (plus) {
+		attr_version = fuse_get_attr_version(fc);
+		fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+			       FUSE_READDIRPLUS);
+	} else {
+		fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+			       FUSE_READDIR);
+	}
 	fuse_request_send(fc, req);
 	nbytes = req->out.args[0].size;
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
-	if (!err)
-		err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
-				    filldir);
+	if (!err) {
+		if (plus) {
+			err = parse_dirplusfile(page_address(page), nbytes,
+						file, dstbuf, filldir,
+						attr_version);
+		} else {
+			err = parse_dirfile(page_address(page), nbytes, file,
+					    dstbuf, filldir);
+		}
+	}
 
 	__free_page(page);
 	fuse_invalidate_attr(inode); /* atime changed */
@@ -1130,7 +1390,7 @@
 {
 	struct inode *inode = dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_req(fc);
+	struct fuse_req *req = fuse_get_req_nopages(fc);
 	char *link;
 
 	if (IS_ERR(req))
@@ -1194,6 +1454,30 @@
 	return fuse_fsync_common(file, start, end, datasync, 1);
 }
 
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+			    unsigned long arg)
+{
+	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+	/* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+	if (fc->minor < 18)
+		return -ENOTTY;
+
+	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+				   unsigned long arg)
+{
+	struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+	if (fc->minor < 18)
+		return -ENOTTY;
+
+	return fuse_ioctl_common(file, cmd, arg,
+				 FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
 static bool update_mtime(unsigned ivalid)
 {
 	/* Always update if mtime is explicitly set  */
@@ -1288,11 +1572,11 @@
  * vmtruncate() doesn't allow for this case, so do the rlimit checking
  * and the actual truncation by hand.
  */
-static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
-			   struct file *file)
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+		    struct file *file)
 {
-	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_req *req;
 	struct fuse_setattr_in inarg;
 	struct fuse_attr_out outarg;
@@ -1300,9 +1584,6 @@
 	loff_t oldsize;
 	int err;
 
-	if (!fuse_allow_task(fc, current))
-		return -EACCES;
-
 	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
 		attr->ia_valid |= ATTR_FORCE;
 
@@ -1319,12 +1600,14 @@
 	if (attr->ia_valid & ATTR_SIZE)
 		is_truncate = true;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	if (is_truncate)
+	if (is_truncate) {
 		fuse_set_nowrite(inode);
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+	}
 
 	memset(&inarg, 0, sizeof(inarg));
 	memset(&outarg, 0, sizeof(outarg));
@@ -1386,21 +1669,28 @@
 		invalidate_inode_pages2(inode->i_mapping);
 	}
 
+	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 	return 0;
 
 error:
 	if (is_truncate)
 		fuse_release_nowrite(inode);
 
+	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 	return err;
 }
 
 static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 {
+	struct inode *inode = entry->d_inode;
+
+	if (!fuse_allow_current_process(get_fuse_conn(inode)))
+		return -EACCES;
+
 	if (attr->ia_valid & ATTR_FILE)
-		return fuse_do_setattr(entry, attr, attr->ia_file);
+		return fuse_do_setattr(inode, attr, attr->ia_file);
 	else
-		return fuse_do_setattr(entry, attr, NULL);
+		return fuse_do_setattr(inode, attr, NULL);
 }
 
 static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
@@ -1409,7 +1699,7 @@
 	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	return fuse_update_attributes(inode, stat, NULL, NULL);
@@ -1427,7 +1717,11 @@
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+#ifdef SYNO_GLUSTER_FS
+	if (IS_GLUSTER_FS(inode))
+		SYNOACL_XATTR_CHGNAME(name)
+#endif
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1450,6 +1744,8 @@
 		fc->no_setxattr = 1;
 		err = -EOPNOTSUPP;
 	}
+	if (!err)
+		fuse_invalidate_attr(inode);
 	return err;
 }
 
@@ -1466,7 +1762,11 @@
 	if (fc->no_getxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+#ifdef SYNO_GLUSTER_FS
+	if (IS_GLUSTER_FS(inode))
+		SYNOACL_XATTR_CHGNAME(name)
+#endif
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1512,13 +1812,13 @@
 	struct fuse_getxattr_out outarg;
 	ssize_t ret;
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	if (fc->no_listxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1563,7 +1863,11 @@
 	if (fc->no_removexattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_req(fc);
+#ifdef SYNO_GLUSTER_FS
+	if (IS_GLUSTER_FS(inode))
+		SYNOACL_XATTR_CHGNAME(name)
+#endif
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1579,9 +1883,133 @@
 		fc->no_removexattr = 1;
 		err = -EOPNOTSUPP;
 	}
+	if (!err)
+		fuse_invalidate_attr(inode);
 	return err;
 }
 
+#ifdef MY_ABC_HERE
+static int fuse_syno_arbit_get(struct dentry *dentry, unsigned int *pArbit)
+{
+	unsigned int arVal = 0;
+	struct inode *inode;
+	ssize_t ret;
+
+	if (!dentry || !pArbit)
+		return -EINVAL;
+
+	inode = dentry->d_inode;
+#ifdef MY_ABC_HERE
+	if (!IS_SYNO_META_XATTR(inode))
+		return -EOPNOTSUPP;
+#endif
+
+	ret = fuse_getxattr(dentry, XATTR_SYNO_PREFIX""XATTR_SYNO_ARCHIVE_BIT_NOPERM, &arVal, sizeof(arVal));
+	if (0 > ret)
+		return ret;
+
+	*pArbit = arVal;
+
+	return 0;
+}
+
+static int fuse_syno_arbit_set(struct dentry *dentry, unsigned int arbit)
+{
+	int err = -EINVAL;
+	struct inode *inode = dentry->d_inode;
+
+	if (!inode->i_op->setxattr)
+		return -EOPNOTSUPP;
+
+#ifdef MY_ABC_HERE
+	if (!IS_SYNO_META_XATTR(inode))
+		return -EOPNOTSUPP;
+#endif
+
+	err = fuse_setxattr(dentry, XATTR_SYNO_PREFIX""XATTR_SYNO_ARCHIVE_BIT_NOPERM, &arbit, sizeof(arbit), 0);
+	//printk(KERN_ERR "fuse_arbit_set: [%s] xattr name: ["XATTR_SYNO_PREFIX""XATTR_SYNO_ARCHIVE_BIT_NOPERM"], value: [%u] err: [%d] \n", d->d_name.name, arbit, err);
+	if (0 > err)
+		goto Err;
+
+	err = 0;
+Err:
+	return err;
+}
+#endif //MY_ABC_HERE
+
+#ifdef MY_ABC_HERE
+static int fuse_create_time_set(struct dentry *dentry, struct timespec *t)
+{
+	long err = -EINVAL;
+	struct inode *inode = dentry->d_inode;
+
+	if (!inode->i_op->setxattr)
+		return -EOPNOTSUPP;
+
+#ifdef MY_ABC_HERE
+	if (!IS_SYNO_META_XATTR(inode))
+		return -EOPNOTSUPP;
+#endif
+
+	err = fuse_setxattr(dentry, XATTR_SYNO_PREFIX""XATTR_SYNO_CREATE_TIME, t, sizeof(struct timespec), 0);
+	if (0 > err)
+		goto Err;
+
+	err = 0;
+Err:
+	return err;
+}
+
+static int fuse_create_time_get(struct dentry *dentry, struct timespec *t)
+{
+	long err = -EINVAL;
+	struct inode *inode = dentry->d_inode;
+
+	if (!inode->i_op->getxattr)
+		return -EOPNOTSUPP;
+
+	err = fuse_getxattr(dentry, XATTR_SYNO_PREFIX""XATTR_SYNO_CREATE_TIME, t, sizeof(struct timespec));
+	if (0 > err)
+		goto Err;
+
+	err = 0;
+Err:
+	return err;
+}
+#endif //MY_ABC_HERE 
+
+#ifdef MY_ABC_HERE
+static int fuse_syno_getattr(struct dentry *dentry, struct kstat *stat, int flags)
+{
+	int err = 0;
+	struct inode * inode = dentry->d_inode;
+
+#ifdef MY_ABC_HERE
+	if (!IS_SYNO_META_XATTR(inode))
+		return -EOPNOTSUPP;
+#endif
+
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_CREATIME) {
+		err = fuse_create_time_get(dentry, &stat->SynoCreateTime);
+		if (0 > err) {
+			stat->SynoCreateTime.tv_sec = 0;
+			stat->SynoCreateTime.tv_nsec = 0;
+		}
+	}
+#endif
+#ifdef MY_ABC_HERE
+	if (flags & SYNOST_ARBIT) {
+		err = fuse_syno_arbit_get(dentry, &stat->SynoMode);
+		if (0 > err) {
+			stat->SynoMode = 0;
+		}
+	}
+#endif
+	return 0;
+}
+#endif //MY_ABC_HERE
+
 static const struct inode_operations fuse_dir_inode_operations = {
 	.lookup		= fuse_lookup,
 	.mkdir		= fuse_mkdir,
@@ -1599,6 +2027,16 @@
 	.getxattr	= fuse_getxattr,
 	.listxattr	= fuse_listxattr,
 	.removexattr	= fuse_removexattr,
+#ifdef MY_ABC_HERE
+	.syno_getattr	= fuse_syno_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_bit = fuse_syno_arbit_get,
+	.syno_set_archive_bit = fuse_syno_arbit_set,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = fuse_create_time_set,
+#endif
 };
 
 static const struct file_operations fuse_dir_operations = {
@@ -1608,6 +2046,8 @@
 	.open		= fuse_dir_open,
 	.release	= fuse_dir_release,
 	.fsync		= fuse_dir_fsync,
+	.unlocked_ioctl	= fuse_dir_ioctl,
+	.compat_ioctl	= fuse_dir_compat_ioctl,
 };
 
 static const struct inode_operations fuse_common_inode_operations = {
@@ -1618,6 +2058,16 @@
 	.getxattr	= fuse_getxattr,
 	.listxattr	= fuse_listxattr,
 	.removexattr	= fuse_removexattr,
+#ifdef MY_ABC_HERE
+	.syno_getattr	= fuse_syno_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_bit = fuse_syno_arbit_get,
+	.syno_set_archive_bit = fuse_syno_arbit_set,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = fuse_create_time_set,
+#endif
 };
 
 static const struct inode_operations fuse_symlink_inode_operations = {
@@ -1630,6 +2080,16 @@
 	.getxattr	= fuse_getxattr,
 	.listxattr	= fuse_listxattr,
 	.removexattr	= fuse_removexattr,
+#ifdef MY_ABC_HERE
+	.syno_getattr	= fuse_syno_getattr,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_get_archive_bit = fuse_syno_arbit_get,
+	.syno_set_archive_bit = fuse_syno_arbit_set,
+#endif
+#ifdef MY_ABC_HERE
+	.syno_set_crtime = fuse_create_time_set,
+#endif
 };
 
 void fuse_init_common(struct inode *inode)
diff -ur a/fs/fuse/file.c b/fs/fuse/file.c
--- a/fs/fuse/file.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/fuse/file.c	2014-02-17 11:57:00.000000000 +0100
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/compat.h>
 #include <linux/swap.h>
+#include <linux/falloc.h>
 
 static const struct file_operations fuse_direct_io_file_operations;
 
@@ -25,7 +26,7 @@
 	struct fuse_req *req;
 	int err;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -57,7 +58,7 @@
 		return NULL;
 
 	ff->fc = fc;
-	ff->reserved_req = fuse_request_alloc();
+	ff->reserved_req = fuse_request_alloc(0);
 	if (unlikely(!ff->reserved_req)) {
 		kfree(ff);
 		return NULL;
@@ -126,11 +127,13 @@
 		struct fuse_req *req = ff->reserved_req;
 
 		if (sync) {
+			req->background = 0;
 			fuse_request_send(ff->fc, req);
 			path_put(&req->misc.release.path);
 			fuse_put_request(ff->fc, req);
 		} else {
 			req->end = fuse_release_end;
+			req->background = 1;
 			fuse_request_send_background(ff->fc, req);
 		}
 		kfree(ff);
@@ -194,10 +197,6 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
-	/* VFS checks this, but only _after_ ->open() */
-	if (file->f_flags & O_DIRECT)
-		return -EINVAL;
-
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
@@ -286,6 +285,7 @@
 	WARN_ON(atomic_read(&ff->count) > 1);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
 	ff->reserved_req->force = 1;
+	ff->reserved_req->background = 0;
 	fuse_request_send(ff->fc, ff->reserved_req);
 	fuse_put_request(ff->fc, ff->reserved_req);
 	kfree(ff);
@@ -372,7 +372,7 @@
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_req_nofail(fc, file);
+	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.lock_owner = fuse_lock_owner_id(fc, id);
@@ -440,7 +440,7 @@
 
 	fuse_sync_writes(inode);
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out;
@@ -495,9 +495,115 @@
 	req->out.args[0].size = count;
 }
 
-static size_t fuse_send_read(struct fuse_req *req, struct file *file,
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+	unsigned i;
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		if (write)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+}
+
+/**
+ * In case of short read, the caller sets 'pos' to the position of
+ * actual end of fuse request in IO request. Otherwise, if bytes_requested
+ * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
+ *
+ * An example:
+ * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
+ * both submitted asynchronously. The first of them was ACKed by userspace as
+ * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
+ * second request was ACKed as short, e.g. only 1K was read, resulting in
+ * pos == 33K.
+ *
+ * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
+ * will be equal to the length of the longest contiguous fragment of
+ * transferred data starting from the beginning of IO request.
+ */
+static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
+{
+	int left;
+
+	spin_lock(&io->lock);
+	if (err)
+		io->err = io->err ? : err;
+	else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
+		io->bytes = pos;
+
+	left = --io->reqs;
+	spin_unlock(&io->lock);
+
+	if (!left) {
+		long res;
+
+		if (io->err)
+			res = io->err;
+		else if (io->bytes >= 0 && io->write)
+			res = -EIO;
+		else {
+			res = io->bytes < 0 ? io->size : io->bytes;
+
+			if (!is_sync_kiocb(io->iocb)) {
+				struct path *path = &io->iocb->ki_filp->f_path;
+				struct inode *inode = path->dentry->d_inode;
+				struct fuse_conn *fc = get_fuse_conn(inode);
+				struct fuse_inode *fi = get_fuse_inode(inode);
+
+				spin_lock(&fc->lock);
+				fi->attr_version = ++fc->attr_version;
+				spin_unlock(&fc->lock);
+			}
+		}
+
+		aio_complete(io->iocb, res, 0);
+		kfree(io);
+	}
+}
+
+static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+	struct fuse_io_priv *io = req->io;
+	ssize_t pos = -1;
+
+	fuse_release_user_pages(req, !io->write);
+
+	if (io->write) {
+		if (req->misc.write.in.size != req->misc.write.out.size)
+			pos = req->misc.write.in.offset - io->offset +
+				req->misc.write.out.size;
+	} else {
+		if (req->misc.read.in.size != req->out.args[0].size)
+			pos = req->misc.read.in.offset - io->offset +
+				req->out.args[0].size;
+	}
+
+	fuse_aio_complete(io, req->out.h.error, pos);
+}
+
+static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
+		size_t num_bytes, struct fuse_io_priv *io)
+{
+	spin_lock(&io->lock);
+	io->size += num_bytes;
+	io->reqs++;
+	spin_unlock(&io->lock);
+
+	req->io = io;
+	req->end = fuse_aio_complete_req;
+
+	__fuse_get_request(req);
+	fuse_request_send_background(fc, req);
+
+	return num_bytes;
+}
+
+static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
 			     loff_t pos, size_t count, fl_owner_t owner)
 {
+	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 
@@ -508,6 +614,10 @@
 		inarg->read_flags |= FUSE_READ_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
+
+	if (io->async)
+		return fuse_async_req_send(fc, req, count, io);
+
 	fuse_request_send(fc, req);
 	return req->out.args[0].size;
 }
@@ -519,7 +629,8 @@
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	spin_lock(&fc->lock);
-	if (attr_ver == fi->attr_version && size < inode->i_size) {
+	if (attr_ver == fi->attr_version && size < inode->i_size &&
+	    !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
 		fi->attr_version = ++fc->attr_version;
 		i_size_write(inode, size);
 	}
@@ -528,6 +639,7 @@
 
 static int fuse_readpage(struct file *file, struct page *page)
 {
+	struct fuse_io_priv io = { .async = 0, .file = file };
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
@@ -548,7 +660,7 @@
 	 */
 	fuse_wait_on_page_writeback(inode, page->index);
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req(fc, 1);
 	err = PTR_ERR(req);
 	if (IS_ERR(req))
 		goto out;
@@ -559,7 +671,8 @@
 	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
-	num_read = fuse_send_read(req, file, pos, count, NULL);
+	req->page_descs[0].length = count;
+	num_read = fuse_send_read(req, &io, pos, count, NULL);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 
@@ -645,6 +758,7 @@
 	struct fuse_req *req;
 	struct file *file;
 	struct inode *inode;
+	unsigned nr_pages;
 };
 
 static int fuse_readpages_fill(void *_data, struct page *page)
@@ -660,16 +774,30 @@
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
 	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
 	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int nr_alloc = min_t(unsigned, data->nr_pages,
+				     FUSE_MAX_PAGES_PER_REQ);
 		fuse_send_readpages(req, data->file);
-		data->req = req = fuse_get_req(fc);
+		if (fc->async_read)
+			req = fuse_get_req_for_background(fc, nr_alloc);
+		else
+			req = fuse_get_req(fc, nr_alloc);
+
+		data->req = req;
 		if (IS_ERR(req)) {
 			unlock_page(page);
 			return PTR_ERR(req);
 		}
 	}
+
+	if (WARN_ON(req->num_pages >= req->max_pages)) {
+		fuse_put_request(fc, req);
+		return -EIO;
+	}
+
 	page_cache_get(page);
 	req->pages[req->num_pages] = page;
 	req->num_pages++;
+	data->nr_pages--;
 	return 0;
 }
 
@@ -680,6 +808,7 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_fill_data data;
 	int err;
+	int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
 
 	err = -EIO;
 	if (is_bad_inode(inode))
@@ -687,7 +816,11 @@
 
 	data.file = file;
 	data.inode = inode;
-	data.req = fuse_get_req(fc);
+	if (fc->async_read)
+		data.req = fuse_get_req_for_background(fc, nr_alloc);
+	else
+		data.req = fuse_get_req(fc, nr_alloc);
+	data.nr_pages = nr_pages;
 	err = PTR_ERR(data.req);
 	if (IS_ERR(data.req))
 		goto out;
@@ -707,13 +840,16 @@
 				  unsigned long nr_segs, loff_t pos)
 {
 	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (pos + iov_length(iov, nr_segs) > i_size_read(inode)) {
+	/*
+	 * In auto invalidate mode, always update attributes on read.
+	 * Otherwise, only update if we attempt to read past EOF (to ensure
+	 * i_size is up to date).
+	 */
+	if (fc->auto_inval_data ||
+	    (pos + iov_length(iov, nr_segs) > i_size_read(inode))) {
 		int err;
-		/*
-		 * If trying to read past EOF, make sure the i_size
-		 * attribute is up-to-date.
-		 */
 		err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
 		if (err)
 			return err;
@@ -745,9 +881,10 @@
 	req->out.args[0].value = outarg;
 }
 
-static size_t fuse_send_write(struct fuse_req *req, struct file *file,
+static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
 			      loff_t pos, size_t count, fl_owner_t owner)
 {
+	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 	struct fuse_write_in *inarg = &req->misc.write.in;
@@ -758,6 +895,10 @@
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
 		inarg->lock_owner = fuse_lock_owner_id(fc, owner);
 	}
+
+	if (io->async)
+		return fuse_async_req_send(fc, req, count, io);
+
 	fuse_request_send(fc, req);
 	return req->misc.write.out.size;
 }
@@ -807,14 +948,14 @@
        */
        fuse_wait_on_page_writeback(inode, page->index);
 
-       req = fuse_get_req(fc);
+       req = fuse_get_req(fc, FUSE_MAX_PAGES_PER_REQ);
        if (IS_ERR(req))
                return PTR_ERR(req);
 
        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
-       req->page_offset = offset;
+       req->page_descs[0].offset = offset;
        nres = fuse_send_write(req, file, pos, count, NULL);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -846,7 +987,6 @@
 }
 #endif /* MY_ABC_HERE */
 
-
 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
 				    struct inode *inode, loff_t pos,
 				    size_t count)
@@ -854,13 +994,14 @@
 	size_t res;
 	unsigned offset;
 	unsigned i;
+	struct fuse_io_priv io = { .async = 0, .file = file };
 
 	for (i = 0; i < req->num_pages; i++)
 		fuse_wait_on_page_writeback(inode, req->pages[i]->index);
 
-	res = fuse_send_write(req, file, pos, count, NULL);
+	res = fuse_send_write(req, &io, pos, count, NULL);
 
-	offset = req->page_offset;
+	offset = req->page_descs[0].offset;
 	count = res;
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
@@ -891,7 +1032,7 @@
 	int err;
 
 	req->in.argpages = 1;
-	req->page_offset = offset;
+	req->page_descs[0].offset = offset;
 
 	do {
 		size_t tmp;
@@ -943,28 +1084,41 @@
 		if (!fc->big_writes)
 			break;
 	} while (iov_iter_count(ii) && count < fc->max_write &&
-		 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+		 req->num_pages < req->max_pages && offset == 0);
 
 	return count > 0 ? count : err;
 }
 
+static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+{
+	return min_t(unsigned,
+		     ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
+		     (pos >> PAGE_CACHE_SHIFT) + 1,
+		     FUSE_MAX_PAGES_PER_REQ);
+}
+
 static ssize_t fuse_perform_write(struct file *file,
 				  struct address_space *mapping,
 				  struct iov_iter *ii, loff_t pos)
 {
 	struct inode *inode = mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	int err = 0;
 	ssize_t res = 0;
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
+	if (inode->i_size < pos + iov_iter_count(ii))
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
 	do {
 		struct fuse_req *req;
 		ssize_t count;
+		unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
 
-		req = fuse_get_req(fc);
+		req = fuse_get_req(fc, nr_pages);
 		if (IS_ERR(req)) {
 			err = PTR_ERR(req);
 			break;
@@ -994,6 +1148,7 @@
 	if (res > 0)
 		fuse_write_update_size(inode, pos);
 
+	clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 	fuse_invalidate_attr(inode);
 
 	return res > 0 ? res : err;
@@ -1005,19 +1160,24 @@
 	struct file *file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
 	size_t count = 0;
+	size_t ocount = 0;
 	ssize_t written = 0;
+	ssize_t written_buffered = 0;
 	struct inode *inode = mapping->host;
 	ssize_t err;
 	struct iov_iter i;
+	loff_t endbyte = 0;
 
 	WARN_ON(iocb->ki_pos != pos);
 
-	err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	ocount = 0;
+	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
 	if (err)
 		return err;
 
+	count = ocount;
+	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -1033,82 +1193,173 @@
 	if (err)
 		goto out;
 
-	file_update_time(file);
+	err = file_update_time(file);
+	if (err)
+		goto out;
+
+	if (file->f_flags & O_DIRECT) {
+		written = generic_file_direct_write(iocb, iov, &nr_segs,
+						    pos, &iocb->ki_pos,
+						    count, ocount);
+		if (written < 0 || written == count)
+			goto out;
+
+		pos += written;
+		count -= written;
+
+		iov_iter_init(&i, iov, nr_segs, count, written);
+		written_buffered = fuse_perform_write(file, mapping, &i, pos);
+		if (written_buffered < 0) {
+			err = written_buffered;
+			goto out;
+		}
+		endbyte = pos + written_buffered - 1;
+
+		err = filemap_write_and_wait_range(file->f_mapping, pos,
+						   endbyte);
+		if (err)
+			goto out;
 
-	iov_iter_init(&i, iov, nr_segs, count, 0);
-	written = fuse_perform_write(file, mapping, &i, pos);
-	if (written >= 0)
-		iocb->ki_pos = pos + written;
+		invalidate_mapping_pages(file->f_mapping,
+					 pos >> PAGE_CACHE_SHIFT,
+					 endbyte >> PAGE_CACHE_SHIFT);
 
+		written += written_buffered;
+		iocb->ki_pos = pos + written_buffered;
+	} else {
+		iov_iter_init(&i, iov, nr_segs, count, 0);
+		written = fuse_perform_write(file, mapping, &i, pos);
+		if (written >= 0)
+			iocb->ki_pos = pos + written;
+	}
 out:
 	current->backing_dev_info = NULL;
 	mutex_unlock(&inode->i_mutex);
+	sb_end_write(inode->i_sb);
 
 	return written ? written : err;
 }
 
-static void fuse_release_user_pages(struct fuse_req *req, int write)
+static inline void fuse_page_descs_length_init(struct fuse_req *req,
+		unsigned index, unsigned nr_pages)
 {
-	unsigned i;
+	int i;
 
-	for (i = 0; i < req->num_pages; i++) {
-		struct page *page = req->pages[i];
-		if (write)
-			set_page_dirty_lock(page);
-		put_page(page);
-	}
+	for (i = index; i < index + nr_pages; i++)
+		req->page_descs[i].length = PAGE_SIZE -
+			req->page_descs[i].offset;
+}
+
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+					size_t max_size)
+{
+	return min(iov_iter_single_seg_count(ii), max_size);
 }
 
-static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
 			       size_t *nbytesp, int write)
 {
-	size_t nbytes = *nbytesp;
-	unsigned long user_addr = (unsigned long) buf;
-	unsigned offset = user_addr & ~PAGE_MASK;
-	int npages;
+	size_t nbytes = 0;  /* # bytes already packed in req */
 
 	/* Special case for kernel I/O: can copy directly into the buffer */
 	if (segment_eq(get_fs(), KERNEL_DS)) {
+		unsigned long user_addr = fuse_get_user_addr(ii);
+		size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+
 		if (write)
 			req->in.args[1].value = (void *) user_addr;
 		else
 			req->out.args[0].value = (void *) user_addr;
 
+		iov_iter_advance(ii, frag_size);
+		*nbytesp = frag_size;
 		return 0;
 	}
 
-	nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
-	npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
-	npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
-	if (npages < 0)
-		return npages;
+	while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+		unsigned npages;
+		unsigned long user_addr = fuse_get_user_addr(ii);
+		unsigned offset = user_addr & ~PAGE_MASK;
+		size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
+		int ret;
+
+		unsigned n = req->max_pages - req->num_pages;
+		frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
+
+		npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		npages = clamp(npages, 1U, n);
+
+		ret = get_user_pages_fast(user_addr, npages, !write,
+					  &req->pages[req->num_pages]);
+		if (ret < 0)
+			return ret;
+
+		npages = ret;
+		frag_size = min_t(size_t, frag_size,
+				  (npages << PAGE_SHIFT) - offset);
+		iov_iter_advance(ii, frag_size);
+
+		req->page_descs[req->num_pages].offset = offset;
+		fuse_page_descs_length_init(req, req->num_pages, npages);
+
+		req->num_pages += npages;
+		req->page_descs[req->num_pages - 1].length -=
+			(npages << PAGE_SHIFT) - offset - frag_size;
 
-	req->num_pages = npages;
-	req->page_offset = offset;
+		nbytes += frag_size;
+	}
 
 	if (write)
 		req->in.argpages = 1;
 	else
 		req->out.argpages = 1;
 
-	nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
-	*nbytesp = min(*nbytesp, nbytes);
+	*nbytesp = nbytes;
 
 	return 0;
 }
 
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
-		       size_t count, loff_t *ppos, int write)
+static inline int fuse_iter_npages(const struct iov_iter *ii_p)
+{
+	struct iov_iter ii = *ii_p;
+	int npages = 0;
+
+	while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
+		unsigned long user_addr = fuse_get_user_addr(&ii);
+		unsigned offset = user_addr & ~PAGE_MASK;
+		size_t frag_size = iov_iter_single_seg_count(&ii);
+
+		npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+		iov_iter_advance(&ii, frag_size);
+	}
+
+	return min(npages, FUSE_MAX_PAGES_PER_REQ);
+}
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
+		       unsigned long nr_segs, size_t count, loff_t *ppos,
+		       int write)
 {
+	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
 	size_t nmax = write ? fc->max_write : fc->max_read;
 	loff_t pos = *ppos;
 	ssize_t res = 0;
 	struct fuse_req *req;
+	struct iov_iter ii;
+
+	iov_iter_init(&ii, iov, nr_segs, count, 0);
 
-	req = fuse_get_req(fc);
+	if (io->async)
+		req = fuse_get_req_for_background(fc, fuse_iter_npages(&ii));
+	else
+		req = fuse_get_req(fc, fuse_iter_npages(&ii));
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1116,18 +1367,19 @@
 		size_t nres;
 		fl_owner_t owner = current->files;
 		size_t nbytes = min(count, nmax);
-		int err = fuse_get_user_pages(req, buf, &nbytes, write);
+		int err = fuse_get_user_pages(req, &ii, &nbytes, write);
 		if (err) {
 			res = err;
 			break;
 		}
 
 		if (write)
-			nres = fuse_send_write(req, file, pos, nbytes, owner);
+			nres = fuse_send_write(req, io, pos, nbytes, owner);
 		else
-			nres = fuse_send_read(req, file, pos, nbytes, owner);
+			nres = fuse_send_read(req, io, pos, nbytes, owner);
 
-		fuse_release_user_pages(req, !write);
+		if (!io->async)
+			fuse_release_user_pages(req, !write);
 		if (req->out.h.error) {
 			if (!res)
 				res = req->out.h.error;
@@ -1139,12 +1391,15 @@
 		count -= nres;
 		res += nres;
 		pos += nres;
-		buf += nres;
 		if (nres != nbytes)
 			break;
 		if (count) {
 			fuse_put_request(fc, req);
-			req = fuse_get_req(fc);
+			if (io->async)
+				req = fuse_get_req_for_background(fc,
+					fuse_iter_npages(&ii));
+			else
+				req = fuse_get_req(fc, fuse_iter_npages(&ii));
 			if (IS_ERR(req))
 				break;
 		}
@@ -1158,16 +1413,45 @@
 }
 EXPORT_SYMBOL_GPL(fuse_direct_io);
 
-static ssize_t fuse_direct_read(struct file *file, char __user *buf,
-				     size_t count, loff_t *ppos)
+static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
+				  const struct iovec *iov,
+				  unsigned long nr_segs, loff_t *ppos,
+				  size_t count)
 {
 	ssize_t res;
+	struct file *file = io->file;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	res = fuse_direct_io(file, buf, count, ppos, 0);
+	res = fuse_direct_io(io, iov, nr_segs, count, ppos, 0);
+
+	fuse_invalidate_attr(inode);
+
+	return res;
+}
+
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+				     size_t count, loff_t *ppos)
+{
+	struct fuse_io_priv io = { .async = 0, .file = file };
+	struct iovec iov = { .iov_base = buf, .iov_len = count };
+	return __fuse_direct_read(&io, &iov, 1, ppos, count);
+}
+
+static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
+				   const struct iovec *iov,
+				   unsigned long nr_segs, loff_t *ppos)
+{
+	struct file *file = io->file;
+	struct inode *inode = file->f_path.dentry->d_inode;
+	size_t count = iov_length(iov, nr_segs);
+	ssize_t res;
+
+	res = generic_write_checks(file, ppos, &count, 0);
+	if (!res)
+		res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
 
 	fuse_invalidate_attr(inode);
 
@@ -1177,24 +1461,21 @@
 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 				 size_t count, loff_t *ppos)
 {
+	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
 	struct inode *inode = file->f_path.dentry->d_inode;
 	ssize_t res;
+	struct fuse_io_priv io = { .async = 0, .file = file };
 
 	if (is_bad_inode(inode))
 		return -EIO;
 
 	/* Don't allow parallel writes to the same file */
 	mutex_lock(&inode->i_mutex);
-	res = generic_write_checks(file, ppos, &count, 0);
-	if (!res) {
-		res = fuse_direct_io(file, buf, count, ppos, 1);
-		if (res > 0)
-			fuse_write_update_size(inode, *ppos);
-	}
+	res = __fuse_direct_write(&io, &iov, 1, ppos);
+	if (res > 0)
+		fuse_write_update_size(inode, *ppos);
 	mutex_unlock(&inode->i_mutex);
 
-	fuse_invalidate_attr(inode);
-
 	return res;
 }
 
@@ -1297,10 +1578,11 @@
 
 	set_page_writeback(page);
 
-	req = fuse_request_alloc_nofs();
+	req = fuse_request_alloc_nofs(1);
 	if (!req)
 		goto err;
 
+	req->background = 1; /* writeback always goes to bg_queue */
 	tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 	if (!tmp_page)
 		goto err_free;
@@ -1318,13 +1600,12 @@
 	req->in.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = tmp_page;
-	req->page_offset = 0;
+	req->page_descs[0].offset = 0;
 	req->end = fuse_writepage_end;
 	req->inode = inode;
 
 	inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
 	inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
-	end_page_writeback(page);
 
 	spin_lock(&fc->lock);
 	list_add(&req->writepages_entry, &fi->writepages);
@@ -1332,6 +1613,8 @@
 	fuse_flush_writepages(inode);
 	spin_unlock(&fc->lock);
 
+	end_page_writeback(page);
+
 	return 0;
 
 err_free:
@@ -1495,7 +1778,7 @@
 	struct fuse_lk_out outarg;
 	int err;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1530,7 +1813,7 @@
 	if (fl->fl_flags & FL_CLOSE)
 		return 0;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -1599,7 +1882,7 @@
 	if (!inode->i_sb->s_bdev || fc->no_bmap)
 		return 0;
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return 0;
 
@@ -1628,48 +1911,16 @@
 	loff_t retval;
 	struct inode *inode = file->f_path.dentry->d_inode;
 
-	mutex_lock(&inode->i_mutex);
-	if (origin != SEEK_CUR && origin != SEEK_SET) {
-		retval = fuse_update_attributes(inode, NULL, file, NULL);
-		if (retval)
-			goto exit;
-	}
+	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+	if (origin == SEEK_CUR || origin == SEEK_SET)
+		return generic_file_llseek(file, offset, origin);
 
-	switch (origin) {
-	case SEEK_END:
-		offset += i_size_read(inode);
-		break;
-	case SEEK_CUR:
-		if (offset == 0) {
-			retval = file->f_pos;
-			goto exit;
-		}
-		offset += file->f_pos;
-		break;
-	case SEEK_DATA:
-		if (offset >= i_size_read(inode)) {
-			retval = -ENXIO;
-			goto exit;
-		}
-		break;
-	case SEEK_HOLE:
-		if (offset >= i_size_read(inode)) {
-			retval = -ENXIO;
-			goto exit;
-		}
-		offset = i_size_read(inode);
-		break;
-	}
-	retval = -EINVAL;
-	if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
-		if (offset != file->f_pos) {
-			file->f_pos = offset;
-			file->f_version = 0;
-		}
-		retval = offset;
-	}
-exit:
+	mutex_lock(&inode->i_mutex);
+	retval = fuse_update_attributes(inode, NULL, file, NULL);
+	if (!retval)
+		retval = generic_file_llseek(file, offset, origin);
 	mutex_unlock(&inode->i_mutex);
+
 	return retval;
 }
 
@@ -1881,7 +2132,7 @@
 	BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
 
 	err = -ENOMEM;
-	pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL);
+	pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
 	iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
 	if (!pages || !iov_page)
 		goto out;
@@ -1929,7 +2180,7 @@
 		num_pages++;
 	}
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req(fc, num_pages);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		req = NULL;
@@ -1937,6 +2188,7 @@
 	}
 	memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
 	req->num_pages = num_pages;
+	fuse_page_descs_length_init(req, 0, req->num_pages);
 
 	/* okay, let's send it to the client */
 	req->in.h.opcode = FUSE_IOCTL;
@@ -1992,11 +2244,11 @@
 		    in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
 			goto out;
 
-		vaddr = kmap_atomic(pages[0], KM_USER0);
+		vaddr = kmap_atomic(pages[0]);
 		err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
 					    transferred, in_iovs + out_iovs,
 					    (flags & FUSE_IOCTL_COMPAT) != 0);
-		kunmap_atomic(vaddr, KM_USER0);
+		kunmap_atomic(vaddr);
 		if (err)
 			goto out;
 
@@ -2031,13 +2283,13 @@
 }
 EXPORT_SYMBOL_GPL(fuse_do_ioctl);
 
-static long fuse_file_ioctl_common(struct file *file, unsigned int cmd,
-				   unsigned long arg, unsigned int flags)
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+		       unsigned long arg, unsigned int flags)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	if (!fuse_allow_task(fc, current))
+	if (!fuse_allow_current_process(fc))
 		return -EACCES;
 
 	if (is_bad_inode(inode))
@@ -2049,13 +2301,13 @@
 static long fuse_file_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
-	return fuse_file_ioctl_common(file, cmd, arg, 0);
+	return fuse_ioctl_common(file, cmd, arg, 0);
 }
 
 static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
 				   unsigned long arg)
 {
-	return fuse_file_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+	return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
 }
 
 /*
@@ -2122,6 +2374,7 @@
 		return DEFAULT_POLLMASK;
 
 	poll_wait(file, &ff->poll_wait, wait);
+	inarg.events = (__u32)poll_requested_events(wait);
 
 	/*
 	 * Ask for notification iff there's someone waiting for it.
@@ -2132,7 +2385,7 @@
 		fuse_register_polled_file(fc, ff);
 	}
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return POLLERR;
 
@@ -2182,6 +2435,185 @@
 	return 0;
 }
 
+static void fuse_do_truncate(struct file *file)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct iattr attr;
+
+	attr.ia_valid = ATTR_SIZE;
+	attr.ia_size = i_size_read(inode);
+
+	attr.ia_file = file;
+	attr.ia_valid |= ATTR_FILE;
+
+	fuse_do_setattr(inode, &attr, file);
+}
+
+static inline loff_t fuse_round_up(loff_t off)
+{
+	return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+}
+
+static ssize_t
+fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
+			loff_t offset, unsigned long nr_segs)
+{
+	ssize_t ret = 0;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	bool async_dio = ff->fc->async_dio;
+	loff_t pos = 0;
+	struct inode *inode;
+	loff_t i_size;
+	size_t count = iov_length(iov, nr_segs);
+	struct fuse_io_priv *io;
+
+	pos = offset;
+	inode = file->f_mapping->host;
+	i_size = i_size_read(inode);
+
+	/* optimization for short read */
+	if (async_dio && rw != WRITE && offset + count > i_size) {
+		if (offset >= i_size)
+			return 0;
+		count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+	}
+
+	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+	if (!io)
+		return -ENOMEM;
+	spin_lock_init(&io->lock);
+	io->reqs = 1;
+	io->bytes = -1;
+	io->size = 0;
+	io->offset = offset;
+	io->write = (rw == WRITE);
+	io->err = 0;
+	io->file = file;
+	/*
+	 * By default, we want to optimize all I/Os with async request
+	 * submission to the client filesystem if supported.
+	 */
+	io->async = async_dio;
+	io->iocb = iocb;
+
+	/*
+	 * We cannot asynchronously extend the size of a file. We have no method
+	 * to wait on real async I/O requests, so we must submit this request
+	 * synchronously.
+	 */
+	if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
+		io->async = false;
+
+	if (rw == WRITE)
+		ret = __fuse_direct_write(io, iov, nr_segs, &pos);
+	else
+		ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
+
+	if (io->async) {
+		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
+
+		/* we have a non-extending, async request, so return */
+		if (!is_sync_kiocb(iocb))
+			return -EIOCBQUEUED;
+
+		ret = wait_on_sync_kiocb(iocb);
+	} else {
+		kfree(io);
+	}
+
+	if (rw == WRITE) {
+		if (ret > 0)
+			fuse_write_update_size(inode, pos);
+		else if (ret < 0 && offset + count > i_size)
+			fuse_do_truncate(file);
+	}
+
+	return ret;
+}
+
+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+				loff_t length)
+{
+	struct fuse_file *ff = file->private_data;
+#ifdef MY_ABC_HERE
+	struct inode *inode = file->f_path.dentry->d_inode;
+#else
+	struct inode *inode = file->f_inode;
+#endif
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = ff->fc;
+	struct fuse_req *req;
+	struct fuse_fallocate_in inarg = {
+		.fh = ff->fh,
+		.offset = offset,
+		.length = length,
+		.mode = mode
+	};
+	int err;
+	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+			   (mode & FALLOC_FL_PUNCH_HOLE);
+
+	if (fc->no_fallocate)
+		return -EOPNOTSUPP;
+
+	if (lock_inode) {
+		mutex_lock(&inode->i_mutex);
+		if (mode & FALLOC_FL_PUNCH_HOLE) {
+			loff_t endbyte = offset + length - 1;
+			err = filemap_write_and_wait_range(inode->i_mapping,
+							   offset, endbyte);
+			if (err)
+				goto out;
+
+			fuse_sync_writes(inode);
+		}
+	}
+
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+	req = fuse_get_req_nopages(fc);
+	if (IS_ERR(req)) {
+		err = PTR_ERR(req);
+		goto out;
+	}
+
+	req->in.h.opcode = FUSE_FALLOCATE;
+	req->in.h.nodeid = ff->nodeid;
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].value = &inarg;
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	if (err == -ENOSYS) {
+		fc->no_fallocate = 1;
+		err = -EOPNOTSUPP;
+	}
+	fuse_put_request(fc, req);
+
+	if (err)
+		goto out;
+
+	/* we could have extended the file */
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		fuse_write_update_size(inode, offset + length);
+
+	if (mode & FALLOC_FL_PUNCH_HOLE)
+		truncate_pagecache_range(inode, offset, offset + length - 1);
+
+	fuse_invalidate_attr(inode);
+
+out:
+	if (!(mode & FALLOC_FL_KEEP_SIZE))
+		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+	if (lock_inode)
+		mutex_unlock(&inode->i_mutex);
+
+	return err;
+}
+
 static const struct file_operations fuse_file_operations = {
 	.llseek		= fuse_file_llseek,
 	.read		= do_sync_read,
@@ -2199,6 +2631,7 @@
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
+	.fallocate	= fuse_file_fallocate,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -2215,6 +2648,7 @@
 	.unlocked_ioctl	= fuse_file_ioctl,
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
+	.fallocate	= fuse_file_fallocate,
 	/* no splice_read */
 };
 
@@ -2229,6 +2663,7 @@
 	.readpages	= fuse_readpages,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.bmap		= fuse_bmap,
+	.direct_IO	= fuse_direct_IO,
 };
 
 void fuse_init_file_inode(struct inode *inode)
diff -ur a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
--- a/fs/fuse/fuse_i.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/fuse/fuse_i.h	2014-02-17 11:57:00.000000000 +0100
@@ -44,6 +44,13 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
+#ifdef MY_ABC_HERE
+#define SYNOMETA_XATTR_MNT_OPT "synometa_xattr"
+#endif
+
+/** Number of page pointers embedded in fuse_req */
+#define FUSE_REQ_INLINE_PAGES 1
+
 /** List of active connections */
 extern struct list_head fuse_conn_list;
 
@@ -80,7 +87,7 @@
 
 	/** The sticky bit in inode->i_mode may have been removed, so
 	    preserve the original mode */
-	mode_t orig_i_mode;
+	umode_t orig_i_mode;
 
 	/** 64 bit inode number */
 	u64 orig_ino;
@@ -103,6 +110,19 @@
 
 	/** List of writepage requestst (pending or sent) */
 	struct list_head writepages;
+
+	/** Miscellaneous bits describing inode state */
+	unsigned long state;
+};
+
+/** FUSE inode state bits */
+enum {
+	/** Advise readdirplus  */
+	FUSE_I_ADVISE_RDPLUS,
+	/** Initialized with readdirplus */
+	FUSE_I_INIT_RDPLUS,
+	/** An operation changing file size is in progress  */
+	FUSE_I_SIZE_UNSTABLE,
 };
 
 struct fuse_conn;
@@ -200,6 +220,12 @@
 	struct fuse_arg args[3];
 };
 
+/** FUSE page descriptor */
+struct fuse_page_desc {
+	unsigned int length;
+	unsigned int offset;
+};
+
 /** The request state */
 enum fuse_req_state {
 	FUSE_REQ_INIT = 0,
@@ -210,6 +236,20 @@
 	FUSE_REQ_FINISHED
 };
 
+/** The request IO state (for asynchronous processing) */
+struct fuse_io_priv {
+	int async;
+	spinlock_t lock;
+	unsigned reqs;
+	ssize_t bytes;
+	size_t size;
+	__u64 offset;
+	bool write;
+	int err;
+	struct kiocb *iocb;
+	struct file *file;
+};
+
 /**
  * A request to the client
  */
@@ -291,20 +331,32 @@
 	} misc;
 
 	/** page vector */
-	struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+	struct page **pages;
+
+	/** page-descriptor vector */
+	struct fuse_page_desc *page_descs;
+
+	/** size of the 'pages' array */
+	unsigned max_pages;
+
+	/** inline page vector */
+	struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
+
+	/** inline page-descriptor vector */
+	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
 
 	/** number of pages in vector */
 	unsigned num_pages;
 
-	/** offset of data on first page */
-	unsigned page_offset;
-
 	/** File used in the request (or NULL) */
 	struct fuse_file *ff;
 
 	/** Inode used in the request or NULL */
 	struct inode *inode;
 
+	/** AIO control block */
+	struct fuse_io_priv *io;
+
 	/** Link on fi->writepages */
 	struct list_head writepages_entry;
 
@@ -390,6 +442,10 @@
 	/** Batching of FORGET requests (positive indicates FORGET batch) */
 	int forget_batch;
 
+	/** Flag indicating that INIT reply has been received. Allocating
+	 * any fuse request will be suspended until the flag is set */
+	int initialized;
+
 	/** Flag indicating if connection is blocked.  This will be
 	    the case before the INIT reply is received, and if there
 	    are too many outstading backgrounds requests */
@@ -481,6 +537,21 @@
 	/** Are BSD file locking primitives not implemented by fs? */
 	unsigned no_flock:1;
 
+	/** Is fallocate not implemented by fs? */
+	unsigned no_fallocate:1;
+
+	/** Use enhanced/automatic page cache invalidation. */
+	unsigned auto_inval_data:1;
+
+	/** Does the filesystem support readdirplus? */
+	unsigned do_readdirplus:1;
+
+	/** Does the filesystem want adaptive readdirplus? */
+	unsigned readdirplus_auto:1;
+
+	/** Does the filesystem support asynchronous direct-IO submission? */
+	unsigned async_dio:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -572,6 +643,9 @@
 
 struct fuse_forget_link *fuse_alloc_forget(void);
 
+/* Used by READDIRPLUS */
+void fuse_force_forget(struct file *file, u64 nodeid);
+
 /**
  * Initialize READ or READDIR request
  */
@@ -652,9 +726,9 @@
 /**
  * Allocate a request
  */
-struct fuse_req *fuse_request_alloc(void);
+struct fuse_req *fuse_request_alloc(unsigned npages);
 
-struct fuse_req *fuse_request_alloc_nofs(void);
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
 
 /**
  * Free a request
@@ -662,14 +736,32 @@
 void fuse_request_free(struct fuse_req *req);
 
 /**
- * Get a request, may fail with -ENOMEM
+ * Get a request, may fail with -ENOMEM,
+ * caller should specify # elements in req->pages[] explicitly
+ */
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+					     unsigned npages);
+
+/*
+ * Increment reference count on request
+ */
+void __fuse_get_request(struct fuse_req *req);
+
+/**
+ * Get a request, may fail with -ENOMEM,
+ * useful for callers who doesn't use req->pages[]
  */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc)
+{
+	return fuse_get_req(fc, 0);
+}
 
 /**
  * Gets a requests for a file operation, always succeeds
  */
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+					     struct file *file);
 
 /**
  * Decrement reference count of a request.  If count goes to zero free
@@ -733,9 +825,9 @@
 int fuse_valid_type(int m);
 
 /**
- * Is task allowed to perform filesystem operation?
+ * Is current process allowed to perform filesystem operation?
  */
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
+int fuse_allow_current_process(struct fuse_conn *fc);
 
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 
@@ -758,19 +850,31 @@
 /**
  * File-system tells the kernel to invalidate parent attributes and
  * the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ *    - matches the inode number for the dentry matching parent/name,
+ *    - is not a mount point
+ *    - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
  */
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
-			     struct qstr *name);
+			     u64 child_nodeid, struct qstr *name);
 
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
-		       size_t count, loff_t *ppos, int write);
+ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
+		       unsigned long nr_segs, size_t count, loff_t *ppos,
+		       int write);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+		       unsigned long arg, unsigned int flags);
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
 
 void fuse_write_update_size(struct inode *inode, loff_t pos);
 
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+		    struct file *file);
+
 #endif /* _FS_FUSE_I_H */
diff -ur a/fs/fuse/inode.c b/fs/fuse/inode.c
--- a/fs/fuse/inode.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/fuse/inode.c	2014-02-17 11:57:00.000000000 +0100
@@ -20,6 +20,9 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
+#ifdef CONFIG_FS_SYNO_ACL
+#include <linux/syno_acl.h>
+#endif
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -92,6 +95,7 @@
 	fi->attr_version = 0;
 	fi->writectr = 0;
 	fi->orig_ino = 0;
+	fi->state = 0;
 	INIT_LIST_HEAD(&fi->write_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
 	INIT_LIST_HEAD(&fi->writepages);
@@ -197,22 +201,44 @@
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	loff_t oldsize;
+	struct timespec old_mtime;
 
 	spin_lock(&fc->lock);
-	if (attr_version != 0 && fi->attr_version > attr_version) {
+	if ((attr_version != 0 && fi->attr_version > attr_version) ||
+	    test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
 		spin_unlock(&fc->lock);
 		return;
 	}
 
+	old_mtime = inode->i_mtime;
 	fuse_change_attributes_common(inode, attr, attr_valid);
 
 	oldsize = inode->i_size;
 	i_size_write(inode, attr->size);
 	spin_unlock(&fc->lock);
 
-	if (S_ISREG(inode->i_mode) && oldsize != attr->size) {
-		truncate_pagecache(inode, oldsize, attr->size);
-		invalidate_inode_pages2(inode->i_mapping);
+	if (S_ISREG(inode->i_mode)) {
+		bool inval = false;
+
+		if (oldsize != attr->size) {
+			truncate_pagecache(inode, oldsize, attr->size);
+			inval = true;
+		} else if (fc->auto_inval_data) {
+			struct timespec new_mtime = {
+				.tv_sec = attr->mtime,
+				.tv_nsec = attr->mtimensec,
+			};
+
+			/*
+			 * Auto inval mode also checks and invalidates if mtime
+			 * has changed.
+			 */
+			if (!timespec_equal(&old_mtime, &new_mtime))
+				inval = true;
+		}
+
+		if (inval)
+			invalidate_inode_pages2(inode->i_mapping);
 	}
 }
 
@@ -325,6 +351,7 @@
 		fc->destroy_req = NULL;
 		req->in.h.opcode = FUSE_DESTROY;
 		req->force = 1;
+		req->background = 0;
 		fuse_request_send(fc, req);
 		fuse_put_request(fc, req);
 	}
@@ -341,17 +368,13 @@
 	spin_lock(&fc->lock);
 	fc->connected = 0;
 	fc->blocked = 0;
+	fc->initialized = 1;
 	spin_unlock(&fc->lock);
 	/* Flush all readers on this fs */
 	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
 	wake_up_all(&fc->blocked_waitq);
 	wake_up_all(&fc->reserved_req_waitq);
-	mutex_lock(&fuse_mutex);
-	list_del(&fc->entry);
-	fuse_ctl_remove_conn(fc);
-	mutex_unlock(&fuse_mutex);
-	fuse_bdi_destroy(fc);
 }
 EXPORT_SYMBOL_GPL(fuse_conn_kill);
 
@@ -360,7 +383,14 @@
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
 	fuse_send_destroy(fc);
+
 	fuse_conn_kill(fc);
+	mutex_lock(&fuse_mutex);
+	list_del(&fc->entry);
+	fuse_ctl_remove_conn(fc);
+	mutex_unlock(&fuse_mutex);
+	fuse_bdi_destroy(fc);
+
 	fuse_conn_put(fc);
 }
 
@@ -386,12 +416,12 @@
 	struct fuse_statfs_out outarg;
 	int err;
 
-	if (!fuse_allow_task(fc, current)) {
+	if (!fuse_allow_current_process(fc)) {
 		buf->f_type = FUSE_SUPER_MAGIC;
 		return 0;
 	}
 
-	req = fuse_get_req(fc);
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
@@ -420,6 +450,12 @@
 	OPT_ALLOW_OTHER,
 	OPT_MAX_READ,
 	OPT_BLKSIZE,
+#ifdef CONFIG_FS_SYNO_ACL
+	OPT_SYNOACL,
+#endif
+#ifdef MY_ABC_HERE
+	OPT_SYNOMETA_XATTR,
+#endif
 	OPT_ERR
 };
 
@@ -432,10 +468,20 @@
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_BLKSIZE,			"blksize=%u"},
+#ifdef CONFIG_FS_SYNO_ACL
+	{OPT_SYNOACL, 			SYNO_ACL_MNT_OPT},
+#endif
+#ifdef MY_ABC_HERE
+	{OPT_SYNOMETA_XATTR, 		SYNOMETA_XATTR_MNT_OPT},
+#endif
 	{OPT_ERR,			NULL}
 };
 
+#if defined(CONFIG_FS_SYNO_ACL) || defined(MY_ABC_HERE)
+static int parse_fuse_opt(char *opt, struct super_block *sb, struct fuse_mount_data *d, int is_bdev)
+#else
 static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
+#endif
 {
 	char *p;
 	memset(d, 0, sizeof(struct fuse_mount_data));
@@ -501,6 +547,16 @@
 			d->blksize = value;
 			break;
 
+#ifdef CONFIG_FS_SYNO_ACL
+		case OPT_SYNOACL:
+			sb->s_flags |= MS_SYNOACL;
+			break;
+#endif
+#ifdef MY_ABC_HERE
+		case OPT_SYNOMETA_XATTR:
+			sb->s_syno_opt |= SYNO_MS_META_XATTR;
+			break;
+#endif //MY_ABC_HERE
 		default:
 			return 0;
 		}
@@ -528,6 +584,14 @@
 	if (mnt->mnt_sb->s_bdev &&
 	    mnt->mnt_sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
 		seq_printf(m, ",blksize=%lu", mnt->mnt_sb->s_blocksize);
+#ifdef CONFIG_FS_SYNO_ACL
+	if (mnt->mnt_sb->s_flags & MS_SYNOACL)
+		seq_puts(m, ","SYNO_ACL_MNT_OPT);
+#endif
+#ifdef MY_ABC_HERE
+	if (mnt->mnt_sb->s_syno_opt & SYNO_MS_META_XATTR)
+		seq_puts(m, ","SYNOMETA_XATTR_MNT_OPT);
+#endif
 	return 0;
 }
 
@@ -554,7 +618,8 @@
 	fc->khctr = 0;
 	fc->polled_files = RB_ROOT;
 	fc->reqctr = 0;
-	fc->blocked = 1;
+	fc->blocked = 0;
+	fc->initialized = 0;
 	fc->attr_version = 1;
 	get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
 }
@@ -842,6 +907,15 @@
 				fc->big_writes = 1;
 			if (arg->flags & FUSE_DONT_MASK)
 				fc->dont_mask = 1;
+			if (arg->flags & FUSE_AUTO_INVAL_DATA)
+				fc->auto_inval_data = 1;
+			if (arg->flags & FUSE_DO_READDIRPLUS) {
+				fc->do_readdirplus = 1;
+				if (arg->flags & FUSE_READDIRPLUS_AUTO)
+					fc->readdirplus_auto = 1;
+			}
+			if (arg->flags & FUSE_ASYNC_DIO)
+				fc->async_dio = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 			fc->no_lock = 1;
@@ -854,7 +928,7 @@
 		fc->max_write = max_t(unsigned, 4096, fc->max_write);
 		fc->conn_init = 1;
 	}
-	fc->blocked = 0;
+	fc->initialized = 1;
 	wake_up_all(&fc->blocked_waitq);
 }
 
@@ -867,7 +941,9 @@
 	arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
 	arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
 		FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
-		FUSE_FLOCK_LOCKS;
+		FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
+		FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+		FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO;
 	req->in.h.opcode = FUSE_INIT;
 	req->in.numargs = 1;
 	req->in.args[0].size = sizeof(*arg);
@@ -947,7 +1023,11 @@
 
 	sb->s_flags &= ~MS_NOSEC;
 
+#if defined(CONFIG_FS_SYNO_ACL) || defined(MY_ABC_HERE)
+	if (!parse_fuse_opt((char *) data, sb, &d, is_bdev))
+#else
 	if (!parse_fuse_opt((char *) data, &d, is_bdev))
+#endif
 		goto err;
 
 	if (is_bdev) {
@@ -963,6 +1043,7 @@
 	sb->s_magic = FUSE_SUPER_MAGIC;
 	sb->s_op = &fuse_super_operations;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	sb->s_time_gran = 1;
 	sb->s_export_op = &fuse_export_operations;
 
 	file = fget(d.fd);
@@ -989,10 +1070,20 @@
 	sb->s_bdi = &fc->bdi;
 
 	/* Handle umasking inside the fuse code */
+#ifdef CONFIG_FS_SYNO_ACL
+	if (sb->s_flags & MS_SYNOACL) {
+		int st = SYNOACLModuleStatusGet("synoacl_vfs");
+		if (MODULE_STATE_LIVE != st) {
+			sb->s_flags &= ~MS_SYNOACL;
+			printk(KERN_ERR "synoacl module has not been loaded. Unable to mount with synoacl, vfs_mod status=%d \n", st);
+		} else
+			SYNOACLModuleGet("synoacl_vfs");
+	}
+#else
 	if (sb->s_flags & MS_POSIXACL)
 		fc->dont_mask = 1;
 	sb->s_flags |= MS_POSIXACL;
-
+#endif
 	fc->release = fuse_free_conn;
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
@@ -1015,12 +1106,13 @@
 	/* only now - we want root dentry with NULL ->d_op */
 	sb->s_d_op = &fuse_dentry_operations;
 
-	init_req = fuse_request_alloc();
+	init_req = fuse_request_alloc(0);
 	if (!init_req)
 		goto err_put_root;
+	init_req->background = 1;
 
 	if (is_bdev) {
-		fc->destroy_req = fuse_request_alloc();
+		fc->destroy_req = fuse_request_alloc(0);
 		if (!fc->destroy_req)
 			goto err_free_init_req;
 	}
@@ -1082,6 +1174,11 @@
 		up_write(&fc->killsb);
 	}
 
+#ifdef CONFIG_FS_SYNO_ACL
+	if (MS_SYNOACL & sb->s_flags) {
+		SYNOACLModulePut("synoacl_vfs");
+	}
+#endif
 	kill_anon_super(sb);
 }
 
@@ -1111,6 +1208,12 @@
 		up_write(&fc->killsb);
 	}
 
+#ifdef CONFIG_FS_SYNO_ACL
+	if (MS_SYNOACL & sb->s_flags) {
+		SYNOACLModulePut("synoacl_vfs");
+	}
+#endif
+
 	kill_block_super(sb);
 }
 
@@ -1183,6 +1286,12 @@
 {
 	unregister_filesystem(&fuse_fs_type);
 	unregister_fuseblk();
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(fuse_inode_cachep);
 }
 
diff -ur a/fs/fuse/Kconfig b/fs/fuse/Kconfig
--- a/fs/fuse/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/fs/fuse/Kconfig	2014-01-21 09:37:25.000000000 +0100
@@ -4,12 +4,24 @@
 	  With FUSE it is possible to implement a fully functional filesystem
 	  in a userspace program.
 
-	  There's also companion library: libfuse.  This library along with
-	  utilities is available from the FUSE homepage:
+	  There's also a companion library: libfuse2.  This library is available
+	  from the FUSE homepage:
 	  <http://fuse.sourceforge.net/>
+	  although chances are your distribution already has that library
+	  installed if you've installed the "fuse" package itself.
 
 	  See <file:Documentation/filesystems/fuse.txt> for more information.
 	  See <file:Documentation/Changes> for needed library/utility version.
 
 	  If you want to develop a userspace FS, or if you want to use
 	  a filesystem based on FUSE, answer Y or M.
+
+config CUSE
+	tristate "Character device in Userspace support"
+	depends on FUSE_FS
+	help
+	  This FUSE extension allows character devices to be
+	  implemented in userspace.
+
+	  If you want to develop or use a userspace character device
+	  based on CUSE, answer Y or M.
diff -ur a/fs/gfs2/file.c b/fs/gfs2/file.c
--- a/fs/gfs2/file.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/gfs2/file.c	2014-02-17 11:57:00.000000000 +0100
@@ -369,11 +369,7 @@
 	loff_t size;
 	int ret;
 
-	/* Wait if fs is frozen. This is racy so we check again later on
-	 * and retry if the fs has been frozen after the page lock has
-	 * been acquired
-	 */
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	sb_start_pagefault(inode->i_sb);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -455,14 +451,9 @@
 	gfs2_holder_uninit(&gh);
 	if (ret == 0) {
 		set_page_dirty(page);
-		/* This check must be post dropping of transaction lock */
-		if (inode->i_sb->s_frozen == SB_UNFROZEN) {
-			wait_on_page_writeback(page);
-		} else {
-			ret = -EAGAIN;
-			unlock_page(page);
-		}
+		wait_on_page_writeback(page);
 	}
+	sb_end_pagefault(inode->i_sb);
 	return block_page_mkwrite_return(ret);
 }
 
diff -ur a/fs/gfs2/inode.c b/fs/gfs2/inode.c
--- a/fs/gfs2/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/gfs2/inode.c	2014-02-17 11:57:00.000000000 +0100
@@ -333,7 +333,7 @@
  */
 
 static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
-		     unsigned int mode)
+		     umode_t mode)
 {
 	int error;
 
@@ -364,7 +364,7 @@
 	return 0;
 }
 
-static void munge_mode_uid_gid(struct gfs2_inode *dip, unsigned int *mode,
+static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode,
 			       unsigned int *uid, unsigned int *gid)
 {
 	if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
@@ -447,7 +447,7 @@
  */
 
 static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
-			const struct gfs2_inum_host *inum, unsigned int mode,
+			const struct gfs2_inum_host *inum, umode_t mode,
 			unsigned int uid, unsigned int gid,
 			const u64 *generation, dev_t dev, const char *symname,
 			unsigned size, struct buffer_head **bhp)
@@ -516,7 +516,7 @@
 }
 
 static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
-		       unsigned int mode, const struct gfs2_inum_host *inum,
+		       umode_t mode, const struct gfs2_inum_host *inum,
 		       const u64 *generation, dev_t dev, const char *symname,
 		       unsigned int size, struct buffer_head **bhp)
 {
@@ -659,7 +659,7 @@
  */
 
 static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
-			     unsigned int mode, dev_t dev, const char *symname,
+			     umode_t mode, dev_t dev, const char *symname,
 			     unsigned int size, int excl)
 {
 	const struct qstr *name = &dentry->d_name;
diff -ur a/fs/gfs2/log.c b/fs/gfs2/log.c
--- a/fs/gfs2/log.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/gfs2/log.c	2014-02-17 11:57:00.000000000 +0100
@@ -951,8 +951,8 @@
 			wake_up(&sdp->sd_log_waitq);
 
 		t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
+
+		try_to_freeze();
 
 		do {
 			prepare_to_wait(&sdp->sd_logd_waitq, &wait,
diff -ur a/fs/gfs2/quota.c b/fs/gfs2/quota.c
--- a/fs/gfs2/quota.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/gfs2/quota.c	2014-02-17 11:57:00.000000000 +0100
@@ -1427,8 +1427,8 @@
 		/* Check for & recover partially truncated inodes */
 		quotad_check_trunc_list(sdp);
 
-		if (freezing(current))
-			refrigerator();
+		try_to_freeze();
+
 		t = min(quotad_timeo, statfs_timeo);
 
 		prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_INTERRUPTIBLE);
diff -ur a/fs/gfs2/trans.c b/fs/gfs2/trans.c
--- a/fs/gfs2/trans.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/gfs2/trans.c	2014-02-17 11:57:00.000000000 +0100
@@ -52,6 +52,7 @@
 						   sizeof(u64));
 	INIT_LIST_HEAD(&tr->tr_list_buf);
 
+	sb_start_intwrite(sdp->sd_vfs);
 	gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
 
 	error = gfs2_glock_nq(&tr->tr_t_gh);
@@ -70,6 +71,7 @@
 	gfs2_glock_dq(&tr->tr_t_gh);
 
 fail_holder_uninit:
+	sb_end_intwrite(sdp->sd_vfs);
 	gfs2_holder_uninit(&tr->tr_t_gh);
 	kfree(tr);
 
@@ -107,6 +109,7 @@
 			gfs2_holder_uninit(&tr->tr_t_gh);
 			kfree(tr);
 		}
+		sb_end_intwrite(sdp->sd_vfs);
 		return;
 	}
 
@@ -130,6 +133,7 @@
 
 	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
 		gfs2_log_flush(sdp, NULL);
+	sb_end_intwrite(sdp->sd_vfs);
 }
 
 /**
diff -ur a/fs/hfs/super.c b/fs/hfs/super.c
--- a/fs/hfs/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/hfs/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -485,6 +485,12 @@
 static void __exit exit_hfs_fs(void)
 {
 	unregister_filesystem(&hfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(hfs_inode_cachep);
 }
 
diff -ur a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
--- a/fs/hfsplus/attributes.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/hfsplus/attributes.c	2014-02-17 11:56:58.000000000 +0100
@@ -10,25 +10,87 @@
 #include "hfsplus_raw.h"
 
 static struct kmem_cache *hfsplus_attr_tree_cachep;
+#ifdef MY_ABC_HERE
+static size_t hfsplus_attr_cached_size = sizeof(hfsplus_attr_entry);
+#endif
 
 int hfsplus_create_attr_tree_cache(void)
 {
+#ifdef MY_ABC_HERE
+	size_t cached_size = hfsplus_get_attr_tree_cache_size();
+#endif
 	if (hfsplus_attr_tree_cachep)
 		return -EEXIST;
 
+#ifdef MY_ABC_HERE
+	hfsplus_attr_tree_cachep =
+		kmem_cache_create("hfsplus_attr_cache",
+			cached_size, 0,
+			SLAB_HWCACHE_ALIGN, NULL);
+#else
 	hfsplus_attr_tree_cachep =
 		kmem_cache_create("hfsplus_attr_cache",
 			sizeof(hfsplus_attr_entry), 0,
 			SLAB_HWCACHE_ALIGN, NULL);
+#endif
 	if (!hfsplus_attr_tree_cachep)
 		return -ENOMEM;
 
 	return 0;
 }
 
+#ifdef MY_ABC_HERE
+void hfsplus_set_attr_tree_cache_size(size_t record_size)
+{
+	if (record_size < sizeof(hfsplus_attr_entry)) {
+		hfsplus_attr_cached_size = sizeof(hfsplus_attr_entry);
+	} else {
+		hfsplus_attr_cached_size = record_size;
+	}
+}
+
+size_t hfsplus_get_attr_tree_cache_size()
+{
+	return hfsplus_attr_cached_size;
+}
+
+int hfsplus_recreate_attr_tree_cache(size_t record_size)
+{
+	int err = -1;
+	size_t ori_cached_size = hfsplus_get_attr_tree_cache_size();
+
+	hfsplus_destroy_attr_tree_cache();
+
+	hfsplus_set_attr_tree_cache_size(record_size);
+
+	err = hfsplus_create_attr_tree_cache();
+	if (!err) {
+		err = 0;
+		goto END;
+	} else {
+		hfsplus_set_attr_tree_cache_size(ori_cached_size);
+		if (-ENOMEM != err) {
+			goto END;
+		}
+		//Try allocate Again
+		err = hfsplus_create_attr_tree_cache();
+		if (err) {
+			kmem_cache_destroy(hfsplus_attr_tree_cachep);
+			goto END;
+		}
+		err = -ENOMEM;
+	}
+END:
+	return err;
+}
+#endif
+
 void hfsplus_destroy_attr_tree_cache(void)
 {
 	kmem_cache_destroy(hfsplus_attr_tree_cachep);
+#ifdef MY_ABC_HERE
+	hfsplus_attr_tree_cachep = NULL;
+#endif
 }
 
 int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *k1,
diff -ur a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
--- a/fs/hfsplus/hfsplus_fs.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/hfsplus/hfsplus_fs.h	2014-02-17 11:56:58.000000000 +0100
@@ -366,6 +366,10 @@
 /* attributes.c */
 int hfsplus_create_attr_tree_cache(void);
 void hfsplus_destroy_attr_tree_cache(void);
+#ifdef MY_ABC_HERE
+int hfsplus_recreate_attr_tree_cache(size_t);
+size_t hfsplus_get_attr_tree_cache_size(void);
+#endif
 hfsplus_attr_entry *hfsplus_alloc_attr_entry(void);
 void hfsplus_destroy_attr_entry(hfsplus_attr_entry *entry_p);
 int hfsplus_attr_bin_cmp_key(const hfsplus_btree_key *,
diff -ur a/fs/hfsplus/hfsplus_raw.h b/fs/hfsplus/hfsplus_raw.h
--- a/fs/hfsplus/hfsplus_raw.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/hfsplus/hfsplus_raw.h	2014-02-17 11:56:58.000000000 +0100
@@ -401,6 +401,10 @@
  * 
  * !!! XNU kernel use the following define.
  * inline structure is outdated & been replaced.
+ *
+ * If hfsplus_attr_data size is small,
+ * hfsplus_attr_tree_cachep will try to alloc more space.
+ *
  */
 struct hfsplus_attr_data {
 	__be32 record_type;
diff -ur a/fs/hfsplus/super.c b/fs/hfsplus/super.c
--- a/fs/hfsplus/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/hfsplus/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -58,6 +58,22 @@
 	return 0;
 }
 
+#ifdef MY_ABC_HERE
+// 3802 byte for 8K node_size
+static inline size_t hfsplus_get_maxinline_attrsize(struct hfs_btree *btree)
+{
+       unsigned int maxsize = btree->node_size;
+       // Copied from Apple open source /xnu-1699.26.8/bsd/hfs/hfs_xattr.c:2169
+       maxsize -= sizeof(struct hfs_bnode_desc);       /* minus node descriptor */
+       maxsize -= 3 * sizeof(u16);                     /* minus 3 index slots */
+       maxsize /= 2;                            /* 2 key/rec pairs minumum */
+       maxsize -= sizeof(struct hfsplus_attr_key);       /* minus maximum key size */
+       maxsize -= sizeof(struct hfsplus_attr_data) - 2;  /* minus data header */
+       maxsize &= 0xFFFFFFFE;                   /* multiple of 2 bytes */
+       return maxsize;
+}
+#endif
+
 struct inode *hfsplus_iget(struct super_block *sb, unsigned long ino)
 {
 	struct hfs_find_data fd;
@@ -392,6 +408,10 @@
 	struct nls_table *nls = NULL;
 	u64 last_fs_block, last_fs_page;
 	int err;
+#ifdef MY_ABC_HERE
+	size_t max_attr_size = 0;
+	size_t cached_size = 0;
+#endif
 
 	err = -ENOMEM;
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
@@ -505,6 +525,19 @@
 		}
 	}
 	sb->s_xattr = hfsplus_xattr_handlers;
+#ifdef MY_ABC_HERE
+	if (sbi->attr_tree) {
+		max_attr_size = hfsplus_get_maxinline_attrsize(sbi->attr_tree);
+		cached_size = offsetof(struct hfsplus_attr_inline_data, raw_bytes) + max_attr_size;
+		if (cached_size > hfsplus_get_attr_tree_cache_size()) {
+			err = hfsplus_recreate_attr_tree_cache(cached_size);
+			if (err) {
+				goto out_close_attr_tree;
+			}
+		}
+		err = -EINVAL;
+	}
+#endif
 
 	inode = hfsplus_iget(sb, HFSPLUS_ALLOC_CNID);
 	if (IS_ERR(inode)) {
diff -ur a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
--- a/fs/hfsplus/xattr.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/hfsplus/xattr.c	2014-02-17 11:56:58.000000000 +0100
@@ -412,17 +412,23 @@
 				fd.entryoffset +
 				offsetof(struct hfsplus_attr_inline_data,
 				length));
+		if ((offsetof(struct hfsplus_attr_inline_data,
+					raw_bytes) + record_length) > hfsplus_get_attr_tree_cache_size()) {
+			pr_err("invalid xattr record size\n");
+			res = -EIO;
+			goto out;
+		}
 #else
 		record_length = hfs_bnode_read_u16(fd.bnode,
 				fd.entryoffset +
 				offsetof(struct hfsplus_attr_inline_data,
 				length));
-#endif
 		if (record_length > HFSPLUS_MAX_INLINE_DATA_SIZE) {
 			pr_err("invalid xattr record size\n");
 			res = -EIO;
 			goto out;
 		}
+#endif
 	} else if (record_type == HFSPLUS_ATTR_FORK_DATA ||
 			record_type == HFSPLUS_ATTR_EXTENTS) {
 		pr_err("only inline data xattr are supported\n");
diff -ur a/fs/hpfs/super.c b/fs/hpfs/super.c
--- a/fs/hpfs/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/hpfs/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -210,6 +210,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(hpfs_inode_cachep);
 }
 
diff -ur a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
--- a/fs/hugetlbfs/inode.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/hugetlbfs/inode.c	2014-02-17 11:57:01.000000000 +0100
@@ -1011,6 +1011,11 @@
 
 static void __exit exit_hugetlbfs_fs(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(hugetlbfs_inode_cachep);
 	kern_unmount(hugetlbfs_vfsmount);
 	unregister_filesystem(&hugetlbfs_fs_type);
diff -ur a/fs/inode.c b/fs/inode.c
--- a/fs/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/inode.c	2014-02-17 11:57:00.000000000 +0100
@@ -26,6 +26,7 @@
 #include <linux/ima.h>
 #include <linux/cred.h>
 #include <linux/buffer_head.h> /* for inode_has_buffers */
+#include <linux/ratelimit.h>
 #include "internal.h"
 
 /*
@@ -170,6 +171,10 @@
 #ifdef MY_ABC_HERE
 	inode->i_mode2 = 0;   /* set archive bit on creation */
 #endif
+#ifdef MY_ABC_HERE
+	inode->i_CreateTime.tv_sec = 0;
+	inode->i_CreateTime.tv_nsec = 0;
+#endif
 	if (security_inode_alloc(inode))
 		goto out;
 	spin_lock_init(&inode->i_lock);
@@ -258,9 +263,14 @@
 	BUG_ON(inode_has_buffers(inode));
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
+	if (!inode->i_nlink) {
+		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
+		atomic_long_dec(&inode->i_sb->s_remove_count);
+	}
+
 #ifdef CONFIG_FS_SYNO_ACL
 	if (inode->i_syno_acl && inode->i_syno_acl != ACL_NOT_CACHED)
-		synoacl_mod_release(inode->i_syno_acl);
+		syno_acl_release(inode->i_syno_acl);
 #elif defined(CONFIG_FS_POSIX_ACL)
 	if (inode->i_acl && inode->i_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_acl);
@@ -287,6 +297,85 @@
 		call_rcu(&inode->i_rcu, i_callback);
 }
 
+/**
+ * drop_nlink - directly drop an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  In cases
+ * where we are attempting to track writes to the
+ * filesystem, a decrement to zero means an imminent
+ * write when the file is truncated and actually unlinked
+ * on the filesystem.
+ */
+void drop_nlink(struct inode *inode)
+{
+	WARN_ON(inode->i_nlink == 0);
+	inode->__i_nlink--;
+	if (!inode->i_nlink)
+		atomic_long_inc(&inode->i_sb->s_remove_count);
+}
+EXPORT_SYMBOL(drop_nlink);
+
+/**
+ * clear_nlink - directly zero an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  See
+ * drop_nlink() for why we care about i_nlink hitting zero.
+ */
+void clear_nlink(struct inode *inode)
+{
+	if (inode->i_nlink) {
+		inode->__i_nlink = 0;
+		atomic_long_inc(&inode->i_sb->s_remove_count);
+	}
+}
+EXPORT_SYMBOL(clear_nlink);
+
+/**
+ * set_nlink - directly set an inode's link count
+ * @inode: inode
+ * @nlink: new nlink (should be non-zero)
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.
+ */
+void set_nlink(struct inode *inode, unsigned int nlink)
+{
+	if (!nlink) {
+		printk_ratelimited(KERN_INFO
+			"set_nlink() clearing i_nlink on %s inode %li\n",
+			inode->i_sb->s_type->name, inode->i_ino);
+		clear_nlink(inode);
+	} else {
+		/* Yes, some filesystems do change nlink from zero to one */
+		if (inode->i_nlink == 0)
+			atomic_long_dec(&inode->i_sb->s_remove_count);
+
+		inode->__i_nlink = nlink;
+	}
+}
+EXPORT_SYMBOL(set_nlink);
+
+/**
+ * inc_nlink - directly increment an inode's link count
+ * @inode: inode
+ *
+ * This is a low-level filesystem helper to replace any
+ * direct filesystem manipulation of i_nlink.  Currently,
+ * it is only here for parity with dec_nlink().
+ */
+void inc_nlink(struct inode *inode)
+{
+	if (WARN_ON(inode->i_nlink == 0))
+		atomic_long_dec(&inode->i_sb->s_remove_count);
+
+	inode->__i_nlink++;
+}
+EXPORT_SYMBOL(inc_nlink);
+
 void address_space_init_once(struct address_space *mapping)
 {
 	memset(mapping, 0, sizeof(*mapping));
@@ -1443,6 +1532,27 @@
 	return 0;
 }
 
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	if (inode->i_op->update_time)
+		return inode->i_op->update_time(inode, time, flags);
+
+	if (flags & S_ATIME)
+		inode->i_atime = *time;
+	if (flags & S_VERSION)
+		inode_inc_iversion(inode);
+	if (flags & S_CTIME)
+		inode->i_ctime = *time;
+	if (flags & S_MTIME)
+		inode->i_mtime = *time;
+	mark_inode_dirty_sync(inode);
+	return 0;
+}
+
 /**
  *	touch_atime	-	update the access time
  *	@mnt: mount the inode is accessed on
@@ -1477,12 +1587,24 @@
 	if (timespec_equal(&inode->i_atime, &now))
 		return;
 
-	if (mnt_want_write(mnt))
+	if (!sb_start_write_trylock(inode->i_sb))
 		return;
 
-	inode->i_atime = now;
-	mark_inode_dirty_sync(inode);
-	mnt_drop_write(mnt);
+	if (__mnt_want_write(mnt))
+		goto skip_update;
+	/*
+	 * File systems can error out when updating inodes if they need to
+	 * allocate new space to modify an inode (such is the case for
+	 * Btrfs), but since we touch atime while walking down the path we
+	 * really don't care if we failed to update the atime of the file,
+	 * so just ignore the return value.
+	 * We may also fail on filesystems that have the ability to make parts
+	 * of the fs read only, e.g. subvolumes in Btrfs.
+	 */
+	update_time(inode, &now, S_ATIME);
+	__mnt_drop_write(mnt);
+skip_update:
+	sb_end_write(inode->i_sb);
 }
 EXPORT_SYMBOL(touch_atime);
 
@@ -1495,18 +1617,20 @@
  *	usage in the file write path of filesystems, and filesystems may
  *	choose to explicitly ignore update via this function with the
  *	S_NOCMTIME inode flag, e.g. for network filesystem where these
- *	timestamps are handled by the server.
+ *	timestamps are handled by the server.  This can return an error for
+ *	file systems who need to allocate space in order to update an inode.
  */
 
-void file_update_time(struct file *file)
+int file_update_time(struct file *file)
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct timespec now;
-	enum { S_MTIME = 1, S_CTIME = 2, S_VERSION = 4 } sync_it = 0;
+	int sync_it = 0;
+	int ret;
 
 	/* First try to exhaust all avenues to not sync */
 	if (IS_NOCMTIME(inode))
-		return;
+		return 0;
 
 	now = current_fs_time(inode->i_sb);
 	if (!timespec_equal(&inode->i_mtime, &now))
@@ -1519,21 +1643,16 @@
 		sync_it |= S_VERSION;
 
 	if (!sync_it)
-		return;
+		return 0;
 
 	/* Finally allowed to write? Takes lock. */
-	if (mnt_want_write_file(file))
-		return;
+	if (__mnt_want_write_file(file))
+		return 0;
 
-	/* Only change inode inside the lock region */
-	if (sync_it & S_VERSION)
-		inode_inc_iversion(inode);
-	if (sync_it & S_CTIME)
-		inode->i_ctime = now;
-	if (sync_it & S_MTIME)
-		inode->i_mtime = now;
-	mark_inode_dirty_sync(inode);
-	mnt_drop_write_file(file);
+	ret = update_time(inode, &now, sync_it);
+	__mnt_drop_write_file(file);
+
+	return ret;
 }
 EXPORT_SYMBOL(file_update_time);
 
diff -ur a/fs/internal.h b/fs/internal.h
--- a/fs/internal.h	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/internal.h	2014-02-17 11:56:59.000000000 +0100
@@ -79,6 +79,10 @@
 
 DECLARE_BRLOCK(vfsmount_lock);
 
+extern int __mnt_want_write(struct vfsmount *);
+extern int __mnt_want_write_file(struct file *);
+extern void __mnt_drop_write(struct vfsmount *);
+extern void __mnt_drop_write_file(struct file *);
 
 /*
  * fs_struct.c
diff -ur a/fs/ioctl.c b/fs/ioctl.c
--- a/fs/ioctl.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/ioctl.c	2014-02-17 11:57:01.000000000 +0100
@@ -467,191 +467,192 @@
 
 
 #ifdef MY_ABC_HERE
-static int ioctl_get_version(struct file *filp, unsigned int *p_ver)
+static int archive_check_capable(struct inode *inode)
 {
-	struct super_block *sb;
-
-	if((!filp)||(!filp->f_path.dentry)||(!filp->f_path.dentry->d_inode)||
-		(!filp->f_path.dentry->d_inode->i_sb))
-		return -EPERM;
-
-	if((!S_ISDIR(filp->f_path.dentry->d_inode->i_mode)) &&(!S_ISREG(filp->f_path.dentry->d_inode->i_mode)))
+	if((!S_ISDIR(inode->i_mode)) && (!S_ISREG(inode->i_mode)))
 		return -EPERM;
 	
-	sb = filp->f_path.dentry->d_inode->i_sb;
-
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	/* If a blockdevice-backed filesystem isn't specified, return. */
-	if (sb->s_bdev == NULL)
+	if (!inode->i_sb->s_op->syno_set_sb_archive_ver)
+		return -EINVAL;
+	if (!inode->i_sb->s_op->syno_get_sb_archive_ver)
 		return -EINVAL;
 
-	*p_ver = sb->s_archive_version;
 	return 0;
 }
 
-static int ioctl_set_version(struct file * filp, unsigned int version)
+static int ioctl_get_version(struct inode *inode, unsigned int *p_ver)
 {
-	struct super_block *sb;
+	int error;
+	struct super_block *sb = inode->i_sb;
 
-	if((!filp)||(!filp->f_path.dentry)||(!filp->f_path.dentry->d_inode)||
-		(!filp->f_path.dentry->d_inode->i_sb))
-		return -EPERM;
+	error = archive_check_capable(inode);
+	if (error)
+		return error;
 
-	if((!S_ISDIR(filp->f_path.dentry->d_inode->i_mode)) &&(!S_ISREG(filp->f_path.dentry->d_inode->i_mode)))
-		return -EPERM;
+	error = sb->s_op->syno_get_sb_archive_ver(sb, p_ver);
+	return error;
+}
 
-	sb = filp->f_path.dentry->d_inode->i_sb;
+static int ioctl_set_version(struct file *filp, unsigned int version)
+{
+	int error;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
 
-	if (!capable(CAP_SYS_ADMIN))
+	error = archive_check_capable(inode);
+	if (error)
+		return error;
+	if ((UINT_MAX - 1) <= version) {
 		return -EPERM;
+	}
+	error = mnt_want_write(filp->f_vfsmnt);
+	if (error)
+		return error;
 
-	/* If a blockdevice-backed filesystem isn't specified, return. */
-	if (sb->s_bdev == NULL)
-		return -EINVAL;
-
-	sb->s_archive_version = version;
-	return 0;
+	mutex_lock(&sb->s_archive_mutex);
+	error = sb->s_op->syno_set_sb_archive_ver(sb, version);
+	mutex_unlock(&sb->s_archive_mutex);
+	mnt_drop_write(filp->f_vfsmnt);
+	return error;
 }
 
 static int ioctl_inc_version(struct file *filp)
 {
-	struct super_block *sb;
 	unsigned int ver;
 	int error;
-	
-	if((!filp)||(!filp->f_path.dentry)||(!filp->f_path.dentry->d_inode)||
-		(!filp->f_path.dentry->d_inode->i_sb))
-		return -EPERM;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
 
-	if((!S_ISDIR(filp->f_path.dentry->d_inode->i_mode)) &&(!S_ISREG(filp->f_path.dentry->d_inode->i_mode)))
-		return -EPERM;
+	error = archive_check_capable(inode);
+	if (error)
+		return error;
+	error = mnt_want_write(filp->f_vfsmnt);
+	if (error)
+		return error;
 
-	sb = filp->f_path.dentry->d_inode->i_sb;
 	mutex_lock(&sb->s_archive_mutex);
+	error = sb->s_op->syno_get_sb_archive_ver(sb, &ver);
+	if (error)
+		goto unlock;
 
-	error = ioctl_get_version(filp, &ver);
-	if (error) {
-		goto out;
-	}
-	if (ver+1 < ver) {
-		/* overflow */
+	// archive ver of inode = archive ver of sb + 1
+	if ((UINT_MAX - 1) <= (ver + 1)) {
 		error = -EPERM;
-		goto out;
+		goto unlock;
 	}
-	error = ioctl_set_version(filp, ver+1);
-out:
+	error = sb->s_op->syno_set_sb_archive_ver(sb, ver + 1);
+unlock:
 	mutex_unlock(&sb->s_archive_mutex);
+	mnt_drop_write(filp->f_vfsmnt);
 	return error;
 }
 
-static int ioctl_set_file_version(struct file * filp, unsigned int version)
+static int ioctl_set_file_version(struct file *filp, unsigned int version)
 {
-	struct inode *inode = NULL;
-	unsigned int old_version;
+	struct inode *inode = filp->f_dentry->d_inode;
+	int error;
 
-	if(!filp || !filp->f_path.dentry || !filp->f_path.dentry->d_inode || 0 > version)
-		return -EPERM;
+	error = archive_check_capable(inode);
+	if (error)
+		return error;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-	inode = filp->f_path.dentry->d_inode;
+	if (!inode->i_op->syno_set_archive_ver)
+		return -EINVAL;
 
-	old_version = inode->i_archive_version;
-	if (old_version != version) {
-		inode->i_archive_version = version;
-		if (inode->i_op->synosetxattr) {
-			struct syno_xattr_archive_version value;
-			value.v_magic = cpu_to_le16(0x2552);
-			value.v_struct_version = cpu_to_le16(1);
-			value.v_archive_version = cpu_to_le32(version);
-			inode->i_op->synosetxattr(inode, XATTR_SYNO_PREFIX XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value), 0);
-		}
-	}
+	error = mnt_want_write(filp->f_vfsmnt);
+	if (error)
+		return error;
 
-	mark_inode_dirty_sync(inode);
-	return 0;
+	error = inode->i_op->syno_set_archive_ver(filp->f_dentry, version);
+	mnt_drop_write(filp->f_vfsmnt);
+	return error;
 }
 
 #ifdef MY_ABC_HERE
-static int ioctl_get_bad_version(struct file *filp, unsigned int *p_ver)
+static int ioctl_get_bad_version(struct inode *inode, unsigned int *p_ver)
 {
-	struct super_block *sb = NULL;
-
-	if((!filp)||(!filp->f_path.dentry)||(!filp->f_path.dentry->d_inode)||
-		(!filp->f_path.dentry->d_inode->i_sb))
-		return -EPERM;
-
-	if((!S_ISDIR(filp->f_path.dentry->d_inode->i_mode)) &&(!S_ISREG(filp->f_path.dentry->d_inode->i_mode)))
-		return -EPERM;
-
-	sb = filp->f_path.dentry->d_inode->i_sb;
+	int error;
+	struct super_block *sb = inode->i_sb;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	error = archive_check_capable(inode);
+	if (error)
+		return error;
 
-	/* If a blockdevice-backed filesystem isn't specified, return. */
-	if (sb->s_bdev == NULL)
+	if (!sb->s_op->syno_get_sb_archive_ver1)
 		return -EINVAL;
-	*p_ver = sb->s_archive_version1;
 
-	return 0;
+	error = inode->i_sb->s_op->syno_get_sb_archive_ver1(sb, p_ver);
+	return error;
 }
 
 static int ioctl_clear_bad_version(struct file *filp)
 {
-	struct super_block *sb = NULL;
+	int error;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
+	unsigned int ver, ver1;
 
-	if((!filp)||(!filp->f_path.dentry)||(!filp->f_path.dentry->d_inode)||
-		(!filp->f_path.dentry->d_inode->i_sb))
-		return -EPERM;
+	error = archive_check_capable(inode);
+	if (error)
+		return error;
 
-	if((!S_ISDIR(filp->f_path.dentry->d_inode->i_mode)) &&(!S_ISREG(filp->f_path.dentry->d_inode->i_mode)))
-		return -EPERM;
+	if (!sb->s_op->syno_get_sb_archive_ver1)
+		return -EINVAL;
+	if (!sb->s_op->syno_set_sb_archive_ver1)
+		return -EINVAL;
 
-	sb = filp->f_path.dentry->d_inode->i_sb;
+	error = mnt_want_write(filp->f_vfsmnt);
+	if (error)
+		return error;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	mutex_lock(&sb->s_archive_mutex);
+	error = sb->s_op->syno_get_sb_archive_ver(sb, &ver);
+	if (error)
+		goto unlock;
 
-	/* If a blockdevice-backed filesystem isn't specified, return. */
-	if (sb->s_bdev == NULL)
-		return -EINVAL;
+	error = sb->s_op->syno_get_sb_archive_ver1(sb, &ver1);
+	if (error)
+		goto unlock;
 
-	sb->s_archive_version = max(sb->s_archive_version, sb->s_archive_version1) + 1;
-	sb->s_archive_version1 = 0;
+	error = sb->s_op->syno_set_sb_archive_ver(sb, max(ver, ver1) + 1);
+	if (error)
+		goto unlock;
 
-	return 0;
+	error = sb->s_op->syno_set_sb_archive_ver1(sb, 0);
+unlock:
+	mutex_unlock(&sb->s_archive_mutex);
+	mnt_drop_write(filp->f_vfsmnt);
+	return error;
 }
 
-static int ioctl_set_bad_version(struct file * filp, unsigned int version)
+static int ioctl_set_bad_version(struct file *filp, unsigned int version)
 {
-	struct super_block *sb;
-
-	if((!filp)||(!filp->f_path.dentry)||(!filp->f_path.dentry->d_inode)||
-		(!filp->f_path.dentry->d_inode->i_sb))
-		return -EPERM;
-
-	if((!S_ISDIR(filp->f_path.dentry->d_inode->i_mode)) &&(!S_ISREG(filp->f_path.dentry->d_inode->i_mode)))
-		return -EPERM;
-
-	sb = filp->f_path.dentry->d_inode->i_sb;
+	int error;
+	struct inode *inode = filp->f_dentry->d_inode;
+	struct super_block *sb = inode->i_sb;
 
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
+	error = archive_check_capable(inode);
+	if (error)
+		return error;
 
-	/* If a blockdevice-backed filesystem isn't specified, return. */
-	if (sb->s_bdev == NULL)
+	if (!sb->s_op->syno_set_sb_archive_ver1)
 		return -EINVAL;
 
-	sb->s_archive_version1 = version;
+	error = mnt_want_write(filp->f_vfsmnt);
+	if (error)
+		return error;
 
-	return 0;
+	mutex_lock(&sb->s_archive_mutex);
+	error = sb->s_op->syno_set_sb_archive_ver1(sb, version);
+	mutex_unlock(&sb->s_archive_mutex);
+	mnt_drop_write(filp->f_vfsmnt);
+	return error;
 }
-#endif
-
-#endif
+#endif /* MY_ABC_HERE */
+#endif /* MY_ABC_HERE */
 
 static int ioctl_fionbio(struct file *filp, int __user *argp)
 {
@@ -783,40 +784,40 @@
 		return put_user(inode->i_sb->s_blocksize, argp);
 
 #ifdef MY_ABC_HERE
-		case FIGETVERSION:
-			error = ioctl_get_version(filp, &ver);
-			if (!error) {
-				error = put_user(ver, (unsigned int __user *)arg) ? -EFAULT : 0;
-			}
-			break;
-		case FISETVERSION:
-			if ((error = get_user(ver, (unsigned int __user *)arg)) != 0)
-				break;
-			error = ioctl_set_version(filp, ver);
-			break;
-		case FIINCVERSION:
-			error = ioctl_inc_version(filp);
+	case FIGETVERSION:
+		error = ioctl_get_version(inode, &ver);
+		if (!error) {
+			error = put_user(ver, (unsigned int __user *)arg) ? -EFAULT : 0;
+		}
+		break;
+	case FISETVERSION:
+		if ((error = get_user(ver, (unsigned int __user *)arg)) != 0)
 			break;
-		case FISETFILEVERSION:
-			if ((error = get_user(ver, (unsigned int __user *)arg)) != 0)
-				break;
-			error = ioctl_set_file_version(filp, ver);
+		error = ioctl_set_version(filp, ver);
+		break;
+	case FIINCVERSION:
+		error = ioctl_inc_version(filp);
+		break;
+	case FISETFILEVERSION:
+		if ((error = get_user(ver, (unsigned int __user *)arg)) != 0)
 			break;
+		error = ioctl_set_file_version(filp, ver);
+		break;
 #ifdef MY_ABC_HERE
-		case FIGETBADVERSION:
-			error = ioctl_get_bad_version(filp, &ver);
-			if (!error) {
-				error = put_user(ver, (unsigned int __user *)arg) ? -EFAULT : 0;
-			}
-			break;
-		case FICLEARBADVERSION:
-			error = ioctl_clear_bad_version(filp);
-			break;
-		case FISETBADVERSION:
-			if ((error = get_user(ver, (unsigned int __user *)arg)) != 0)
-				break;
-			error = ioctl_set_bad_version(filp, ver);
+	case FIGETBADVERSION:
+		error = ioctl_get_bad_version(inode, &ver);
+		if (!error) {
+			error = put_user(ver, (unsigned int __user *)arg) ? -EFAULT : 0;
+		}
+		break;
+	case FICLEARBADVERSION:
+		error = ioctl_clear_bad_version(filp);
+		break;
+	case FISETBADVERSION:
+		if ((error = get_user(ver, (unsigned int __user *)arg)) != 0)
 			break;
+		error = ioctl_set_bad_version(filp, ver);
+		break;
 #endif
 #endif
 
diff -ur a/fs/isofs/inode.c b/fs/isofs/inode.c
--- a/fs/isofs/inode.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/isofs/inode.c	2014-02-17 11:56:57.000000000 +0100
@@ -114,6 +114,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(isofs_inode_cachep);
 }
 
diff -ur a/fs/jbd/journal.c b/fs/jbd/journal.c
--- a/fs/jbd/journal.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/jbd/journal.c	2014-02-17 11:57:00.000000000 +0100
@@ -166,7 +166,7 @@
 		 */
 		jbd_debug(1, "Now suspending kjournald\n");
 		spin_unlock(&journal->j_state_lock);
-		refrigerator();
+		try_to_freeze();
 		spin_lock(&journal->j_state_lock);
 	} else {
 		/*
diff -ur a/fs/jbd2/journal.c b/fs/jbd2/journal.c
--- a/fs/jbd2/journal.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/jbd2/journal.c	2014-02-17 11:56:59.000000000 +0100
@@ -173,7 +173,7 @@
 		 */
 		jbd_debug(1, "Now suspending kjournald2\n");
 		write_unlock(&journal->j_state_lock);
-		refrigerator();
+		try_to_freeze();
 		write_lock(&journal->j_state_lock);
 	} else {
 		/*
diff -ur a/fs/jffs2/build.c b/fs/jffs2/build.c
--- a/fs/jffs2/build.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/jffs2/build.c	2014-02-17 11:57:00.000000000 +0100
@@ -112,6 +112,19 @@
 	dbg_fsbuild("scanned flash completely\n");
 	jffs2_dbg_dump_block_lists_nolock(c);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (c->flags & (1 << 7)) {
+		printk("%s(): unlocking the mtd device... ", __func__);
+		if (c->mtd->unlock)
+			c->mtd->unlock(c->mtd, 0, c->mtd->size);
+		printk("done.\n");
+
+		printk("%s(): erasing all blocks after the end marker... ", __func__);
+		jffs2_erase_pending_blocks(c, -1);
+		printk("done.\n");
+	}
+#endif
+
 	dbg_fsbuild("pass 1 starting\n");
 	c->flags |= JFFS2_SB_FLAG_BUILDING;
 	/* Now scan the directory tree, increasing nlink according to every dirent found. */
diff -ur a/fs/jffs2/scan.c b/fs/jffs2/scan.c
--- a/fs/jffs2/scan.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/jffs2/scan.c	2014-02-17 11:57:00.000000000 +0100
@@ -22,6 +22,10 @@
 
 #define DEFAULT_EMPTY_SCAN_SIZE 256
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_MTD_NAND_COMCERTO)
+#define BIT_FLIP_TOLERENCE	7
+#endif
+
 #define noisy_printk(noise, args...) do { \
 	if (*(noise)) { \
 		printk(KERN_NOTICE args); \
@@ -64,6 +68,17 @@
 		return DEFAULT_EMPTY_SCAN_SIZE;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_MTD_NAND_COMCERTO)
+static inline uint32_t count_zero_bits( uint32_t value) {
+	uint32_t num_zeros = 0;
+	size_t i;
+	for (i = 0; i < sizeof value; ++i, value >>= 1) {
+		if ((value & 1) == 0)
+			++num_zeros;
+	}
+}
+#endif
+
 static int file_dirty(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb)
 {
 	int ret;
@@ -72,7 +87,7 @@
 		return ret;
 	if ((ret = jffs2_scan_dirty_space(c, jeb, jeb->free_size)))
 		return ret;
-	/* Turned wasted size into dirty, since we apparently 
+	/* Turned wasted size into dirty, since we apparently
 	   think it's recoverable now. */
 	jeb->dirty_size += jeb->wasted_size;
 	c->dirty_size += jeb->wasted_size;
@@ -147,8 +162,16 @@
 		/* reset summary info for next eraseblock scan */
 		jffs2_sum_reset_collected(s);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+		if (c->flags & (1 << 7))
+			ret = BLK_STATE_ALLFF;
+		else
+			ret = jffs2_scan_eraseblock(c, jeb, buf_size?flashbuf:(flashbuf+jeb->offset),
+							buf_size, s);
+#else
 		ret = jffs2_scan_eraseblock(c, jeb, buf_size?flashbuf:(flashbuf+jeb->offset),
 						buf_size, s);
+#endif
 
 		if (ret < 0)
 			goto out;
@@ -403,7 +426,7 @@
 	if (!ref)
 		return -ENOMEM;
 
-	/* BEFORE jffs2_build_xattr_subsystem() called, 
+	/* BEFORE jffs2_build_xattr_subsystem() called,
 	 * and AFTER xattr_ref is marked as a dead xref,
 	 * ref->xid is used to store 32bit xid, xd is not used
 	 * ref->ino is used to store 32bit inode-number, ic is not used
@@ -476,7 +499,7 @@
 		struct jffs2_sum_marker *sm;
 		void *sumptr = NULL;
 		uint32_t sumlen;
-	      
+
 		if (!buf_size) {
 			/* XIP case. Just look, point at the summary if it's there */
 			sm = (void *)buf + c->sector_size - sizeof(*sm);
@@ -492,9 +515,9 @@
 				buf_len = sizeof(*sm);
 
 			/* Read as much as we want into the _end_ of the preallocated buffer */
-			err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len, 
+			err = jffs2_fill_scan_buf(c, buf + buf_size - buf_len,
 						  jeb->offset + c->sector_size - buf_len,
-						  buf_len);				
+						  buf_len);
 			if (err)
 				return err;
 
@@ -513,9 +536,9 @@
 				}
 				if (buf_len < sumlen) {
 					/* Need to read more so that the entire summary node is present */
-					err = jffs2_fill_scan_buf(c, sumptr, 
+					err = jffs2_fill_scan_buf(c, sumptr,
 								  jeb->offset + c->sector_size - sumlen,
-								  sumlen - buf_len);				
+								  sumlen - buf_len);
 					if (err)
 						return err;
 				}
@@ -528,7 +551,7 @@
 
 			if (buf_size && sumlen > buf_size)
 				kfree(sumptr);
-			/* If it returns with a real error, bail. 
+			/* If it returns with a real error, bail.
 			   If it returns positive, that's a block classification
 			   (i.e. BLK_STATE_xxx) so return that too.
 			   If it returns zero, fall through to full scan. */
@@ -549,6 +572,19 @@
 			return err;
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if ((buf[0] == 0xde) &&
+		(buf[1] == 0xad) &&
+		(buf[2] == 0xc0) &&
+		(buf[3] == 0xde)) {
+		/* end of filesystem. erase everything after this point */
+		printk("%s(): End of filesystem marker found at 0x%x\n", __func__, jeb->offset);
+		c->flags |= (1 << 7);
+
+		return BLK_STATE_ALLFF;
+	}
+#endif
+
 	/* We temporarily use 'ofs' as a pointer into the buffer/jeb */
 	ofs = 0;
 	max_ofs = EMPTY_SCAN_SIZE(c->sector_size);
@@ -575,6 +611,34 @@
 		else
 			return BLK_STATE_ALLFF;	/* OK to erase if all blocks are like this */
 	}
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_MTD_NAND_COMCERTO)
+	else if (cleanmarkerfound) {
+			ofs = 0;
+			uint32_t num_zeros = 0;
+			while((ofs < max_ofs) && (num_zeros < BIT_FLIP_TOLERENCE)) {
+				if (!(*(uint32_t *)(&buf[ofs]) == 0xFFFFFFFF))
+					num_zeros = count_zero_bits(*(uint32_t *)(&buf[ofs]));
+				ofs += 4;
+			}
+#ifdef CONFIG_JFFS2_FS_WRITEBUFFER
+		if ((num_zeros < BIT_FLIP_TOLERENCE) && jffs2_cleanmarker_oob(c)) {
+			/* scan oob, take care of cleanmarker */
+			int ret = jffs2_check_oob_empty(c, jeb, cleanmarkerfound);
+			D2(printk(KERN_NOTICE "jffs2_check_oob_empty returned %d\n",ret));
+			switch (ret) {
+			case 0:		return cleanmarkerfound ? BLK_STATE_CLEANMARKER : BLK_STATE_ALLFF;
+			case 1: 	return BLK_STATE_ALLDIRTY;
+			default: 	return ret;
+			}
+		} else {
+			return BLK_STATE_ALLDIRTY;
+		}
+#endif
+
+	}
+#endif
+
 	if (ofs) {
 		D1(printk(KERN_DEBUG "Free space at %08x ends at %08x\n", jeb->offset,
 			  jeb->offset + ofs));
@@ -674,7 +738,7 @@
 				scan_end = buf_len;
 				goto more_empty;
 			}
-			
+
 			/* See how much more there is to read in this eraseblock... */
 			buf_len = min_t(uint32_t, buf_size, jeb->offset + c->sector_size - ofs);
 			if (!buf_len) {
@@ -910,7 +974,7 @@
 
 	D1(printk(KERN_DEBUG "Block at 0x%08x: free 0x%08x, dirty 0x%08x, unchecked 0x%08x, used 0x%08x, wasted 0x%08x\n",
 		  jeb->offset,jeb->free_size, jeb->dirty_size, jeb->unchecked_size, jeb->used_size, jeb->wasted_size));
-	
+
 	/* mark_node_obsolete can add to wasted !! */
 	if (jeb->wasted_size) {
 		jeb->dirty_size += jeb->wasted_size;
diff -ur a/fs/jffs2/super.c b/fs/jffs2/super.c
--- a/fs/jffs2/super.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/jffs2/super.c	2014-02-17 11:57:00.000000000 +0100
@@ -422,6 +422,12 @@
 	unregister_filesystem(&jffs2_fs_type);
 	jffs2_destroy_slab_caches();
 	jffs2_compressors_exit();
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(jffs2_inode_cachep);
 }
 
diff -ur a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
--- a/fs/jfs/jfs_logmgr.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/jfs/jfs_logmgr.c	2014-02-17 11:56:58.000000000 +0100
@@ -2349,7 +2349,7 @@
 
 		if (freezing(current)) {
 			spin_unlock_irq(&log_redrive_lock);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(&log_redrive_lock);
diff -ur a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
--- a/fs/jfs/jfs_txnmgr.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/jfs/jfs_txnmgr.c	2014-02-17 11:56:58.000000000 +0100
@@ -2800,7 +2800,7 @@
 
 		if (freezing(current)) {
 			LAZY_UNLOCK(flags);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			DECLARE_WAITQUEUE(wq, current);
 
@@ -2994,7 +2994,7 @@
 
 		if (freezing(current)) {
 			TXN_UNLOCK();
-			refrigerator();
+			try_to_freeze();
 		} else {
 			set_current_state(TASK_INTERRUPTIBLE);
 			TXN_UNLOCK();
diff -ur a/fs/jfs/super.c b/fs/jfs/super.c
--- a/fs/jfs/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/jfs/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -892,6 +892,12 @@
 	jfs_proc_clean();
 #endif
 	unregister_filesystem(&jfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(jfs_inode_cachep);
 }
 
diff -ur a/fs/Kconfig b/fs/Kconfig
--- a/fs/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/fs/Kconfig	2014-01-21 09:37:24.000000000 +0100
@@ -80,16 +80,6 @@
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
 
-config CUSE
-	tristate "Character device in Userspace support"
-	depends on FUSE_FS
-	help
-	  This FUSE extension allows character devices to be
-	  implemented in userspace.
-
-	  If you want to develop or use userspace character device
-	  based on CUSE, answer Y or M.
-
 config GENERIC_ACL
 	bool
 	select FS_POSIX_ACL
diff -ur a/fs/logfs/inode.c b/fs/logfs/inode.c
--- a/fs/logfs/inode.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/logfs/inode.c	2014-02-17 11:57:01.000000000 +0100
@@ -399,5 +399,10 @@
 
 void logfs_destroy_inode_cache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(logfs_inode_cache);
 }
diff -ur a/fs/Makefile b/fs/Makefile
--- a/fs/Makefile	2013-08-16 08:07:18.000000000 +0200
+++ b/fs/Makefile	2014-01-21 09:37:24.000000000 +0100
@@ -47,7 +47,7 @@
 
 obj-$(CONFIG_FS_MBCACHE)	+= mbcache.o
 obj-$(CONFIG_FS_POSIX_ACL)	+= posix_acl.o xattr_acl.o
-obj-$(CONFIG_FS_SYNO_ACL)	+= synoacl_api.o 
+obj-$(CONFIG_FS_SYNO_ACL)	+= synoacl_api.o syno_acl.o
 obj-$(CONFIG_NFS_COMMON)	+= nfs_common/
 obj-$(CONFIG_GENERIC_ACL)	+= generic_acl.o
 
diff -ur a/fs/minix/inode.c b/fs/minix/inode.c
--- a/fs/minix/inode.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/minix/inode.c	2014-02-17 11:57:01.000000000 +0100
@@ -100,6 +100,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(minix_inode_cachep);
 }
 
diff -ur a/fs/namei.c b/fs/namei.c
--- a/fs/namei.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/namei.c	2014-02-17 11:56:57.000000000 +0100
@@ -41,15 +41,11 @@
 #include "internal.h"
 
 #ifdef MY_ABC_HERE
-inline int SYNOUnicodeUTF8ChrToUTF16Chr(u_int16_t *p, const u_int8_t *s, int n);
-int SYNOUnicodeUTF8StrToUTF16Str(u_int16_t *pwcs, const u_int8_t *s, int n);
-inline int SYNOUnicodeUTF16ChrToUTF8Chr(u_int8_t *s, u_int16_t wc, int maxlen);
-int SYNOUnicodeUTF16StrToUTF8Str(u_int8_t *s, const u_int16_t *pwcs, int maxlen);
+int SYNOUnicodeUTF8ChrToUTF16Chr(u_int16_t *p, const u_int8_t *s, int n);
+int SYNOUnicodeUTF16ChrToUTF8Chr(u_int8_t *s, u_int16_t wc, int maxlen);
 u_int16_t *SYNOUnicodeGenerateDefaultUpcaseTable(void);
 u_int16_t *DefUpcaseTable(void);
 
-static u_int16_t UTF16NameiStrBuf1[UNICODE_UTF16_BUFSIZE];
-extern spinlock_t Namei_buf_lock_1;  /* init at alloc_super() */
 
 /*
  * Sample implementation from Unicode home page.
@@ -74,7 +70,7 @@
     {0,						       /* end of table    */}
 };
 
-inline int SYNOUnicodeUTF8ChrToUTF16Chr(u_int16_t *p, const u_int8_t *s, int n)
+int SYNOUnicodeUTF8ChrToUTF16Chr(u_int16_t *p, const u_int8_t *s, int n)
 {
 	long l;
 	int c0, c, nc;
@@ -103,40 +99,7 @@
 	return -1;
 }
 
-int SYNOUnicodeUTF8StrToUTF16Str(u_int16_t *pwcs, const u_int8_t *s, int n)
-{
-	u_int16_t *op;
-	const u_int8_t *ip;
-	int size;
-
-	op = pwcs;
-	ip = s;
-	while (n > 0 && *ip) {
-		if (*ip & 0x80) {
-			size = SYNOUnicodeUTF8ChrToUTF16Chr(op, ip, n);
-			if (size == -1) {
-				/* Ignore character and move on */
-				ip++;
-				n--;
-			} else {
-				op++;
-				ip += size;
-				n -= size;
-			}
-		} else {
-			*op++ = *ip++;
-			n--;
-		}
-	}
-	*op = 0;
-#ifdef SYNO_DEBUG_BUILD
-	if((op - pwcs) >= UNICODE_UTF16_BUFSIZE)
-		panic("SYNOUnicodeUTF8StrToUTF16Str: UTF8 string too long\n");
-#endif
-	return (op - pwcs);
-}
-
-inline int SYNOUnicodeUTF16ChrToUTF8Chr(u_int8_t *s, u_int16_t wc, int maxlen)
+int SYNOUnicodeUTF16ChrToUTF8Chr(u_int8_t *s, u_int16_t wc, int maxlen)
 {
 	long l;
 	int c, nc;
@@ -163,34 +126,6 @@
 	return -1;
 }
 
-int SYNOUnicodeUTF16StrToUTF8Str(u_int8_t *s, const u_int16_t *pwcs, int maxlen)
-{
-	const u_int16_t *ip;
-	u_int8_t *op;
-	int size;
-
-	op = s;
-	ip = pwcs;
-	while (*ip && maxlen > 0) {
-		if (*ip > 0x7f) {
-			size = SYNOUnicodeUTF16ChrToUTF8Chr(op, *ip, maxlen);
-			if (size == -1) {
-				/* Ignore character and move on */
-				maxlen--;
-			} else {
-				op += size;
-				maxlen -= size;
-			}
-		} else {
-			*op++ = (u_int8_t) *ip;
-		}
-		ip++;
-	}
-	*op = 0;
-	return (op - s);
-}
-
-
 
 /*
  * upcase.c - Generate the full NTFS Unicode upcase table in little endian.
@@ -324,23 +259,33 @@
 int SYNOUnicodeUTF8toUpper(u_int8_t *to,const u_int8_t *from, int maxlen, int clenfrom, u_int16_t *upcasetable)
 {
 	u_int16_t *UpcaseTbl;
-	int clenUtf16;
-	int i;
-	int err;
-
-	spin_lock(&Namei_buf_lock_1);
+	u_int16_t wc;
+	u_int8_t *op;
+	int size;
 
 	UpcaseTbl = (upcasetable==NULL) ? DefUpcaseTable() : upcasetable;
-	clenUtf16 = SYNOUnicodeUTF8StrToUTF16Str(UTF16NameiStrBuf1, from, clenfrom);
-
-	for(i = 0; i < clenUtf16; i++)
-		UTF16NameiStrBuf1[i] = UpcaseTbl[UTF16NameiStrBuf1[i]];
-
-	UTF16NameiStrBuf1[clenUtf16] = 0;
-	err = SYNOUnicodeUTF16StrToUTF8Str(to, UTF16NameiStrBuf1, maxlen);
-	spin_unlock(&Namei_buf_lock_1);
-	return err;
 
+	op = to;
+	while (clenfrom && maxlen) {
+		size = SYNOUnicodeUTF8ChrToUTF16Chr(&wc, from, clenfrom);
+		if (size == -1) {
+			from++;
+			clenfrom--;
+			continue;
+		} else {
+			from += size;
+			clenfrom -= size;
+		}
+		size = SYNOUnicodeUTF16ChrToUTF8Chr(op, UpcaseTbl[wc], maxlen);
+		if (size == -1) {
+			continue;
+		} else {
+			op += size;
+			maxlen -= size;
+		}
+	}
+	*op = 0;
+	return (op - to);
 }
 EXPORT_SYMBOL(SYNOUnicodeUTF8toUpper);
 
@@ -1690,13 +1635,14 @@
 static inline int may_lookup(struct nameidata *nd)
 {
 #ifdef CONFIG_FS_SYNO_ACL
-		int err;
+	int err;
+	int is_synoacl = IS_SYNOACL_INODE(nd->inode, nd->path.dentry);
 #endif
 
 	if (nd->flags & LOOKUP_RCU) {
 #ifdef CONFIG_FS_SYNO_ACL
-		if (IS_SYNOACL(nd->inode) && (NULL != nd)) {
-			err = nd->inode->i_op->syno_exec_permission(nd->path.dentry);
+		if (is_synoacl) {
+			err = synoacl_op_exec_perm(nd->path.dentry, nd->inode);
 		} else {
 			err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
 		}
@@ -1710,8 +1656,8 @@
 	}
 
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(nd->inode) && (NULL != nd)) {
-		err = nd->inode->i_op->syno_exec_permission(nd->path.dentry);
+	if (is_synoacl) {
+		err = synoacl_op_exec_perm(nd->path.dentry, nd->inode);
 	} else {
 		err = inode_permission(nd->inode, MAY_EXEC);
 	}
@@ -2153,10 +2099,9 @@
 				goto fput_fail;
 
 #ifdef CONFIG_FS_SYNO_ACL
-			if (IS_SYNOACL(dentry->d_inode)) {
-				if (dentry->d_inode->i_op->syno_permission(dentry, MAY_EXEC))
-					goto fput_fail;
-			}
+			if (IS_SYNOACL(dentry)) {
+				retval = synoacl_op_perm(dentry, MAY_EXEC);
+			} else 
 #endif
 			retval = inode_permission(dentry->d_inode, MAY_EXEC);
 			if (retval)
@@ -2345,8 +2290,8 @@
 	int err;
 
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(inode) && NULL != nd) {
-		err = inode->i_op->syno_exec_permission(nd->path.dentry);
+	if (IS_SYNOACL(base)) {
+		err = synoacl_op_exec_perm(base, inode);
 	} else {
 		err = inode_permission(inode, MAY_EXEC);
 	}
@@ -2592,7 +2537,7 @@
 	if (IS_APPEND(dir))
 		return -EPERM;
 #ifdef CONFIG_FS_SYNO_ACL
-	if (!IS_SYNOACL(dir) && check_sticky(dir, victim->d_inode)) {
+	if (!IS_SYNOACL(victim->d_parent) && check_sticky(dir, victim->d_inode)) {
 		return -EPERM;
 	}
 	if (IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)){
@@ -2637,8 +2582,8 @@
 		return -ENOENT;
 
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(dir)) {
-		return dir->i_op->syno_permission(child->d_parent, (S_ISDIR(mode)?MAY_APPEND:MAY_WRITE) | MAY_EXEC);
+	if (IS_SYNOACL(child->d_parent)) {
+		return synoacl_op_perm(child->d_parent, (S_ISDIR(mode)?MAY_APPEND:MAY_WRITE) | MAY_EXEC);
 	} 
 #endif /* CONFIG_FS_SYNO_ACL */
 	return inode_permission(dir, MAY_WRITE | MAY_EXEC);
@@ -2708,6 +2653,14 @@
 	error = dir->i_op->create(dir, dentry, mode, nd);
 	if (!error)
 		fsnotify_create(dir, dentry);
+
+#ifdef CONFIG_FS_SYNO_ACL
+	if (!error && IS_SYNOACL(dentry->d_parent)) {
+		//We assume that inode has been attached to dentry by d_instantiate().
+		synoacl_op_init(dentry);
+	}
+#endif
+
 	return error;
 }
 
@@ -2743,8 +2696,8 @@
 	}
 
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(inode)) {
-		error = inode->i_op->syno_permission(dentry, acc_mode);
+	if (IS_SYNOACL(dentry)) {
+		error = synoacl_op_perm(dentry, acc_mode);
 	} else
 #endif /* CONFIG_FS_SYNO_ACL */
 	error = inode_permission(inode, acc_mode);
@@ -3297,6 +3250,14 @@
 	error = dir->i_op->mkdir(dir, dentry, mode);
 	if (!error)
 		fsnotify_mkdir(dir, dentry);
+
+#ifdef CONFIG_FS_SYNO_ACL
+	if (!error && IS_SYNOACL(dentry->d_parent)) {
+		//We assume that inode has been attached to dentry by d_instantiate().
+		synoacl_op_init(dentry);
+	}
+#endif
+
 	return error;
 }
 
@@ -3778,7 +3739,7 @@
 	 */
 	if (new_dir != old_dir) {
 #ifdef CONFIG_FS_SYNO_ACL
-		if (!IS_SYNOACL(old_dentry->d_inode)) {
+		if (!IS_SYNOACL(old_dentry)) {
 			error = inode_permission(old_dentry->d_inode, MAY_WRITE);
 		}
 #else
diff -ur a/fs/namespace.c b/fs/namespace.c
--- a/fs/namespace.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/namespace.c	2014-02-17 11:57:00.000000000 +0100
@@ -43,6 +43,14 @@
 extern int gSynoHasDynModule;
 #endif
 
+#ifdef MY_ABC_HERE
+extern void ext4_fill_mount_path(struct super_block *sb, const char *szPath);
+#endif
+
+#ifdef CONFIG_SYNO_DUAL_HEAD
+extern int gSynoDualHead;
+#endif
+
 static int event;
 static DEFINE_IDA(mnt_id_ida);
 static DEFINE_IDA(mnt_group_ida);
@@ -316,36 +324,34 @@
 }
 
 /*
- * Most r/o checks on a fs are for operations that take
- * discrete amounts of time, like a write() or unlink().
- * We must keep track of when those operations start
- * (for permission checks) and when they end, so that
- * we can determine when writes are able to occur to
- * a filesystem.
+ * Most r/o & frozen checks on a fs are for operations that take discrete
+ * amounts of time, like a write() or unlink().  We must keep track of when
+ * those operations start (for permission checks) and when they end, so that we
+ * can determine when writes are able to occur to a filesystem.
  */
 /**
- * mnt_want_write - get write access to a mount
- * @mnt: the mount on which to take a write
+ * __mnt_want_write - get write access to a mount without freeze protection
+ * @m: the mount on which to take a write
  *
- * This tells the low-level filesystem that a write is
- * about to be performed to it, and makes sure that
- * writes are allowed before returning success.  When
- * the write operation is finished, mnt_drop_write()
- * must be called.  This is effectively a refcount.
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that writes are allowed (mnt it read-write) before
+ * returning success. This operation does not protect against filesystem being
+ * frozen. When the write operation is finished, __mnt_drop_write() must be
+ * called. This is effectively a refcount.
  */
-int mnt_want_write(struct vfsmount *mnt)
+int __mnt_want_write(struct vfsmount *m)
 {
 	int ret = 0;
 
 	preempt_disable();
-	mnt_inc_writers(mnt);
+	mnt_inc_writers(m);
 	/*
 	 * The store to mnt_inc_writers must be visible before we pass
 	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
 	 * incremented count after it has set MNT_WRITE_HOLD.
 	 */
 	smp_mb();
-	while (mnt->mnt_flags & MNT_WRITE_HOLD)
+	while (m->mnt_flags & MNT_WRITE_HOLD)
 		cpu_relax();
 	/*
 	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
@@ -353,13 +359,34 @@
 	 * MNT_WRITE_HOLD is cleared.
 	 */
 	smp_rmb();
-	if (__mnt_is_readonly(mnt)) {
-		mnt_dec_writers(mnt);
+	if (__mnt_is_readonly(m)) {
+		mnt_dec_writers(m);
 		ret = -EROFS;
 		goto out;
 	}
 out:
 	preempt_enable();
+
+	return ret;
+}
+
+/**
+ * mnt_want_write - get write access to a mount
+ * @m: the mount on which to take a write
+ *
+ * This tells the low-level filesystem that a write is about to be performed to
+ * it, and makes sure that writes are allowed (mount is read-write, filesystem
+ * is not frozen) before returning success.  When the write operation is
+ * finished, mnt_drop_write() must be called.  This is effectively a refcount.
+ */
+int mnt_want_write(struct vfsmount *m)
+{
+	int ret;
+
+	sb_start_write(m->mnt_sb);
+	ret = __mnt_want_write(m);
+	if (ret)
+		sb_end_write(m->mnt_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
@@ -389,38 +416,76 @@
 EXPORT_SYMBOL_GPL(mnt_clone_write);
 
 /**
- * mnt_want_write_file - get write access to a file's mount
+ * __mnt_want_write_file - get write access to a file's mount
  * @file: the file who's mount on which to take a write
  *
- * This is like mnt_want_write, but it takes a file and can
+ * This is like __mnt_want_write, but it takes a file and can
  * do some optimisations if the file is open for write already
  */
-int mnt_want_write_file(struct file *file)
+int __mnt_want_write_file(struct file *file)
 {
 	struct inode *inode = file->f_dentry->d_inode;
+
 	if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
-		return mnt_want_write(file->f_path.mnt);
+		return __mnt_want_write(file->f_path.mnt);
 	else
 		return mnt_clone_write(file->f_path.mnt);
 }
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+	int ret;
+
+	sb_start_write(file->f_path.mnt->mnt_sb);
+	ret = __mnt_want_write_file(file);
+	if (ret)
+		sb_end_write(file->f_path.mnt->mnt_sb);
+	return ret;
+}
 EXPORT_SYMBOL_GPL(mnt_want_write_file);
 
 /**
- * mnt_drop_write - give up write access to a mount
+ * __mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
  *
  * Tells the low-level filesystem that we are done
  * performing writes to it.  Must be matched with
- * mnt_want_write() call above.
+ * __mnt_want_write() call above.
  */
-void mnt_drop_write(struct vfsmount *mnt)
+void __mnt_drop_write(struct vfsmount *mnt)
 {
 	preempt_disable();
 	mnt_dec_writers(mnt);
 	preempt_enable();
 }
+
+/**
+ * mnt_drop_write - give up write access to a mount
+ * @mnt: the mount on which to give up write access
+ *
+ * Tells the low-level filesystem that we are done performing writes to it and
+ * also allows filesystem to be frozen again.  Must be matched with
+ * mnt_want_write() call above.
+ */
+void mnt_drop_write(struct vfsmount *mnt)
+{
+	__mnt_drop_write(mnt);
+	sb_end_write(mnt->mnt_sb);
+}
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
+void __mnt_drop_write_file(struct file *file)
+{
+	__mnt_drop_write(file->f_path.mnt);
+}
+
 void mnt_drop_write_file(struct file *file)
 {
 	mnt_drop_write(file->f_path.mnt);
@@ -2043,6 +2108,13 @@
 		return PTR_ERR(mnt);
 
 	err = do_add_mount(mnt, path, mnt_flags);
+#ifdef MY_ABC_HERE
+	if (!strcmp(type, "ext4")) {
+		char buf[SYNO_EXT4_MOUNT_PATH_LEN] = {'\0'};
+		ext4_fill_mount_path(mnt->mnt_sb, d_path(path, buf, sizeof(buf)));
+	}
+#endif
+
 	if (err)
 		mntput(mnt);
 	return err;
@@ -2309,6 +2381,9 @@
 	if ( 0 == gSynoInstallFlag &&
 			NULL != dev_name &&
 			strstr(dev_name, SYNO_USB_FLASH_DEVICE_PATH) &&
+#ifdef CONFIG_SYNO_DUAL_HEAD
+			!(1 == gSynoDualHead && strstr(dev_name, SYNO_DUALHEAD_SYSTEM_DEVICE_PATH)) &&
+#endif
 			gSynoHasDynModule) {
 		return -EINVAL;
 	}
diff -ur a/fs/ncpfs/file.c b/fs/ncpfs/file.c
--- a/fs/ncpfs/file.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ncpfs/file.c	2014-02-17 11:57:00.000000000 +0100
@@ -222,6 +222,10 @@
 
 	already_written = 0;
 
+	errno = file_update_time(file);
+	if (errno)
+		goto outrel;
+
 	bouncebuffer = vmalloc(bufsize);
 	if (!bouncebuffer) {
 		errno = -EIO;	/* -ENOMEM */
@@ -253,8 +257,6 @@
 	}
 	vfree(bouncebuffer);
 
-	file_update_time(file);
-
 	*ppos = pos;
 
 	if (pos > i_size_read(inode)) {
diff -ur a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
--- a/fs/ncpfs/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/ncpfs/inode.c	2014-02-17 11:57:00.000000000 +0100
@@ -90,6 +90,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ncp_inode_cachep);
 }
 
diff -ur a/fs/nfs/inode.c b/fs/nfs/inode.c
--- a/fs/nfs/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/nfs/inode.c	2014-02-17 11:56:59.000000000 +0100
@@ -1516,6 +1516,11 @@
 
 static void nfs_destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(nfs_inode_cachep);
 }
 
diff -ur a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
--- a/fs/nfsd/nfsfh.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nfsd/nfsfh.c	2014-02-17 11:57:01.000000000 +0100
@@ -14,6 +14,10 @@
 #include "vfs.h"
 #include "auth.h"
 
+#ifdef CONFIG_FS_SYNO_ACL
+#include "../synoacl_int.h"
+#endif
+
 #define NFSDDBG_FACILITY		NFSDDBG_FH
 
 
@@ -39,8 +43,8 @@
 		int err;
 		parent = dget_parent(tdentry);
 #ifdef CONFIG_FS_SYNO_ACL
-		if (IS_SYNOACL(parent->d_inode)) {
-			err = parent->d_inode->i_op->syno_permission(parent, MAY_EXEC);
+		if (IS_SYNOACL(parent)) {
+			err = synoacl_op_perm(parent, MAY_EXEC);
 		} else 
 #endif /* CONFIG_FS_SYNO_ACL */
 		err = inode_permission(parent->d_inode, MAY_EXEC);
@@ -64,7 +68,7 @@
  * the write call).
  */
 static inline __be32
-nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, int requested)
+nfsd_mode_check(struct svc_rqst *rqstp, umode_t mode, umode_t requested)
 {
 	mode &= S_IFMT;
 
@@ -311,7 +315,7 @@
  * include/linux/nfsd/nfsd.h.
  */
 __be32
-fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access)
+fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, int access)
 {
 	struct svc_export *exp;
 	struct dentry	*dentry;
@@ -377,7 +381,7 @@
 skip_pseudoflavor_check:
 	/* Finally, check access permissions. */
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(dentry->d_inode) && CheckPermInFileSystem(access)) {
+	if (IS_SYNOACL(dentry) && CheckPermInFileSystem(access)) {
 		access |= NFSD_MAY_SYNO_NOP;
 	}
 #endif
diff -ur a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
--- a/fs/nfsd/nfsfh.h	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nfsd/nfsfh.h	2014-02-17 11:57:01.000000000 +0100
@@ -102,7 +102,7 @@
 /*
  * Function prototypes
  */
-__be32	fh_verify(struct svc_rqst *, struct svc_fh *, int, int);
+__be32	fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int);
 __be32	fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
 __be32	fh_update(struct svc_fh *);
 void	fh_put(struct svc_fh *);
diff -ur a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
--- a/fs/nfsd/vfs.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nfsd/vfs.c	2014-02-17 11:57:01.000000000 +0100
@@ -310,7 +310,7 @@
 	struct dentry	*dentry;
 	struct inode	*inode;
 	int		accmode = NFSD_MAY_SATTR;
-	int		ftype = 0;
+	umode_t		ftype = 0;
 	__be32		err;
 	int		host_err;
 	int		size_change = 0;
@@ -363,8 +363,8 @@
 			delta = -delta;
 #ifdef CONFIG_FS_SYNO_ACL
 		if (delta < MAX_TOUCH_TIME_ERROR) {
-			if (IS_SYNOACL(inode)) {
-				if (0 > inode->i_op->syno_inode_change_ok(dentry, iap)) {
+			if (IS_SYNOACL(dentry)) {
+				if (0 > synoacl_op_inode_chg_ok(dentry, iap)) {
 					iap->ia_valid &= ~BOTH_TIME_SET;
 				}
 			} else if (0 > inode_change_ok(inode, iap)){
@@ -712,7 +712,7 @@
 #ifdef CONFIG_FS_SYNO_ACL
 	inode = dentry->d_inode;
 	isFSInACLMode = IS_FS_SYNOACL(inode);
-	isInodeInACLMode = IS_INODE_SYNOACL(inode) && isFSInACLMode;
+	isInodeInACLMode = IS_INODE_SYNOACL(inode, dentry) && isFSInACLMode;
 
 	if (isInodeInACLMode) {
 		if (S_ISREG(inode->i_mode))
@@ -741,7 +741,7 @@
 #ifdef CONFIG_FS_SYNO_ACL
 			if (isInodeInACLMode){
 				if (inode->i_op) {
-					err2 = nfserrno(inode->i_op->syno_permission(dentry, map->how));
+					err2 = nfserrno(synoacl_op_perm(dentry, map->how));
 				} else {//impossible case
 					printk(KERN_WARNING "nfsd: (%s) is in acl mode but has no operator \n", dentry->d_iname);
 					err2 = nfs_ok;
@@ -794,7 +794,7 @@
  * N.B. After this call fhp needs an fh_put
  */
 __be32
-nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
+nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type,
 			int access, struct file **filp)
 {
 	struct dentry	*dentry;
@@ -2234,13 +2234,13 @@
 	 * with NFSv3.
 	 */
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(inode)) {
-		if ((acc & NFSD_MAY_OWNER_OVERRIDE) && is_synoacl_owner(inode))
+	if (IS_SYNOACL(dentry)) {
+		if ((acc & NFSD_MAY_OWNER_OVERRIDE) && is_synoacl_owner(dentry))
 			return 0;
 		if (acc & NFSD_MAY_SYNO_NOP) {
 			return 0;
 		}
-		err = inode->i_op->syno_permission(dentry, syno_acl_nfs_perm_switch(inode, acc));
+		err = synoacl_op_perm(dentry, syno_acl_nfs_perm_switch(inode, acc));
 	} else {
 #endif /* CONFIG_FS_SYNO_ACL */
 
@@ -2260,9 +2260,9 @@
 		acc == (NFSD_MAY_READ | NFSD_MAY_READ_IF_EXEC)))
 #ifdef CONFIG_FS_SYNO_ACL
 	{
-		if (IS_SYNOACL(inode))
-			err = inode->i_op->syno_permission(dentry, MAY_EXEC);
-		else {
+		if (IS_SYNOACL(dentry)){
+			err = synoacl_op_perm(dentry, MAY_EXEC);
+		} else {
 			err = inode_permission(inode, MAY_EXEC);
 		}
 	}
diff -ur a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
--- a/fs/nfsd/vfs.h	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nfsd/vfs.h	2014-02-17 11:57:01.000000000 +0100
@@ -70,7 +70,7 @@
 __be32		nfsd_commit(struct svc_rqst *, struct svc_fh *,
 				loff_t, unsigned long);
 #endif /* CONFIG_NFSD_V3 */
-__be32		nfsd_open(struct svc_rqst *, struct svc_fh *, int,
+__be32		nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t,
 				int, struct file **);
 void		nfsd_close(struct file *);
 __be32 		nfsd_read(struct svc_rqst *, struct svc_fh *,
diff -ur a/fs/nilfs2/file.c b/fs/nilfs2/file.c
--- a/fs/nilfs2/file.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nilfs2/file.c	2014-02-17 11:57:01.000000000 +0100
@@ -65,16 +65,18 @@
 	struct page *page = vmf->page;
 	struct inode *inode = vma->vm_file->f_dentry->d_inode;
 	struct nilfs_transaction_info ti;
-	int ret;
+	int ret = 0;
 
 	if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
 		return VM_FAULT_SIGBUS; /* -ENOSPC */
 
+	sb_start_pagefault(inode->i_sb);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping ||
 	    page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
 		unlock_page(page);
-		return VM_FAULT_NOPAGE; /* make the VM retry the fault */
+		ret = -EFAULT;	/* make the VM retry the fault */
+		goto out;
 	}
 
 	/*
@@ -108,19 +110,21 @@
 	ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
 	/* never returns -ENOMEM, but may return -ENOSPC */
 	if (unlikely(ret))
-		return VM_FAULT_SIGBUS;
+		goto out;
 
-	ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
-	if (ret != VM_FAULT_LOCKED) {
+	ret = __block_page_mkwrite(vma, vmf, nilfs_get_block);
+	if (ret) {
 		nilfs_transaction_abort(inode->i_sb);
-		return ret;
+		goto out;
 	}
 	nilfs_set_file_dirty(inode, 1 << (PAGE_SHIFT - inode->i_blkbits));
 	nilfs_transaction_commit(inode->i_sb);
 
  mapped:
 	wait_on_page_writeback(page);
-	return VM_FAULT_LOCKED;
+ out:
+	sb_end_pagefault(inode->i_sb);
+	return block_page_mkwrite_return(ret);
 }
 
 static const struct vm_operations_struct nilfs_file_vm_ops = {
diff -ur a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
--- a/fs/nilfs2/ioctl.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nilfs2/ioctl.c	2014-02-17 11:57:01.000000000 +0100
@@ -658,8 +658,6 @@
 		goto out_free;
 	}
 
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
 	ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
 	if (ret < 0)
 		printk(KERN_ERR "NILFS: GC failed during preparation: "
diff -ur a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
--- a/fs/nilfs2/segment.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nilfs2/segment.c	2014-02-17 11:57:01.000000000 +0100
@@ -189,7 +189,7 @@
 	if (ret > 0)
 		return 0;
 
-	vfs_check_frozen(sb, SB_FREEZE_WRITE);
+	sb_start_intwrite(sb);
 
 	nilfs = sb->s_fs_info;
 	down_read(&nilfs->ns_segctor_sem);
@@ -205,6 +205,7 @@
 	current->journal_info = ti->ti_save;
 	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
 		kmem_cache_free(nilfs_transaction_cachep, ti);
+	sb_end_intwrite(sb);
 	return ret;
 }
 
@@ -246,6 +247,7 @@
 		err = nilfs_construct_segment(sb);
 	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
 		kmem_cache_free(nilfs_transaction_cachep, ti);
+	sb_end_intwrite(sb);
 	return err;
 }
 
@@ -264,6 +266,7 @@
 	current->journal_info = ti->ti_save;
 	if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
 		kmem_cache_free(nilfs_transaction_cachep, ti);
+	sb_end_intwrite(sb);
 }
 
 void nilfs_relax_pressure_in_lock(struct super_block *sb)
@@ -2472,7 +2475,7 @@
 
 	if (freezing(current)) {
 		spin_unlock(&sci->sc_state_lock);
-		refrigerator();
+		try_to_freeze();
 		spin_lock(&sci->sc_state_lock);
 	} else {
 		DEFINE_WAIT(wait);
diff -ur a/fs/nilfs2/super.c b/fs/nilfs2/super.c
--- a/fs/nilfs2/super.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/nilfs2/super.c	2014-02-17 11:57:01.000000000 +0100
@@ -1389,6 +1389,12 @@
 
 static void nilfs_destroy_cachep(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
 	if (nilfs_inode_cachep)
 		kmem_cache_destroy(nilfs_inode_cachep);
 	if (nilfs_transaction_cachep)
diff -ur a/fs/ntfs/file.c b/fs/ntfs/file.c
--- a/fs/ntfs/file.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ntfs/file.c	2014-02-17 11:56:57.000000000 +0100
@@ -2084,7 +2084,6 @@
 	if (err)
 		return err;
 	pos = *ppos;
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
 	/* We can write back this queue in page reclaim. */
 	current->backing_dev_info = mapping->backing_dev_info;
 	written = 0;
@@ -2096,7 +2095,9 @@
 	err = file_remove_suid(file);
 	if (err)
 		goto out;
-	file_update_time(file);
+	err = file_update_time(file);
+	if (err)
+		goto out;
 	written = ntfs_file_buffered_write(iocb, iov, nr_segs, pos, ppos,
 			count);
 out:
@@ -2117,6 +2118,7 @@
 
 	BUG_ON(iocb->ki_pos != pos);
 
+	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
 	ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
 	mutex_unlock(&inode->i_mutex);
@@ -2125,6 +2127,7 @@
 		if (err < 0)
 			ret = err;
 	}
+	sb_end_write(inode->i_sb);
 	return ret;
 }
 
diff -ur a/fs/ntfs/super.c b/fs/ntfs/super.c
--- a/fs/ntfs/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ntfs/super.c	2014-02-17 11:56:57.000000000 +0100
@@ -3184,6 +3184,12 @@
 	ntfs_debug("Unregistering NTFS driver.");
 
 	unregister_filesystem(&ntfs_fs_type);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ntfs_big_inode_cache);
 	kmem_cache_destroy(ntfs_inode_cache);
 	kmem_cache_destroy(ntfs_name_cache);
diff -ur a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
--- a/fs/ocfs2/dlmfs/dlmfs.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ocfs2/dlmfs/dlmfs.c	2014-02-17 11:56:56.000000000 +0100
@@ -712,6 +712,11 @@
 	flush_workqueue(user_dlm_worker);
 	destroy_workqueue(user_dlm_worker);
 
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(dlmfs_inode_cache);
 
 	bdi_destroy(&dlmfs_backing_dev_info);
diff -ur a/fs/ocfs2/file.c b/fs/ocfs2/file.c
--- a/fs/ocfs2/file.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ocfs2/file.c	2014-02-17 11:56:56.000000000 +0100
@@ -1971,6 +1971,7 @@
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	int ret;
 
 	if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
 	    !ocfs2_writes_unwritten_extents(osb))
@@ -1985,7 +1986,12 @@
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 
-	return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+	ret = __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0);
+	mnt_drop_write_file(file);
+	return ret;
 }
 
 static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
@@ -2261,7 +2267,7 @@
 	if (iocb->ki_left == 0)
 		return 0;
 
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	sb_start_write(inode->i_sb);
 
 	appending = file->f_flags & O_APPEND ? 1 : 0;
 	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
@@ -2436,6 +2442,7 @@
 		ocfs2_iocb_clear_sem_locked(iocb);
 
 	mutex_unlock(&inode->i_mutex);
+	sb_end_write(inode->i_sb);
 
 	if (written)
 		ret = written;
@@ -2508,18 +2515,15 @@
 		ret = sd.num_spliced;
 
 	if (ret > 0) {
-		unsigned long nr_pages;
 		int err;
 
-		nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
 		err = generic_write_sync(out, *ppos, ret);
 		if (err)
 			ret = err;
 		else
 			*ppos += ret;
 
-		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+		balance_dirty_pages_ratelimited(mapping);
 	}
 
 	return ret;
diff -ur a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
--- a/fs/ocfs2/ioctl.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ocfs2/ioctl.c	2014-02-17 11:56:56.000000000 +0100
@@ -928,7 +928,12 @@
 		if (get_user(new_clusters, (int __user *)arg))
 			return -EFAULT;
 
-		return ocfs2_group_extend(inode, new_clusters);
+		status = mnt_want_write_file(filp);
+		if (status)
+			return status;
+		status = ocfs2_group_extend(inode, new_clusters);
+		mnt_drop_write_file(filp);
+		return status;
 	case OCFS2_IOC_GROUP_ADD:
 	case OCFS2_IOC_GROUP_ADD64:
 		if (!capable(CAP_SYS_RESOURCE))
@@ -937,7 +942,12 @@
 		if (copy_from_user(&input, (int __user *) arg, sizeof(input)))
 			return -EFAULT;
 
-		return ocfs2_group_add(inode, &input);
+		status = mnt_want_write_file(filp);
+		if (status)
+			return status;
+		status = ocfs2_group_add(inode, &input);
+		mnt_drop_write_file(filp);
+		return status;
 	case OCFS2_IOC_REFLINK:
 		if (copy_from_user(&args, (struct reflink_arguments *)arg,
 				   sizeof(args)))
diff -ur a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
--- a/fs/ocfs2/journal.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ocfs2/journal.c	2014-02-17 11:56:56.000000000 +0100
@@ -355,11 +355,14 @@
 	if (journal_current_handle())
 		return jbd2_journal_start(journal, max_buffs);
 
+	sb_start_intwrite(osb->sb);
+
 	down_read(&osb->journal->j_trans_barrier);
 
 	handle = jbd2_journal_start(journal, max_buffs);
 	if (IS_ERR(handle)) {
 		up_read(&osb->journal->j_trans_barrier);
+		sb_end_intwrite(osb->sb);
 
 		mlog_errno(PTR_ERR(handle));
 
@@ -388,8 +391,10 @@
 	if (ret < 0)
 		mlog_errno(ret);
 
-	if (!nested)
+	if (!nested) {
 		up_read(&journal->j_trans_barrier);
+		sb_end_intwrite(osb->sb);
+	}
 
 	return ret;
 }
diff -ur a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
--- a/fs/ocfs2/mmap.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ocfs2/mmap.c	2014-02-17 11:56:56.000000000 +0100
@@ -136,6 +136,7 @@
 	sigset_t oldset;
 	int ret;
 
+	sb_start_pagefault(inode->i_sb);
 	ocfs2_block_signals(&oldset);
 
 	/*
@@ -165,6 +166,7 @@
 
 out:
 	ocfs2_unblock_signals(&oldset);
+	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
 
diff -ur a/fs/ocfs2/super.c b/fs/ocfs2/super.c
--- a/fs/ocfs2/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ocfs2/super.c	2014-02-17 11:56:56.000000000 +0100
@@ -1820,6 +1820,11 @@
 
 static void ocfs2_free_mem_caches(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	if (ocfs2_inode_cachep)
 		kmem_cache_destroy(ocfs2_inode_cachep);
 	ocfs2_inode_cachep = NULL;
diff -ur a/fs/open.c b/fs/open.c
--- a/fs/open.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/open.c	2014-02-17 11:56:58.000000000 +0100
@@ -32,8 +32,8 @@
 #include <linux/dnotify.h>
 #include "internal.h"
 
-#ifdef MY_ABC_HERE
-extern long __SYNOArchiveSet(struct dentry *dentry, unsigned int cmd);
+#ifdef CONFIG_FS_SYNO_ACL
+#include "synoacl_int.h"
 #endif
 
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
@@ -92,8 +92,8 @@
 	if (error)
 		goto dput_and_out;
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(inode)) {
-		error = inode->i_op->syno_permission(path.dentry, MAY_WRITE);
+	if (IS_SYNOACL(path.dentry)) {
+		error = synoacl_op_perm(path.dentry, MAY_WRITE);
 	} else
 #endif
 	error = inode_permission(inode, MAY_WRITE);
@@ -171,11 +171,13 @@
 	if (IS_APPEND(inode))
 		goto out_putf;
 
+	sb_start_write(inode->i_sb);
 	error = locks_verify_truncate(inode, file, length);
 	if (!error)
 		error = security_path_truncate(&file->f_path);
 	if (!error)
 		error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
+	sb_end_write(inode->i_sb);
 out_putf:
 	fput(file);
 out:
@@ -273,7 +275,10 @@
 	if (!file->f_op->fallocate)
 		return -EOPNOTSUPP;
 
-	return file->f_op->fallocate(file, mode, offset, len);
+	sb_start_write(inode->i_sb);
+	ret = file->f_op->fallocate(file, mode, offset, len);
+	sb_end_write(inode->i_sb);
+	return ret;
 }
 
 SYSCALL_DEFINE(fallocate)(int fd, int mode, loff_t offset, loff_t len)
@@ -351,8 +356,8 @@
 			goto out_path_release;
 	}
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(inode)) {
-		res = inode->i_op->syno_access(path.dentry, mode | MAY_ACCESS);
+	if (IS_SYNOACL(path.dentry)) {
+		res = synoacl_op_access(path.dentry, mode | MAY_ACCESS);
 	} else
 #endif
 	res = inode_permission(inode, mode | MAY_ACCESS);
@@ -399,8 +404,8 @@
 
 #ifdef CONFIG_FS_SYNO_ACL
 	inode = path.dentry->d_inode;
-	if (IS_SYNOACL(inode)) {
-		error = inode->i_op->syno_permission(path.dentry, MAY_EXEC);
+	if (IS_SYNOACL(path.dentry)) {
+		error = synoacl_op_perm(path.dentry, MAY_EXEC);
 	} else {
 		error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
 	}
@@ -436,8 +441,8 @@
 		goto out_putf;
 
 #ifdef CONFIG_FS_SYNO_ACL
-	if (IS_SYNOACL(inode)) {
-		error = inode->i_op->syno_permission(file->f_path.dentry, MAY_EXEC);
+	if (IS_SYNOACL(file->f_path.dentry)) {
+		error = synoacl_op_perm(file->f_path.dentry, MAY_EXEC);
 	} else
 #endif
 	error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
@@ -463,8 +468,8 @@
 
 #ifdef CONFIG_FS_SYNO_ACL
 	inode = path.dentry->d_inode;
-	if (IS_SYNOACL(inode)) {
-		error = inode->i_op->syno_permission(path.dentry, MAY_EXEC);
+	if (IS_SYNOACL(path.dentry)) {
+		error = synoacl_op_perm(path.dentry, MAY_EXEC);
 	} else {
 		error = inode_permission(inode, MAY_EXEC | MAY_CHDIR);
 	}
@@ -571,39 +576,30 @@
 }
 
 #ifdef	MY_ABC_HERE
+extern long __SYNOArchiveSet(struct dentry *, unsigned int cmd);
+
 asmlinkage long sys_SYNOArchiveBit(const char * filename, int cmd)
 {
-	int isPathGet = 0;
 	struct path path;
-	struct inode * inode = NULL;
-	long error = -EINVAL;
+	long error;
+
+	if (SYNO_FCNTL_BASE > cmd || SYNO_FCNTL_LAST < cmd) {
+		printk(KERN_WARNING "Archive bit cmd:%x not implement.\n", cmd);
+		return -EINVAL;
+	}
 
 	error = user_path_at(AT_FDCWD, filename, LOOKUP_FOLLOW, &path);
 	if (error)
-		goto out;
-
-	isPathGet = 1;
-
-	if (path.dentry && path.dentry->d_inode) {
-		inode = path.dentry->d_inode;
-	} else {
-		goto out;
-	}
+		return error;
 
-	if (inode->i_op && inode->i_op->set_archive) {
-		error = inode->i_op->set_archive(path.dentry, cmd);
-	} else {
-		error = __SYNOArchiveSet(path.dentry, cmd);
-	}
-	if (error) {
-		goto out;
-	}
+	error = mnt_want_write(path.mnt);
+	if (error)
+		goto out_release;
 
-	error = 0;
-out:
-	if (isPathGet) {
-		path_put(&path);
-	}
+	error = __SYNOArchiveSet(path.dentry, cmd);
+	mnt_drop_write(path.mnt);
+out_release:
+	path_put(&path);
 	return error;
 }
 #endif //MY_ABC_HERE
@@ -718,7 +714,7 @@
 		/*
 		 * Balanced in __fput()
 		 */
-		error = mnt_want_write(mnt);
+		error = __mnt_want_write(mnt);
 		if (error)
 			put_write_access(inode);
 	}
diff -ur a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
--- a/fs/openpromfs/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/openpromfs/inode.c	2014-02-17 11:56:59.000000000 +0100
@@ -464,6 +464,11 @@
 static void __exit exit_openprom_fs(void)
 {
 	unregister_filesystem(&openprom_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(op_inode_cachep);
 }
 
diff -ur a/fs/partitions/check.c b/fs/partitions/check.c
--- a/fs/partitions/check.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/partitions/check.c	2014-02-17 11:57:00.000000000 +0100
@@ -136,6 +136,12 @@
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
+#ifdef MY_ABC_HERE
+	if (!bdev || !bdev->bd_part) {
+		printk(KERN_ERR "bdevname: bdev data should not be NULL\n");
+		return buf;
+	}
+#endif
 	return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
 }
 
diff -ur a/fs/pipe.c b/fs/pipe.c
--- a/fs/pipe.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/pipe.c	2014-02-17 11:57:00.000000000 +0100
@@ -653,8 +653,11 @@
 		wake_up_interruptible_sync_poll(&pipe->wait, POLLIN | POLLRDNORM);
 		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
-	if (ret > 0)
-		file_update_time(filp);
+	if (ret > 0) {
+		int err = file_update_time(filp);
+		if (err)
+			ret = err;
+	}
 	return ret;
 }
 
diff -ur a/fs/qnx4/inode.c b/fs/qnx4/inode.c
--- a/fs/qnx4/inode.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/qnx4/inode.c	2014-02-17 11:57:00.000000000 +0100
@@ -456,6 +456,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(qnx4_inode_cachep);
 }
 
Nur in b/fs: qnx6.
diff -ur a/fs/reiserfs/super.c b/fs/reiserfs/super.c
--- a/fs/reiserfs/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/reiserfs/super.c	2014-02-17 11:56:56.000000000 +0100
@@ -569,6 +569,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(reiserfs_inode_cachep);
 }
 
diff -ur a/fs/romfs/super.c b/fs/romfs/super.c
--- a/fs/romfs/super.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/romfs/super.c	2014-02-17 11:57:01.000000000 +0100
@@ -650,6 +650,11 @@
 static void __exit exit_romfs_fs(void)
 {
 	unregister_filesystem(&romfs_fs_type);
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(romfs_inode_cachep);
 }
 
diff -ur a/fs/select.c b/fs/select.c
--- a/fs/select.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/select.c	2014-02-17 11:57:01.000000000 +0100
@@ -223,7 +223,7 @@
 	get_file(filp);
 	entry->filp = filp;
 	entry->wait_address = wait_address;
-	entry->key = p->key;
+	entry->key = p->_key;
 	init_waitqueue_func_entry(&entry->wait, pollwake);
 	entry->wait.private = pwq;
 	add_wait_queue(wait_address, &entry->wait);
@@ -386,13 +386,11 @@
 static inline void wait_key_set(poll_table *wait, unsigned long in,
 				unsigned long out, unsigned long bit)
 {
-	if (wait) {
-		wait->key = POLLEX_SET;
-		if (in & bit)
-			wait->key |= POLLIN_SET;
-		if (out & bit)
-			wait->key |= POLLOUT_SET;
-	}
+	wait->_key = POLLEX_SET;
+	if (in & bit)
+		wait->_key |= POLLIN_SET;
+	if (out & bit)
+		wait->_key |= POLLOUT_SET;
 }
 
 int do_select(int n, fd_set_bits *fds, struct timespec *end_time)
@@ -414,7 +412,7 @@
 	poll_initwait(&table);
 	wait = &table.pt;
 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
-		wait = NULL;
+		wait->_qproc = NULL;
 		timed_out = 1;
 	}
 
@@ -459,17 +457,17 @@
 					if ((mask & POLLIN_SET) && (in & bit)) {
 						res_in |= bit;
 						retval++;
-						wait = NULL;
+						wait->_qproc = NULL;
 					}
 					if ((mask & POLLOUT_SET) && (out & bit)) {
 						res_out |= bit;
 						retval++;
-						wait = NULL;
+						wait->_qproc = NULL;
 					}
 					if ((mask & POLLEX_SET) && (ex & bit)) {
 						res_ex |= bit;
 						retval++;
-						wait = NULL;
+						wait->_qproc = NULL;
 					}
 				}
 			}
@@ -481,7 +479,7 @@
 				*rexp = res_ex;
 			cond_resched();
 		}
-		wait = NULL;
+		wait->_qproc = NULL;
 		if (retval || timed_out || signal_pending(current))
 			break;
 		if (table.error) {
@@ -720,7 +718,7 @@
  * interested in events matching the pollfd->events mask, and the result
  * matching that mask is both recorded in pollfd->revents and returned. The
  * pwait poll_table will be used by the fd-provided poll handler for waiting,
- * if non-NULL.
+ * if pwait->_qproc is non-NULL.
  */
 static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait)
 {
@@ -738,9 +736,7 @@
 		if (file != NULL) {
 			mask = DEFAULT_POLLMASK;
 			if (file->f_op && file->f_op->poll) {
-				if (pwait)
-					pwait->key = pollfd->events |
-							POLLERR | POLLHUP;
+				pwait->_key = pollfd->events|POLLERR|POLLHUP;
 				mask = file->f_op->poll(file, pwait);
 			}
 			/* Mask out unneeded events. */
@@ -763,7 +759,7 @@
 
 	/* Optimise the no-wait case */
 	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
-		pt = NULL;
+		pt->_qproc = NULL;
 		timed_out = 1;
 	}
 
@@ -781,22 +777,22 @@
 			for (; pfd != pfd_end; pfd++) {
 				/*
 				 * Fish for events. If we found one, record it
-				 * and kill the poll_table, so we don't
+				 * and kill poll_table->_qproc, so we don't
 				 * needlessly register any other waiters after
 				 * this. They'll get immediately deregistered
 				 * when we break out and return.
 				 */
 				if (do_pollfd(pfd, pt)) {
 					count++;
-					pt = NULL;
+					pt->_qproc = NULL;
 				}
 			}
 		}
 		/*
 		 * All waiters have already been registered, so don't provide
-		 * a poll_table to them on the next loop iteration.
+		 * a poll_table->_qproc to them on the next loop iteration.
 		 */
-		pt = NULL;
+		pt->_qproc = NULL;
 		if (!count) {
 			count = wait->error;
 			if (signal_pending(current))
diff -ur a/fs/splice.c b/fs/splice.c
--- a/fs/splice.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/splice.c	2014-02-17 11:57:00.000000000 +0100
@@ -32,6 +32,9 @@
 #include <linux/security.h>
 #include <linux/gfp.h>
 #include <linux/socket.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/time.h>
+#endif
 #if defined(CONFIG_SYNO_ARMADA)
 #include <net/sock.h>
 #include <linux/net.h>
@@ -873,6 +876,380 @@
 }
 EXPORT_SYMBOL(splice_from_pipe_feed);
 
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_IMPROVED_SPLICE)
+#if !defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+#define MSPD_SPLICE_NUM_DMA		100
+#else
+#define MSPD_SPLICE_NUM_DMA		MDMA_OUTBOUND_BUF_DESC
+#endif
+
+#if defined(CONFIG_COMCERTO_SPLICE_PROF)
+unsigned int enable_splice_prof = 0;
+#endif
+
+int comcerto_splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd)
+{
+	struct page **mspd_splice_pages;
+	void **mspd_splice_fsdata;
+	struct pipe_buffer *buf;
+	const struct pipe_buf_operations *ops;
+	int ret, ret2 = 0, remaining;
+	unsigned int curbuf, nrbufs, len, nrbufs_len, done;
+	loff_t pos, offset;
+	struct file *file = sd->u.file;
+	struct address_space *mapping = file->f_mapping;
+	struct page **page;
+	void **fsdata;
+	unsigned int size;
+#if !defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+	unsigned int buf_len, buf_offset;
+	char *src, *dst;
+#else
+	struct comcerto_dma_sg *sg;
+#endif
+
+	size = (sizeof(struct page *) + sizeof(void *)) * MSPD_SPLICE_NUM_DMA;
+
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+	size = ALIGN(size, 8) + sizeof(struct comcerto_dma_sg);
+#endif
+
+	mspd_splice_pages = kmalloc(size, GFP_KERNEL);
+	if (!mspd_splice_pages)
+		return -ENOMEM;
+
+	mspd_splice_fsdata = (void **)(mspd_splice_pages + MSPD_SPLICE_NUM_DMA);
+
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+	sg = (struct comcerto_dma_sg *)(mspd_splice_fsdata + MSPD_SPLICE_NUM_DMA);
+	sg = PTR_ALIGN(sg, 8);
+#endif
+
+start:
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+	comcerto_dma_sg_init(sg);
+#endif
+
+	//Compute length to transfer (in bytes), and make sure data is there
+	nrbufs_len = 0;
+	nrbufs = pipe->nrbufs;
+	curbuf = pipe->curbuf;
+	while (nrbufs) {
+		buf = pipe->bufs + curbuf;
+
+		ret = buf->ops->confirm(pipe, buf);
+		if (unlikely(ret)) {
+			printk(KERN_WARNING "%s: buf->ops->confirm() failed(%d)\n", __func__, ret);
+			if (ret == -ENODATA)
+				ret = 0;
+			goto err;
+		}
+
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+		// Is there a risk of getting the same page more than once (several buffers in a single page)?
+		ret = comcerto_dma_sg_add_input(sg, buf->page, buf->offset, buf->len, 0);
+		if (unlikely(ret)) {
+			printk(KERN_WARNING "%s: out of input bdescs\n", __func__);
+			break; //We will transfer what we could up to the previous buffer, based on nrbufs_len
+		}
+#endif
+
+		nrbufs_len += buf->len;
+
+		if (nrbufs_len > sd->total_len) {
+			nrbufs_len = sd->total_len;
+			break;
+		}
+
+		// - 2 because first and last pages could be almost empty depending on alignment
+		if (nrbufs_len > (MSPD_SPLICE_NUM_DMA - 2)*PAGE_CACHE_SIZE) {
+			nrbufs_len = (MSPD_SPLICE_NUM_DMA - 2)*PAGE_CACHE_SIZE;
+			break;
+		}
+		curbuf = (curbuf + 1) & (pipe->buffers - 1);
+		nrbufs--;
+	}
+
+	if (unlikely(nrbufs_len == 0)) {
+		printk(KERN_WARNING "%s: nrbufs_len == 0\n", __func__);
+		ret = 0;
+		goto err;
+	}
+
+//	printk("BLA nrbufs_len: %d\n", nrbufs_len);
+
+	/* Allocate as many destinations pages as needed.
+	 * First and last pages are likely not to be filled, but the ones in-between will.
+	 * If some allocations fail, finish the work on the allocated pages.
+	 */
+	page = &mspd_splice_pages[0];
+	fsdata = &mspd_splice_fsdata[0];
+
+	pos = sd->pos;
+	offset = pos & ~PAGE_CACHE_MASK;
+	len = nrbufs_len;
+
+	if (likely(len + offset > PAGE_CACHE_SIZE))
+		len = PAGE_CACHE_SIZE - offset;
+
+	ret = pagecache_write_begin(file, mapping, pos, len,
+			AOP_FLAG_UNINTERRUPTIBLE, page, fsdata);
+	if (unlikely(ret))
+		goto err;		// We failed early, so we still have an easy way out
+
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+	comcerto_dma_sg_add_output(sg, *page, offset, len, 1); //Don't check result since we should have at least one entry at this point
+#endif
+
+	pos += len;
+	remaining = nrbufs_len - len;
+	page++;
+	fsdata++;
+
+	while (remaining > PAGE_CACHE_SIZE) {
+		ret = pagecache_write_begin(file, mapping, pos, PAGE_CACHE_SIZE,
+				AOP_FLAG_UNINTERRUPTIBLE, page, fsdata);
+
+		if (unlikely(ret))
+			goto write_begin_done;
+
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+		ret = comcerto_dma_sg_add_output(sg, *page, 0, PAGE_CACHE_SIZE, 1);
+		if (unlikely(ret)) {
+			pagecache_write_end(file, mapping, pos, PAGE_CACHE_SIZE, 0, *page, *fsdata);
+			goto write_begin_done;
+		}
+#endif
+		pos += PAGE_CACHE_SIZE;
+		remaining -= PAGE_CACHE_SIZE;
+		page++;
+		fsdata++;
+	}
+
+	if (remaining) {
+		ret = pagecache_write_begin(file, mapping, pos, remaining,
+						AOP_FLAG_UNINTERRUPTIBLE, page, fsdata);
+
+		if (unlikely(ret))
+			goto write_begin_done;
+
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+		ret = comcerto_dma_sg_add_output(sg, *page, 0, remaining, 1);
+		if (unlikely(ret)) {
+			pagecache_write_end(file, mapping, pos, remaining, 0, *page, *fsdata);
+			goto write_begin_done;
+		}
+#endif
+		remaining = 0;
+	}
+
+write_begin_done:
+	// Couldn't allocate all pages or bdescs, so update the total length accordingly
+	if (unlikely(remaining))
+		nrbufs_len = nrbufs_len - remaining;
+
+	//Now do the copies
+#if defined(CONFIG_COMCERTO_SPLICE_USE_MDMA)
+
+	comcerto_dma_get();
+
+	comcerto_dma_sg_setup(sg, nrbufs_len);
+
+	comcerto_dma_start();
+	comcerto_dma_wait();
+	comcerto_dma_put();
+
+	comcerto_dma_sg_cleanup(sg, nrbufs_len);
+#else
+	remaining = nrbufs_len;
+	curbuf = pipe->curbuf;
+	buf = pipe->bufs + curbuf;
+	buf_len = buf->len;
+	buf_offset = buf->offset;
+	src = buf->ops->map(pipe, buf, 1);
+	pos = sd->pos;
+	offset = pos & ~PAGE_CACHE_MASK;
+	page = &mspd_splice_pages[0];
+	dst = kmap_atomic(*page, KM_USER1);
+
+	while (remaining) {
+		len = remaining;
+		if (len + offset > PAGE_CACHE_SIZE)
+			len = PAGE_CACHE_SIZE - offset;
+		if (len > buf_len)
+			len = buf_len;
+
+		memcpy(dst + offset, src + buf_offset, len);
+
+		buf_len -= len;
+		buf_offset += len;
+		remaining -= len;
+		pos += len;
+		offset = pos & ~PAGE_CACHE_MASK;
+
+		if (!offset) {
+			/* FIXME if this was the last page we should still flush/unmap, even if it's not a full page */
+			/* ... actually it looks ok, the unmap is done outside the loop */
+			flush_dcache_page(*page);
+			kunmap_atomic(dst, KM_USER1);
+			if (remaining) {
+				page++;
+				dst = kmap_atomic(*page, KM_USER1);
+			}
+		}
+
+		if (!buf_len) {
+			buf->ops->unmap(pipe, buf, src);
+			if (remaining) {
+				curbuf = (curbuf + 1) & (pipe->buffers - 1);
+				buf = pipe->bufs + curbuf;
+				buf_len = buf->len;
+				buf_offset = buf->offset;
+				src = buf->ops->map(pipe, buf, 1);
+			}
+		}
+	}
+
+	if (offset) {
+		flush_dcache_page(*page);
+		kunmap_atomic(dst, KM_USER1);
+	}
+
+	if (buf_len)
+		buf->ops->unmap(pipe, buf, src);
+#endif
+
+
+	//loop on write_end, update sd fields
+	page = &mspd_splice_pages[0];
+	fsdata = &mspd_splice_fsdata[0];
+	offset = sd->pos & ~PAGE_CACHE_MASK;
+	pos = sd->pos;
+	remaining = nrbufs_len;
+	len = nrbufs_len;
+	done = 0;
+
+	if (likely(len + offset > PAGE_CACHE_SIZE))
+		len = PAGE_CACHE_SIZE - offset;
+
+	ret = pagecache_write_end(file, mapping, pos, len, len,
+			*page, *fsdata);
+
+	/* In case of error or short write we need to report error to the caller */
+	/* If there was already a previous error, just continue doing the pagecache_write_end() cleanup */
+	/* Otherwise keep track of how many bytes we have succefully written and that an error happened */
+	if (unlikely(ret != len)) {
+		printk(KERN_ERR "Failed on write_end, continuing with other buffers\n");
+
+		/* Only report error to caller if nothing has been done */
+		ret2 = ret;
+		nrbufs_len = (ret > 0) ? ret: 0;
+	}
+
+	pos += len;
+	done += len;
+	remaining -= len;
+
+	page++;
+	fsdata++;
+
+	while (remaining > PAGE_CACHE_SIZE) {
+		ret = pagecache_write_end(file, mapping, pos, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE,
+				*page, *fsdata);
+
+		if (unlikely((ret != PAGE_CACHE_SIZE) && !ret2)) {
+			printk(KERN_ERR "Failed on write_end, continuing with other buffers\n");
+
+			nrbufs_len = done;
+
+			if (ret >= 0)
+				nrbufs_len += ret;
+
+			ret2 = nrbufs_len;
+		}
+
+		pos += PAGE_CACHE_SIZE;
+		done += PAGE_CACHE_SIZE;
+		remaining -= PAGE_CACHE_SIZE;
+
+		page++;
+		fsdata++;
+	}
+
+	if (remaining) {
+		ret = pagecache_write_end(file, mapping, pos, remaining, remaining,
+					*page, *fsdata);
+
+		if (unlikely((ret != remaining) && !ret2)) {
+			printk(KERN_ERR "Failed on write_end, continuing with other buffers\n");
+
+			nrbufs_len = done;
+
+			if (ret >= 0)
+				nrbufs_len += ret;
+
+			ret2 = nrbufs_len;
+		}
+	}
+
+	sd->num_spliced += nrbufs_len;
+	sd->len -= nrbufs_len;
+	sd->pos += nrbufs_len;
+	sd->total_len -= nrbufs_len;
+
+	//loop on pipe buffers to release them
+	remaining = nrbufs_len;
+	buf = pipe->bufs + pipe->curbuf;
+
+	while (remaining && (remaining >= buf->len)) {
+		ops = buf->ops;
+
+		remaining -= buf->len;
+		buf->len = 0;
+		buf->ops = NULL;
+		ops->release(pipe, buf);
+		pipe->nrbufs--;
+		pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+		buf = pipe->bufs + pipe->curbuf;
+	}
+
+	// Last buffer, might not be empty
+	if (remaining) {
+		buf->len -= remaining;
+		buf->offset += remaining;
+	}
+
+	if (pipe->inode)
+		sd->need_wakeup = true;
+
+	if (!sd->total_len) {
+		kfree(mspd_splice_pages);
+		return 0;
+	}
+
+	if (ret2) {
+		if (ret2 > 0)
+			ret = 0;
+		else
+			ret = ret2;
+
+		goto err;
+	}
+
+	if (pipe->nrbufs)
+		goto start;
+
+	ret = 1;
+
+err:
+	kfree(mspd_splice_pages);
+
+	return ret;
+}
+EXPORT_SYMBOL(comcerto_splice_from_pipe_feed);
+#endif
+
 /**
  * splice_from_pipe_next - wait for some data to splice from
  * @pipe:	pipe to splice from
@@ -1033,6 +1410,8 @@
 	};
 	ssize_t ret;
 
+	sb_start_write(inode->i_sb);
+
 	pipe_lock(pipe);
 
 	splice_from_pipe_begin(&sd);
@@ -1046,9 +1425,98 @@
 		if (!ret) {
 #if defined(CONFIG_SYNO_ARMADA)
 #else
-			file_update_time(out);
+			ret = file_update_time(out);
+#endif
+			if (!ret)
+				ret = splice_from_pipe_feed(pipe, &sd,
+							    pipe_to_file);
+		}
+		mutex_unlock(&inode->i_mutex);
+	} while (ret > 0);
+	splice_from_pipe_end(pipe, &sd);
+
+	pipe_unlock(pipe);
+
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	if (ret > 0) {
+		int err;
+
+		err = generic_write_sync(out, *ppos, ret);
+		if (err)
+			ret = err;
+		else
+			*ppos += ret;
+		balance_dirty_pages_ratelimited(mapping);
+	}
+	sb_end_write(inode->i_sb);
+
+	return ret;
+}
+
+EXPORT_SYMBOL(generic_file_splice_write);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_SPLICE_PROF)
+unsigned int splicew_time_counter[256];
+unsigned int splicew_reqtime_counter[256];
+unsigned int splicew_data_counter[256];
+static struct timeval last_splicew;
+unsigned int init_splicew_prof = 0;
 #endif
-			ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_IMPROVED_SPLICE)
+ssize_t
+comcerto_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+	ssize_t ret;
+#if defined(CONFIG_COMCERTO_SPLICE_PROF)
+	struct timeval now;
+	int diff_time_ms;
+#endif
+
+	pipe_lock(pipe);
+
+#if defined(CONFIG_COMCERTO_SPLICE_PROF)
+	if (enable_splice_prof) {
+		do_gettimeofday(&now);
+		if (init_splicew_prof) {
+			diff_time_ms = ((now.tv_sec - last_splicew.tv_sec) * 1000) + ((now.tv_usec - last_splicew.tv_usec) / 1000);
+			if (diff_time_ms < 1000) {
+				splicew_time_counter[diff_time_ms >> 3]++;
+			}
+			else {
+				splicew_time_counter[255]++;
+			}
+		}
+		last_splicew = now;
+		if (len < (1 <<21))
+			splicew_data_counter[(len >> 13) & 0xFF]++;
+		else
+			splicew_data_counter[255]++;
+	}
+#endif
+
+	splice_from_pipe_begin(&sd);
+	do {
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
+
+		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		ret = file_remove_suid(out);
+		if (!ret) {
+			file_update_time(out);
+			ret = comcerto_splice_from_pipe_feed(pipe, &sd);
 		}
 		mutex_unlock(&inode->i_mutex);
 	} while (ret > 0);
@@ -1070,13 +1538,36 @@
 			ret = err;
 		else
 			*ppos += ret;
+#if defined(CONFIG_SYNO_COMCERTO)
+		balance_dirty_pages_ratelimited(mapping);
+#else
 		balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+#endif
+
 	}
 
+#if defined(CONFIG_COMCERTO_SPLICE_PROF)
+	if (enable_splice_prof) {
+		do_gettimeofday(&now);
+		
+		diff_time_ms = ((now.tv_sec - last_splicew.tv_sec) * 1000) + ((now.tv_usec - last_splicew.tv_usec) / 1000);
+		if (diff_time_ms < 1000) {//Don't record useless data
+			splicew_reqtime_counter[diff_time_ms >> 3]++;
+		}
+		else
+			splicew_reqtime_counter[255]++;
+
+		if(!init_splicew_prof)
+			init_splicew_prof = 1;
+
+		last_splicew = now;
+	}
+#endif
 	return ret;
 }
 
-EXPORT_SYMBOL(generic_file_splice_write);
+EXPORT_SYMBOL(comcerto_file_splice_write);
+#endif
 
 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			  struct splice_desc *sd)
@@ -1719,7 +2210,7 @@
 	}
 	mutex_lock(&inode->i_mutex);
 
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	sb_start_write(inode->i_sb);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
@@ -1734,6 +2225,7 @@
 	if (unlikely(!rcv_pool || !kvec_pool))
 	{
 		printk(KERN_ERR "rcv_pool %p kvec_pool %p uninitialized %d\n", rcv_pool, kvec_pool);
+		sb_end_write(inode->i_sb);
 		return -ENOMEM;
 	}
 
@@ -1743,6 +2235,7 @@
 	if (!rv_cb || !iov)
 	{
 		printk(KERN_ERR "Failed to get pool mem for %d pages (rv_cb %p iov %p)\n", page_cnt_est, rv_cb, iov);
+		sb_end_write(inode->i_sb);
 		return -ENOMEM;
 	}
 
@@ -1813,7 +2306,7 @@
 			printk("%s: write_end fail,ret = %d\n", __func__, ret);
 		count += rv_cb[i].rv_count;
 	}
-	balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+	balance_dirty_pages_ratelimited(mapping);
 	if (copy_to_user(ppos, &pos, sizeof(loff_t)))
 		err = -EFAULT;
 done:
@@ -1822,6 +2315,7 @@
 	common_mempool_free(kvec_pool, (void*)iov);
 
 	mutex_unlock(&inode->i_mutex);
+	sb_end_write(inode->i_sb);
 	return err ? err : count;
 cleanup:
 	for(i = 0; i < nr_pages; i++) {
diff -ur a/fs/squashfs/super.c b/fs/squashfs/super.c
--- a/fs/squashfs/super.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/squashfs/super.c	2014-02-17 11:57:01.000000000 +0100
@@ -421,6 +421,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(squashfs_inode_cachep);
 }
 
diff -ur a/fs/stack.c b/fs/stack.c
--- a/fs/stack.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/stack.c	2014-02-17 11:57:00.000000000 +0100
@@ -83,10 +83,3 @@
 }
 EXPORT_SYMBOL_GPL(fsstack_copy_attr_all);
 
-#ifdef MY_ABC_HERE
-void fsstack_copy_syno_archive(struct inode *dest, const struct inode *src)
-{
-	dest->i_mode2 = src->i_mode2;
-}
-EXPORT_SYMBOL_GPL(fsstack_copy_syno_archive);
-#endif
diff -ur a/fs/stat.c b/fs/stat.c
--- a/fs/stat.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/stat.c	2014-02-17 11:57:01.000000000 +0100
@@ -18,6 +18,10 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
+#ifdef CONFIG_FS_SYNO_ACL
+#include "synoacl_int.h"
+#endif
+
 #ifdef MY_ABC_HERE
 #include <linux/synolib.h>
 extern int SynoDebugFlag;
@@ -29,12 +33,6 @@
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = inode->i_ino;
 	stat->mode = inode->i_mode;
-#ifdef MY_ABC_HERE
-	stat->SynoMode = inode->i_mode2;
-#endif
-#ifdef MY_ABC_HERE
-	stat->syno_archive_version = inode->i_archive_version;
-#endif
 	stat->nlink = inode->i_nlink;
 	stat->uid = inode->i_uid;
 	stat->gid = inode->i_gid;
@@ -43,9 +41,6 @@
 	stat->atime = inode->i_atime;
 	stat->mtime = inode->i_mtime;
 	stat->ctime = inode->i_ctime;
-#ifdef MY_ABC_HERE
-	stat->SynoCreateTime = inode->i_CreateTime;
-#endif
 	stat->blksize = (1 << inode->i_blkbits);
 	stat->blocks = inode->i_blocks;
 }
@@ -61,10 +56,26 @@
 	if (retval)
 		return retval;
 
+#ifdef CONFIG_FS_SYNO_ACL
+	if (IS_SYNOACL(dentry)) {
+		if (inode->i_op->getattr) {
+			if (0 != (retval = inode->i_op->getattr(mnt, dentry, stat)))
+				return retval;
+		} else {
+			generic_fillattr(inode, stat);
+		}
+
+		synoacl_op_to_mode(dentry, stat);
+
+		return 0;
+	}
+#endif //CONFIG_FS_SYNO_ACL
+
 	if (inode->i_op->getattr)
 		return inode->i_op->getattr(mnt, dentry, stat);
 
 	generic_fillattr(inode, stat);
+
 	return 0;
 }
 
@@ -123,6 +134,72 @@
 }
 EXPORT_SYMBOL(vfs_lstat);
 
+#ifdef MY_ABC_HERE
+
+int __always_inline syno_vfs_getattr(struct path *path, struct kstat *stat, int stat_flags)
+{
+	int error = 0;
+
+	error = vfs_getattr(path->mnt, path->dentry, stat);
+	if ((!error) && stat_flags) {
+		struct inode *inode = path->dentry->d_inode;
+		int is_support = 1;
+
+		if (inode->i_op->syno_getattr) {
+			error = inode->i_op->syno_getattr(path->dentry, stat, stat_flags);
+			if (-EOPNOTSUPP == error) {
+				is_support = 0;
+				error = 0;
+			}
+		} else 
+			is_support = 0;
+
+		if (!is_support) {
+#ifdef MY_ABC_HERE
+			stat->SynoCreateTime = inode->i_CreateTime;
+#endif
+#ifdef MY_ABC_HERE
+			stat->SynoMode = inode->i_mode2;
+#endif
+#ifdef MY_ABC_HERE
+			stat->syno_archive_version = inode->i_archive_version;
+#endif
+		}
+	}
+	return error;
+}
+
+// copy from vfs_fstat
+int syno_vfs_fstat(unsigned int fd, struct kstat *stat, int stat_flags)
+{
+	int fput_needed;
+	struct file *f = fget_raw_light(fd, &fput_needed);
+	int error = -EBADF;
+
+	if (f) {
+		error = syno_vfs_getattr(&(f->f_path), stat, stat_flags);
+		fput_light(f, fput_needed);
+	}
+	return error;
+}
+EXPORT_SYMBOL(syno_vfs_fstat);
+
+int syno_vfs_stat(const char __user *name, struct kstat *stat, int flags, int stat_flags)
+{
+	struct path path;
+	int error;
+
+	error = user_path_at(AT_FDCWD, name, flags, &path);
+	if (error)
+		goto out;
+
+	error = syno_vfs_getattr(&path, stat, stat_flags);
+	path_put(&path);
+out:
+	return error;
+}
+EXPORT_SYMBOL(syno_vfs_stat);
+#endif
 
 #ifdef __ARCH_WANT_OLD_STAT
 
@@ -283,7 +360,14 @@
 	if (!file) {
 		return ret;
 	}
+
+	ret = mnt_want_write(file->f_vfsmnt);
+	if (ret)
+		goto fput_out;
+
 	ret = __SYNOArchiveOverwrite(file->f_path.dentry, flags);
+	mnt_drop_write(file->f_vfsmnt);
+fput_out:
 	fput_light(file, fput_needed);
 	return ret;
 }
@@ -498,7 +582,7 @@
  * The filename will be convert to real filename and return to user space.
  * In caller, the length of filename must equal or be larger than SYNO_SMB_PSTRING_LEN.
 */
-int __SYNOCaselessStat(char __user * filename, int isLink, struct kstat *stat, int *lastComponent)
+int __SYNOCaselessStat(char __user * filename, int nofollowLink, struct kstat *stat, int *lastComponent, int flags)
 {
 	struct path path;
 	int error;
@@ -516,14 +600,14 @@
 		printk("%s(%d) orig name:[%s] len:[%u]\n", __FUNCTION__, __LINE__, filename, (unsigned int)strlen(filename));
 	}
 #endif
-	if (isLink) {
+	if (nofollowLink) {
 		f = LOOKUP_CASELESS_COMPARE;
 	} else {
 		f = LOOKUP_FOLLOW|LOOKUP_CASELESS_COMPARE;
 	}
 	error = syno_user_path_at(AT_FDCWD, filename, f, &path, &real_filename, &real_filename_len, lastComponent);
 	if (!error) {
-		error = vfs_getattr(path.mnt, path.dentry, stat);
+		error = syno_vfs_getattr(&path, stat, flags);
 		path_put(&path);
 		if (real_filename_len) {
 			error = copy_to_user(filename, real_filename, real_filename_len) ? -EFAULT : error;
@@ -542,7 +626,7 @@
 
 	kfree(real_filename);
 #ifdef MY_ABC_HERE
-	if(!isLink && syno_hibernation_log_sec > 0) {
+	if(!nofollowLink && syno_hibernation_log_sec > 0) {
 		syno_do_hibernation_log(filename);
 	}
 #endif
@@ -557,7 +641,7 @@
 	long error = -1;
 	struct kstat stat;
 
-	error = __SYNOCaselessStat(filename, 0, &stat, &lastComponent);
+	error = __SYNOCaselessStat(filename, 0, &stat, &lastComponent, 0);
 	if (!error) {
 		error = cp_new_stat(&stat, statbuf);
 	}
@@ -571,7 +655,7 @@
 	long error = -1;
 	struct kstat stat;
 
-	error = __SYNOCaselessStat(filename, 1, &stat, &lastComponent);
+	error = __SYNOCaselessStat(filename, 1, &stat, &lastComponent, 0);
 	if (!error) {
 		error = cp_new_stat(&stat, statbuf);
 	}
@@ -747,15 +831,15 @@
 	return error;
 }
 
-static int do_SYNOStat(char __user * filename, int isLink, int f, struct SYNOSTAT __user * pSt, struct SYNOSTAT64 __user * pSt64)
+static int do_SYNOStat(char __user * filename, int nofollowLink, int flags, struct SYNOSTAT __user * pSt, struct SYNOSTAT64 __user * pSt64)
 {
 	long error = -EINVAL;
 	int lastComponent = 0;
 	struct kstat kst;
 
-	if (f & SYNOST_IS_CASELESS) {
+	if (flags & SYNOST_IS_CASELESS) {
 #ifdef MY_ABC_HERE
-		error = __SYNOCaselessStat(filename, isLink, &kst, &lastComponent);
+		error = __SYNOCaselessStat(filename, nofollowLink, &kst, &lastComponent, flags);
 		if (-ENOENT == error) {
 			if (pSt) {
 				if (__put_user(lastComponent, &pSt->ext.lastComponent)){
@@ -771,10 +855,10 @@
 		error = -EOPNOTSUPP;
 #endif
 	} else {
-		if (isLink) {
-			error = vfs_lstat(filename, &kst);
+		if (nofollowLink) {
+			error = syno_vfs_stat(filename, &kst, 0, flags);
 		} else {
-			error = vfs_stat(filename, &kst);
+			error = syno_vfs_stat(filename, &kst, LOOKUP_FOLLOW, flags);
 #ifdef MY_ABC_HERE
 			if(syno_hibernation_log_sec > 0) {
 				syno_do_hibernation_log(filename);
@@ -787,7 +871,7 @@
 		goto Out;
 	}
 
-	error = SYNOStatCopyToUser(&kst, f, pSt, pSt64);
+	error = SYNOStatCopyToUser(&kst, flags, pSt, pSt64);
 Out:
 	return error;
 }
@@ -797,7 +881,7 @@
 	int error;
 	struct kstat kst;
 
-	error = vfs_fstat(fd, &kst);
+	error = syno_vfs_fstat(fd, &kst, flags);
 	if (!error) {
 		error = SYNOStatCopyToUser(&kst, flags, pSt, pSt64);
 	}
@@ -840,7 +924,7 @@
 	long error = -1;
 	struct kstat stat;
 
-	error = __SYNOCaselessStat(filename, 0, &stat, &lastComponent);
+	error = __SYNOCaselessStat(filename, 0, &stat, &lastComponent, 0);
 	if (!error) {
 		error = cp_new_stat64(&stat, statbuf);
 	}
@@ -854,7 +938,7 @@
 	long error = -1;
 	struct kstat stat;
 
-	error = __SYNOCaselessStat(filename, 1, &stat, &lastComponent);
+	error = __SYNOCaselessStat(filename, 1, &stat, &lastComponent, 0);
 	if (!error) {
 		error = cp_new_stat64(&stat, statbuf);
 	}
diff -ur a/fs/super.c b/fs/super.c
--- a/fs/super.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/super.c	2014-02-17 11:57:00.000000000 +0100
@@ -32,16 +32,18 @@
 #include <linux/backing-dev.h>
 #include <linux/rculist_bl.h>
 #include <linux/cleancache.h>
+#include <linux/lockdep.h>
 #include "internal.h"
 
 
 LIST_HEAD(super_blocks);
 DEFINE_SPINLOCK(sb_lock);
 
-#ifdef MY_ABC_HERE
-spinlock_t Namei_buf_lock_1;  /* lock for UTF16NameiStrBuf1[] in fs/namei.c */
-static int lock_1_init = 0;
-#endif
+static char *sb_writers_name[SB_FREEZE_LEVELS] = {
+	"sb_writers",
+	"sb_pagefaults",
+	"sb_internal",
+};
 
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
@@ -106,6 +108,35 @@
 	return total_objects;
 }
 
+static int init_sb_writers(struct super_block *s, struct file_system_type *type)
+{
+	int err;
+	int i;
+
+	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
+		err = percpu_counter_init(&s->s_writers.counter[i], 0);
+		if (err < 0)
+			goto err_out;
+		lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
+				 &type->s_writers_key[i], 0);
+	}
+	init_waitqueue_head(&s->s_writers.wait);
+	init_waitqueue_head(&s->s_writers.wait_unfrozen);
+	return 0;
+err_out:
+	while (--i >= 0)
+		percpu_counter_destroy(&s->s_writers.counter[i]);
+	return err;
+}
+
+static void destroy_sb_writers(struct super_block *s)
+{
+	int i;
+
+	for (i = 0; i < SB_FREEZE_LEVELS; i++)
+		percpu_counter_destroy(&s->s_writers.counter[i]);
+}
+
 /**
  *	alloc_super	-	create new superblock
  *	@type:	filesystem type superblock should belong to
@@ -120,18 +151,19 @@
 
 	if (s) {
 		if (security_sb_alloc(s)) {
+			/*
+			 * We cannot call security_sb_free() without
+			 * security_sb_alloc() succeeding. So bail out manually
+			 */
 			kfree(s);
 			s = NULL;
 			goto out;
 		}
 #ifdef CONFIG_SMP
 		s->s_files = alloc_percpu(struct list_head);
-		if (!s->s_files) {
-			security_sb_free(s);
-			kfree(s);
-			s = NULL;
-			goto out;
-		} else {
+		if (!s->s_files)
+			goto err_out;
+		else {
 			int i;
 
 			for_each_possible_cpu(i)
@@ -140,6 +172,8 @@
 #else
 		INIT_LIST_HEAD(&s->s_files);
 #endif
+		if (init_sb_writers(s, type))
+			goto err_out;
 		s->s_bdi = &default_backing_dev_info;
 		INIT_LIST_HEAD(&s->s_instances);
 		INIT_HLIST_BL_HEAD(&s->s_anon);
@@ -179,7 +213,6 @@
 		mutex_init(&s->s_dquot.dqio_mutex);
 		mutex_init(&s->s_dquot.dqonoff_mutex);
 		init_rwsem(&s->s_dquot.dqptr_sem);
-		init_waitqueue_head(&s->s_wait_unfrozen);
 		s->s_maxbytes = MAX_NON_LFS;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
@@ -187,13 +220,6 @@
 		mutex_init(&s->s_archive_mutex);
 		s->s_archive_version = 0;
 #endif
-#ifdef MY_ABC_HERE
-		if (!lock_1_init) {
-			spin_lock_init(&Namei_buf_lock_1);
-			lock_1_init=1;
-		}
-#endif
-
 		s->cleancache_poolid = -1;
 
 		s->s_shrink.seeks = DEFAULT_SEEKS;
@@ -202,6 +228,16 @@
 	}
 out:
 	return s;
+err_out:
+	security_sb_free(s);
+#ifdef CONFIG_SMP
+	if (s->s_files)
+		free_percpu(s->s_files);
+#endif
+	destroy_sb_writers(s);
+	kfree(s);
+	s = NULL;
+	goto out;
 }
 
 /**
@@ -215,6 +251,7 @@
 #ifdef CONFIG_SMP
 	free_percpu(s->s_files);
 #endif
+	destroy_sb_writers(s);
 	security_sb_free(s);
 	kfree(s->s_subtype);
 	kfree(s->s_options);
@@ -248,41 +285,6 @@
 	spin_unlock(&sb_lock);
 }
 
-#ifdef MY_ABC_HERE
-/** 
- * Modified from deactivate_locked_super() 
- *  
- *  deactivate_read_locked_super	-	drop an active reference
- *  to superblock
- *	@s: superblock to deactivate
- *
- *	This is a read-lock variation of deactivate_locked_super
- */
-void deactivate_read_locked_super(struct super_block *s)
-{
-	struct file_system_type *fs = s->s_type;
-	if (atomic_dec_and_test(&s->s_active)) {
-		cleancache_flush_fs(s);
-		down_write(&s->s_umount);
-		fs->kill_sb(s);
-
-		/* caches are now gone, we can safely kill the shrinker now */
-		unregister_shrinker(&s->s_shrink);
-
-		/*
-		 * We need to call rcu_barrier so all the delayed rcu free
-		 * inodes are flushed before we release the fs module.
-		 */
-		rcu_barrier();
-		put_filesystem(fs);
-		put_super(s);
-	} else {
-		up_read(&s->s_umount);
-	}
-}
-EXPORT_SYMBOL(deactivate_read_locked_super);
-#endif
-
 /**
  *	deactivate_locked_super	-	drop an active reference to superblock
  *	@s: superblock to deactivate
@@ -303,12 +305,6 @@
 
 		/* caches are now gone, we can safely kill the shrinker now */
 		unregister_shrinker(&s->s_shrink);
-
-		/*
-		 * We need to call rcu_barrier so all the delayed rcu free
-		 * inodes are flushed before we release the fs module.
-		 */
-		rcu_barrier();
 		put_filesystem(fs);
 		put_super(s);
 	} else {
@@ -680,6 +676,29 @@
 EXPORT_SYMBOL(get_super);
 
 /**
+ *	get_super_thawed - get thawed superblock of a device
+ *	@bdev: device to get the superblock for
+ *
+ *	Scans the superblock list and finds the superblock of the file system
+ *	mounted on the device. The superblock is returned once it is thawed
+ *	(or immediately if it was not frozen). %NULL is returned if no match
+ *	is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+	while (1) {
+		struct super_block *s = get_super(bdev);
+		if (!s || s->s_writers.frozen == SB_UNFROZEN)
+			return s;
+		up_read(&s->s_umount);
+		wait_event(s->s_writers.wait_unfrozen,
+			   s->s_writers.frozen == SB_UNFROZEN);
+		put_super(s);
+	}
+}
+EXPORT_SYMBOL(get_super_thawed);
+
+/**
  * get_active_super - get an active reference to the superblock of a device
  * @bdev: device to get the superblock for
  *
@@ -751,7 +770,7 @@
 	int retval;
 	int remount_ro;
 
-	if (sb->s_frozen != SB_UNFROZEN)
+	if (sb->s_writers.frozen != SB_UNFROZEN)
 		return -EBUSY;
 
 #ifdef CONFIG_BLOCK
@@ -1175,6 +1194,120 @@
 	return ERR_PTR(error);
 }
 
+/*
+ * This is an internal function, please use sb_end_{write,pagefault,intwrite}
+ * instead.
+ */
+void __sb_end_write(struct super_block *sb, int level)
+{
+	percpu_counter_dec(&sb->s_writers.counter[level-1]);
+	/*
+	 * Make sure s_writers are updated before we wake up waiters in
+	 * freeze_super().
+	 */
+	smp_mb();
+	if (waitqueue_active(&sb->s_writers.wait))
+		wake_up(&sb->s_writers.wait);
+	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_);
+}
+EXPORT_SYMBOL(__sb_end_write);
+
+#ifdef CONFIG_LOCKDEP
+/*
+ * We want lockdep to tell us about possible deadlocks with freezing but
+ * it's it bit tricky to properly instrument it. Getting a freeze protection
+ * works as getting a read lock but there are subtle problems. XFS for example
+ * gets freeze protection on internal level twice in some cases, which is OK
+ * only because we already hold a freeze protection also on higher level. Due
+ * to these cases we have to tell lockdep we are doing trylock when we
+ * already hold a freeze protection for a higher freeze level.
+ */
+static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock,
+				unsigned long ip)
+{
+	int i;
+
+	if (!trylock) {
+		for (i = 0; i < level - 1; i++)
+			if (lock_is_held(&sb->s_writers.lock_map[i])) {
+				trylock = true;
+				break;
+			}
+	}
+	rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip);
+}
+#endif
+
+/*
+ * This is an internal function, please use sb_start_{write,pagefault,intwrite}
+ * instead.
+ */
+int __sb_start_write(struct super_block *sb, int level, bool wait)
+{
+retry:
+	if (unlikely(sb->s_writers.frozen >= level)) {
+		if (!wait)
+			return 0;
+		wait_event(sb->s_writers.wait_unfrozen,
+			   sb->s_writers.frozen < level);
+	}
+
+#ifdef CONFIG_LOCKDEP
+	acquire_freeze_lock(sb, level, !wait, _RET_IP_);
+#endif
+	percpu_counter_inc(&sb->s_writers.counter[level-1]);
+	/*
+	 * Make sure counter is updated before we check for frozen.
+	 * freeze_super() first sets frozen and then checks the counter.
+	 */
+	smp_mb();
+	if (unlikely(sb->s_writers.frozen >= level)) {
+		__sb_end_write(sb, level);
+		goto retry;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(__sb_start_write);
+
+/**
+ * sb_wait_write - wait until all writers to given file system finish
+ * @sb: the super for which we wait
+ * @level: type of writers we wait for (normal vs page fault)
+ *
+ * This function waits until there are no writers of given type to given file
+ * system. Caller of this function should make sure there can be no new writers
+ * of type @level before calling this function. Otherwise this function can
+ * livelock.
+ */
+static void sb_wait_write(struct super_block *sb, int level)
+{
+	s64 writers;
+
+	/*
+	 * We just cycle-through lockdep here so that it does not complain
+	 * about returning with lock to userspace
+	 */
+	rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_);
+	rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_);
+
+	do {
+		DEFINE_WAIT(wait);
+
+		/*
+		 * We use a barrier in prepare_to_wait() to separate setting
+		 * of frozen and checking of the counter
+		 */
+		prepare_to_wait(&sb->s_writers.wait, &wait,
+				TASK_UNINTERRUPTIBLE);
+
+		writers = percpu_counter_sum(&sb->s_writers.counter[level-1]);
+		if (writers)
+			schedule();
+
+		finish_wait(&sb->s_writers.wait, &wait);
+	} while (writers);
+}
+
 /**
  * freeze_super - lock the filesystem and force it into a consistent state
  * @sb: the super to lock
@@ -1182,52 +1315,91 @@
  * Syncs the super to make sure the filesystem is consistent and calls the fs's
  * freeze_fs.  Subsequent calls to this without first thawing the fs will return
  * -EBUSY.
+ *
+ * During this function, sb->s_writers.frozen goes through these values:
+ *
+ * SB_UNFROZEN: File system is normal, all writes progress as usual.
+ *
+ * SB_FREEZE_WRITE: The file system is in the process of being frozen.  New
+ * writes should be blocked, though page faults are still allowed. We wait for
+ * all writes to complete and then proceed to the next stage.
+ *
+ * SB_FREEZE_PAGEFAULT: Freezing continues. Now also page faults are blocked
+ * but internal fs threads can still modify the filesystem (although they
+ * should not dirty new pages or inodes), writeback can run etc. After waiting
+ * for all running page faults we sync the filesystem which will clean all
+ * dirty pages and inodes (no new dirty pages or inodes can be created when
+ * sync is running).
+ *
+ * SB_FREEZE_FS: The file system is frozen. Now all internal sources of fs
+ * modification are blocked (e.g. XFS preallocation truncation on inode
+ * reclaim). This is usually implemented by blocking new transactions for
+ * filesystems that have them and need this additional guard. After all
+ * internal writers are finished we call ->freeze_fs() to finish filesystem
+ * freezing. Then we transition to SB_FREEZE_COMPLETE state. This state is
+ * mostly auxiliary for filesystems to verify they do not modify frozen fs.
+ *
+ * sb->s_writers.frozen is protected by sb->s_umount.
  */
 int freeze_super(struct super_block *sb)
 {
 	int ret;
-#ifdef MY_ABC_HERE
-extern int sync_wait_fs_sync(struct super_block *sb);
-#endif
 
 	atomic_inc(&sb->s_active);
 	down_write(&sb->s_umount);
-	if (sb->s_frozen) {
+	if (sb->s_writers.frozen != SB_UNFROZEN) {
 		deactivate_locked_super(sb);
 		return -EBUSY;
 	}
 
 	if (sb->s_flags & MS_RDONLY) {
-		sb->s_frozen = SB_FREEZE_TRANS;
-		smp_wmb();
+		/* Nothing to do really... */
+		sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 		up_write(&sb->s_umount);
 		return 0;
 	}
 
-	sb->s_frozen = SB_FREEZE_WRITE;
+	/* From now on, no new normal writers can start */
+	sb->s_writers.frozen = SB_FREEZE_WRITE;
+	smp_wmb();
+
+	/* Release s_umount to preserve sb_start_write -> s_umount ordering */
+	up_write(&sb->s_umount);
+
+	sb_wait_write(sb, SB_FREEZE_WRITE);
+
+	/* Now we go and block page faults... */
+	down_write(&sb->s_umount);
+	sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
 	smp_wmb();
 
+	sb_wait_write(sb, SB_FREEZE_PAGEFAULT);
+
+	/* All writers are done so after syncing there won't be dirty data */
 	sync_filesystem(sb);
-#ifdef MY_ABC_HERE
-	sync_wait_fs_sync(sb);
-#endif
 
-	sb->s_frozen = SB_FREEZE_TRANS;
+	/* Now wait for internal filesystem counter */
+	sb->s_writers.frozen = SB_FREEZE_FS;
 	smp_wmb();
+	sb_wait_write(sb, SB_FREEZE_FS);
 
-	sync_blockdev(sb->s_bdev);
 	if (sb->s_op->freeze_fs) {
 		ret = sb->s_op->freeze_fs(sb);
 		if (ret) {
 			printk(KERN_ERR
 				"VFS:Filesystem freeze failed\n");
-			sb->s_frozen = SB_UNFROZEN;
+			sb->s_writers.frozen = SB_UNFROZEN;
 			smp_wmb();
-			wake_up(&sb->s_wait_unfrozen);
+			wake_up(&sb->s_writers.wait_unfrozen);
 			deactivate_locked_super(sb);
 			return ret;
 		}
 	}
+	/*
+	 * This is just for debugging purposes so that fs can warn if it
+	 * sees write activity when frozen is set to SB_FREEZE_COMPLETE.
+	 */
+	sb->s_writers.frozen = SB_FREEZE_COMPLETE;
 	up_write(&sb->s_umount);
 	return 0;
 }
@@ -1243,12 +1415,8 @@
 {
 	int error;
 
-#ifdef MY_ABC_HERE
-	down_read(&sb->s_umount);
-#else
 	down_write(&sb->s_umount);
-#endif
-	if (sb->s_frozen == SB_UNFROZEN) {
+	if (sb->s_writers.frozen == SB_UNFROZEN) {
 		up_write(&sb->s_umount);
 		return -EINVAL;
 	}
@@ -1261,21 +1429,16 @@
 		if (error) {
 			printk(KERN_ERR
 				"VFS:Filesystem thaw failed\n");
-			sb->s_frozen = SB_FREEZE_TRANS;
 			up_write(&sb->s_umount);
 			return error;
 		}
 	}
 
 out:
-	sb->s_frozen = SB_UNFROZEN;
+	sb->s_writers.frozen = SB_UNFROZEN;
 	smp_wmb();
-	wake_up(&sb->s_wait_unfrozen);
-#ifdef MY_ABC_HERE
-	deactivate_read_locked_super(sb);
-#else
+	wake_up(&sb->s_writers.wait_unfrozen);
 	deactivate_locked_super(sb);
-#endif
 
 	return 0;
 }
diff -ur a/fs/synoacl_api.c b/fs/synoacl_api.c
--- a/fs/synoacl_api.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/synoacl_api.c	2014-02-17 11:56:57.000000000 +0100
@@ -26,6 +26,46 @@
 struct synoacl_vfs_operations *VFS_MODULE = NULL;
 struct synoacl_syscall_operations *SYSCALL_MODULE = NULL;
 
+int SYNOACLModuleStatusGet(const char *szModName)
+{
+	int st = -1;
+	struct module *mod = NULL;
+
+	mutex_lock(&module_mutex);
+
+	if (NULL == (mod = find_module(szModName))){
+		goto Err;
+	}
+
+	st = mod->state;
+Err:
+	mutex_unlock(&module_mutex);
+
+	return st;
+}
+EXPORT_SYMBOL(SYNOACLModuleStatusGet);
+
+void UseACLModule(const char *szModName, int isGet)
+{
+	struct module *mod = NULL;
+
+	mutex_lock(&module_mutex);
+
+	if (NULL == (mod = find_module(szModName))){
+		printk("synoacl module [%s] is not loaded \n", szModName);
+		goto Err;
+	}
+
+	if (isGet) {
+		try_module_get(mod);
+	} else {
+		module_put(mod);
+	}
+Err:
+	mutex_unlock(&module_mutex);
+}
+EXPORT_SYMBOL(UseACLModule);
+
 /* --------------- Register Function ---------------- */
 int synoacl_vfs_register(struct synoacl_vfs_operations *pvfs, struct synoacl_syscall_operations *psys)
 {
@@ -48,13 +88,6 @@
 EXPORT_SYMBOL(synoacl_vfs_unregister);
 
 /* --------------- VFS API ---------------- */
-void synoacl_mod_release(struct syno_acl *acl)
-{
-	if (IS_VFS_ACL_READY(syno_acl_release)) {
-		DO_VFS(syno_acl_release, acl);
-	}
-}
-
 int synoacl_mod_archive_change_ok(struct dentry *d, unsigned int cmd, int tag, int mask)
 {
 	if (IS_VFS_ACL_READY(archive_change_ok)) {
@@ -62,6 +95,7 @@
 	}
 	return 0; //is settable
 }
+EXPORT_SYMBOL(synoacl_mod_archive_change_ok);
 
 int synoacl_mod_may_delete(struct dentry *d, struct inode *dir)
 {
@@ -72,6 +106,76 @@
 }
 EXPORT_SYMBOL(synoacl_mod_may_delete);
 
+int synoacl_mod_setattr_post(struct dentry *dentry, struct iattr *attr)
+{
+	if (IS_VFS_ACL_READY(syno_acl_setattr_post)) {
+		return DO_VFS(syno_acl_setattr_post, dentry, attr);
+	}
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(synoacl_mod_setattr_post);
+
+int synoacl_mod_inode_change_ok(struct dentry *d, struct iattr *attr)
+{
+	if (IS_VFS_ACL_READY(syno_inode_change_ok)) {
+		return DO_VFS(syno_inode_change_ok, d, attr);
+	}
+	return inode_change_ok(d->d_inode, attr);
+}
+EXPORT_SYMBOL(synoacl_mod_inode_change_ok);
+
+void synoacl_mod_to_mode(struct dentry *d, struct kstat *stat)
+{
+	if (IS_VFS_ACL_READY(syno_acl_to_mode)) {
+		DO_VFS(syno_acl_to_mode, d, stat);
+	}
+}
+EXPORT_SYMBOL(synoacl_mod_to_mode);
+
+int synoacl_mod_access(struct dentry *d, int mask)
+{
+	if (IS_VFS_ACL_READY(syno_acl_access)) {
+		return DO_VFS(syno_acl_access, d, mask);
+	}
+	return inode_permission(d->d_inode, mask);
+}
+EXPORT_SYMBOL(synoacl_mod_access);
+
+int synoacl_mod_exec_permission(struct dentry *d)
+{
+	if (IS_VFS_ACL_READY(syno_acl_exec_permission)) {
+		return DO_VFS(syno_acl_exec_permission, d);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(synoacl_mod_exec_permission);
+
+int synoacl_mod_permission(struct dentry *d, int mask)
+{
+	if (IS_VFS_ACL_READY(syno_acl_permission)) {
+		return DO_VFS(syno_acl_permission, d, mask);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(synoacl_mod_permission);
+
+int synoacl_mod_get_acl_xattr(struct dentry *d, int cmd, void *value, size_t size)
+{
+	if (IS_VFS_ACL_READY(syno_acl_xattr_get)) {
+		return DO_VFS(syno_acl_xattr_get, d, cmd, value, size);
+	}
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(synoacl_mod_get_acl_xattr);
+
+int synoacl_mod_init_acl(struct dentry *dentry, struct inode *inode)
+{
+	if (IS_VFS_ACL_READY(syno_acl_init)) {
+		return DO_VFS(syno_acl_init, dentry, inode);
+	}
+	return -EOPNOTSUPP;
+}
+EXPORT_SYMBOL(synoacl_mod_init_acl);
 /* --------------- System Call API ---------------- */
 asmlinkage long sys_SYNOACLIsSupport(const char *szPath, int fd, int tag)
 {
Nur in b/fs: syno_acl.c.
diff -ur a/fs/synoacl_int.h b/fs/synoacl_int.h
--- a/fs/synoacl_int.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/synoacl_int.h	2014-02-17 11:56:57.000000000 +0100
@@ -11,6 +11,11 @@
 #define NEED_INODE_ACL_SUPPORT 0x0004
 #define NEED_FS_ACL_SUPPORT 0x0008
 
+#define SYNOACL_XATTR_CHGNAME(name) \
+	if (!strcmp(name, SYNO_ACL_XATTR_ACCESS)) { \
+		name = SYNO_ACL_XATTR_ACCESS_NOPERM; \
+	}
+
 struct synoacl_syscall_operations {
 	int (*get_perm) (const char *szPath, int __user *pOutPerm);
 	int (*is_acl_support) (const char *szPath, int fd, int tag);
@@ -18,14 +23,157 @@
 };
 
 struct synoacl_vfs_operations {
-	void (*syno_acl_release) (struct syno_acl *acl);
 	int (*archive_change_ok) (struct dentry *d, unsigned int cmd, int tag, int mask);
-	int (*check_perm) (const char *szPath, int mask);
 	int (*syno_acl_may_delete) (struct dentry *, struct inode *, int);
+	int (*syno_acl_setattr_post) (struct dentry *dentry, struct iattr *);
+	int (*syno_inode_change_ok) (struct dentry *d, struct iattr *attr);
+	int (*syno_acl_access) (struct dentry *d, int mask);
+	void (*syno_acl_to_mode) (struct dentry *d, struct kstat *stat);
+	int (*syno_acl_exec_permission) (struct dentry *d);
+	int (*syno_acl_permission)(struct dentry *d, int mask);
+	int (*syno_acl_xattr_get) (struct dentry *d, int cmd, void *value, size_t size);
+	int (*syno_acl_init) (struct dentry *d, struct inode *inode);
 };
 
-void synoacl_mod_release(struct syno_acl *);
 int synoacl_mod_archive_change_ok(struct dentry *, unsigned int , int , int );
 int synoacl_mod_may_delete(struct dentry *, struct inode *);
+int synoacl_mod_setattr_post(struct dentry *, struct iattr *);
+int synoacl_mod_init_acl(struct dentry *, struct inode *);
+int synoacl_mod_inode_change_ok(struct dentry *, struct iattr *);
+int synoacl_mod_access(struct dentry *, int);
+void synoacl_mod_to_mode(struct dentry *, struct kstat *);
+int synoacl_mod_exec_permission(struct dentry *);
+int synoacl_mod_permission(struct dentry *, int);
+int synoacl_mod_get_acl_xattr(struct dentry *, int, void *, size_t);
+
+/**  Inode Operation of SYNOACL **/
+static inline int synoacl_op_perm(struct dentry * dentry, int perm)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_permission) {
+		return inode->i_op->syno_permission(dentry, perm);
+	}
+	/*printk(KERN_ERR "(%s/%d/%s) file:[%s], cur_uid: [%u], perm: [%d], error: [%d] \n", __FILE__, __LINE__, __FUNCTION__, dentry->d_iname, current_fsuid(), perm, synoacl_mod_permission(dentry, perm));*/
+	return synoacl_mod_permission(dentry, perm);
+}
+
+static inline int synoacl_op_exec_perm(struct dentry * dentry, struct inode * inode)
+{
+	if (inode->i_op->syno_exec_permission) {
+		return inode->i_op->syno_exec_permission(dentry);
+	}
+	/*printk(KERN_ERR "(%s/%d/%s) file:[%s], cur_uid: [%u], perm: [exec], error: [%d] \n", __FILE__, __LINE__, __FUNCTION__, dentry->d_iname, current_fsuid(), synoacl_mod_exec_permission(dentry));*/
+	return synoacl_mod_exec_permission(dentry);
+}
+
+static inline int synoacl_op_access(struct dentry * dentry, int mode)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_acl_access) {
+		return inode->i_op->syno_acl_access(dentry, mode);
+	}
+	return synoacl_mod_access(dentry, mode);
+}
+
+static inline void synoacl_op_to_mode(struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_acl_to_mode) {
+		inode->i_op->syno_acl_to_mode(dentry, stat);
+	} else {
+		synoacl_mod_to_mode(dentry, stat);
+	}
+}
+
+static inline int synoacl_op_xattr_get(struct dentry * dentry, int cmd, void *value, size_t size)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_acl_xattr_get) {
+		return inode->i_op->syno_acl_xattr_get(dentry, cmd, value, size);
+	}
+
+	return synoacl_mod_get_acl_xattr(dentry, cmd, value, size);
+}
+
+static inline int synoacl_op_inode_chg_ok(struct dentry * dentry, struct iattr * attr)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_inode_change_ok) {
+		return inode->i_op->syno_inode_change_ok(dentry, attr);
+	}
+
+	return synoacl_mod_inode_change_ok(dentry, attr);
+}
+
+static inline int synoacl_op_arbit_chg_ok(struct dentry *d, unsigned int cmd, int tag, int mask)
+{
+	struct inode *inode = d->d_inode;
+
+	if (inode->i_op->syno_arbit_chg_ok) {
+		return inode->i_op->syno_arbit_chg_ok(d, cmd, tag, mask);
+	}
+
+	return synoacl_mod_archive_change_ok(d, cmd, tag, mask);
+}
+
+static inline void synoacl_op_setattr_post(struct dentry * dentry, struct iattr * attr)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_setattr_post) {
+		inode->i_op->syno_setattr_post(dentry, attr);
+	} else {
+		synoacl_mod_setattr_post(dentry, attr);
+	}
+}
+
+static inline void synoacl_op_init(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_acl_init) {
+		inode->i_op->syno_acl_init(dentry, inode);
+	} else {
+		synoacl_mod_init_acl(dentry, inode);
+	}
+}
+
+static inline int synoacl_check_xattr_perm(const char *name, struct dentry *dentry, unsigned int perm) 
+{
+	int error = 0;
+
+	if (!name || strcmp(name, SYNO_ACL_XATTR_ACCESS)) { 
+		return 0; // skip xattr except ACL.
+	}
+
+	switch (perm) {
+	case MAY_READ_PERMISSION:
+		if (!IS_SYNOACL(dentry)) {
+			//printk(KERN_ERR "(%s/%d/%s) gfs:[%d] name: [%s], error: acl bit not on (fs:%d) \n", __FILE__, __LINE__, __FUNCTION__, IS_GLUSTER_FS(dentry->d_inode), name, IS_FS_SYNOACL(dentry->d_inode)?1:0); 
+			return -EOPNOTSUPP;
+		}
+		break;
+	case MAY_WRITE_PERMISSION:
+		if (!IS_FS_SYNOACL(dentry->d_inode)) {
+			return -EOPNOTSUPP;
+		}
+		break;
+	default: //invalid parameters, just skip it.
+		return 0;
+	}
+
+	error = synoacl_op_perm(dentry, perm);
+	if (error) {
+		//printk(KERN_ERR "(%s/%d/%s) gfs:[%d] name: [%s], error: perm err )\n", __FILE__, __LINE__, __FUNCTION__, IS_GLUSTER_FS(dentry->d_inode), name); 
+		return error;
+	}
+
+	return 0;
+}
 
 #endif  /* __LINUX_SYNOACL_INT_H */
diff -ur a/fs/sysv/inode.c b/fs/sysv/inode.c
--- a/fs/sysv/inode.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/sysv/inode.c	2014-02-17 11:56:59.000000000 +0100
@@ -376,5 +376,10 @@
 
 void sysv_destroy_icache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(sysv_inode_cachep);
 }
diff -ur a/fs/ubifs/budget.c b/fs/ubifs/budget.c
--- a/fs/ubifs/budget.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/budget.c	2014-02-17 11:56:58.000000000 +0100
@@ -342,9 +342,8 @@
 	lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt -
 	       c->lst.taken_empty_lebs;
 	if (unlikely(rsvd_idx_lebs > lebs)) {
-		dbg_budg("out of indexing space: min_idx_lebs %d (old %d), "
-			 "rsvd_idx_lebs %d", min_idx_lebs, c->bi.min_idx_lebs,
-			 rsvd_idx_lebs);
+		dbg_budg("out of indexing space: min_idx_lebs %d (old %d), rsvd_idx_lebs %d",
+			 min_idx_lebs, c->bi.min_idx_lebs, rsvd_idx_lebs);
 		return -ENOSPC;
 	}
 
diff -ur a/fs/ubifs/commit.c b/fs/ubifs/commit.c
--- a/fs/ubifs/commit.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/commit.c	2014-02-17 11:56:58.000000000 +0100
@@ -293,8 +293,8 @@
 	int err;
 	struct ubifs_info *c = info;
 
-	dbg_msg("background thread \"%s\" started, PID %d",
-		c->bgt_name, current->pid);
+	ubifs_msg("background thread \"%s\" started, PID %d",
+		  c->bgt_name, current->pid);
 	set_freezable();
 
 	while (1) {
@@ -328,7 +328,7 @@
 		cond_resched();
 	}
 
-	dbg_msg("background thread \"%s\" stops", c->bgt_name);
+	ubifs_msg("background thread \"%s\" stops", c->bgt_name);
 	return 0;
 }
 
@@ -496,7 +496,9 @@
 	return ret;
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
 
 /**
  * struct idx_node - hold index nodes during index tree traversal.
@@ -512,7 +514,7 @@
 	struct list_head list;
 	int iip;
 	union ubifs_key upper_key;
-	struct ubifs_idx_node idx __attribute__((aligned(8)));
+	struct ubifs_idx_node idx __aligned(8);
 };
 
 /**
@@ -714,14 +716,14 @@
 	return 0;
 
 out_dump:
-	dbg_err("dumping index node (iip=%d)", i->iip);
-	dbg_dump_node(c, idx);
+	ubifs_err("dumping index node (iip=%d)", i->iip);
+	ubifs_dump_node(c, idx);
 	list_del(&i->list);
 	kfree(i);
 	if (!list_empty(&list)) {
 		i = list_entry(list.prev, struct idx_node, list);
-		dbg_err("dumping parent index node");
-		dbg_dump_node(c, &i->idx);
+		ubifs_err("dumping parent index node");
+		ubifs_dump_node(c, &i->idx);
 	}
 out_free:
 	while (!list_empty(&list)) {
@@ -734,5 +736,3 @@
 		err = -EINVAL;
 	return err;
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/compress.c b/fs/ubifs/compress.c
--- a/fs/ubifs/compress.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/compress.c	2014-02-17 11:56:58.000000000 +0100
@@ -112,8 +112,7 @@
 	if (compr->comp_mutex)
 		mutex_unlock(compr->comp_mutex);
 	if (unlikely(err)) {
-		ubifs_warn("cannot compress %d bytes, compressor %s, "
-			   "error %d, leave data uncompressed",
+		ubifs_warn("cannot compress %d bytes, compressor %s, error %d, leave data uncompressed",
 			   in_len, compr->name, err);
 		 goto no_compr;
 	}
@@ -176,8 +175,8 @@
 	if (compr->decomp_mutex)
 		mutex_unlock(compr->decomp_mutex);
 	if (err)
-		ubifs_err("cannot decompress %d bytes, compressor %s, "
-			  "error %d", in_len, compr->name, err);
+		ubifs_err("cannot decompress %d bytes, compressor %s, error %d",
+			  in_len, compr->name, err);
 
 	return err;
 }
diff -ur a/fs/ubifs/debug.c b/fs/ubifs/debug.c
--- a/fs/ubifs/debug.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/debug.c	2014-02-17 11:56:58.000000000 +0100
@@ -34,12 +34,7 @@
 #include <linux/random.h>
 #include "ubifs.h"
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
-DEFINE_SPINLOCK(dbg_lock);
-
-static char dbg_key_buf0[128];
-static char dbg_key_buf1[128];
+static DEFINE_SPINLOCK(dbg_lock);
 
 static const char *get_key_fmt(int fmt)
 {
@@ -103,8 +98,8 @@
 	}
 }
 
-static void sprintf_key(const struct ubifs_info *c, const union ubifs_key *key,
-			char *buffer)
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+			     const union ubifs_key *key, char *buffer, int len)
 {
 	char *p = buffer;
 	int type = key_type(c, key);
@@ -112,45 +107,34 @@
 	if (c->key_fmt == UBIFS_SIMPLE_KEY_FMT) {
 		switch (type) {
 		case UBIFS_INO_KEY:
-			sprintf(p, "(%lu, %s)", (unsigned long)key_inum(c, key),
-			       get_key_type(type));
+			len -= snprintf(p, len, "(%lu, %s)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type));
 			break;
 		case UBIFS_DENT_KEY:
 		case UBIFS_XENT_KEY:
-			sprintf(p, "(%lu, %s, %#08x)",
-				(unsigned long)key_inum(c, key),
-				get_key_type(type), key_hash(c, key));
+			len -= snprintf(p, len, "(%lu, %s, %#08x)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type), key_hash(c, key));
 			break;
 		case UBIFS_DATA_KEY:
-			sprintf(p, "(%lu, %s, %u)",
-				(unsigned long)key_inum(c, key),
-				get_key_type(type), key_block(c, key));
+			len -= snprintf(p, len, "(%lu, %s, %u)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type), key_block(c, key));
 			break;
 		case UBIFS_TRUN_KEY:
-			sprintf(p, "(%lu, %s)",
-				(unsigned long)key_inum(c, key),
-				get_key_type(type));
+			len -= snprintf(p, len, "(%lu, %s)",
+					(unsigned long)key_inum(c, key),
+					get_key_type(type));
 			break;
 		default:
-			sprintf(p, "(bad key type: %#08x, %#08x)",
-				key->u32[0], key->u32[1]);
+			len -= snprintf(p, len, "(bad key type: %#08x, %#08x)",
+					key->u32[0], key->u32[1]);
 		}
 	} else
-		sprintf(p, "bad key format %d", c->key_fmt);
-}
-
-const char *dbg_key_str0(const struct ubifs_info *c, const union ubifs_key *key)
-{
-	/* dbg_lock must be held */
-	sprintf_key(c, key, dbg_key_buf0);
-	return dbg_key_buf0;
-}
-
-const char *dbg_key_str1(const struct ubifs_info *c, const union ubifs_key *key)
-{
-	/* dbg_lock must be held */
-	sprintf_key(c, key, dbg_key_buf1);
-	return dbg_key_buf1;
+		len -= snprintf(p, len, "bad key format %d", c->key_fmt);
+	ubifs_assert(len > 0);
+	return p;
 }
 
 const char *dbg_ntype(int type)
@@ -235,18 +219,18 @@
 
 static void dump_ch(const struct ubifs_ch *ch)
 {
-	printk(KERN_DEBUG "\tmagic          %#x\n", le32_to_cpu(ch->magic));
-	printk(KERN_DEBUG "\tcrc            %#x\n", le32_to_cpu(ch->crc));
-	printk(KERN_DEBUG "\tnode_type      %d (%s)\n", ch->node_type,
+	pr_err("\tmagic          %#x\n", le32_to_cpu(ch->magic));
+	pr_err("\tcrc            %#x\n", le32_to_cpu(ch->crc));
+	pr_err("\tnode_type      %d (%s)\n", ch->node_type,
 	       dbg_ntype(ch->node_type));
-	printk(KERN_DEBUG "\tgroup_type     %d (%s)\n", ch->group_type,
+	pr_err("\tgroup_type     %d (%s)\n", ch->group_type,
 	       dbg_gtype(ch->group_type));
-	printk(KERN_DEBUG "\tsqnum          %llu\n",
+	pr_err("\tsqnum          %llu\n",
 	       (unsigned long long)le64_to_cpu(ch->sqnum));
-	printk(KERN_DEBUG "\tlen            %u\n", le32_to_cpu(ch->len));
+	pr_err("\tlen            %u\n", le32_to_cpu(ch->len));
 }
 
-void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode)
+void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
 {
 	const struct ubifs_inode *ui = ubifs_inode(inode);
 	struct qstr nm = { .name = NULL };
@@ -254,43 +238,43 @@
 	struct ubifs_dent_node *dent, *pdent = NULL;
 	int count = 2;
 
-	printk(KERN_DEBUG "Dump in-memory inode:");
-	printk(KERN_DEBUG "\tinode          %lu\n", inode->i_ino);
-	printk(KERN_DEBUG "\tsize           %llu\n",
+	pr_err("Dump in-memory inode:");
+	pr_err("\tinode          %lu\n", inode->i_ino);
+	pr_err("\tsize           %llu\n",
 	       (unsigned long long)i_size_read(inode));
-	printk(KERN_DEBUG "\tnlink          %u\n", inode->i_nlink);
-	printk(KERN_DEBUG "\tuid            %u\n", (unsigned int)inode->i_uid);
-	printk(KERN_DEBUG "\tgid            %u\n", (unsigned int)inode->i_gid);
-	printk(KERN_DEBUG "\tatime          %u.%u\n",
+	pr_err("\tnlink          %u\n", inode->i_nlink);
+	pr_err("\tuid            %u\n", (unsigned int)inode->i_uid);
+	pr_err("\tgid            %u\n", (unsigned int)inode->i_gid);
+	pr_err("\tatime          %u.%u\n",
 	       (unsigned int)inode->i_atime.tv_sec,
 	       (unsigned int)inode->i_atime.tv_nsec);
-	printk(KERN_DEBUG "\tmtime          %u.%u\n",
+	pr_err("\tmtime          %u.%u\n",
 	       (unsigned int)inode->i_mtime.tv_sec,
 	       (unsigned int)inode->i_mtime.tv_nsec);
-	printk(KERN_DEBUG "\tctime          %u.%u\n",
+	pr_err("\tctime          %u.%u\n",
 	       (unsigned int)inode->i_ctime.tv_sec,
 	       (unsigned int)inode->i_ctime.tv_nsec);
-	printk(KERN_DEBUG "\tcreat_sqnum    %llu\n", ui->creat_sqnum);
-	printk(KERN_DEBUG "\txattr_size     %u\n", ui->xattr_size);
-	printk(KERN_DEBUG "\txattr_cnt      %u\n", ui->xattr_cnt);
-	printk(KERN_DEBUG "\txattr_names    %u\n", ui->xattr_names);
-	printk(KERN_DEBUG "\tdirty          %u\n", ui->dirty);
-	printk(KERN_DEBUG "\txattr          %u\n", ui->xattr);
-	printk(KERN_DEBUG "\tbulk_read      %u\n", ui->xattr);
-	printk(KERN_DEBUG "\tsynced_i_size  %llu\n",
+	pr_err("\tcreat_sqnum    %llu\n", ui->creat_sqnum);
+	pr_err("\txattr_size     %u\n", ui->xattr_size);
+	pr_err("\txattr_cnt      %u\n", ui->xattr_cnt);
+	pr_err("\txattr_names    %u\n", ui->xattr_names);
+	pr_err("\tdirty          %u\n", ui->dirty);
+	pr_err("\txattr          %u\n", ui->xattr);
+	pr_err("\tbulk_read      %u\n", ui->xattr);
+	pr_err("\tsynced_i_size  %llu\n",
 	       (unsigned long long)ui->synced_i_size);
-	printk(KERN_DEBUG "\tui_size        %llu\n",
+	pr_err("\tui_size        %llu\n",
 	       (unsigned long long)ui->ui_size);
-	printk(KERN_DEBUG "\tflags          %d\n", ui->flags);
-	printk(KERN_DEBUG "\tcompr_type     %d\n", ui->compr_type);
-	printk(KERN_DEBUG "\tlast_page_read %lu\n", ui->last_page_read);
-	printk(KERN_DEBUG "\tread_in_a_row  %lu\n", ui->read_in_a_row);
-	printk(KERN_DEBUG "\tdata_len       %d\n", ui->data_len);
+	pr_err("\tflags          %d\n", ui->flags);
+	pr_err("\tcompr_type     %d\n", ui->compr_type);
+	pr_err("\tlast_page_read %lu\n", ui->last_page_read);
+	pr_err("\tread_in_a_row  %lu\n", ui->read_in_a_row);
+	pr_err("\tdata_len       %d\n", ui->data_len);
 
 	if (!S_ISDIR(inode->i_mode))
 		return;
 
-	printk(KERN_DEBUG "List of directory entries:\n");
+	pr_err("List of directory entries:\n");
 	ubifs_assert(!mutex_is_locked(&c->tnc_mutex));
 
 	lowest_dent_key(c, &key, inode->i_ino);
@@ -298,11 +282,11 @@
 		dent = ubifs_tnc_next_ent(c, &key, &nm);
 		if (IS_ERR(dent)) {
 			if (PTR_ERR(dent) != -ENOENT)
-				printk(KERN_DEBUG "error %ld\n", PTR_ERR(dent));
+				pr_err("error %ld\n", PTR_ERR(dent));
 			break;
 		}
 
-		printk(KERN_DEBUG "\t%d: %s (%s)\n",
+		pr_err("\t%d: %s (%s)\n",
 		       count++, dent->name, get_dent_type(dent->type));
 
 		nm.name = dent->name;
@@ -314,19 +298,17 @@
 	kfree(pdent);
 }
 
-void dbg_dump_node(const struct ubifs_info *c, const void *node)
+void ubifs_dump_node(const struct ubifs_info *c, const void *node)
 {
 	int i, n;
 	union ubifs_key key;
 	const struct ubifs_ch *ch = node;
-
-	if (dbg_is_tst_rcvry(c))
-		return;
+	char key_buf[DBG_KEY_BUF_LEN];
 
 	/* If the magic is incorrect, just hexdump the first bytes */
 	if (le32_to_cpu(ch->magic) != UBIFS_NODE_MAGIC) {
-		printk(KERN_DEBUG "Not a node, first %zu bytes:", UBIFS_CH_SZ);
-		print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 1,
+		pr_err("Not a node, first %zu bytes:", UBIFS_CH_SZ);
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 32, 1,
 			       (void *)node, UBIFS_CH_SZ, 1);
 		return;
 	}
@@ -339,8 +321,7 @@
 	{
 		const struct ubifs_pad_node *pad = node;
 
-		printk(KERN_DEBUG "\tpad_len        %u\n",
-		       le32_to_cpu(pad->pad_len));
+		pr_err("\tpad_len        %u\n", le32_to_cpu(pad->pad_len));
 		break;
 	}
 	case UBIFS_SB_NODE:
@@ -348,112 +329,77 @@
 		const struct ubifs_sb_node *sup = node;
 		unsigned int sup_flags = le32_to_cpu(sup->flags);
 
-		printk(KERN_DEBUG "\tkey_hash       %d (%s)\n",
+		pr_err("\tkey_hash       %d (%s)\n",
 		       (int)sup->key_hash, get_key_hash(sup->key_hash));
-		printk(KERN_DEBUG "\tkey_fmt        %d (%s)\n",
+		pr_err("\tkey_fmt        %d (%s)\n",
 		       (int)sup->key_fmt, get_key_fmt(sup->key_fmt));
-		printk(KERN_DEBUG "\tflags          %#x\n", sup_flags);
-		printk(KERN_DEBUG "\t  big_lpt      %u\n",
+		pr_err("\tflags          %#x\n", sup_flags);
+		pr_err("\t  big_lpt      %u\n",
 		       !!(sup_flags & UBIFS_FLG_BIGLPT));
-		printk(KERN_DEBUG "\t  space_fixup  %u\n",
+		pr_err("\t  space_fixup  %u\n",
 		       !!(sup_flags & UBIFS_FLG_SPACE_FIXUP));
-		printk(KERN_DEBUG "\tmin_io_size    %u\n",
-		       le32_to_cpu(sup->min_io_size));
-		printk(KERN_DEBUG "\tleb_size       %u\n",
-		       le32_to_cpu(sup->leb_size));
-		printk(KERN_DEBUG "\tleb_cnt        %u\n",
-		       le32_to_cpu(sup->leb_cnt));
-		printk(KERN_DEBUG "\tmax_leb_cnt    %u\n",
-		       le32_to_cpu(sup->max_leb_cnt));
-		printk(KERN_DEBUG "\tmax_bud_bytes  %llu\n",
+		pr_err("\tmin_io_size    %u\n", le32_to_cpu(sup->min_io_size));
+		pr_err("\tleb_size       %u\n", le32_to_cpu(sup->leb_size));
+		pr_err("\tleb_cnt        %u\n", le32_to_cpu(sup->leb_cnt));
+		pr_err("\tmax_leb_cnt    %u\n", le32_to_cpu(sup->max_leb_cnt));
+		pr_err("\tmax_bud_bytes  %llu\n",
 		       (unsigned long long)le64_to_cpu(sup->max_bud_bytes));
-		printk(KERN_DEBUG "\tlog_lebs       %u\n",
-		       le32_to_cpu(sup->log_lebs));
-		printk(KERN_DEBUG "\tlpt_lebs       %u\n",
-		       le32_to_cpu(sup->lpt_lebs));
-		printk(KERN_DEBUG "\torph_lebs      %u\n",
-		       le32_to_cpu(sup->orph_lebs));
-		printk(KERN_DEBUG "\tjhead_cnt      %u\n",
-		       le32_to_cpu(sup->jhead_cnt));
-		printk(KERN_DEBUG "\tfanout         %u\n",
-		       le32_to_cpu(sup->fanout));
-		printk(KERN_DEBUG "\tlsave_cnt      %u\n",
-		       le32_to_cpu(sup->lsave_cnt));
-		printk(KERN_DEBUG "\tdefault_compr  %u\n",
+		pr_err("\tlog_lebs       %u\n", le32_to_cpu(sup->log_lebs));
+		pr_err("\tlpt_lebs       %u\n", le32_to_cpu(sup->lpt_lebs));
+		pr_err("\torph_lebs      %u\n", le32_to_cpu(sup->orph_lebs));
+		pr_err("\tjhead_cnt      %u\n", le32_to_cpu(sup->jhead_cnt));
+		pr_err("\tfanout         %u\n", le32_to_cpu(sup->fanout));
+		pr_err("\tlsave_cnt      %u\n", le32_to_cpu(sup->lsave_cnt));
+		pr_err("\tdefault_compr  %u\n",
 		       (int)le16_to_cpu(sup->default_compr));
-		printk(KERN_DEBUG "\trp_size        %llu\n",
+		pr_err("\trp_size        %llu\n",
 		       (unsigned long long)le64_to_cpu(sup->rp_size));
-		printk(KERN_DEBUG "\trp_uid         %u\n",
-		       le32_to_cpu(sup->rp_uid));
-		printk(KERN_DEBUG "\trp_gid         %u\n",
-		       le32_to_cpu(sup->rp_gid));
-		printk(KERN_DEBUG "\tfmt_version    %u\n",
-		       le32_to_cpu(sup->fmt_version));
-		printk(KERN_DEBUG "\ttime_gran      %u\n",
-		       le32_to_cpu(sup->time_gran));
-		printk(KERN_DEBUG "\tUUID           %pUB\n",
-		       sup->uuid);
+		pr_err("\trp_uid         %u\n", le32_to_cpu(sup->rp_uid));
+		pr_err("\trp_gid         %u\n", le32_to_cpu(sup->rp_gid));
+		pr_err("\tfmt_version    %u\n", le32_to_cpu(sup->fmt_version));
+		pr_err("\ttime_gran      %u\n", le32_to_cpu(sup->time_gran));
+		pr_err("\tUUID           %pUB\n", sup->uuid);
 		break;
 	}
 	case UBIFS_MST_NODE:
 	{
 		const struct ubifs_mst_node *mst = node;
 
-		printk(KERN_DEBUG "\thighest_inum   %llu\n",
+		pr_err("\thighest_inum   %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->highest_inum));
-		printk(KERN_DEBUG "\tcommit number  %llu\n",
+		pr_err("\tcommit number  %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->cmt_no));
-		printk(KERN_DEBUG "\tflags          %#x\n",
-		       le32_to_cpu(mst->flags));
-		printk(KERN_DEBUG "\tlog_lnum       %u\n",
-		       le32_to_cpu(mst->log_lnum));
-		printk(KERN_DEBUG "\troot_lnum      %u\n",
-		       le32_to_cpu(mst->root_lnum));
-		printk(KERN_DEBUG "\troot_offs      %u\n",
-		       le32_to_cpu(mst->root_offs));
-		printk(KERN_DEBUG "\troot_len       %u\n",
-		       le32_to_cpu(mst->root_len));
-		printk(KERN_DEBUG "\tgc_lnum        %u\n",
-		       le32_to_cpu(mst->gc_lnum));
-		printk(KERN_DEBUG "\tihead_lnum     %u\n",
-		       le32_to_cpu(mst->ihead_lnum));
-		printk(KERN_DEBUG "\tihead_offs     %u\n",
-		       le32_to_cpu(mst->ihead_offs));
-		printk(KERN_DEBUG "\tindex_size     %llu\n",
+		pr_err("\tflags          %#x\n", le32_to_cpu(mst->flags));
+		pr_err("\tlog_lnum       %u\n", le32_to_cpu(mst->log_lnum));
+		pr_err("\troot_lnum      %u\n", le32_to_cpu(mst->root_lnum));
+		pr_err("\troot_offs      %u\n", le32_to_cpu(mst->root_offs));
+		pr_err("\troot_len       %u\n", le32_to_cpu(mst->root_len));
+		pr_err("\tgc_lnum        %u\n", le32_to_cpu(mst->gc_lnum));
+		pr_err("\tihead_lnum     %u\n", le32_to_cpu(mst->ihead_lnum));
+		pr_err("\tihead_offs     %u\n", le32_to_cpu(mst->ihead_offs));
+		pr_err("\tindex_size     %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->index_size));
-		printk(KERN_DEBUG "\tlpt_lnum       %u\n",
-		       le32_to_cpu(mst->lpt_lnum));
-		printk(KERN_DEBUG "\tlpt_offs       %u\n",
-		       le32_to_cpu(mst->lpt_offs));
-		printk(KERN_DEBUG "\tnhead_lnum     %u\n",
-		       le32_to_cpu(mst->nhead_lnum));
-		printk(KERN_DEBUG "\tnhead_offs     %u\n",
-		       le32_to_cpu(mst->nhead_offs));
-		printk(KERN_DEBUG "\tltab_lnum      %u\n",
-		       le32_to_cpu(mst->ltab_lnum));
-		printk(KERN_DEBUG "\tltab_offs      %u\n",
-		       le32_to_cpu(mst->ltab_offs));
-		printk(KERN_DEBUG "\tlsave_lnum     %u\n",
-		       le32_to_cpu(mst->lsave_lnum));
-		printk(KERN_DEBUG "\tlsave_offs     %u\n",
-		       le32_to_cpu(mst->lsave_offs));
-		printk(KERN_DEBUG "\tlscan_lnum     %u\n",
-		       le32_to_cpu(mst->lscan_lnum));
-		printk(KERN_DEBUG "\tleb_cnt        %u\n",
-		       le32_to_cpu(mst->leb_cnt));
-		printk(KERN_DEBUG "\tempty_lebs     %u\n",
-		       le32_to_cpu(mst->empty_lebs));
-		printk(KERN_DEBUG "\tidx_lebs       %u\n",
-		       le32_to_cpu(mst->idx_lebs));
-		printk(KERN_DEBUG "\ttotal_free     %llu\n",
+		pr_err("\tlpt_lnum       %u\n", le32_to_cpu(mst->lpt_lnum));
+		pr_err("\tlpt_offs       %u\n", le32_to_cpu(mst->lpt_offs));
+		pr_err("\tnhead_lnum     %u\n", le32_to_cpu(mst->nhead_lnum));
+		pr_err("\tnhead_offs     %u\n", le32_to_cpu(mst->nhead_offs));
+		pr_err("\tltab_lnum      %u\n", le32_to_cpu(mst->ltab_lnum));
+		pr_err("\tltab_offs      %u\n", le32_to_cpu(mst->ltab_offs));
+		pr_err("\tlsave_lnum     %u\n", le32_to_cpu(mst->lsave_lnum));
+		pr_err("\tlsave_offs     %u\n", le32_to_cpu(mst->lsave_offs));
+		pr_err("\tlscan_lnum     %u\n", le32_to_cpu(mst->lscan_lnum));
+		pr_err("\tleb_cnt        %u\n", le32_to_cpu(mst->leb_cnt));
+		pr_err("\tempty_lebs     %u\n", le32_to_cpu(mst->empty_lebs));
+		pr_err("\tidx_lebs       %u\n", le32_to_cpu(mst->idx_lebs));
+		pr_err("\ttotal_free     %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->total_free));
-		printk(KERN_DEBUG "\ttotal_dirty    %llu\n",
+		pr_err("\ttotal_dirty    %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->total_dirty));
-		printk(KERN_DEBUG "\ttotal_used     %llu\n",
+		pr_err("\ttotal_used     %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->total_used));
-		printk(KERN_DEBUG "\ttotal_dead     %llu\n",
+		pr_err("\ttotal_dead     %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->total_dead));
-		printk(KERN_DEBUG "\ttotal_dark     %llu\n",
+		pr_err("\ttotal_dark     %llu\n",
 		       (unsigned long long)le64_to_cpu(mst->total_dark));
 		break;
 	}
@@ -461,12 +407,9 @@
 	{
 		const struct ubifs_ref_node *ref = node;
 
-		printk(KERN_DEBUG "\tlnum           %u\n",
-		       le32_to_cpu(ref->lnum));
-		printk(KERN_DEBUG "\toffs           %u\n",
-		       le32_to_cpu(ref->offs));
-		printk(KERN_DEBUG "\tjhead          %u\n",
-		       le32_to_cpu(ref->jhead));
+		pr_err("\tlnum           %u\n", le32_to_cpu(ref->lnum));
+		pr_err("\toffs           %u\n", le32_to_cpu(ref->offs));
+		pr_err("\tjhead          %u\n", le32_to_cpu(ref->jhead));
 		break;
 	}
 	case UBIFS_INO_NODE:
@@ -474,40 +417,32 @@
 		const struct ubifs_ino_node *ino = node;
 
 		key_read(c, &ino->key, &key);
-		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
-		printk(KERN_DEBUG "\tcreat_sqnum    %llu\n",
+		pr_err("\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+		pr_err("\tcreat_sqnum    %llu\n",
 		       (unsigned long long)le64_to_cpu(ino->creat_sqnum));
-		printk(KERN_DEBUG "\tsize           %llu\n",
+		pr_err("\tsize           %llu\n",
 		       (unsigned long long)le64_to_cpu(ino->size));
-		printk(KERN_DEBUG "\tnlink          %u\n",
-		       le32_to_cpu(ino->nlink));
-		printk(KERN_DEBUG "\tatime          %lld.%u\n",
+		pr_err("\tnlink          %u\n", le32_to_cpu(ino->nlink));
+		pr_err("\tatime          %lld.%u\n",
 		       (long long)le64_to_cpu(ino->atime_sec),
 		       le32_to_cpu(ino->atime_nsec));
-		printk(KERN_DEBUG "\tmtime          %lld.%u\n",
+		pr_err("\tmtime          %lld.%u\n",
 		       (long long)le64_to_cpu(ino->mtime_sec),
 		       le32_to_cpu(ino->mtime_nsec));
-		printk(KERN_DEBUG "\tctime          %lld.%u\n",
+		pr_err("\tctime          %lld.%u\n",
 		       (long long)le64_to_cpu(ino->ctime_sec),
 		       le32_to_cpu(ino->ctime_nsec));
-		printk(KERN_DEBUG "\tuid            %u\n",
-		       le32_to_cpu(ino->uid));
-		printk(KERN_DEBUG "\tgid            %u\n",
-		       le32_to_cpu(ino->gid));
-		printk(KERN_DEBUG "\tmode           %u\n",
-		       le32_to_cpu(ino->mode));
-		printk(KERN_DEBUG "\tflags          %#x\n",
-		       le32_to_cpu(ino->flags));
-		printk(KERN_DEBUG "\txattr_cnt      %u\n",
-		       le32_to_cpu(ino->xattr_cnt));
-		printk(KERN_DEBUG "\txattr_size     %u\n",
-		       le32_to_cpu(ino->xattr_size));
-		printk(KERN_DEBUG "\txattr_names    %u\n",
-		       le32_to_cpu(ino->xattr_names));
-		printk(KERN_DEBUG "\tcompr_type     %#x\n",
+		pr_err("\tuid            %u\n", le32_to_cpu(ino->uid));
+		pr_err("\tgid            %u\n", le32_to_cpu(ino->gid));
+		pr_err("\tmode           %u\n", le32_to_cpu(ino->mode));
+		pr_err("\tflags          %#x\n", le32_to_cpu(ino->flags));
+		pr_err("\txattr_cnt      %u\n", le32_to_cpu(ino->xattr_cnt));
+		pr_err("\txattr_size     %u\n", le32_to_cpu(ino->xattr_size));
+		pr_err("\txattr_names    %u\n", le32_to_cpu(ino->xattr_names));
+		pr_err("\tcompr_type     %#x\n",
 		       (int)le16_to_cpu(ino->compr_type));
-		printk(KERN_DEBUG "\tdata len       %u\n",
-		       le32_to_cpu(ino->data_len));
+		pr_err("\tdata len       %u\n", le32_to_cpu(ino->data_len));
 		break;
 	}
 	case UBIFS_DENT_NODE:
@@ -517,21 +452,21 @@
 		int nlen = le16_to_cpu(dent->nlen);
 
 		key_read(c, &dent->key, &key);
-		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
-		printk(KERN_DEBUG "\tinum           %llu\n",
+		pr_err("\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+		pr_err("\tinum           %llu\n",
 		       (unsigned long long)le64_to_cpu(dent->inum));
-		printk(KERN_DEBUG "\ttype           %d\n", (int)dent->type);
-		printk(KERN_DEBUG "\tnlen           %d\n", nlen);
-		printk(KERN_DEBUG "\tname           ");
+		pr_err("\ttype           %d\n", (int)dent->type);
+		pr_err("\tnlen           %d\n", nlen);
+		pr_err("\tname           ");
 
 		if (nlen > UBIFS_MAX_NLEN)
-			printk(KERN_DEBUG "(bad name length, not printing, "
-					  "bad or corrupted node)");
+			pr_err("(bad name length, not printing, bad or corrupted node)");
 		else {
 			for (i = 0; i < nlen && dent->name[i]; i++)
-				printk(KERN_CONT "%c", dent->name[i]);
+				pr_cont("%c", dent->name[i]);
 		}
-		printk(KERN_CONT "\n");
+		pr_cont("\n");
 
 		break;
 	}
@@ -541,15 +476,14 @@
 		int dlen = le32_to_cpu(ch->len) - UBIFS_DATA_NODE_SZ;
 
 		key_read(c, &dn->key, &key);
-		printk(KERN_DEBUG "\tkey            %s\n", DBGKEY(&key));
-		printk(KERN_DEBUG "\tsize           %u\n",
-		       le32_to_cpu(dn->size));
-		printk(KERN_DEBUG "\tcompr_typ      %d\n",
+		pr_err("\tkey            %s\n",
+		       dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
+		pr_err("\tsize           %u\n", le32_to_cpu(dn->size));
+		pr_err("\tcompr_typ      %d\n",
 		       (int)le16_to_cpu(dn->compr_type));
-		printk(KERN_DEBUG "\tdata size      %d\n",
-		       dlen);
-		printk(KERN_DEBUG "\tdata:\n");
-		print_hex_dump(KERN_DEBUG, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+		pr_err("\tdata size      %d\n", dlen);
+		pr_err("\tdata:\n");
+		print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
 			       (void *)&dn->data, dlen, 0);
 		break;
 	}
@@ -557,11 +491,10 @@
 	{
 		const struct ubifs_trun_node *trun = node;
 
-		printk(KERN_DEBUG "\tinum           %u\n",
-		       le32_to_cpu(trun->inum));
-		printk(KERN_DEBUG "\told_size       %llu\n",
+		pr_err("\tinum           %u\n", le32_to_cpu(trun->inum));
+		pr_err("\told_size       %llu\n",
 		       (unsigned long long)le64_to_cpu(trun->old_size));
-		printk(KERN_DEBUG "\tnew_size       %llu\n",
+		pr_err("\tnew_size       %llu\n",
 		       (unsigned long long)le64_to_cpu(trun->new_size));
 		break;
 	}
@@ -570,19 +503,20 @@
 		const struct ubifs_idx_node *idx = node;
 
 		n = le16_to_cpu(idx->child_cnt);
-		printk(KERN_DEBUG "\tchild_cnt      %d\n", n);
-		printk(KERN_DEBUG "\tlevel          %d\n",
-		       (int)le16_to_cpu(idx->level));
-		printk(KERN_DEBUG "\tBranches:\n");
+		pr_err("\tchild_cnt      %d\n", n);
+		pr_err("\tlevel          %d\n", (int)le16_to_cpu(idx->level));
+		pr_err("\tBranches:\n");
 
 		for (i = 0; i < n && i < c->fanout - 1; i++) {
 			const struct ubifs_branch *br;
 
 			br = ubifs_idx_branch(c, idx, i);
 			key_read(c, &br->key, &key);
-			printk(KERN_DEBUG "\t%d: LEB %d:%d len %d key %s\n",
+			pr_err("\t%d: LEB %d:%d len %d key %s\n",
 			       i, le32_to_cpu(br->lnum), le32_to_cpu(br->offs),
-			       le32_to_cpu(br->len), DBGKEY(&key));
+			       le32_to_cpu(br->len),
+			       dbg_snprintf_key(c, &key, key_buf,
+						DBG_KEY_BUF_LEN));
 		}
 		break;
 	}
@@ -592,57 +526,55 @@
 	{
 		const struct ubifs_orph_node *orph = node;
 
-		printk(KERN_DEBUG "\tcommit number  %llu\n",
+		pr_err("\tcommit number  %llu\n",
 		       (unsigned long long)
 				le64_to_cpu(orph->cmt_no) & LLONG_MAX);
-		printk(KERN_DEBUG "\tlast node flag %llu\n",
+		pr_err("\tlast node flag %llu\n",
 		       (unsigned long long)(le64_to_cpu(orph->cmt_no)) >> 63);
 		n = (le32_to_cpu(ch->len) - UBIFS_ORPH_NODE_SZ) >> 3;
-		printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n);
+		pr_err("\t%d orphan inode numbers:\n", n);
 		for (i = 0; i < n; i++)
-			printk(KERN_DEBUG "\t  ino %llu\n",
+			pr_err("\t  ino %llu\n",
 			       (unsigned long long)le64_to_cpu(orph->inos[i]));
 		break;
 	}
 	default:
-		printk(KERN_DEBUG "node type %d was not recognized\n",
+		pr_err("node type %d was not recognized\n",
 		       (int)ch->node_type);
 	}
 	spin_unlock(&dbg_lock);
 }
 
-void dbg_dump_budget_req(const struct ubifs_budget_req *req)
+void ubifs_dump_budget_req(const struct ubifs_budget_req *req)
 {
 	spin_lock(&dbg_lock);
-	printk(KERN_DEBUG "Budgeting request: new_ino %d, dirtied_ino %d\n",
+	pr_err("Budgeting request: new_ino %d, dirtied_ino %d\n",
 	       req->new_ino, req->dirtied_ino);
-	printk(KERN_DEBUG "\tnew_ino_d   %d, dirtied_ino_d %d\n",
+	pr_err("\tnew_ino_d   %d, dirtied_ino_d %d\n",
 	       req->new_ino_d, req->dirtied_ino_d);
-	printk(KERN_DEBUG "\tnew_page    %d, dirtied_page %d\n",
+	pr_err("\tnew_page    %d, dirtied_page %d\n",
 	       req->new_page, req->dirtied_page);
-	printk(KERN_DEBUG "\tnew_dent    %d, mod_dent     %d\n",
+	pr_err("\tnew_dent    %d, mod_dent     %d\n",
 	       req->new_dent, req->mod_dent);
-	printk(KERN_DEBUG "\tidx_growth  %d\n", req->idx_growth);
-	printk(KERN_DEBUG "\tdata_growth %d dd_growth     %d\n",
+	pr_err("\tidx_growth  %d\n", req->idx_growth);
+	pr_err("\tdata_growth %d dd_growth     %d\n",
 	       req->data_growth, req->dd_growth);
 	spin_unlock(&dbg_lock);
 }
 
-void dbg_dump_lstats(const struct ubifs_lp_stats *lst)
+void ubifs_dump_lstats(const struct ubifs_lp_stats *lst)
 {
 	spin_lock(&dbg_lock);
-	printk(KERN_DEBUG "(pid %d) Lprops statistics: empty_lebs %d, "
-	       "idx_lebs  %d\n", current->pid, lst->empty_lebs, lst->idx_lebs);
-	printk(KERN_DEBUG "\ttaken_empty_lebs %d, total_free %lld, "
-	       "total_dirty %lld\n", lst->taken_empty_lebs, lst->total_free,
-	       lst->total_dirty);
-	printk(KERN_DEBUG "\ttotal_used %lld, total_dark %lld, "
-	       "total_dead %lld\n", lst->total_used, lst->total_dark,
-	       lst->total_dead);
+	pr_err("(pid %d) Lprops statistics: empty_lebs %d, idx_lebs  %d\n",
+	       current->pid, lst->empty_lebs, lst->idx_lebs);
+	pr_err("\ttaken_empty_lebs %d, total_free %lld, total_dirty %lld\n",
+	       lst->taken_empty_lebs, lst->total_free, lst->total_dirty);
+	pr_err("\ttotal_used %lld, total_dark %lld, total_dead %lld\n",
+	       lst->total_used, lst->total_dark, lst->total_dead);
 	spin_unlock(&dbg_lock);
 }
 
-void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
+void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi)
 {
 	int i;
 	struct rb_node *rb;
@@ -652,21 +584,17 @@
 
 	spin_lock(&c->space_lock);
 	spin_lock(&dbg_lock);
-	printk(KERN_DEBUG "(pid %d) Budgeting info: data budget sum %lld, "
-	       "total budget sum %lld\n", current->pid,
-	       bi->data_growth + bi->dd_growth,
+	pr_err("(pid %d) Budgeting info: data budget sum %lld, total budget sum %lld\n",
+	       current->pid, bi->data_growth + bi->dd_growth,
 	       bi->data_growth + bi->dd_growth + bi->idx_growth);
-	printk(KERN_DEBUG "\tbudg_data_growth %lld, budg_dd_growth %lld, "
-	       "budg_idx_growth %lld\n", bi->data_growth, bi->dd_growth,
-	       bi->idx_growth);
-	printk(KERN_DEBUG "\tmin_idx_lebs %d, old_idx_sz %llu, "
-	       "uncommitted_idx %lld\n", bi->min_idx_lebs, bi->old_idx_sz,
-	       bi->uncommitted_idx);
-	printk(KERN_DEBUG "\tpage_budget %d, inode_budget %d, dent_budget %d\n",
+	pr_err("\tbudg_data_growth %lld, budg_dd_growth %lld, budg_idx_growth %lld\n",
+	       bi->data_growth, bi->dd_growth, bi->idx_growth);
+	pr_err("\tmin_idx_lebs %d, old_idx_sz %llu, uncommitted_idx %lld\n",
+	       bi->min_idx_lebs, bi->old_idx_sz, bi->uncommitted_idx);
+	pr_err("\tpage_budget %d, inode_budget %d, dent_budget %d\n",
 	       bi->page_budget, bi->inode_budget, bi->dent_budget);
-	printk(KERN_DEBUG "\tnospace %u, nospace_rp %u\n",
-	       bi->nospace, bi->nospace_rp);
-	printk(KERN_DEBUG "\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
+	pr_err("\tnospace %u, nospace_rp %u\n", bi->nospace, bi->nospace_rp);
+	pr_err("\tdark_wm %d, dead_wm %d, max_idx_node_sz %d\n",
 	       c->dark_wm, c->dead_wm, c->max_idx_node_sz);
 
 	if (bi != &c->bi)
@@ -677,45 +605,44 @@
 		 */
 		goto out_unlock;
 
-	printk(KERN_DEBUG "\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
+	pr_err("\tfreeable_cnt %d, calc_idx_sz %lld, idx_gc_cnt %d\n",
 	       c->freeable_cnt, c->calc_idx_sz, c->idx_gc_cnt);
-	printk(KERN_DEBUG "\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, "
-	       "clean_zn_cnt %ld\n", atomic_long_read(&c->dirty_pg_cnt),
+	pr_err("\tdirty_pg_cnt %ld, dirty_zn_cnt %ld, clean_zn_cnt %ld\n",
+	       atomic_long_read(&c->dirty_pg_cnt),
 	       atomic_long_read(&c->dirty_zn_cnt),
 	       atomic_long_read(&c->clean_zn_cnt));
-	printk(KERN_DEBUG "\tgc_lnum %d, ihead_lnum %d\n",
-	       c->gc_lnum, c->ihead_lnum);
+	pr_err("\tgc_lnum %d, ihead_lnum %d\n", c->gc_lnum, c->ihead_lnum);
 
 	/* If we are in R/O mode, journal heads do not exist */
 	if (c->jheads)
 		for (i = 0; i < c->jhead_cnt; i++)
-			printk(KERN_DEBUG "\tjhead %s\t LEB %d\n",
+			pr_err("\tjhead %s\t LEB %d\n",
 			       dbg_jhead(c->jheads[i].wbuf.jhead),
 			       c->jheads[i].wbuf.lnum);
 	for (rb = rb_first(&c->buds); rb; rb = rb_next(rb)) {
 		bud = rb_entry(rb, struct ubifs_bud, rb);
-		printk(KERN_DEBUG "\tbud LEB %d\n", bud->lnum);
+		pr_err("\tbud LEB %d\n", bud->lnum);
 	}
 	list_for_each_entry(bud, &c->old_buds, list)
-		printk(KERN_DEBUG "\told bud LEB %d\n", bud->lnum);
+		pr_err("\told bud LEB %d\n", bud->lnum);
 	list_for_each_entry(idx_gc, &c->idx_gc, list)
-		printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
+		pr_err("\tGC'ed idx LEB %d unmap %d\n",
 		       idx_gc->lnum, idx_gc->unmap);
-	printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+	pr_err("\tcommit state %d\n", c->cmt_state);
 
 	/* Print budgeting predictions */
 	available = ubifs_calc_available(c, c->bi.min_idx_lebs);
 	outstanding = c->bi.data_growth + c->bi.dd_growth;
 	free = ubifs_get_free_space_nolock(c);
-	printk(KERN_DEBUG "Budgeting predictions:\n");
-	printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+	pr_err("Budgeting predictions:\n");
+	pr_err("\tavailable: %lld, outstanding %lld, free %lld\n",
 	       available, outstanding, free);
 out_unlock:
 	spin_unlock(&dbg_lock);
 	spin_unlock(&c->space_lock);
 }
 
-void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
+void ubifs_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp)
 {
 	int i, spc, dark = 0, dead = 0;
 	struct rb_node *rb;
@@ -728,21 +655,19 @@
 		dark = ubifs_calc_dark(c, spc);
 
 	if (lp->flags & LPROPS_INDEX)
-		printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
-		       "free + dirty %-8d flags %#x (", lp->lnum, lp->free,
-		       lp->dirty, c->leb_size - spc, spc, lp->flags);
+		pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d flags %#x (",
+		       lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc,
+		       lp->flags);
 	else
-		printk(KERN_DEBUG "LEB %-7d free %-8d dirty %-8d used %-8d "
-		       "free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d "
-		       "flags %#-4x (", lp->lnum, lp->free, lp->dirty,
-		       c->leb_size - spc, spc, dark, dead,
-		       (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
+		pr_err("LEB %-7d free %-8d dirty %-8d used %-8d free + dirty %-8d dark %-4d dead %-4d nodes fit %-3d flags %#-4x (",
+		       lp->lnum, lp->free, lp->dirty, c->leb_size - spc, spc,
+		       dark, dead, (int)(spc / UBIFS_MAX_NODE_SZ), lp->flags);
 
 	if (lp->flags & LPROPS_TAKEN) {
 		if (lp->flags & LPROPS_INDEX)
-			printk(KERN_CONT "index, taken");
+			pr_cont("index, taken");
 		else
-			printk(KERN_CONT "taken");
+			pr_cont("taken");
 	} else {
 		const char *s;
 
@@ -779,7 +704,7 @@
 				break;
 			}
 		}
-		printk(KERN_CONT "%s", s);
+		pr_cont("%s", s);
 	}
 
 	for (rb = rb_first((struct rb_root *)&c->buds); rb; rb = rb_next(rb)) {
@@ -794,109 +719,101 @@
 				 */
 				if (c->jheads &&
 				    lp->lnum == c->jheads[i].wbuf.lnum) {
-					printk(KERN_CONT ", jhead %s",
-					       dbg_jhead(i));
+					pr_cont(", jhead %s", dbg_jhead(i));
 					head = 1;
 				}
 			}
 			if (!head)
-				printk(KERN_CONT ", bud of jhead %s",
+				pr_cont(", bud of jhead %s",
 				       dbg_jhead(bud->jhead));
 		}
 	}
 	if (lp->lnum == c->gc_lnum)
-		printk(KERN_CONT ", GC LEB");
-	printk(KERN_CONT ")\n");
+		pr_cont(", GC LEB");
+	pr_cont(")\n");
 }
 
-void dbg_dump_lprops(struct ubifs_info *c)
+void ubifs_dump_lprops(struct ubifs_info *c)
 {
 	int lnum, err;
 	struct ubifs_lprops lp;
 	struct ubifs_lp_stats lst;
 
-	printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
-	       current->pid);
+	pr_err("(pid %d) start dumping LEB properties\n", current->pid);
 	ubifs_get_lp_stats(c, &lst);
-	dbg_dump_lstats(&lst);
+	ubifs_dump_lstats(&lst);
 
 	for (lnum = c->main_first; lnum < c->leb_cnt; lnum++) {
 		err = ubifs_read_one_lp(c, lnum, &lp);
 		if (err)
 			ubifs_err("cannot read lprops for LEB %d", lnum);
 
-		dbg_dump_lprop(c, &lp);
+		ubifs_dump_lprop(c, &lp);
 	}
-	printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
-	       current->pid);
+	pr_err("(pid %d) finish dumping LEB properties\n", current->pid);
 }
 
-void dbg_dump_lpt_info(struct ubifs_info *c)
+void ubifs_dump_lpt_info(struct ubifs_info *c)
 {
 	int i;
 
 	spin_lock(&dbg_lock);
-	printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
-	printk(KERN_DEBUG "\tlpt_sz:        %lld\n", c->lpt_sz);
-	printk(KERN_DEBUG "\tpnode_sz:      %d\n", c->pnode_sz);
-	printk(KERN_DEBUG "\tnnode_sz:      %d\n", c->nnode_sz);
-	printk(KERN_DEBUG "\tltab_sz:       %d\n", c->ltab_sz);
-	printk(KERN_DEBUG "\tlsave_sz:      %d\n", c->lsave_sz);
-	printk(KERN_DEBUG "\tbig_lpt:       %d\n", c->big_lpt);
-	printk(KERN_DEBUG "\tlpt_hght:      %d\n", c->lpt_hght);
-	printk(KERN_DEBUG "\tpnode_cnt:     %d\n", c->pnode_cnt);
-	printk(KERN_DEBUG "\tnnode_cnt:     %d\n", c->nnode_cnt);
-	printk(KERN_DEBUG "\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
-	printk(KERN_DEBUG "\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
-	printk(KERN_DEBUG "\tlsave_cnt:     %d\n", c->lsave_cnt);
-	printk(KERN_DEBUG "\tspace_bits:    %d\n", c->space_bits);
-	printk(KERN_DEBUG "\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
-	printk(KERN_DEBUG "\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
-	printk(KERN_DEBUG "\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
-	printk(KERN_DEBUG "\tpcnt_bits:     %d\n", c->pcnt_bits);
-	printk(KERN_DEBUG "\tlnum_bits:     %d\n", c->lnum_bits);
-	printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
-	printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
+	pr_err("(pid %d) dumping LPT information\n", current->pid);
+	pr_err("\tlpt_sz:        %lld\n", c->lpt_sz);
+	pr_err("\tpnode_sz:      %d\n", c->pnode_sz);
+	pr_err("\tnnode_sz:      %d\n", c->nnode_sz);
+	pr_err("\tltab_sz:       %d\n", c->ltab_sz);
+	pr_err("\tlsave_sz:      %d\n", c->lsave_sz);
+	pr_err("\tbig_lpt:       %d\n", c->big_lpt);
+	pr_err("\tlpt_hght:      %d\n", c->lpt_hght);
+	pr_err("\tpnode_cnt:     %d\n", c->pnode_cnt);
+	pr_err("\tnnode_cnt:     %d\n", c->nnode_cnt);
+	pr_err("\tdirty_pn_cnt:  %d\n", c->dirty_pn_cnt);
+	pr_err("\tdirty_nn_cnt:  %d\n", c->dirty_nn_cnt);
+	pr_err("\tlsave_cnt:     %d\n", c->lsave_cnt);
+	pr_err("\tspace_bits:    %d\n", c->space_bits);
+	pr_err("\tlpt_lnum_bits: %d\n", c->lpt_lnum_bits);
+	pr_err("\tlpt_offs_bits: %d\n", c->lpt_offs_bits);
+	pr_err("\tlpt_spc_bits:  %d\n", c->lpt_spc_bits);
+	pr_err("\tpcnt_bits:     %d\n", c->pcnt_bits);
+	pr_err("\tlnum_bits:     %d\n", c->lnum_bits);
+	pr_err("\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
+	pr_err("\tLPT head is at %d:%d\n",
 	       c->nhead_lnum, c->nhead_offs);
-	printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
-	       c->ltab_lnum, c->ltab_offs);
+	pr_err("\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
 	if (c->big_lpt)
-		printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
+		pr_err("\tLPT lsave is at %d:%d\n",
 		       c->lsave_lnum, c->lsave_offs);
 	for (i = 0; i < c->lpt_lebs; i++)
-		printk(KERN_DEBUG "\tLPT LEB %d free %d dirty %d tgc %d "
-		       "cmt %d\n", i + c->lpt_first, c->ltab[i].free,
-		       c->ltab[i].dirty, c->ltab[i].tgc, c->ltab[i].cmt);
+		pr_err("\tLPT LEB %d free %d dirty %d tgc %d cmt %d\n",
+		       i + c->lpt_first, c->ltab[i].free, c->ltab[i].dirty,
+		       c->ltab[i].tgc, c->ltab[i].cmt);
 	spin_unlock(&dbg_lock);
 }
 
-void dbg_dump_sleb(const struct ubifs_info *c,
-		   const struct ubifs_scan_leb *sleb, int offs)
+void ubifs_dump_sleb(const struct ubifs_info *c,
+		     const struct ubifs_scan_leb *sleb, int offs)
 {
 	struct ubifs_scan_node *snod;
 
-	printk(KERN_DEBUG "(pid %d) start dumping scanned data from LEB %d:%d\n",
+	pr_err("(pid %d) start dumping scanned data from LEB %d:%d\n",
 	       current->pid, sleb->lnum, offs);
 
 	list_for_each_entry(snod, &sleb->nodes, list) {
 		cond_resched();
-		printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", sleb->lnum,
-		       snod->offs, snod->len);
-		dbg_dump_node(c, snod->node);
+		pr_err("Dumping node at LEB %d:%d len %d\n",
+		       sleb->lnum, snod->offs, snod->len);
+		ubifs_dump_node(c, snod->node);
 	}
 }
 
-void dbg_dump_leb(const struct ubifs_info *c, int lnum)
+void ubifs_dump_leb(const struct ubifs_info *c, int lnum)
 {
 	struct ubifs_scan_leb *sleb;
 	struct ubifs_scan_node *snod;
 	void *buf;
 
-	if (dbg_is_tst_rcvry(c))
-		return;
-
-	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
-	       current->pid, lnum);
+	pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
 
 	buf = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
 	if (!buf) {
@@ -910,18 +827,17 @@
 		goto out;
 	}
 
-	printk(KERN_DEBUG "LEB %d has %d nodes ending at %d\n", lnum,
+	pr_err("LEB %d has %d nodes ending at %d\n", lnum,
 	       sleb->nodes_cnt, sleb->endpt);
 
 	list_for_each_entry(snod, &sleb->nodes, list) {
 		cond_resched();
-		printk(KERN_DEBUG "Dumping node at LEB %d:%d len %d\n", lnum,
+		pr_err("Dumping node at LEB %d:%d len %d\n", lnum,
 		       snod->offs, snod->len);
-		dbg_dump_node(c, snod->node);
+		ubifs_dump_node(c, snod->node);
 	}
 
-	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
-	       current->pid, lnum);
+	pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
 	ubifs_scan_destroy(sleb);
 
 out:
@@ -929,11 +845,12 @@
 	return;
 }
 
-void dbg_dump_znode(const struct ubifs_info *c,
-		    const struct ubifs_znode *znode)
+void ubifs_dump_znode(const struct ubifs_info *c,
+		      const struct ubifs_znode *znode)
 {
 	int n;
 	const struct ubifs_zbranch *zbr;
+	char key_buf[DBG_KEY_BUF_LEN];
 
 	spin_lock(&dbg_lock);
 	if (znode->parent)
@@ -941,103 +858,102 @@
 	else
 		zbr = &c->zroot;
 
-	printk(KERN_DEBUG "znode %p, LEB %d:%d len %d parent %p iip %d level %d"
-	       " child_cnt %d flags %lx\n", znode, zbr->lnum, zbr->offs,
-	       zbr->len, znode->parent, znode->iip, znode->level,
-	       znode->child_cnt, znode->flags);
+	pr_err("znode %p, LEB %d:%d len %d parent %p iip %d level %d child_cnt %d flags %lx\n",
+	       znode, zbr->lnum, zbr->offs, zbr->len, znode->parent, znode->iip,
+	       znode->level, znode->child_cnt, znode->flags);
 
 	if (znode->child_cnt <= 0 || znode->child_cnt > c->fanout) {
 		spin_unlock(&dbg_lock);
 		return;
 	}
 
-	printk(KERN_DEBUG "zbranches:\n");
+	pr_err("zbranches:\n");
 	for (n = 0; n < znode->child_cnt; n++) {
 		zbr = &znode->zbranch[n];
 		if (znode->level > 0)
-			printk(KERN_DEBUG "\t%d: znode %p LEB %d:%d len %d key "
-					  "%s\n", n, zbr->znode, zbr->lnum,
-					  zbr->offs, zbr->len,
-					  DBGKEY(&zbr->key));
+			pr_err("\t%d: znode %p LEB %d:%d len %d key %s\n",
+			       n, zbr->znode, zbr->lnum, zbr->offs, zbr->len,
+			       dbg_snprintf_key(c, &zbr->key, key_buf,
+						DBG_KEY_BUF_LEN));
 		else
-			printk(KERN_DEBUG "\t%d: LNC %p LEB %d:%d len %d key "
-					  "%s\n", n, zbr->znode, zbr->lnum,
-					  zbr->offs, zbr->len,
-					  DBGKEY(&zbr->key));
+			pr_err("\t%d: LNC %p LEB %d:%d len %d key %s\n",
+			       n, zbr->znode, zbr->lnum, zbr->offs, zbr->len,
+			       dbg_snprintf_key(c, &zbr->key, key_buf,
+						DBG_KEY_BUF_LEN));
 	}
 	spin_unlock(&dbg_lock);
 }
 
-void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
+void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
 {
 	int i;
 
-	printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
+	pr_err("(pid %d) start dumping heap cat %d (%d elements)\n",
 	       current->pid, cat, heap->cnt);
 	for (i = 0; i < heap->cnt; i++) {
 		struct ubifs_lprops *lprops = heap->arr[i];
 
-		printk(KERN_DEBUG "\t%d. LEB %d hpos %d free %d dirty %d "
-		       "flags %d\n", i, lprops->lnum, lprops->hpos,
-		       lprops->free, lprops->dirty, lprops->flags);
+		pr_err("\t%d. LEB %d hpos %d free %d dirty %d flags %d\n",
+		       i, lprops->lnum, lprops->hpos, lprops->free,
+		       lprops->dirty, lprops->flags);
 	}
-	printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
+	pr_err("(pid %d) finish dumping heap\n", current->pid);
 }
 
-void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
-		    struct ubifs_nnode *parent, int iip)
+void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		      struct ubifs_nnode *parent, int iip)
 {
 	int i;
 
-	printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
-	printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
+	pr_err("(pid %d) dumping pnode:\n", current->pid);
+	pr_err("\taddress %zx parent %zx cnext %zx\n",
 	       (size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
-	printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
+	pr_err("\tflags %lu iip %d level %d num %d\n",
 	       pnode->flags, iip, pnode->level, pnode->num);
 	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
 		struct ubifs_lprops *lp = &pnode->lprops[i];
 
-		printk(KERN_DEBUG "\t%d: free %d dirty %d flags %d lnum %d\n",
+		pr_err("\t%d: free %d dirty %d flags %d lnum %d\n",
 		       i, lp->free, lp->dirty, lp->flags, lp->lnum);
 	}
 }
 
-void dbg_dump_tnc(struct ubifs_info *c)
+void ubifs_dump_tnc(struct ubifs_info *c)
 {
 	struct ubifs_znode *znode;
 	int level;
 
-	printk(KERN_DEBUG "\n");
-	printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
+	pr_err("\n");
+	pr_err("(pid %d) start dumping TNC tree\n", current->pid);
 	znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
 	level = znode->level;
-	printk(KERN_DEBUG "== Level %d ==\n", level);
+	pr_err("== Level %d ==\n", level);
 	while (znode) {
 		if (level != znode->level) {
 			level = znode->level;
-			printk(KERN_DEBUG "== Level %d ==\n", level);
+			pr_err("== Level %d ==\n", level);
 		}
-		dbg_dump_znode(c, znode);
+		ubifs_dump_znode(c, znode);
 		znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
 	}
-	printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
+	pr_err("(pid %d) finish dumping TNC tree\n", current->pid);
 }
 
 static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
 		      void *priv)
 {
-	dbg_dump_znode(c, znode);
+	ubifs_dump_znode(c, znode);
 	return 0;
 }
 
 /**
- * dbg_dump_index - dump the on-flash index.
+ * ubifs_dump_index - dump the on-flash index.
  * @c: UBIFS file-system description object
  *
- * This function dumps whole UBIFS indexing B-tree, unlike 'dbg_dump_tnc()'
+ * This function dumps whole UBIFS indexing B-tree, unlike 'ubifs_dump_tnc()'
  * which dumps only in-memory znodes and does not read znodes which from flash.
  */
-void dbg_dump_index(struct ubifs_info *c)
+void ubifs_dump_index(struct ubifs_info *c)
 {
 	dbg_walk_index(c, NULL, dump_znode, NULL);
 }
@@ -1123,15 +1039,15 @@
 
 out:
 	ubifs_msg("saved lprops statistics dump");
-	dbg_dump_lstats(&d->saved_lst);
+	ubifs_dump_lstats(&d->saved_lst);
 	ubifs_msg("saved budgeting info dump");
-	dbg_dump_budg(c, &d->saved_bi);
+	ubifs_dump_budg(c, &d->saved_bi);
 	ubifs_msg("saved idx_gc_cnt %d", d->saved_idx_gc_cnt);
 	ubifs_msg("current lprops statistics dump");
 	ubifs_get_lp_stats(c, &lst);
-	dbg_dump_lstats(&lst);
+	ubifs_dump_lstats(&lst);
 	ubifs_msg("current budgeting info dump");
-	dbg_dump_budg(c, &c->bi);
+	ubifs_dump_budg(c, &c->bi);
 	dump_stack();
 	return -EINVAL;
 }
@@ -1159,11 +1075,11 @@
 	mutex_lock(&ui->ui_mutex);
 	spin_lock(&ui->ui_lock);
 	if (ui->ui_size != ui->synced_i_size && !ui->dirty) {
-		ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode "
-			  "is clean", ui->ui_size, ui->synced_i_size);
+		ubifs_err("ui_size is %lld, synced_i_size is %lld, but inode is clean",
+			  ui->ui_size, ui->synced_i_size);
 		ubifs_err("i_ino %lu, i_mode %#x, i_size %lld", inode->i_ino,
 			  inode->i_mode, i_size_read(inode));
-		dbg_dump_stack();
+		dump_stack();
 		err = -EINVAL;
 	}
 	spin_unlock(&ui->ui_lock);
@@ -1222,18 +1138,17 @@
 	kfree(pdent);
 
 	if (i_size_read(dir) != size) {
-		ubifs_err("directory inode %lu has size %llu, "
-			  "but calculated size is %llu", dir->i_ino,
-			  (unsigned long long)i_size_read(dir),
+		ubifs_err("directory inode %lu has size %llu, but calculated size is %llu",
+			  dir->i_ino, (unsigned long long)i_size_read(dir),
 			  (unsigned long long)size);
-		dbg_dump_inode(c, dir);
+		ubifs_dump_inode(c, dir);
 		dump_stack();
 		return -EINVAL;
 	}
 	if (dir->i_nlink != nlink) {
-		ubifs_err("directory inode %lu has nlink %u, but calculated "
-			  "nlink is %u", dir->i_ino, dir->i_nlink, nlink);
-		dbg_dump_inode(c, dir);
+		ubifs_err("directory inode %lu has nlink %u, but calculated nlink is %u",
+			  dir->i_ino, dir->i_nlink, nlink);
+		ubifs_dump_inode(c, dir);
 		dump_stack();
 		return -EINVAL;
 	}
@@ -1260,6 +1175,7 @@
 	int err, nlen1, nlen2, cmp;
 	struct ubifs_dent_node *dent1, *dent2;
 	union ubifs_key key;
+	char key_buf[DBG_KEY_BUF_LEN];
 
 	ubifs_assert(!keys_cmp(c, &zbr1->key, &zbr2->key));
 	dent1 = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
@@ -1289,21 +1205,25 @@
 	err = 1;
 	key_read(c, &dent1->key, &key);
 	if (keys_cmp(c, &zbr1->key, &key)) {
-		dbg_err("1st entry at %d:%d has key %s", zbr1->lnum,
-			zbr1->offs, DBGKEY(&key));
-		dbg_err("but it should have key %s according to tnc",
-			DBGKEY(&zbr1->key));
-		dbg_dump_node(c, dent1);
+		ubifs_err("1st entry at %d:%d has key %s", zbr1->lnum,
+			  zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+						       DBG_KEY_BUF_LEN));
+		ubifs_err("but it should have key %s according to tnc",
+			  dbg_snprintf_key(c, &zbr1->key, key_buf,
+					   DBG_KEY_BUF_LEN));
+		ubifs_dump_node(c, dent1);
 		goto out_free;
 	}
 
 	key_read(c, &dent2->key, &key);
 	if (keys_cmp(c, &zbr2->key, &key)) {
-		dbg_err("2nd entry at %d:%d has key %s", zbr1->lnum,
-			zbr1->offs, DBGKEY(&key));
-		dbg_err("but it should have key %s according to tnc",
-			DBGKEY(&zbr2->key));
-		dbg_dump_node(c, dent2);
+		ubifs_err("2nd entry at %d:%d has key %s", zbr1->lnum,
+			  zbr1->offs, dbg_snprintf_key(c, &key, key_buf,
+						       DBG_KEY_BUF_LEN));
+		ubifs_err("but it should have key %s according to tnc",
+			  dbg_snprintf_key(c, &zbr2->key, key_buf,
+					   DBG_KEY_BUF_LEN));
+		ubifs_dump_node(c, dent2);
 		goto out_free;
 	}
 
@@ -1316,15 +1236,15 @@
 		goto out_free;
 	}
 	if (cmp == 0 && nlen1 == nlen2)
-		dbg_err("2 xent/dent nodes with the same name");
+		ubifs_err("2 xent/dent nodes with the same name");
 	else
-		dbg_err("bad order of colliding key %s",
-			DBGKEY(&key));
+		ubifs_err("bad order of colliding key %s",
+			  dbg_snprintf_key(c, &key, key_buf, DBG_KEY_BUF_LEN));
 
 	ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
-	dbg_dump_node(c, dent1);
+	ubifs_dump_node(c, dent1);
 	ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
-	dbg_dump_node(c, dent2);
+	ubifs_dump_node(c, dent2);
 
 out_free:
 	kfree(dent2);
@@ -1527,10 +1447,10 @@
 out:
 	ubifs_err("failed, error %d", err);
 	ubifs_msg("dump of the znode");
-	dbg_dump_znode(c, znode);
+	ubifs_dump_znode(c, znode);
 	if (zp) {
 		ubifs_msg("dump of the parent znode");
-		dbg_dump_znode(c, zp);
+		ubifs_dump_znode(c, zp);
 	}
 	dump_stack();
 	return -EINVAL;
@@ -1597,9 +1517,9 @@
 				return err;
 			if (err) {
 				ubifs_msg("first znode");
-				dbg_dump_znode(c, prev);
+				ubifs_dump_znode(c, prev);
 				ubifs_msg("second znode");
-				dbg_dump_znode(c, znode);
+				ubifs_dump_znode(c, znode);
 				return -EINVAL;
 			}
 		}
@@ -1686,9 +1606,9 @@
 		if (znode_cb) {
 			err = znode_cb(c, znode, priv);
 			if (err) {
-				ubifs_err("znode checking function returned "
-					  "error %d", err);
-				dbg_dump_znode(c, znode);
+				ubifs_err("znode checking function returned error %d",
+					  err);
+				ubifs_dump_znode(c, znode);
 				goto out_dump;
 			}
 		}
@@ -1697,9 +1617,7 @@
 				zbr = &znode->zbranch[idx];
 				err = leaf_cb(c, zbr, priv);
 				if (err) {
-					ubifs_err("leaf checking function "
-						  "returned error %d, for leaf "
-						  "at LEB %d:%d",
+					ubifs_err("leaf checking function returned error %d, for leaf at LEB %d:%d",
 						  err, zbr->lnum, zbr->offs);
 					goto out_dump;
 				}
@@ -1756,7 +1674,7 @@
 	else
 		zbr = &c->zroot;
 	ubifs_msg("dump of znode at LEB %d:%d", zbr->lnum, zbr->offs);
-	dbg_dump_znode(c, znode);
+	ubifs_dump_znode(c, znode);
 out_unlock:
 	mutex_unlock(&c->tnc_mutex);
 	return err;
@@ -1807,8 +1725,8 @@
 	}
 
 	if (calc != idx_size) {
-		ubifs_err("index size check failed: calculated size is %lld, "
-			  "should be %lld", calc, idx_size);
+		ubifs_err("index size check failed: calculated size is %lld, should be %lld",
+			  calc, idx_size);
 		dump_stack();
 		return -EINVAL;
 	}
@@ -2120,8 +2038,7 @@
 		fscki = read_add_inode(c, priv, inum);
 		if (IS_ERR(fscki)) {
 			err = PTR_ERR(fscki);
-			ubifs_err("error %d while processing data node and "
-				  "trying to find inode node %lu",
+			ubifs_err("error %d while processing data node and trying to find inode node %lu",
 				  err, (unsigned long)inum);
 			goto out_dump;
 		}
@@ -2131,9 +2048,8 @@
 		blk_offs <<= UBIFS_BLOCK_SHIFT;
 		blk_offs += le32_to_cpu(dn->size);
 		if (blk_offs > fscki->size) {
-			ubifs_err("data node at LEB %d:%d is not within inode "
-				  "size %lld", zbr->lnum, zbr->offs,
-				  fscki->size);
+			ubifs_err("data node at LEB %d:%d is not within inode size %lld",
+				  zbr->lnum, zbr->offs, fscki->size);
 			err = -EINVAL;
 			goto out_dump;
 		}
@@ -2154,8 +2070,7 @@
 		fscki = read_add_inode(c, priv, inum);
 		if (IS_ERR(fscki)) {
 			err = PTR_ERR(fscki);
-			ubifs_err("error %d while processing entry node and "
-				  "trying to find inode node %lu",
+			ubifs_err("error %d while processing entry node and trying to find inode node %lu",
 				  err, (unsigned long)inum);
 			goto out_dump;
 		}
@@ -2167,8 +2082,7 @@
 		fscki1 = read_add_inode(c, priv, inum);
 		if (IS_ERR(fscki1)) {
 			err = PTR_ERR(fscki1);
-			ubifs_err("error %d while processing entry node and "
-				  "trying to find parent inode node %lu",
+			ubifs_err("error %d while processing entry node and trying to find parent inode node %lu",
 				  err, (unsigned long)inum);
 			goto out_dump;
 		}
@@ -2192,7 +2106,7 @@
 
 out_dump:
 	ubifs_msg("dump of node at LEB %d:%d", zbr->lnum, zbr->offs);
-	dbg_dump_node(c, node);
+	ubifs_dump_node(c, node);
 out_free:
 	kfree(node);
 	return err;
@@ -2258,61 +2172,52 @@
 			 */
 			if (fscki->inum != UBIFS_ROOT_INO &&
 			    fscki->references != 1) {
-				ubifs_err("directory inode %lu has %d "
-					  "direntries which refer it, but "
-					  "should be 1",
+				ubifs_err("directory inode %lu has %d direntries which refer it, but should be 1",
 					  (unsigned long)fscki->inum,
 					  fscki->references);
 				goto out_dump;
 			}
 			if (fscki->inum == UBIFS_ROOT_INO &&
 			    fscki->references != 0) {
-				ubifs_err("root inode %lu has non-zero (%d) "
-					  "direntries which refer it",
+				ubifs_err("root inode %lu has non-zero (%d) direntries which refer it",
 					  (unsigned long)fscki->inum,
 					  fscki->references);
 				goto out_dump;
 			}
 			if (fscki->calc_sz != fscki->size) {
-				ubifs_err("directory inode %lu size is %lld, "
-					  "but calculated size is %lld",
+				ubifs_err("directory inode %lu size is %lld, but calculated size is %lld",
 					  (unsigned long)fscki->inum,
 					  fscki->size, fscki->calc_sz);
 				goto out_dump;
 			}
 			if (fscki->calc_cnt != fscki->nlink) {
-				ubifs_err("directory inode %lu nlink is %d, "
-					  "but calculated nlink is %d",
+				ubifs_err("directory inode %lu nlink is %d, but calculated nlink is %d",
 					  (unsigned long)fscki->inum,
 					  fscki->nlink, fscki->calc_cnt);
 				goto out_dump;
 			}
 		} else {
 			if (fscki->references != fscki->nlink) {
-				ubifs_err("inode %lu nlink is %d, but "
-					  "calculated nlink is %d",
+				ubifs_err("inode %lu nlink is %d, but calculated nlink is %d",
 					  (unsigned long)fscki->inum,
 					  fscki->nlink, fscki->references);
 				goto out_dump;
 			}
 		}
 		if (fscki->xattr_sz != fscki->calc_xsz) {
-			ubifs_err("inode %lu has xattr size %u, but "
-				  "calculated size is %lld",
+			ubifs_err("inode %lu has xattr size %u, but calculated size is %lld",
 				  (unsigned long)fscki->inum, fscki->xattr_sz,
 				  fscki->calc_xsz);
 			goto out_dump;
 		}
 		if (fscki->xattr_cnt != fscki->calc_xcnt) {
-			ubifs_err("inode %lu has %u xattrs, but "
-				  "calculated count is %lld",
+			ubifs_err("inode %lu has %u xattrs, but calculated count is %lld",
 				  (unsigned long)fscki->inum,
 				  fscki->xattr_cnt, fscki->calc_xcnt);
 			goto out_dump;
 		}
 		if (fscki->xattr_nms != fscki->calc_xnms) {
-			ubifs_err("inode %lu has xattr names' size %u, but "
-				  "calculated names' size is %lld",
+			ubifs_err("inode %lu has xattr names' size %u, but calculated names' size is %lld",
 				  (unsigned long)fscki->inum, fscki->xattr_nms,
 				  fscki->calc_xnms);
 			goto out_dump;
@@ -2350,7 +2255,7 @@
 
 	ubifs_msg("dump of the inode %lu sitting in LEB %d:%d",
 		  (unsigned long)fscki->inum, zbr->lnum, zbr->offs);
-	dbg_dump_node(c, ino);
+	ubifs_dump_node(c, ino);
 	kfree(ino);
 	return -EINVAL;
 }
@@ -2421,12 +2326,12 @@
 
 		if (sa->type != UBIFS_DATA_NODE) {
 			ubifs_err("bad node type %d", sa->type);
-			dbg_dump_node(c, sa->node);
+			ubifs_dump_node(c, sa->node);
 			return -EINVAL;
 		}
 		if (sb->type != UBIFS_DATA_NODE) {
 			ubifs_err("bad node type %d", sb->type);
-			dbg_dump_node(c, sb->node);
+			ubifs_dump_node(c, sb->node);
 			return -EINVAL;
 		}
 
@@ -2457,8 +2362,8 @@
 	return 0;
 
 error_dump:
-	dbg_dump_node(c, sa->node);
-	dbg_dump_node(c, sb->node);
+	ubifs_dump_node(c, sa->node);
+	ubifs_dump_node(c, sb->node);
 	return -EINVAL;
 }
 
@@ -2489,13 +2394,13 @@
 		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
 		    sa->type != UBIFS_XENT_NODE) {
 			ubifs_err("bad node type %d", sa->type);
-			dbg_dump_node(c, sa->node);
+			ubifs_dump_node(c, sa->node);
 			return -EINVAL;
 		}
 		if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE &&
 		    sa->type != UBIFS_XENT_NODE) {
 			ubifs_err("bad node type %d", sb->type);
-			dbg_dump_node(c, sb->node);
+			ubifs_dump_node(c, sb->node);
 			return -EINVAL;
 		}
 
@@ -2545,16 +2450,16 @@
 
 error_dump:
 	ubifs_msg("dumping first node");
-	dbg_dump_node(c, sa->node);
+	ubifs_dump_node(c, sa->node);
 	ubifs_msg("dumping second node");
-	dbg_dump_node(c, sb->node);
+	ubifs_dump_node(c, sb->node);
 	return -EINVAL;
 	return 0;
 }
 
 static inline int chance(unsigned int n, unsigned int out_of)
 {
-	return !!((random32() % out_of) + 1 <= n);
+	return !!((prandom_u32() % out_of) + 1 <= n);
 
 }
 
@@ -2572,13 +2477,13 @@
 			if (chance(1, 2)) {
 				d->pc_delay = 1;
 				/* Fail withing 1 minute */
-				delay = random32() % 60000;
+				delay = prandom_u32() % 60000;
 				d->pc_timeout = jiffies;
 				d->pc_timeout += msecs_to_jiffies(delay);
 				ubifs_warn("failing after %lums", delay);
 			} else {
 				d->pc_delay = 2;
-				delay = random32() % 10000;
+				delay = prandom_u32() % 10000;
 				/* Fail within 10000 operations */
 				d->pc_cnt_max = delay;
 				ubifs_warn("failing after %lu calls", delay);
@@ -2652,31 +2557,29 @@
 	return 1;
 }
 
-static void cut_data(const void *buf, unsigned int len)
+static int corrupt_data(const struct ubifs_info *c, const void *buf,
+			unsigned int len)
 {
-	unsigned int from, to, i, ffs = chance(1, 2);
+	unsigned int from, to, ffs = chance(1, 2);
 	unsigned char *p = (void *)buf;
 
-	from = random32() % (len + 1);
-	if (chance(1, 2))
-		to = random32() % (len - from + 1);
-	else
-		to = len;
+	from = prandom_u32() % (len + 1);
+	/* Corruption may only span one max. write unit */
+	to = min(len, ALIGN(from, c->max_write_size));
 
-	if (from < to)
-		ubifs_warn("filled bytes %u-%u with %s", from, to - 1,
-			   ffs ? "0xFFs" : "random data");
+	ubifs_warn("filled bytes %u-%u with %s", from, to - 1,
+		   ffs ? "0xFFs" : "random data");
 
 	if (ffs)
-		for (i = from; i < to; i++)
-			p[i] = 0xFF;
+		memset(p + from, 0xFF, to - from);
 	else
-		for (i = from; i < to; i++)
-			p[i] = random32() % 0x100;
+		prandom_bytes(p + from, to - from);
+
+	return to;
 }
 
 int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
-		  int offs, int len, int dtype)
+		  int offs, int len)
 {
 	int err, failing;
 
@@ -2685,8 +2588,10 @@
 
 	failing = power_cut_emulated(c, lnum, 1);
 	if (failing)
-		cut_data(buf, len);
-	err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+		len = corrupt_data(c, buf, len);
+	ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
+		   len, lnum, offs);
+	err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
 	if (err)
 		return err;
 	if (failing)
@@ -2695,7 +2600,7 @@
 }
 
 int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf,
-		   int len, int dtype)
+		   int len)
 {
 	int err;
 
@@ -2703,7 +2608,7 @@
 		return -EROFS;
 	if (power_cut_emulated(c, lnum, 1))
 		return -EROFS;
-	err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+	err = ubi_leb_change(c->ubi, lnum, buf, len);
 	if (err)
 		return err;
 	if (power_cut_emulated(c, lnum, 1))
@@ -2727,7 +2632,7 @@
 	return 0;
 }
 
-int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype)
+int dbg_leb_map(struct ubifs_info *c, int lnum)
 {
 	int err;
 
@@ -2735,7 +2640,7 @@
 		return -EROFS;
 	if (power_cut_emulated(c, lnum, 0))
 		return -EROFS;
-	err = ubi_leb_map(c->ubi, lnum, dtype);
+	err = ubi_leb_map(c->ubi, lnum);
 	if (err)
 		return err;
 	if (power_cut_emulated(c, lnum, 0))
@@ -2802,6 +2707,8 @@
 		val = d->chk_fs;
 	else if (dent == d->dfs_tst_rcvry)
 		val = d->tst_rcvry;
+	else if (dent == d->dfs_ro_error)
+		val = c->ro_error;
 	else
 		return -EINVAL;
 
@@ -2855,16 +2762,16 @@
 	 * 'ubifs-debug' file-system instead.
 	 */
 	if (file->f_path.dentry == d->dfs_dump_lprops) {
-		dbg_dump_lprops(c);
+		ubifs_dump_lprops(c);
 		return count;
 	}
 	if (file->f_path.dentry == d->dfs_dump_budg) {
-		dbg_dump_budg(c, &c->bi);
+		ubifs_dump_budg(c, &c->bi);
 		return count;
 	}
 	if (file->f_path.dentry == d->dfs_dump_tnc) {
 		mutex_lock(&c->tnc_mutex);
-		dbg_dump_tnc(c);
+		ubifs_dump_tnc(c);
 		mutex_unlock(&c->tnc_mutex);
 		return count;
 	}
@@ -2885,6 +2792,8 @@
 		d->chk_fs = val;
 	else if (dent == d->dfs_tst_rcvry)
 		d->tst_rcvry = val;
+	else if (dent == d->dfs_ro_error)
+		c->ro_error = !!val;
 	else
 		return -EINVAL;
 
@@ -2918,6 +2827,9 @@
 	struct dentry *dent;
 	struct ubifs_debug_info *d = c->dbg;
 
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
 	n = snprintf(d->dfs_dir_name, UBIFS_DFS_DIR_LEN + 1, UBIFS_DFS_DIR_NAME,
 		     c->vi.ubi_num, c->vi.vol_id);
 	if (n == UBIFS_DFS_DIR_LEN) {
@@ -2993,6 +2905,13 @@
 		goto out_remove;
 	d->dfs_tst_rcvry = dent;
 
+	fname = "ro_error";
+	dent = debugfs_create_file(fname, S_IRUSR | S_IWUSR, d->dfs_dir, c,
+				   &dfs_fops);
+	if (IS_ERR_OR_NULL(dent))
+		goto out_remove;
+	d->dfs_ro_error = dent;
+
 	return 0;
 
 out_remove:
@@ -3010,7 +2929,8 @@
  */
 void dbg_debugfs_exit_fs(struct ubifs_info *c)
 {
-	debugfs_remove_recursive(c->dbg->dfs_dir);
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_remove_recursive(c->dbg->dfs_dir);
 }
 
 struct ubifs_global_debug_info ubifs_dbg;
@@ -3095,6 +3015,9 @@
 	const char *fname;
 	struct dentry *dent;
 
+	if (!IS_ENABLED(CONFIG_DEBUG_FS))
+		return 0;
+
 	fname = "ubifs";
 	dent = debugfs_create_dir(fname, NULL);
 	if (IS_ERR_OR_NULL(dent))
@@ -3159,7 +3082,8 @@
  */
 void dbg_debugfs_exit(void)
 {
-	debugfs_remove_recursive(dfs_rootdir);
+	if (IS_ENABLED(CONFIG_DEBUG_FS))
+		debugfs_remove_recursive(dfs_rootdir);
 }
 
 /**
@@ -3187,5 +3111,3 @@
 {
 	kfree(c->dbg);
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/debug.h b/fs/ubifs/debug.h
--- a/fs/ubifs/debug.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/debug.h	2014-02-17 11:56:58.000000000 +0100
@@ -29,8 +29,6 @@
 typedef int (*dbg_znode_callback)(struct ubifs_info *c,
 				  struct ubifs_znode *znode, void *priv);
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
 /*
  * The UBIFS debugfs directory name pattern and maximum name length (3 for "ubi"
  * + 1 for "_" and plus 2x2 for 2 UBI numbers and 1 for the trailing zero byte.
@@ -81,6 +79,10 @@
  * @dfs_chk_lprops: debugfs knob to enable UBIFS LEP properties extra checks
  * @dfs_chk_fs: debugfs knob to enable UBIFS contents extra checks
  * @dfs_tst_rcvry: debugfs knob to enable UBIFS recovery testing
+ * @dfs_ro_error: debugfs knob to switch UBIFS to R/O mode (different to
+ *                re-mounting to R/O mode because it does not flush any buffers
+ *                and UBIFS just starts returning -EROFS on all write
+ *               operations)
  */
 struct ubifs_debug_info {
 	struct ubifs_zbranch old_zroot;
@@ -124,6 +126,7 @@
 	struct dentry *dfs_chk_lprops;
 	struct dentry *dfs_chk_fs;
 	struct dentry *dfs_tst_rcvry;
+	struct dentry *dfs_ro_error;
 };
 
 /**
@@ -147,63 +150,50 @@
 
 #define ubifs_assert(expr) do {                                                \
 	if (unlikely(!(expr))) {                                               \
-		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
+		pr_crit("UBIFS assert failed in %s at %u (pid %d)\n",          \
 		       __func__, __LINE__, current->pid);                      \
-		dbg_dump_stack();                                              \
+		dump_stack();                                                  \
 	}                                                                      \
 } while (0)
 
 #define ubifs_assert_cmt_locked(c) do {                                        \
 	if (unlikely(down_write_trylock(&(c)->commit_sem))) {                  \
 		up_write(&(c)->commit_sem);                                    \
-		printk(KERN_CRIT "commit lock is not locked!\n");              \
+		pr_crit("commit lock is not locked!\n");                       \
 		ubifs_assert(0);                                               \
 	}                                                                      \
 } while (0)
 
-#define dbg_dump_stack() dump_stack()
-
-#define dbg_err(fmt, ...) do {                                                 \
-	spin_lock(&dbg_lock);                                                  \
-	ubifs_err(fmt, ##__VA_ARGS__);                                         \
-	spin_unlock(&dbg_lock);                                                \
-} while (0)
-
-const char *dbg_key_str0(const struct ubifs_info *c,
-			 const union ubifs_key *key);
-const char *dbg_key_str1(const struct ubifs_info *c,
-			 const union ubifs_key *key);
-
-/*
- * TODO: these macros are now broken because there is no locking around them
- * and we use a global buffer for the key string. This means that in case of
- * concurrent execution we will end up with incorrect and messy key strings.
- */
-#define DBGKEY(key) dbg_key_str0(c, (key))
-#define DBGKEY1(key) dbg_key_str1(c, (key))
-
-extern spinlock_t dbg_lock;
-
 #define ubifs_dbg_msg(type, fmt, ...) \
-	pr_debug("UBIFS DBG " type ": " fmt "\n", ##__VA_ARGS__)
+	pr_debug("UBIFS DBG " type " (pid %d): " fmt "\n", current->pid,       \
+		 ##__VA_ARGS__)
 
-/* Just a debugging messages not related to any specific UBIFS subsystem */
-#define dbg_msg(fmt, ...)                                                     \
-	printk(KERN_DEBUG "UBIFS DBG (pid %d): %s: " fmt "\n", current->pid,  \
-	       __func__, ##__VA_ARGS__)
+#define DBG_KEY_BUF_LEN 48
+#define ubifs_dbg_msg_key(type, key, fmt, ...) do {                            \
+	char __tmp_key_buf[DBG_KEY_BUF_LEN];                                   \
+	pr_debug("UBIFS DBG " type " (pid %d): " fmt "%s\n", current->pid,     \
+		 ##__VA_ARGS__,                                                \
+		 dbg_snprintf_key(c, key, __tmp_key_buf, DBG_KEY_BUF_LEN));    \
+} while (0)
 
 /* General messages */
 #define dbg_gen(fmt, ...)   ubifs_dbg_msg("gen", fmt, ##__VA_ARGS__)
 /* Additional journal messages */
 #define dbg_jnl(fmt, ...)   ubifs_dbg_msg("jnl", fmt, ##__VA_ARGS__)
+#define dbg_jnlk(key, fmt, ...) \
+	ubifs_dbg_msg_key("jnl", key, fmt, ##__VA_ARGS__)
 /* Additional TNC messages */
 #define dbg_tnc(fmt, ...)   ubifs_dbg_msg("tnc", fmt, ##__VA_ARGS__)
+#define dbg_tnck(key, fmt, ...) \
+	ubifs_dbg_msg_key("tnc", key, fmt, ##__VA_ARGS__)
 /* Additional lprops messages */
 #define dbg_lp(fmt, ...)    ubifs_dbg_msg("lp", fmt, ##__VA_ARGS__)
 /* Additional LEB find messages */
 #define dbg_find(fmt, ...)  ubifs_dbg_msg("find", fmt, ##__VA_ARGS__)
 /* Additional mount messages */
 #define dbg_mnt(fmt, ...)   ubifs_dbg_msg("mnt", fmt, ##__VA_ARGS__)
+#define dbg_mntk(key, fmt, ...) \
+	ubifs_dbg_msg_key("mnt", key, fmt, ##__VA_ARGS__)
 /* Additional I/O messages */
 #define dbg_io(fmt, ...)    ubifs_dbg_msg("io", fmt, ##__VA_ARGS__)
 /* Additional commit messages */
@@ -259,27 +249,29 @@
 const char *dbg_jhead(int jhead);
 const char *dbg_get_key_dump(const struct ubifs_info *c,
 			     const union ubifs_key *key);
-void dbg_dump_inode(struct ubifs_info *c, const struct inode *inode);
-void dbg_dump_node(const struct ubifs_info *c, const void *node);
-void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
-		       int offs);
-void dbg_dump_budget_req(const struct ubifs_budget_req *req);
-void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
-void dbg_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
-void dbg_dump_lprop(const struct ubifs_info *c, const struct ubifs_lprops *lp);
-void dbg_dump_lprops(struct ubifs_info *c);
-void dbg_dump_lpt_info(struct ubifs_info *c);
-void dbg_dump_leb(const struct ubifs_info *c, int lnum);
-void dbg_dump_sleb(const struct ubifs_info *c,
-		   const struct ubifs_scan_leb *sleb, int offs);
-void dbg_dump_znode(const struct ubifs_info *c,
-		    const struct ubifs_znode *znode);
-void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat);
-void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
-		    struct ubifs_nnode *parent, int iip);
-void dbg_dump_tnc(struct ubifs_info *c);
-void dbg_dump_index(struct ubifs_info *c);
-void dbg_dump_lpt_lebs(const struct ubifs_info *c);
+const char *dbg_snprintf_key(const struct ubifs_info *c,
+			     const union ubifs_key *key, char *buffer, int len);
+void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode);
+void ubifs_dump_node(const struct ubifs_info *c, const void *node);
+void ubifs_dump_budget_req(const struct ubifs_budget_req *req);
+void ubifs_dump_lstats(const struct ubifs_lp_stats *lst);
+void ubifs_dump_budg(struct ubifs_info *c, const struct ubifs_budg_info *bi);
+void ubifs_dump_lprop(const struct ubifs_info *c,
+		      const struct ubifs_lprops *lp);
+void ubifs_dump_lprops(struct ubifs_info *c);
+void ubifs_dump_lpt_info(struct ubifs_info *c);
+void ubifs_dump_leb(const struct ubifs_info *c, int lnum);
+void ubifs_dump_sleb(const struct ubifs_info *c,
+		     const struct ubifs_scan_leb *sleb, int offs);
+void ubifs_dump_znode(const struct ubifs_info *c,
+		      const struct ubifs_znode *znode);
+void ubifs_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap,
+		     int cat);
+void ubifs_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+		      struct ubifs_nnode *parent, int iip);
+void ubifs_dump_tnc(struct ubifs_info *c);
+void ubifs_dump_index(struct ubifs_info *c);
+void ubifs_dump_lpt_lebs(const struct ubifs_info *c);
 
 int dbg_walk_index(struct ubifs_info *c, dbg_leaf_callback leaf_cb,
 		   dbg_znode_callback znode_cb, void *priv);
@@ -309,11 +301,10 @@
 int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head);
 
 int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
-		  int len, int dtype);
-int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
-		   int dtype);
+		  int len);
+int dbg_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len);
 int dbg_leb_unmap(struct ubifs_info *c, int lnum);
-int dbg_leb_map(struct ubifs_info *c, int lnum, int dtype);
+int dbg_leb_map(struct ubifs_info *c, int lnum);
 
 /* Debugfs-related stuff */
 int dbg_debugfs_init(void);
@@ -321,155 +312,4 @@
 int dbg_debugfs_init_fs(struct ubifs_info *c);
 void dbg_debugfs_exit_fs(struct ubifs_info *c);
 
-#else /* !CONFIG_UBIFS_FS_DEBUG */
-
-/* Use "if (0)" to make compiler check arguments even if debugging is off */
-#define ubifs_assert(expr)  do {                                               \
-	if (0)                                                                 \
-		printk(KERN_CRIT "UBIFS assert failed in %s at %u (pid %d)\n", \
-		       __func__, __LINE__, current->pid);                      \
-} while (0)
-
-#define dbg_err(fmt, ...)   do {                   \
-	if (0)                                     \
-		ubifs_err(fmt, ##__VA_ARGS__);     \
-} while (0)
-
-#define DBGKEY(key)  ((char *)(key))
-#define DBGKEY1(key) ((char *)(key))
-
-#define ubifs_dbg_msg(fmt, ...) do {                        \
-	if (0)                                              \
-		printk(KERN_DEBUG fmt "\n", ##__VA_ARGS__); \
-} while (0)
-
-#define dbg_dump_stack()
-#define ubifs_assert_cmt_locked(c)
-
-#define dbg_msg(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gen(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_jnl(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_tnc(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_lp(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_find(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_mnt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_io(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_cmt(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_budg(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_log(fmt, ...)   ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_gc(fmt, ...)    ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_scan(fmt, ...)  ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-#define dbg_rcvry(fmt, ...) ubifs_dbg_msg(fmt, ##__VA_ARGS__)
-
-static inline int ubifs_debugging_init(struct ubifs_info *c)      { return 0; }
-static inline void ubifs_debugging_exit(struct ubifs_info *c)     { return; }
-static inline const char *dbg_ntype(int type)                     { return ""; }
-static inline const char *dbg_cstate(int cmt_state)               { return ""; }
-static inline const char *dbg_jhead(int jhead)                    { return ""; }
-static inline const char *
-dbg_get_key_dump(const struct ubifs_info *c,
-		 const union ubifs_key *key)                      { return ""; }
-static inline void dbg_dump_inode(struct ubifs_info *c,
-				  const struct inode *inode)      { return; }
-static inline void dbg_dump_node(const struct ubifs_info *c,
-				 const void *node)                { return; }
-static inline void dbg_dump_lpt_node(const struct ubifs_info *c,
-				     void *node, int lnum,
-				     int offs)                    { return; }
-static inline void
-dbg_dump_budget_req(const struct ubifs_budget_req *req)           { return; }
-static inline void
-dbg_dump_lstats(const struct ubifs_lp_stats *lst)                 { return; }
-static inline void
-dbg_dump_budg(struct ubifs_info *c,
-	      const struct ubifs_budg_info *bi)                   { return; }
-static inline void dbg_dump_lprop(const struct ubifs_info *c,
-				  const struct ubifs_lprops *lp)  { return; }
-static inline void dbg_dump_lprops(struct ubifs_info *c)          { return; }
-static inline void dbg_dump_lpt_info(struct ubifs_info *c)        { return; }
-static inline void dbg_dump_leb(const struct ubifs_info *c,
-				int lnum)                         { return; }
-static inline void
-dbg_dump_sleb(const struct ubifs_info *c,
-	      const struct ubifs_scan_leb *sleb, int offs)        { return; }
-static inline void
-dbg_dump_znode(const struct ubifs_info *c,
-	       const struct ubifs_znode *znode)                   { return; }
-static inline void dbg_dump_heap(struct ubifs_info *c,
-				 struct ubifs_lpt_heap *heap,
-				 int cat)                         { return; }
-static inline void dbg_dump_pnode(struct ubifs_info *c,
-				  struct ubifs_pnode *pnode,
-				  struct ubifs_nnode *parent,
-				  int iip)                        { return; }
-static inline void dbg_dump_tnc(struct ubifs_info *c)             { return; }
-static inline void dbg_dump_index(struct ubifs_info *c)           { return; }
-static inline void dbg_dump_lpt_lebs(const struct ubifs_info *c)  { return; }
-
-static inline int dbg_walk_index(struct ubifs_info *c,
-				 dbg_leaf_callback leaf_cb,
-				 dbg_znode_callback znode_cb,
-				 void *priv)                      { return 0; }
-static inline void dbg_save_space_info(struct ubifs_info *c)      { return; }
-static inline int dbg_check_space_info(struct ubifs_info *c)      { return 0; }
-static inline int dbg_check_lprops(struct ubifs_info *c)          { return 0; }
-static inline int
-dbg_old_index_check_init(struct ubifs_info *c,
-			 struct ubifs_zbranch *zroot)             { return 0; }
-static inline int
-dbg_check_old_index(struct ubifs_info *c,
-		    struct ubifs_zbranch *zroot)                  { return 0; }
-static inline int dbg_check_cats(struct ubifs_info *c)            { return 0; }
-static inline int dbg_check_ltab(struct ubifs_info *c)            { return 0; }
-static inline int dbg_chk_lpt_free_spc(struct ubifs_info *c)      { return 0; }
-static inline int dbg_chk_lpt_sz(struct ubifs_info *c,
-				 int action, int len)             { return 0; }
-static inline int
-dbg_check_synced_i_size(const struct ubifs_info *c,
-			struct inode *inode)                      { return 0; }
-static inline int dbg_check_dir(struct ubifs_info *c,
-				const struct inode *dir)          { return 0; }
-static inline int dbg_check_tnc(struct ubifs_info *c, int extra)  { return 0; }
-static inline int dbg_check_idx_size(struct ubifs_info *c,
-				     long long idx_size)          { return 0; }
-static inline int dbg_check_filesystem(struct ubifs_info *c)      { return 0; }
-static inline void dbg_check_heap(struct ubifs_info *c,
-				  struct ubifs_lpt_heap *heap,
-				  int cat, int add_pos)           { return; }
-static inline int dbg_check_lpt_nodes(struct ubifs_info *c,
-	struct ubifs_cnode *cnode, int row, int col)              { return 0; }
-static inline int dbg_check_inode_size(struct ubifs_info *c,
-				       const struct inode *inode,
-				       loff_t size)               { return 0; }
-static inline int
-dbg_check_data_nodes_order(struct ubifs_info *c,
-			   struct list_head *head)                { return 0; }
-static inline int
-dbg_check_nondata_nodes_order(struct ubifs_info *c,
-			      struct list_head *head)             { return 0; }
-
-static inline int dbg_leb_write(struct ubifs_info *c, int lnum,
-				const void *buf, int offset,
-				int len, int dtype)               { return 0; }
-static inline int dbg_leb_change(struct ubifs_info *c, int lnum,
-				 const void *buf, int len,
-				 int dtype)                       { return 0; }
-static inline int dbg_leb_unmap(struct ubifs_info *c, int lnum)   { return 0; }
-static inline int dbg_leb_map(struct ubifs_info *c, int lnum,
-			      int dtype)                          { return 0; }
-
-static inline int dbg_is_chk_gen(const struct ubifs_info *c)      { return 0; }
-static inline int dbg_is_chk_index(const struct ubifs_info *c)    { return 0; }
-static inline int dbg_is_chk_orph(const struct ubifs_info *c)     { return 0; }
-static inline int dbg_is_chk_lprops(const struct ubifs_info *c)   { return 0; }
-static inline int dbg_is_chk_fs(const struct ubifs_info *c)       { return 0; }
-static inline int dbg_is_tst_rcvry(const struct ubifs_info *c)    { return 0; }
-static inline int dbg_is_power_cut(const struct ubifs_info *c)    { return 0; }
-
-static inline int dbg_debugfs_init(void)                          { return 0; }
-static inline void dbg_debugfs_exit(void)                         { return; }
-static inline int dbg_debugfs_init_fs(struct ubifs_info *c)       { return 0; }
-static inline int dbg_debugfs_exit_fs(struct ubifs_info *c)       { return 0; }
-
-#endif /* !CONFIG_UBIFS_FS_DEBUG */
 #endif /* !__UBIFS_DEBUG_H__ */
diff -ur a/fs/ubifs/dir.c b/fs/ubifs/dir.c
--- a/fs/ubifs/dir.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/dir.c	2014-02-17 11:56:58.000000000 +0100
@@ -170,8 +170,6 @@
 	return inode;
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
 static int dbg_check_name(const struct ubifs_info *c,
 			  const struct ubifs_dent_node *dent,
 			  const struct qstr *nm)
@@ -185,12 +183,6 @@
 	return 0;
 }
 
-#else
-
-#define dbg_check_name(c, dent, nm) 0
-
-#endif
-
 static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 				   struct nameidata *nd)
 {
@@ -357,31 +349,50 @@
 static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir)
 {
 	int err, over = 0;
+	loff_t pos = file->f_pos;
 	struct qstr nm;
 	union ubifs_key key;
 	struct ubifs_dent_node *dent;
 	struct inode *dir = file->f_path.dentry->d_inode;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 
-	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, file->f_pos);
+	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, pos);
 
-	if (file->f_pos > UBIFS_S_KEY_HASH_MASK || file->f_pos == 2)
+	if (pos > UBIFS_S_KEY_HASH_MASK || pos == 2)
 		/*
 		 * The directory was seek'ed to a senseless position or there
 		 * are no more entries.
 		 */
 		return 0;
 
+	if (file->f_version == 0) {
+		/*
+		 * The file was seek'ed, which means that @file->private_data
+		 * is now invalid. This may also be just the first
+		 * 'ubifs_readdir()' invocation, in which case
+		 * @file->private_data is NULL, and the below code is
+		 * basically a no-op.
+		 */
+		kfree(file->private_data);
+		file->private_data = NULL;
+	}
+
+	/*
+	 * 'generic_file_llseek()' unconditionally sets @file->f_version to
+	 * zero, and we use this for detecting whether the file was seek'ed.
+	 */
+	file->f_version = 1;
+
 	/* File positions 0 and 1 correspond to "." and ".." */
-	if (file->f_pos == 0) {
+	if (pos == 0) {
 		ubifs_assert(!file->private_data);
 		over = filldir(dirent, ".", 1, 0, dir->i_ino, DT_DIR);
 		if (over)
 			return 0;
-		file->f_pos = 1;
+		file->f_pos = pos = 1;
 	}
 
-	if (file->f_pos == 1) {
+	if (pos == 1) {
 		ubifs_assert(!file->private_data);
 		over = filldir(dirent, "..", 2, 1,
 			       parent_ino(file->f_path.dentry), DT_DIR);
@@ -397,7 +408,7 @@
 			goto out;
 		}
 
-		file->f_pos = key_hash_flash(c, &dent->key);
+		file->f_pos = pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 	}
 
@@ -405,17 +416,16 @@
 	if (!dent) {
 		/*
 		 * The directory was seek'ed to and is now readdir'ed.
-		 * Find the entry corresponding to @file->f_pos or the
-		 * closest one.
+		 * Find the entry corresponding to @pos or the closest one.
 		 */
-		dent_key_init_hash(c, &key, dir->i_ino, file->f_pos);
+		dent_key_init_hash(c, &key, dir->i_ino, pos);
 		nm.name = NULL;
 		dent = ubifs_tnc_next_ent(c, &key, &nm);
 		if (IS_ERR(dent)) {
 			err = PTR_ERR(dent);
 			goto out;
 		}
-		file->f_pos = key_hash_flash(c, &dent->key);
+		file->f_pos = pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 	}
 
@@ -427,7 +437,7 @@
 			     ubifs_inode(dir)->creat_sqnum);
 
 		nm.len = le16_to_cpu(dent->nlen);
-		over = filldir(dirent, dent->name, nm.len, file->f_pos,
+		over = filldir(dirent, dent->name, nm.len, pos,
 			       le64_to_cpu(dent->inum),
 			       vfs_dent_type(dent->type));
 		if (over)
@@ -443,9 +453,17 @@
 		}
 
 		kfree(file->private_data);
-		file->f_pos = key_hash_flash(c, &dent->key);
+		file->f_pos = pos = key_hash_flash(c, &dent->key);
 		file->private_data = dent;
 		cond_resched();
+
+		if (file->f_version == 0)
+			/*
+			 * The file was seek'ed meanwhile, lets return and start
+			 * reading direntries from the new position on the next
+			 * invocation.
+			 */
+			return 0;
 	}
 
 out:
@@ -456,15 +474,13 @@
 
 	kfree(file->private_data);
 	file->private_data = NULL;
+	/* 2 is a special value indicating that there are no more direntries */
 	file->f_pos = 2;
 	return 0;
 }
 
-/* If a directory is seeked, we have to free saved readdir() state */
 static loff_t ubifs_dir_llseek(struct file *file, loff_t offset, int origin)
 {
-	kfree(file->private_data);
-	file->private_data = NULL;
 	return generic_file_llseek(file, offset, origin);
 }
 
@@ -566,6 +582,7 @@
 	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
 	int err, budgeted = 1;
 	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+	unsigned int saved_nlink = inode->i_nlink;
 
 	/*
 	 * Budget request settings: deletion direntry, deletion inode (+1 for
@@ -613,7 +630,7 @@
 out_cancel:
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
-	inc_nlink(inode);
+	set_nlink(inode, saved_nlink);
 	unlock_2_inodes(dir, inode);
 	if (budgeted)
 		ubifs_release_budget(c, &req);
@@ -704,8 +721,7 @@
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	inc_nlink(dir);
-	inc_nlink(inode);
-	inc_nlink(inode);
+	set_nlink(inode, 2);
 	unlock_2_inodes(dir, inode);
 	if (budgeted)
 		ubifs_release_budget(c, &req);
@@ -977,6 +993,7 @@
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
 	struct timespec time;
+	unsigned int uninitialized_var(saved_nlink);
 
 	/*
 	 * Budget request settings: deletion direntry, new direntry, removing
@@ -987,8 +1004,8 @@
 	 * separately.
 	 */
 
-	dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in "
-		"dir ino %lu", old_dentry->d_name.len, old_dentry->d_name.name,
+	dbg_gen("dent '%.*s' ino %lu in dir ino %lu to dent '%.*s' in dir ino %lu",
+		old_dentry->d_name.len, old_dentry->d_name.name,
 		old_inode->i_ino, old_dir->i_ino, new_dentry->d_name.len,
 		new_dentry->d_name.name, new_dir->i_ino);
 	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
@@ -1059,13 +1076,14 @@
 	if (unlink) {
 		/*
 		 * Directories cannot have hard-links, so if this is a
-		 * directory, decrement its @i_nlink twice because an empty
-		 * directory has @i_nlink 2.
+		 * directory, just clear @i_nlink.
 		 */
+		saved_nlink = new_inode->i_nlink;
 		if (is_dir)
+			clear_nlink(new_inode);
+		else
 			drop_nlink(new_inode);
 		new_inode->i_ctime = time;
-		drop_nlink(new_inode);
 	} else {
 		new_dir->i_size += new_sz;
 		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@ -1102,9 +1120,7 @@
 
 out_cancel:
 	if (unlink) {
-		if (is_dir)
-			inc_nlink(new_inode);
-		inc_nlink(new_inode);
+		set_nlink(new_inode, saved_nlink);
 	} else {
 		new_dir->i_size -= new_sz;
 		ubifs_inode(new_dir)->ui_size = new_dir->i_size;
@@ -1187,12 +1203,10 @@
 	.rename      = ubifs_rename,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
-#ifdef CONFIG_UBIFS_FS_XATTR
 	.setxattr    = ubifs_setxattr,
 	.getxattr    = ubifs_getxattr,
 	.listxattr   = ubifs_listxattr,
 	.removexattr = ubifs_removexattr,
-#endif
 };
 
 const struct file_operations ubifs_dir_operations = {
diff -ur a/fs/ubifs/file.c b/fs/ubifs/file.c
--- a/fs/ubifs/file.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/file.c	2014-02-17 11:56:58.000000000 +0100
@@ -97,7 +97,7 @@
 dump:
 	ubifs_err("bad data node (block %u, inode %lu)",
 		  block, inode->i_ino);
-	dbg_dump_node(c, dn);
+	ubifs_dump_node(c, dn);
 	return -EINVAL;
 }
 
@@ -1486,8 +1486,8 @@
 	err = ubifs_budget_space(c, &req);
 	if (unlikely(err)) {
 		if (err == -ENOSPC)
-			ubifs_warn("out of space for mmapped file "
-				   "(inode number %lu)", inode->i_ino);
+			ubifs_warn("out of space for mmapped file (inode number %lu)",
+				   inode->i_ino);
 		return VM_FAULT_SIGBUS;
 	}
 
@@ -1562,12 +1562,10 @@
 const struct inode_operations ubifs_file_inode_operations = {
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
-#ifdef CONFIG_UBIFS_FS_XATTR
 	.setxattr    = ubifs_setxattr,
 	.getxattr    = ubifs_getxattr,
 	.listxattr   = ubifs_listxattr,
 	.removexattr = ubifs_removexattr,
-#endif
 };
 
 const struct inode_operations ubifs_symlink_inode_operations = {
diff -ur a/fs/ubifs/find.c b/fs/ubifs/find.c
--- a/fs/ubifs/find.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/find.c	2014-02-17 11:56:58.000000000 +0100
@@ -947,8 +947,8 @@
 	}
 	dbg_find("LEB %d, dirty %d and free %d flags %#x", lp->lnum, lp->dirty,
 		 lp->free, lp->flags);
-	ubifs_assert(lp->flags | LPROPS_TAKEN);
-	ubifs_assert(lp->flags | LPROPS_INDEX);
+	ubifs_assert(lp->flags & LPROPS_TAKEN);
+	ubifs_assert(lp->flags & LPROPS_INDEX);
 	return lnum;
 }
 
diff -ur a/fs/ubifs/gc.c b/fs/ubifs/gc.c
--- a/fs/ubifs/gc.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/gc.c	2014-02-17 11:56:58.000000000 +0100
@@ -109,7 +109,7 @@
 		return err;
 
 	c->gc_lnum = -1;
-	err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0, UBI_LONGTERM);
+	err = ubifs_wbuf_seek_nolock(wbuf, gc_lnum, 0);
 	return err;
 }
 
@@ -714,9 +714,9 @@
 			break;
 		}
 
-		dbg_gc("found LEB %d: free %d, dirty %d, sum %d "
-		       "(min. space %d)", lp.lnum, lp.free, lp.dirty,
-		       lp.free + lp.dirty, min_space);
+		dbg_gc("found LEB %d: free %d, dirty %d, sum %d (min. space %d)",
+		       lp.lnum, lp.free, lp.dirty, lp.free + lp.dirty,
+		       min_space);
 
 		space_before = c->leb_size - wbuf->offs - wbuf->used;
 		if (wbuf->lnum == -1)
diff -ur a/fs/ubifs/io.c b/fs/ubifs/io.c
--- a/fs/ubifs/io.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/io.c	2014-02-17 11:56:58.000000000 +0100
@@ -109,13 +109,13 @@
 	if (err && (err != -EBADMSG || even_ebadmsg)) {
 		ubifs_err("reading %d bytes from LEB %d:%d failed, error %d",
 			  len, lnum, offs, err);
-		dbg_dump_stack();
+		dump_stack();
 	}
 	return err;
 }
 
 int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
-		    int len, int dtype)
+		    int len)
 {
 	int err;
 
@@ -123,20 +123,19 @@
 	if (c->ro_error)
 		return -EROFS;
 	if (!dbg_is_tst_rcvry(c))
-		err = ubi_leb_write(c->ubi, lnum, buf, offs, len, dtype);
+		err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
 	else
-		err = dbg_leb_write(c, lnum, buf, offs, len, dtype);
+		err = dbg_leb_write(c, lnum, buf, offs, len);
 	if (err) {
 		ubifs_err("writing %d bytes to LEB %d:%d failed, error %d",
 			  len, lnum, offs, err);
 		ubifs_ro_mode(c, err);
-		dbg_dump_stack();
+		dump_stack();
 	}
 	return err;
 }
 
-int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
-		     int dtype)
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len)
 {
 	int err;
 
@@ -144,14 +143,14 @@
 	if (c->ro_error)
 		return -EROFS;
 	if (!dbg_is_tst_rcvry(c))
-		err = ubi_leb_change(c->ubi, lnum, buf, len, dtype);
+		err = ubi_leb_change(c->ubi, lnum, buf, len);
 	else
-		err = dbg_leb_change(c, lnum, buf, len, dtype);
+		err = dbg_leb_change(c, lnum, buf, len);
 	if (err) {
 		ubifs_err("changing %d bytes in LEB %d failed, error %d",
 			  len, lnum, err);
 		ubifs_ro_mode(c, err);
-		dbg_dump_stack();
+		dump_stack();
 	}
 	return err;
 }
@@ -170,12 +169,12 @@
 	if (err) {
 		ubifs_err("unmap LEB %d failed, error %d", lnum, err);
 		ubifs_ro_mode(c, err);
-		dbg_dump_stack();
+		dump_stack();
 	}
 	return err;
 }
 
-int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype)
+int ubifs_leb_map(struct ubifs_info *c, int lnum)
 {
 	int err;
 
@@ -183,13 +182,13 @@
 	if (c->ro_error)
 		return -EROFS;
 	if (!dbg_is_tst_rcvry(c))
-		err = ubi_leb_map(c->ubi, lnum, dtype);
+		err = ubi_leb_map(c->ubi, lnum);
 	else
-		err = dbg_leb_map(c, lnum, dtype);
+		err = dbg_leb_map(c, lnum);
 	if (err) {
 		ubifs_err("mapping LEB %d failed, error %d", lnum, err);
 		ubifs_ro_mode(c, err);
-		dbg_dump_stack();
+		dump_stack();
 	}
 	return err;
 }
@@ -202,7 +201,7 @@
 	if (err < 0) {
 		ubifs_err("ubi_is_mapped failed for LEB %d, error %d",
 			  lnum, err);
-		dbg_dump_stack();
+		dump_stack();
 	}
 	return err;
 }
@@ -294,8 +293,8 @@
 out:
 	if (!quiet) {
 		ubifs_err("bad node at LEB %d:%d", lnum, offs);
-		dbg_dump_node(c, buf);
-		dbg_dump_stack();
+		ubifs_dump_node(c, buf);
+		dump_stack();
 	}
 	return err;
 }
@@ -523,8 +522,7 @@
 	dirt = sync_len - wbuf->used;
 	if (dirt)
 		ubifs_pad(c, wbuf->buf + wbuf->used, dirt);
-	err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len,
-			      wbuf->dtype);
+	err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, sync_len);
 	if (err)
 		return err;
 
@@ -562,14 +560,12 @@
  * @wbuf: write-buffer
  * @lnum: logical eraseblock number to seek to
  * @offs: logical eraseblock offset to seek to
- * @dtype: data type
  *
  * This function targets the write-buffer to logical eraseblock @lnum:@offs.
  * The write-buffer has to be empty. Returns zero in case of success and a
  * negative error code in case of failure.
  */
-int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
-			   int dtype)
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs)
 {
 	const struct ubifs_info *c = wbuf->c;
 
@@ -592,7 +588,6 @@
 	wbuf->avail = wbuf->size;
 	wbuf->used = 0;
 	spin_unlock(&wbuf->lock);
-	wbuf->dtype = dtype;
 
 	return 0;
 }
@@ -719,8 +714,7 @@
 			dbg_io("flush jhead %s wbuf to LEB %d:%d",
 			       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
 			err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf,
-					      wbuf->offs, wbuf->size,
-					      wbuf->dtype);
+					      wbuf->offs, wbuf->size);
 			if (err)
 				goto out;
 
@@ -756,7 +750,7 @@
 		       dbg_jhead(wbuf->jhead), wbuf->lnum, wbuf->offs);
 		memcpy(wbuf->buf + wbuf->used, buf, wbuf->avail);
 		err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs,
-				      wbuf->size, wbuf->dtype);
+				      wbuf->size);
 		if (err)
 			goto out;
 
@@ -775,7 +769,7 @@
 		dbg_io("write %d bytes to LEB %d:%d",
 		       wbuf->size, wbuf->lnum, wbuf->offs);
 		err = ubifs_leb_write(c, wbuf->lnum, buf, wbuf->offs,
-				      wbuf->size, wbuf->dtype);
+				      wbuf->size);
 		if (err)
 			goto out;
 
@@ -797,7 +791,7 @@
 		dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
 		       wbuf->offs);
 		err = ubifs_leb_write(c, wbuf->lnum, buf + written,
-				      wbuf->offs, n, wbuf->dtype);
+				      wbuf->offs, n);
 		if (err)
 			goto out;
 		wbuf->offs += n;
@@ -841,9 +835,9 @@
 out:
 	ubifs_err("cannot write %d bytes to LEB %d:%d, error %d",
 		  len, wbuf->lnum, wbuf->offs, err);
-	dbg_dump_node(c, buf);
-	dbg_dump_stack();
-	dbg_dump_leb(c, wbuf->lnum);
+	ubifs_dump_node(c, buf);
+	dump_stack();
+	ubifs_dump_leb(c, wbuf->lnum);
 	return err;
 }
 
@@ -854,7 +848,6 @@
  * @len: node length
  * @lnum: logical eraseblock number
  * @offs: offset within the logical eraseblock
- * @dtype: node life-time hint (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
  *
  * This function automatically fills node magic number, assigns sequence
  * number, and calculates node CRC checksum. The length of the @buf buffer has
@@ -863,7 +856,7 @@
  * success and a negative error code in case of failure.
  */
 int ubifs_write_node(struct ubifs_info *c, void *buf, int len, int lnum,
-		     int offs, int dtype)
+		     int offs)
 {
 	int err, buf_len = ALIGN(len, c->min_io_size);
 
@@ -879,9 +872,9 @@
 		return -EROFS;
 
 	ubifs_prepare_node(c, buf, len, 1);
-	err = ubifs_leb_write(c, lnum, buf, offs, buf_len, dtype);
+	err = ubifs_leb_write(c, lnum, buf, offs, buf_len);
 	if (err)
-		dbg_dump_node(c, buf);
+		ubifs_dump_node(c, buf);
 
 	return err;
 }
@@ -960,8 +953,8 @@
 
 out:
 	ubifs_err("bad node at LEB %d:%d", lnum, offs);
-	dbg_dump_node(c, buf);
-	dbg_dump_stack();
+	ubifs_dump_node(c, buf);
+	dump_stack();
 	return -EINVAL;
 }
 
@@ -1017,8 +1010,8 @@
 out:
 	ubifs_err("bad node at LEB %d:%d, LEB mapping status %d", lnum, offs,
 		  ubi_is_mapped(c->ubi, lnum));
-	dbg_dump_node(c, buf);
-	dbg_dump_stack();
+	ubifs_dump_node(c, buf);
+	dump_stack();
 	return -EINVAL;
 }
 
@@ -1056,7 +1049,6 @@
 	 */
 	size = c->max_write_size - (c->leb_start % c->max_write_size);
 	wbuf->avail = wbuf->size = size;
-	wbuf->dtype = UBI_UNKNOWN;
 	wbuf->sync_callback = NULL;
 	mutex_init(&wbuf->io_mutex);
 	spin_lock_init(&wbuf->lock);
diff -ur a/fs/ubifs/journal.c b/fs/ubifs/journal.c
--- a/fs/ubifs/journal.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/journal.c	2014-02-17 11:56:58.000000000 +0100
@@ -214,7 +214,7 @@
 	err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
 	if (err)
 		goto out_return;
-	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs, wbuf->dtype);
+	err = ubifs_wbuf_seek_nolock(wbuf, lnum, offs);
 	if (err)
 		goto out_unlock;
 
@@ -385,9 +385,9 @@
 	if (err == -ENOSPC) {
 		/* This are some budgeting problems, print useful information */
 		down_write(&c->commit_sem);
-		dbg_dump_stack();
-		dbg_dump_budg(c, &c->bi);
-		dbg_dump_lprops(c);
+		dump_stack();
+		ubifs_dump_budg(c, &c->bi);
+		ubifs_dump_lprops(c);
 		cmt_retries = dbg_check_lprops(c);
 		up_write(&c->commit_sem);
 	}
@@ -697,9 +697,8 @@
 	int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
-	dbg_jnl("ino %lu, blk %u, len %d, key %s",
-		(unsigned long)key_inum(c, key), key_block(c, key), len,
-		DBGKEY(key));
+	dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
+		(unsigned long)key_inum(c, key), key_block(c, key), len);
 	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
 
 	data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
@@ -1177,7 +1176,7 @@
 		dn = (void *)trun + UBIFS_TRUN_NODE_SZ;
 		blk = new_size >> UBIFS_BLOCK_SHIFT;
 		data_key_init(c, &key, inum, blk);
-		dbg_jnl("last block key %s", DBGKEY(&key));
+		dbg_jnlk(&key, "last block key ");
 		err = ubifs_tnc_lookup(c, &key, dn);
 		if (err == -ENOENT)
 			dlen = 0; /* Not found (so it is a hole) */
@@ -1268,7 +1267,6 @@
 	return err;
 }
 
-#ifdef CONFIG_UBIFS_FS_XATTR
 
 /**
  * ubifs_jnl_delete_xattr - delete an extended attribute.
@@ -1463,4 +1461,3 @@
 	return err;
 }
 
-#endif /* CONFIG_UBIFS_FS_XATTR */
diff -ur a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
--- a/fs/ubifs/Kconfig	2013-08-03 09:59:51.000000000 +0200
+++ b/fs/ubifs/Kconfig	2014-01-21 09:37:27.000000000 +0100
@@ -11,12 +11,6 @@
 	help
 	  UBIFS is a file system for flash devices which works on top of UBI.
 
-config UBIFS_FS_XATTR
-	bool "Extended attributes support"
-	depends on UBIFS_FS
-	help
-	  This option enables support of extended attributes.
-
 config UBIFS_FS_ADVANCED_COMPR
 	bool "Advanced compression options"
 	depends on UBIFS_FS
@@ -41,20 +35,3 @@
 	default y
 	help
 	  Zlib compresses better than LZO but it is slower. Say 'Y' if unsure.
-
-# Debugging-related stuff
-config UBIFS_FS_DEBUG
-	bool "Enable debugging support"
-	depends on UBIFS_FS
-	select DEBUG_FS
-	select KALLSYMS
-	help
-	  This option enables UBIFS debugging support. It makes sure various
-	  assertions, self-checks, debugging messages and test modes are compiled
-	  in (this all is compiled out otherwise). Assertions are light-weight
-	  and this option also enables them. Self-checks, debugging messages and
-	  test modes are switched off by default. Thus, it is safe and actually
-	  recommended to have debugging support enabled, and it should not slow
-	  down UBIFS. You can then further enable / disable individual  debugging
-	  features using UBIFS module parameters and the corresponding sysfs
-	  interfaces.
diff -ur a/fs/ubifs/log.c b/fs/ubifs/log.c
--- a/fs/ubifs/log.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/log.c	2014-02-17 11:56:58.000000000 +0100
@@ -29,11 +29,7 @@
 
 #include "ubifs.h"
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 static int dbg_check_bud_bytes(struct ubifs_info *c);
-#else
-#define dbg_check_bud_bytes(c) 0
-#endif
 
 /**
  * ubifs_search_bud - search bud LEB.
@@ -262,7 +258,7 @@
 		 * an unclean reboot, because the target LEB might have been
 		 * unmapped, but not yet physically erased.
 		 */
-		err = ubifs_leb_map(c, bud->lnum, UBI_SHORTTERM);
+		err = ubifs_leb_map(c, bud->lnum);
 		if (err)
 			goto out_unlock;
 	}
@@ -270,7 +266,7 @@
 	dbg_log("write ref LEB %d:%d",
 		c->lhead_lnum, c->lhead_offs);
 	err = ubifs_write_node(c, ref, UBIFS_REF_NODE_SZ, c->lhead_lnum,
-			       c->lhead_offs, UBI_SHORTTERM);
+			       c->lhead_offs);
 	if (err)
 		goto out_unlock;
 
@@ -319,17 +315,15 @@
 			 * heads (non-closed buds).
 			 */
 			c->cmt_bud_bytes += wbuf->offs - bud->start;
-			dbg_log("preserve %d:%d, jhead %s, bud bytes %d, "
-				"cmt_bud_bytes %lld", bud->lnum, bud->start,
-				dbg_jhead(bud->jhead), wbuf->offs - bud->start,
-				c->cmt_bud_bytes);
+			dbg_log("preserve %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld",
+				bud->lnum, bud->start, dbg_jhead(bud->jhead),
+				wbuf->offs - bud->start, c->cmt_bud_bytes);
 			bud->start = wbuf->offs;
 		} else {
 			c->cmt_bud_bytes += c->leb_size - bud->start;
-			dbg_log("remove %d:%d, jhead %s, bud bytes %d, "
-				"cmt_bud_bytes %lld", bud->lnum, bud->start,
-				dbg_jhead(bud->jhead), c->leb_size - bud->start,
-				c->cmt_bud_bytes);
+			dbg_log("remove %d:%d, jhead %s, bud bytes %d, cmt_bud_bytes %lld",
+				bud->lnum, bud->start, dbg_jhead(bud->jhead),
+				c->leb_size - bud->start, c->cmt_bud_bytes);
 			rb_erase(p1, &c->buds);
 			/*
 			 * If the commit does not finish, the recovery will need
@@ -422,7 +416,7 @@
 
 	len = ALIGN(len, c->min_io_size);
 	dbg_log("writing commit start at LEB %d:0, len %d", c->lhead_lnum, len);
-	err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len, UBI_SHORTTERM);
+	err = ubifs_leb_write(c, c->lhead_lnum, cs, 0, len);
 	if (err)
 		goto out;
 
@@ -623,7 +617,7 @@
 		int sz = ALIGN(*offs, c->min_io_size), err;
 
 		ubifs_pad(c, buf + *offs, sz - *offs);
-		err = ubifs_leb_change(c, *lnum, buf, sz, UBI_SHORTTERM);
+		err = ubifs_leb_change(c, *lnum, buf, sz);
 		if (err)
 			return err;
 		*lnum = ubifs_next_log_lnum(c, *lnum);
@@ -702,7 +696,7 @@
 		int sz = ALIGN(offs, c->min_io_size);
 
 		ubifs_pad(c, buf + offs, sz - offs);
-		err = ubifs_leb_change(c, write_lnum, buf, sz, UBI_SHORTTERM);
+		err = ubifs_leb_change(c, write_lnum, buf, sz);
 		if (err)
 			goto out_free;
 		offs = ALIGN(offs, c->min_io_size);
@@ -734,8 +728,6 @@
 	return err;
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
 /**
  * dbg_check_bud_bytes - make sure bud bytes calculation are all right.
  * @c: UBIFS file-system description object
@@ -767,5 +759,3 @@
 
 	return err;
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
--- a/fs/ubifs/lprops.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/lprops.c	2014-02-17 11:56:58.000000000 +0100
@@ -453,7 +453,7 @@
 	int new_cat = ubifs_categorize_lprops(c, lprops);
 
 	if (old_cat == new_cat) {
-		struct ubifs_lpt_heap *heap = &c->lpt_heap[new_cat - 1];
+		struct ubifs_lpt_heap *heap;
 
 		/* lprops on a heap now must be moved up or down */
 		if (new_cat < 1 || new_cat > LPROPS_HEAP_CNT)
@@ -852,7 +852,9 @@
 	return lprops;
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
 
 /**
  * dbg_check_cats - check category heaps and lists.
@@ -871,15 +873,15 @@
 
 	list_for_each_entry(lprops, &c->empty_list, list) {
 		if (lprops->free != c->leb_size) {
-			ubifs_err("non-empty LEB %d on empty list "
-				  "(free %d dirty %d flags %d)", lprops->lnum,
-				  lprops->free, lprops->dirty, lprops->flags);
+			ubifs_err("non-empty LEB %d on empty list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
 			return -EINVAL;
 		}
 		if (lprops->flags & LPROPS_TAKEN) {
-			ubifs_err("taken LEB %d on empty list "
-				  "(free %d dirty %d flags %d)", lprops->lnum,
-				  lprops->free, lprops->dirty, lprops->flags);
+			ubifs_err("taken LEB %d on empty list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
 			return -EINVAL;
 		}
 	}
@@ -887,15 +889,15 @@
 	i = 0;
 	list_for_each_entry(lprops, &c->freeable_list, list) {
 		if (lprops->free + lprops->dirty != c->leb_size) {
-			ubifs_err("non-freeable LEB %d on freeable list "
-				  "(free %d dirty %d flags %d)", lprops->lnum,
-				  lprops->free, lprops->dirty, lprops->flags);
+			ubifs_err("non-freeable LEB %d on freeable list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
 			return -EINVAL;
 		}
 		if (lprops->flags & LPROPS_TAKEN) {
-			ubifs_err("taken LEB %d on freeable list "
-				  "(free %d dirty %d flags %d)", lprops->lnum,
-				  lprops->free, lprops->dirty, lprops->flags);
+			ubifs_err("taken LEB %d on freeable list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
 			return -EINVAL;
 		}
 		i += 1;
@@ -917,21 +919,21 @@
 
 	list_for_each_entry(lprops, &c->frdi_idx_list, list) {
 		if (lprops->free + lprops->dirty != c->leb_size) {
-			ubifs_err("non-freeable LEB %d on frdi_idx list "
-				  "(free %d dirty %d flags %d)", lprops->lnum,
-				  lprops->free, lprops->dirty, lprops->flags);
+			ubifs_err("non-freeable LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
 			return -EINVAL;
 		}
 		if (lprops->flags & LPROPS_TAKEN) {
-			ubifs_err("taken LEB %d on frdi_idx list "
-				  "(free %d dirty %d flags %d)", lprops->lnum,
-				  lprops->free, lprops->dirty, lprops->flags);
+			ubifs_err("taken LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
 			return -EINVAL;
 		}
 		if (!(lprops->flags & LPROPS_INDEX)) {
-			ubifs_err("non-index LEB %d on frdi_idx list "
-				  "(free %d dirty %d flags %d)", lprops->lnum,
-				  lprops->free, lprops->dirty, lprops->flags);
+			ubifs_err("non-index LEB %d on frdi_idx list (free %d dirty %d flags %d)",
+				  lprops->lnum, lprops->free, lprops->dirty,
+				  lprops->flags);
 			return -EINVAL;
 		}
 	}
@@ -986,9 +988,9 @@
 			goto out;
 		}
 		if (lprops != lp) {
-			dbg_msg("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
-				(size_t)lprops, (size_t)lp, lprops->lnum,
-				lp->lnum);
+			ubifs_err("lprops %zx lp %zx lprops->lnum %d lp->lnum %d",
+				  (size_t)lprops, (size_t)lp, lprops->lnum,
+				  lp->lnum);
 			err = 4;
 			goto out;
 		}
@@ -1006,9 +1008,9 @@
 	}
 out:
 	if (err) {
-		dbg_msg("failed cat %d hpos %d err %d", cat, i, err);
-		dbg_dump_stack();
-		dbg_dump_heap(c, heap, cat);
+		ubifs_err("failed cat %d hpos %d err %d", cat, i, err);
+		dump_stack();
+		ubifs_dump_heap(c, heap, cat);
 	}
 }
 
@@ -1115,8 +1117,8 @@
 	if (IS_ERR(sleb)) {
 		ret = PTR_ERR(sleb);
 		if (ret == -EUCLEAN) {
-			dbg_dump_lprops(c);
-			dbg_dump_budg(c, &c->bi);
+			ubifs_dump_lprops(c);
+			ubifs_dump_budg(c, &c->bi);
 		}
 		goto out;
 	}
@@ -1157,8 +1159,8 @@
 
 	if (free > c->leb_size || free < 0 || dirty > c->leb_size ||
 	    dirty < 0) {
-		ubifs_err("bad calculated accounting for LEB %d: "
-			  "free %d, dirty %d", lnum, free, dirty);
+		ubifs_err("bad calculated accounting for LEB %d: free %d, dirty %d",
+			  lnum, free, dirty);
 		goto out_destroy;
 	}
 
@@ -1204,8 +1206,7 @@
 			/* Free but not unmapped LEB, it's fine */
 			is_idx = 0;
 		else {
-			ubifs_err("indexing node without indexing "
-				  "flag");
+			ubifs_err("indexing node without indexing flag");
 			goto out_print;
 		}
 	}
@@ -1240,10 +1241,9 @@
 	return LPT_SCAN_CONTINUE;
 
 out_print:
-	ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, "
-		  "should be free %d, dirty %d",
+	ubifs_err("bad accounting of LEB %d: free %d, dirty %d flags %#x, should be free %d, dirty %d",
 		  lnum, lp->free, lp->dirty, lp->flags, free, dirty);
-	dbg_dump_leb(c, lnum);
+	ubifs_dump_leb(c, lnum);
 out_destroy:
 	ubifs_scan_destroy(sleb);
 	ret = -EINVAL;
@@ -1294,12 +1294,10 @@
 	    lst.total_dirty != c->lst.total_dirty ||
 	    lst.total_used != c->lst.total_used) {
 		ubifs_err("bad overall accounting");
-		ubifs_err("calculated: empty_lebs %d, idx_lebs %d, "
-			  "total_free %lld, total_dirty %lld, total_used %lld",
+		ubifs_err("calculated: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
 			  lst.empty_lebs, lst.idx_lebs, lst.total_free,
 			  lst.total_dirty, lst.total_used);
-		ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, "
-			  "total_free %lld, total_dirty %lld, total_used %lld",
+		ubifs_err("read from lprops: empty_lebs %d, idx_lebs %d, total_free %lld, total_dirty %lld, total_used %lld",
 			  c->lst.empty_lebs, c->lst.idx_lebs, c->lst.total_free,
 			  c->lst.total_dirty, c->lst.total_used);
 		err = -EINVAL;
@@ -1321,5 +1319,3 @@
 out:
 	return err;
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
--- a/fs/ubifs/lpt.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/lpt.c	2014-02-17 11:56:58.000000000 +0100
@@ -701,8 +701,7 @@
 			alen = ALIGN(len, c->min_io_size);
 			set_ltab(c, lnum, c->leb_size - alen, alen - len);
 			memset(p, 0xff, alen - len);
-			err = ubifs_leb_change(c, lnum++, buf, alen,
-					       UBI_SHORTTERM);
+			err = ubifs_leb_change(c, lnum++, buf, alen);
 			if (err)
 				goto out;
 			p = buf;
@@ -732,8 +731,7 @@
 				set_ltab(c, lnum, c->leb_size - alen,
 					    alen - len);
 				memset(p, 0xff, alen - len);
-				err = ubifs_leb_change(c, lnum++, buf, alen,
-						       UBI_SHORTTERM);
+				err = ubifs_leb_change(c, lnum++, buf, alen);
 				if (err)
 					goto out;
 				p = buf;
@@ -780,8 +778,7 @@
 			alen = ALIGN(len, c->min_io_size);
 			set_ltab(c, lnum, c->leb_size - alen, alen - len);
 			memset(p, 0xff, alen - len);
-			err = ubifs_leb_change(c, lnum++, buf, alen,
-					       UBI_SHORTTERM);
+			err = ubifs_leb_change(c, lnum++, buf, alen);
 			if (err)
 				goto out;
 			p = buf;
@@ -806,7 +803,7 @@
 		alen = ALIGN(len, c->min_io_size);
 		set_ltab(c, lnum, c->leb_size - alen, alen - len);
 		memset(p, 0xff, alen - len);
-		err = ubifs_leb_change(c, lnum++, buf, alen, UBI_SHORTTERM);
+		err = ubifs_leb_change(c, lnum++, buf, alen);
 		if (err)
 			goto out;
 		p = buf;
@@ -826,7 +823,7 @@
 
 	/* Write remaining buffer */
 	memset(p, 0xff, alen - len);
-	err = ubifs_leb_change(c, lnum, buf, alen, UBI_SHORTTERM);
+	err = ubifs_leb_change(c, lnum, buf, alen);
 	if (err)
 		goto out;
 
@@ -926,7 +923,7 @@
 	if (crc != calc_crc) {
 		ubifs_err("invalid crc in LPT node: crc %hx calc %hx", crc,
 			  calc_crc);
-		dbg_dump_stack();
+		dump_stack();
 		return -EINVAL;
 	}
 	return 0;
@@ -949,7 +946,7 @@
 	if (node_type != type) {
 		ubifs_err("invalid type (%d) in LPT node type %d", node_type,
 			  type);
-		dbg_dump_stack();
+		dump_stack();
 		return -EINVAL;
 	}
 	return 0;
@@ -1247,7 +1244,7 @@
 
 out:
 	ubifs_err("error %d reading nnode at %d:%d", err, lnum, offs);
-	dbg_dump_stack();
+	dump_stack();
 	kfree(nnode);
 	return err;
 }
@@ -1312,9 +1309,9 @@
 
 out:
 	ubifs_err("error %d reading pnode at %d:%d", err, lnum, offs);
-	dbg_dump_pnode(c, pnode, parent, iip);
-	dbg_dump_stack();
-	dbg_msg("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
+	ubifs_dump_pnode(c, pnode, parent, iip);
+	dump_stack();
+	ubifs_err("calc num: %d", calc_pnode_num_from_parent(c, parent, iip));
 	kfree(pnode);
 	return err;
 }
@@ -1740,16 +1737,23 @@
 	if (rd) {
 		err = lpt_init_rd(c);
 		if (err)
-			return err;
+			goto out_err;
 	}
 
 	if (wr) {
 		err = lpt_init_wr(c);
 		if (err)
-			return err;
+			goto out_err;
 	}
 
 	return 0;
+
+out_err:
+	if (wr)
+		ubifs_lpt_free(c, 1);
+	if (rd)
+		ubifs_lpt_free(c, 0);
+	return err;
 }
 
 /**
@@ -1986,12 +1990,11 @@
 
 				if (path[h].in_tree)
 					continue;
-				nnode = kmalloc(sz, GFP_NOFS);
+				nnode = kmemdup(&path[h].nnode, sz, GFP_NOFS);
 				if (!nnode) {
 					err = -ENOMEM;
 					goto out;
 				}
-				memcpy(nnode, &path[h].nnode, sz);
 				parent = nnode->parent;
 				parent->nbranch[nnode->iip].nnode = nnode;
 				path[h].ptr.nnode = nnode;
@@ -2004,12 +2007,11 @@
 				const size_t sz = sizeof(struct ubifs_pnode);
 				struct ubifs_nnode *parent;
 
-				pnode = kmalloc(sz, GFP_NOFS);
+				pnode = kmemdup(&path[h].pnode, sz, GFP_NOFS);
 				if (!pnode) {
 					err = -ENOMEM;
 					goto out;
 				}
-				memcpy(pnode, &path[h].pnode, sz);
 				parent = pnode->parent;
 				parent->nbranch[pnode->iip].pnode = pnode;
 				path[h].ptr.pnode = pnode;
@@ -2082,8 +2084,6 @@
 	return err;
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
 /**
  * dbg_chk_pnode - check a pnode.
  * @c: the UBIFS file-system description object
@@ -2098,8 +2098,8 @@
 	int i;
 
 	if (pnode->num != col) {
-		dbg_err("pnode num %d expected %d parent num %d iip %d",
-			pnode->num, col, pnode->parent->num, pnode->iip);
+		ubifs_err("pnode num %d expected %d parent num %d iip %d",
+			  pnode->num, col, pnode->parent->num, pnode->iip);
 		return -EINVAL;
 	}
 	for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
@@ -2113,14 +2113,14 @@
 		if (lnum >= c->leb_cnt)
 			continue;
 		if (lprops->lnum != lnum) {
-			dbg_err("bad LEB number %d expected %d",
-				lprops->lnum, lnum);
+			ubifs_err("bad LEB number %d expected %d",
+				  lprops->lnum, lnum);
 			return -EINVAL;
 		}
 		if (lprops->flags & LPROPS_TAKEN) {
 			if (cat != LPROPS_UNCAT) {
-				dbg_err("LEB %d taken but not uncat %d",
-					lprops->lnum, cat);
+				ubifs_err("LEB %d taken but not uncat %d",
+					  lprops->lnum, cat);
 				return -EINVAL;
 			}
 			continue;
@@ -2132,8 +2132,8 @@
 			case LPROPS_FRDI_IDX:
 				break;
 			default:
-				dbg_err("LEB %d index but cat %d",
-					lprops->lnum, cat);
+				ubifs_err("LEB %d index but cat %d",
+					  lprops->lnum, cat);
 				return -EINVAL;
 			}
 		} else {
@@ -2145,8 +2145,8 @@
 			case LPROPS_FREEABLE:
 				break;
 			default:
-				dbg_err("LEB %d not index but cat %d",
-					lprops->lnum, cat);
+				ubifs_err("LEB %d not index but cat %d",
+					  lprops->lnum, cat);
 				return -EINVAL;
 			}
 		}
@@ -2186,24 +2186,24 @@
 			break;
 		}
 		if (!found) {
-			dbg_err("LEB %d cat %d not found in cat heap/list",
-				lprops->lnum, cat);
+			ubifs_err("LEB %d cat %d not found in cat heap/list",
+				  lprops->lnum, cat);
 			return -EINVAL;
 		}
 		switch (cat) {
 		case LPROPS_EMPTY:
 			if (lprops->free != c->leb_size) {
-				dbg_err("LEB %d cat %d free %d dirty %d",
-					lprops->lnum, cat, lprops->free,
-					lprops->dirty);
+				ubifs_err("LEB %d cat %d free %d dirty %d",
+					  lprops->lnum, cat, lprops->free,
+					  lprops->dirty);
 				return -EINVAL;
 			}
 		case LPROPS_FREEABLE:
 		case LPROPS_FRDI_IDX:
 			if (lprops->free + lprops->dirty != c->leb_size) {
-				dbg_err("LEB %d cat %d free %d dirty %d",
-					lprops->lnum, cat, lprops->free,
-					lprops->dirty);
+				ubifs_err("LEB %d cat %d free %d dirty %d",
+					  lprops->lnum, cat, lprops->free,
+					  lprops->dirty);
 				return -EINVAL;
 			}
 		}
@@ -2237,9 +2237,9 @@
 			/* cnode is a nnode */
 			num = calc_nnode_num(row, col);
 			if (cnode->num != num) {
-				dbg_err("nnode num %d expected %d "
-					"parent num %d iip %d", cnode->num, num,
-					(nnode ? nnode->num : 0), cnode->iip);
+				ubifs_err("nnode num %d expected %d parent num %d iip %d",
+					  cnode->num, num,
+					  (nnode ? nnode->num : 0), cnode->iip);
 				return -EINVAL;
 			}
 			nn = (struct ubifs_nnode *)cnode;
@@ -2276,5 +2276,3 @@
 	}
 	return 0;
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
--- a/fs/ubifs/lpt_commit.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/lpt_commit.c	2014-02-17 11:56:58.000000000 +0100
@@ -30,11 +30,7 @@
 #include <linux/random.h>
 #include "ubifs.h"
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 static int dbg_populate_lsave(struct ubifs_info *c);
-#else
-#define dbg_populate_lsave(c) 0
-#endif
 
 /**
  * first_dirty_cnode - find first dirty cnode.
@@ -324,11 +320,10 @@
 	return 0;
 
 no_space:
-	ubifs_err("LPT out of space");
-	dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
-		"done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
-	dbg_dump_lpt_info(c);
-	dbg_dump_lpt_lebs(c);
+	ubifs_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
+		  lnum, offs, len, done_ltab, done_lsave);
+	ubifs_dump_lpt_info(c);
+	ubifs_dump_lpt_lebs(c);
 	dump_stack();
 	return err;
 }
@@ -421,7 +416,7 @@
 				alen = ALIGN(wlen, c->min_io_size);
 				memset(buf + offs, 0xff, alen - wlen);
 				err = ubifs_leb_write(c, lnum, buf + from, from,
-						       alen, UBI_SHORTTERM);
+						       alen);
 				if (err)
 					return err;
 			}
@@ -479,8 +474,7 @@
 			wlen = offs - from;
 			alen = ALIGN(wlen, c->min_io_size);
 			memset(buf + offs, 0xff, alen - wlen);
-			err = ubifs_leb_write(c, lnum, buf + from, from, alen,
-					      UBI_SHORTTERM);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen);
 			if (err)
 				return err;
 			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
@@ -506,8 +500,7 @@
 			wlen = offs - from;
 			alen = ALIGN(wlen, c->min_io_size);
 			memset(buf + offs, 0xff, alen - wlen);
-			err = ubifs_leb_write(c, lnum, buf + from, from, alen,
-					      UBI_SHORTTERM);
+			err = ubifs_leb_write(c, lnum, buf + from, from, alen);
 			if (err)
 				return err;
 			dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
@@ -531,7 +524,7 @@
 	wlen = offs - from;
 	alen = ALIGN(wlen, c->min_io_size);
 	memset(buf + offs, 0xff, alen - wlen);
-	err = ubifs_leb_write(c, lnum, buf + from, from, alen, UBI_SHORTTERM);
+	err = ubifs_leb_write(c, lnum, buf + from, from, alen);
 	if (err)
 		return err;
 
@@ -552,11 +545,10 @@
 	return 0;
 
 no_space:
-	ubifs_err("LPT out of space mismatch");
-	dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
-		"%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
-	dbg_dump_lpt_info(c);
-	dbg_dump_lpt_lebs(c);
+	ubifs_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab %d, done_lsave %d",
+		  lnum, offs, len, done_ltab, done_lsave);
+	ubifs_dump_lpt_info(c);
+	ubifs_dump_lpt_lebs(c);
 	dump_stack();
 	return err;
 }
@@ -1497,7 +1489,9 @@
 	kfree(c->lpt_nod_buf);
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
 
 /**
  * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
@@ -1668,21 +1662,19 @@
 				continue;
 			}
 			if (!dbg_is_all_ff(p, len)) {
-				dbg_msg("invalid empty space in LEB %d at %d",
-					lnum, c->leb_size - len);
+				ubifs_err("invalid empty space in LEB %d at %d",
+					  lnum, c->leb_size - len);
 				err = -EINVAL;
 			}
 			i = lnum - c->lpt_first;
 			if (len != c->ltab[i].free) {
-				dbg_msg("invalid free space in LEB %d "
-					"(free %d, expected %d)",
-					lnum, len, c->ltab[i].free);
+				ubifs_err("invalid free space in LEB %d (free %d, expected %d)",
+					  lnum, len, c->ltab[i].free);
 				err = -EINVAL;
 			}
 			if (dirty != c->ltab[i].dirty) {
-				dbg_msg("invalid dirty space in LEB %d "
-					"(dirty %d, expected %d)",
-					lnum, dirty, c->ltab[i].dirty);
+				ubifs_err("invalid dirty space in LEB %d (dirty %d, expected %d)",
+					  lnum, dirty, c->ltab[i].dirty);
 				err = -EINVAL;
 			}
 			goto out;
@@ -1735,7 +1727,7 @@
 	for (lnum = c->lpt_first; lnum <= c->lpt_last; lnum++) {
 		err = dbg_check_ltab_lnum(c, lnum);
 		if (err) {
-			dbg_err("failed at LEB %d", lnum);
+			ubifs_err("failed at LEB %d", lnum);
 			return err;
 		}
 	}
@@ -1767,10 +1759,10 @@
 			free += c->leb_size;
 	}
 	if (free < c->lpt_sz) {
-		dbg_err("LPT space error: free %lld lpt_sz %lld",
-			free, c->lpt_sz);
-		dbg_dump_lpt_info(c);
-		dbg_dump_lpt_lebs(c);
+		ubifs_err("LPT space error: free %lld lpt_sz %lld",
+			  free, c->lpt_sz);
+		ubifs_dump_lpt_info(c);
+		ubifs_dump_lpt_lebs(c);
 		dump_stack();
 		return -EINVAL;
 	}
@@ -1807,13 +1799,13 @@
 		d->chk_lpt_lebs = 0;
 		d->chk_lpt_wastage = 0;
 		if (c->dirty_pn_cnt > c->pnode_cnt) {
-			dbg_err("dirty pnodes %d exceed max %d",
-				c->dirty_pn_cnt, c->pnode_cnt);
+			ubifs_err("dirty pnodes %d exceed max %d",
+				  c->dirty_pn_cnt, c->pnode_cnt);
 			err = -EINVAL;
 		}
 		if (c->dirty_nn_cnt > c->nnode_cnt) {
-			dbg_err("dirty nnodes %d exceed max %d",
-				c->dirty_nn_cnt, c->nnode_cnt);
+			ubifs_err("dirty nnodes %d exceed max %d",
+				  c->dirty_nn_cnt, c->nnode_cnt);
 			err = -EINVAL;
 		}
 		return err;
@@ -1830,23 +1822,23 @@
 		chk_lpt_sz *= d->chk_lpt_lebs;
 		chk_lpt_sz += len - c->nhead_offs;
 		if (d->chk_lpt_sz != chk_lpt_sz) {
-			dbg_err("LPT wrote %lld but space used was %lld",
-				d->chk_lpt_sz, chk_lpt_sz);
+			ubifs_err("LPT wrote %lld but space used was %lld",
+				  d->chk_lpt_sz, chk_lpt_sz);
 			err = -EINVAL;
 		}
 		if (d->chk_lpt_sz > c->lpt_sz) {
-			dbg_err("LPT wrote %lld but lpt_sz is %lld",
-				d->chk_lpt_sz, c->lpt_sz);
+			ubifs_err("LPT wrote %lld but lpt_sz is %lld",
+				  d->chk_lpt_sz, c->lpt_sz);
 			err = -EINVAL;
 		}
 		if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
-			dbg_err("LPT layout size %lld but wrote %lld",
-				d->chk_lpt_sz, d->chk_lpt_sz2);
+			ubifs_err("LPT layout size %lld but wrote %lld",
+				  d->chk_lpt_sz, d->chk_lpt_sz2);
 			err = -EINVAL;
 		}
 		if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
-			dbg_err("LPT new nhead offs: expected %d was %d",
-				d->new_nhead_offs, len);
+			ubifs_err("LPT new nhead offs: expected %d was %d",
+				  d->new_nhead_offs, len);
 			err = -EINVAL;
 		}
 		lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1855,13 +1847,13 @@
 		if (c->big_lpt)
 			lpt_sz += c->lsave_sz;
 		if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
-			dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
-				d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
+			ubifs_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
+				  d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
 			err = -EINVAL;
 		}
 		if (err) {
-			dbg_dump_lpt_info(c);
-			dbg_dump_lpt_lebs(c);
+			ubifs_dump_lpt_info(c);
+			ubifs_dump_lpt_lebs(c);
 			dump_stack();
 		}
 		d->chk_lpt_sz2 = d->chk_lpt_sz;
@@ -1880,7 +1872,7 @@
 }
 
 /**
- * dbg_dump_lpt_leb - dump an LPT LEB.
+ * ubifs_dump_lpt_leb - dump an LPT LEB.
  * @c: UBIFS file-system description object
  * @lnum: LEB number to dump
  *
@@ -1894,8 +1886,7 @@
 	int err, len = c->leb_size, node_type, node_num, node_len, offs;
 	void *buf, *p;
 
-	printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
-	       current->pid, lnum);
+	pr_err("(pid %d) start dumping LEB %d\n", current->pid, lnum);
 	buf = p = __vmalloc(c->leb_size, GFP_NOFS, PAGE_KERNEL);
 	if (!buf) {
 		ubifs_err("cannot allocate memory to dump LPT");
@@ -1913,14 +1904,14 @@
 
 			pad_len = get_pad_len(c, p, len);
 			if (pad_len) {
-				printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
+				pr_err("LEB %d:%d, pad %d bytes\n",
 				       lnum, offs, pad_len);
 				p += pad_len;
 				len -= pad_len;
 				continue;
 			}
 			if (len)
-				printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
+				pr_err("LEB %d:%d, free %d bytes\n",
 				       lnum, offs, len);
 			break;
 		}
@@ -1931,11 +1922,10 @@
 		{
 			node_len = c->pnode_sz;
 			if (c->big_lpt)
-				printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
+				pr_err("LEB %d:%d, pnode num %d\n",
 				       lnum, offs, node_num);
 			else
-				printk(KERN_DEBUG "LEB %d:%d, pnode\n",
-				       lnum, offs);
+				pr_err("LEB %d:%d, pnode\n", lnum, offs);
 			break;
 		}
 		case UBIFS_LPT_NNODE:
@@ -1945,29 +1935,28 @@
 
 			node_len = c->nnode_sz;
 			if (c->big_lpt)
-				printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
+				pr_err("LEB %d:%d, nnode num %d, ",
 				       lnum, offs, node_num);
 			else
-				printk(KERN_DEBUG "LEB %d:%d, nnode, ",
+				pr_err("LEB %d:%d, nnode, ",
 				       lnum, offs);
 			err = ubifs_unpack_nnode(c, p, &nnode);
 			for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
-				printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
+				pr_cont("%d:%d", nnode.nbranch[i].lnum,
 				       nnode.nbranch[i].offs);
 				if (i != UBIFS_LPT_FANOUT - 1)
-					printk(KERN_CONT ", ");
+					pr_cont(", ");
 			}
-			printk(KERN_CONT "\n");
+			pr_cont("\n");
 			break;
 		}
 		case UBIFS_LPT_LTAB:
 			node_len = c->ltab_sz;
-			printk(KERN_DEBUG "LEB %d:%d, ltab\n",
-			       lnum, offs);
+			pr_err("LEB %d:%d, ltab\n", lnum, offs);
 			break;
 		case UBIFS_LPT_LSAVE:
 			node_len = c->lsave_sz;
-			printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
+			pr_err("LEB %d:%d, lsave len\n", lnum, offs);
 			break;
 		default:
 			ubifs_err("LPT node type %d not recognized", node_type);
@@ -1978,30 +1967,27 @@
 		len -= node_len;
 	}
 
-	printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
-	       current->pid, lnum);
+	pr_err("(pid %d) finish dumping LEB %d\n", current->pid, lnum);
 out:
 	vfree(buf);
 	return;
 }
 
 /**
- * dbg_dump_lpt_lebs - dump LPT lebs.
+ * ubifs_dump_lpt_lebs - dump LPT lebs.
  * @c: UBIFS file-system description object
  *
  * This function dumps all LPT LEBs. The caller has to make sure the LPT is
  * locked.
  */
-void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+void ubifs_dump_lpt_lebs(const struct ubifs_info *c)
 {
 	int i;
 
-	printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
-	       current->pid);
+	pr_err("(pid %d) start dumping all LPT LEBs\n", current->pid);
 	for (i = 0; i < c->lpt_lebs; i++)
 		dump_lpt_leb(c, i + c->lpt_first);
-	printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
-	       current->pid);
+	pr_err("(pid %d) finish dumping all LPT LEBs\n", current->pid);
 }
 
 /**
@@ -2021,30 +2007,28 @@
 
 	if (!dbg_is_chk_gen(c))
 		return 0;
-	if (random32() & 3)
+	if (prandom_u32() & 3)
 		return 0;
 
 	for (i = 0; i < c->lsave_cnt; i++)
 		c->lsave[i] = c->main_first;
 
 	list_for_each_entry(lprops, &c->empty_list, list)
-		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
 	list_for_each_entry(lprops, &c->freeable_list, list)
-		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
 	list_for_each_entry(lprops, &c->frdi_idx_list, list)
-		c->lsave[random32() % c->lsave_cnt] = lprops->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = lprops->lnum;
 
 	heap = &c->lpt_heap[LPROPS_DIRTY_IDX - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
 	heap = &c->lpt_heap[LPROPS_DIRTY - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
 	heap = &c->lpt_heap[LPROPS_FREE - 1];
 	for (i = 0; i < heap->cnt; i++)
-		c->lsave[random32() % c->lsave_cnt] = heap->arr[i]->lnum;
+		c->lsave[prandom_u32() % c->lsave_cnt] = heap->arr[i]->lnum;
 
 	return 1;
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/Makefile b/fs/ubifs/Makefile
--- a/fs/ubifs/Makefile	2013-08-03 09:59:51.000000000 +0200
+++ b/fs/ubifs/Makefile	2014-01-21 09:37:27.000000000 +0100
@@ -3,7 +3,4 @@
 ubifs-y += shrinker.o journal.o file.o dir.o super.o sb.o io.o
 ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
 ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
-ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o
-
-ubifs-$(CONFIG_UBIFS_FS_DEBUG) += debug.o
-ubifs-$(CONFIG_UBIFS_FS_XATTR) += xattr.o
+ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
diff -ur a/fs/ubifs/master.c b/fs/ubifs/master.c
--- a/fs/ubifs/master.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/master.c	2014-02-17 11:56:58.000000000 +0100
@@ -241,7 +241,7 @@
 
 out:
 	ubifs_err("bad master node at offset %d error %d", c->mst_offs, err);
-	dbg_dump_node(c, c->mst_node);
+	ubifs_dump_node(c, c->mst_node);
 	return -EINVAL;
 }
 
@@ -317,7 +317,7 @@
 		if (c->leb_cnt < old_leb_cnt ||
 		    c->leb_cnt < UBIFS_MIN_LEB_CNT) {
 			ubifs_err("bad leb_cnt on master node");
-			dbg_dump_node(c, c->mst_node);
+			ubifs_dump_node(c, c->mst_node);
 			return -EINVAL;
 		}
 
@@ -379,7 +379,7 @@
 	c->mst_offs = offs;
 	c->mst_node->highest_inum = cpu_to_le64(c->highest_inum);
 
-	err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
 	if (err)
 		return err;
 
@@ -390,7 +390,7 @@
 		if (err)
 			return err;
 	}
-	err = ubifs_write_node(c, c->mst_node, len, lnum, offs, UBI_SHORTTERM);
+	err = ubifs_write_node(c, c->mst_node, len, lnum, offs);
 
 	return err;
 }
diff -ur a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
--- a/fs/ubifs/orphan.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/orphan.c	2014-02-17 11:56:58.000000000 +0100
@@ -52,11 +52,7 @@
  * than the maximum number of orphans allowed.
  */
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 static int dbg_check_orphans(struct ubifs_info *c);
-#else
-#define dbg_check_orphans(c) 0
-#endif
 
 /**
  * ubifs_add_orphan - add an orphan.
@@ -92,7 +88,7 @@
 		else if (inum > o->inum)
 			p = &(*p)->rb_right;
 		else {
-			dbg_err("orphaned twice");
+			ubifs_err("orphaned twice");
 			spin_unlock(&c->orphan_lock);
 			kfree(orphan);
 			return 0;
@@ -136,7 +132,7 @@
 					(unsigned long)inum);
 				return;
 			}
-			if (o->cnext) {
+			if (o->cmt) {
 				o->del = 1;
 				o->dnext = c->orph_dnext;
 				c->orph_dnext = o;
@@ -159,8 +155,8 @@
 		}
 	}
 	spin_unlock(&c->orphan_lock);
-	dbg_err("missing orphan ino %lu", (unsigned long)inum);
-	dbg_dump_stack();
+	ubifs_err("missing orphan ino %lu", (unsigned long)inum);
+	dump_stack();
 }
 
 /**
@@ -177,11 +173,13 @@
 	last = &c->orph_cnext;
 	list_for_each_entry(orphan, &c->orph_new, new_list) {
 		ubifs_assert(orphan->new);
+		ubifs_assert(!orphan->cmt);
 		orphan->new = 0;
+		orphan->cmt = 1;
 		*last = orphan;
 		last = &orphan->cnext;
 	}
-	*last = orphan->cnext;
+	*last = NULL;
 	c->cmt_orphans = c->new_orphans;
 	c->new_orphans = 0;
 	dbg_cmt("%d orphans to commit", c->cmt_orphans);
@@ -249,8 +247,7 @@
 		ubifs_assert(c->ohead_offs == 0);
 		ubifs_prepare_node(c, c->orph_buf, len, 1);
 		len = ALIGN(len, c->min_io_size);
-		err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len,
-				       UBI_SHORTTERM);
+		err = ubifs_leb_change(c, c->ohead_lnum, c->orph_buf, len);
 	} else {
 		if (c->ohead_offs == 0) {
 			/* Ensure LEB has been unmapped */
@@ -259,7 +256,7 @@
 				return err;
 		}
 		err = ubifs_write_node(c, c->orph_buf, len, c->ohead_lnum,
-				       c->ohead_offs, UBI_SHORTTERM);
+				       c->ohead_offs);
 	}
 	return err;
 }
@@ -305,7 +302,9 @@
 	cnext = c->orph_cnext;
 	for (i = 0; i < cnt; i++) {
 		orphan = cnext;
+		ubifs_assert(orphan->cmt);
 		orph->inos[i] = cpu_to_le64(orphan->inum);
+		orphan->cmt = 0;
 		cnext = orphan->cnext;
 		orphan->cnext = NULL;
 	}
@@ -384,11 +383,12 @@
 		list_for_each_entry(orphan, &c->orph_list, list) {
 			if (orphan->new)
 				continue;
+			orphan->cmt = 1;
 			*last = orphan;
 			last = &orphan->cnext;
 			cnt += 1;
 		}
-		*last = orphan->cnext;
+		*last = NULL;
 		ubifs_assert(cnt == c->tot_orphans - c->new_orphans);
 		c->cmt_orphans = cnt;
 		c->ohead_lnum = c->orph_first;
@@ -570,9 +570,9 @@
 
 	list_for_each_entry(snod, &sleb->nodes, list) {
 		if (snod->type != UBIFS_ORPH_NODE) {
-			ubifs_err("invalid node type %d in orphan area at "
-				  "%d:%d", snod->type, sleb->lnum, snod->offs);
-			dbg_dump_node(c, snod->node);
+			ubifs_err("invalid node type %d in orphan area at %d:%d",
+				  snod->type, sleb->lnum, snod->offs);
+			ubifs_dump_node(c, snod->node);
 			return -EINVAL;
 		}
 
@@ -597,10 +597,9 @@
 			 * number. That makes this orphan node, out of date.
 			 */
 			if (!first) {
-				ubifs_err("out of order commit number %llu in "
-					  "orphan node at %d:%d",
+				ubifs_err("out of order commit number %llu in orphan node at %d:%d",
 					  cmt_no, sleb->lnum, snod->offs);
-				dbg_dump_node(c, snod->node);
+				ubifs_dump_node(c, snod->node);
 				return -EINVAL;
 			}
 			dbg_rcvry("out of date LEB %d", sleb->lnum);
@@ -728,7 +727,9 @@
 	return err;
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
+/*
+ * Everything below is related to debugging.
+ */
 
 struct check_orphan {
 	struct rb_node rb;
@@ -971,5 +972,3 @@
 	kfree(ci.node);
 	return err;
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
--- a/fs/ubifs/recovery.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/recovery.c	2014-02-17 11:56:58.000000000 +0100
@@ -213,10 +213,10 @@
 	mst->flags |= cpu_to_le32(UBIFS_MST_RCVRY);
 
 	ubifs_prepare_node(c, mst, UBIFS_MST_NODE_SZ, 1);
-	err = ubifs_leb_change(c, lnum, mst, sz, UBI_SHORTTERM);
+	err = ubifs_leb_change(c, lnum, mst, sz);
 	if (err)
 		goto out;
-	err = ubifs_leb_change(c, lnum + 1, mst, sz, UBI_SHORTTERM);
+	err = ubifs_leb_change(c, lnum + 1, mst, sz);
 	if (err)
 		goto out;
 out:
@@ -362,12 +362,12 @@
 out_free:
 	ubifs_err("failed to recover master node");
 	if (mst1) {
-		dbg_err("dumping first master node");
-		dbg_dump_node(c, mst1);
+		ubifs_err("dumping first master node");
+		ubifs_dump_node(c, mst1);
 	}
 	if (mst2) {
-		dbg_err("dumping second master node");
-		dbg_dump_node(c, mst2);
+		ubifs_err("dumping second master node");
+		ubifs_dump_node(c, mst2);
 	}
 	vfree(buf2);
 	vfree(buf1);
@@ -555,8 +555,7 @@
 					ubifs_pad(c, buf, pad_len);
 				}
 			}
-			err = ubifs_leb_change(c, lnum, sleb->buf, len,
-					       UBI_UNKNOWN);
+			err = ubifs_leb_change(c, lnum, sleb->buf, len);
 			if (err)
 				return err;
 		}
@@ -610,7 +609,8 @@
 		snod = list_entry(sleb->nodes.prev, struct ubifs_scan_node,
 				  list);
 
-		dbg_rcvry("dropping last node at %d:%d", sleb->lnum, snod->offs);
+		dbg_rcvry("dropping last node at %d:%d",
+			  sleb->lnum, snod->offs);
 		*offs = snod->offs;
 		list_del(&snod->list);
 		kfree(snod);
@@ -679,10 +679,11 @@
 			   ret == SCANNED_GARBAGE     ||
 			   ret == SCANNED_A_BAD_PAD_NODE ||
 			   ret == SCANNED_A_CORRUPT_NODE) {
-			dbg_rcvry("found corruption - %d", ret);
+			dbg_rcvry("found corruption (%d) at %d:%d",
+				  ret, lnum, offs);
 			break;
 		} else {
-			dbg_err("unexpected return value %d", ret);
+			ubifs_err("unexpected return value %d", ret);
 			err = -EINVAL;
 			goto error;
 		}
@@ -702,8 +703,8 @@
 			 * See header comment for this file for more
 			 * explanations about the reasons we have this check.
 			 */
-			ubifs_err("corrupt empty space LEB %d:%d, corruption "
-				  "starts at %d", lnum, offs, corruption);
+			ubifs_err("corrupt empty space LEB %d:%d, corruption starts at %d",
+				  lnum, offs, corruption);
 			/* Make sure we dump interesting non-0xFF data */
 			offs += corruption;
 			buf += corruption;
@@ -788,7 +789,7 @@
 
 corrupted_rescan:
 	/* Re-scan the corrupted data with verbose messages */
-	dbg_err("corruptio %d", ret);
+	ubifs_err("corruption %d", ret);
 	ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
 corrupted:
 	ubifs_scanned_corruption(c, lnum, offs, buf);
@@ -826,17 +827,17 @@
 		goto out_free;
 	ret = ubifs_scan_a_node(c, cs_node, UBIFS_CS_NODE_SZ, lnum, offs, 0);
 	if (ret != SCANNED_A_NODE) {
-		dbg_err("Not a valid node");
+		ubifs_err("Not a valid node");
 		goto out_err;
 	}
 	if (cs_node->ch.node_type != UBIFS_CS_NODE) {
-		dbg_err("Node a CS node, type is %d", cs_node->ch.node_type);
+		ubifs_err("Node a CS node, type is %d", cs_node->ch.node_type);
 		goto out_err;
 	}
 	if (le64_to_cpu(cs_node->cmt_no) != c->cmt_no) {
-		dbg_err("CS node cmt_no %llu != current cmt_no %llu",
-			(unsigned long long)le64_to_cpu(cs_node->cmt_no),
-			c->cmt_no);
+		ubifs_err("CS node cmt_no %llu != current cmt_no %llu",
+			  (unsigned long long)le64_to_cpu(cs_node->cmt_no),
+			  c->cmt_no);
 		goto out_err;
 	}
 	*cs_sqnum = le64_to_cpu(cs_node->ch.sqnum);
@@ -899,8 +900,8 @@
 				}
 			}
 			if (snod->sqnum > cs_sqnum) {
-				ubifs_err("unrecoverable log corruption "
-					  "in LEB %d", lnum);
+				ubifs_err("unrecoverable log corruption in LEB %d",
+					  lnum);
 				ubifs_scan_destroy(sleb);
 				return ERR_PTR(-EUCLEAN);
 			}
@@ -940,7 +941,7 @@
 		err = ubifs_leb_read(c, lnum, sbuf, 0, offs, 1);
 		if (err)
 			return err;
-		return ubifs_leb_change(c, lnum, sbuf, offs, UBI_UNKNOWN);
+		return ubifs_leb_change(c, lnum, sbuf, offs);
 	}
 
 	return 0;
@@ -1070,7 +1071,7 @@
 	}
 
 	/* Write back the LEB atomically */
-	err = ubifs_leb_change(c, lnum, sbuf, len, UBI_UNKNOWN);
+	err = ubifs_leb_change(c, lnum, sbuf, len);
 	if (err)
 		return err;
 
@@ -1137,9 +1138,9 @@
 	 */
 	lnum = ubifs_find_free_leb_for_idx(c);
 	if (lnum < 0) {
-		dbg_err("could not find an empty LEB");
-		dbg_dump_lprops(c);
-		dbg_dump_budg(c, &c->bi);
+		ubifs_err("could not find an empty LEB");
+		ubifs_dump_lprops(c);
+		ubifs_dump_budg(c, &c->bi);
 		return lnum;
 	}
 
@@ -1217,7 +1218,7 @@
 	}
 	mutex_unlock(&wbuf->io_mutex);
 	if (err < 0) {
-		dbg_err("GC failed, error %d", err);
+		ubifs_err("GC failed, error %d", err);
 		if (err == -EAGAIN)
 			err = -EINVAL;
 		return err;
@@ -1471,7 +1472,7 @@
 		len -= 1;
 	len = ALIGN(len + 1, c->min_io_size);
 	/* Atomically write the fixed LEB back again */
-	err = ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN);
+	err = ubifs_leb_change(c, lnum, c->sbuf, len);
 	if (err)
 		goto out;
 	dbg_rcvry("inode %lu at %d:%d size %lld -> %lld",
diff -ur a/fs/ubifs/replay.c b/fs/ubifs/replay.c
--- a/fs/ubifs/replay.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/replay.c	2014-02-17 11:56:58.000000000 +0100
@@ -141,9 +141,9 @@
 		 * during the replay.
 		 */
 		if (dirty != 0)
-			dbg_msg("LEB %d lp: %d free %d dirty "
-				"replay: %d free %d dirty", b->bud->lnum,
-				lp->free, lp->dirty, b->free, b->dirty);
+			dbg_mnt("LEB %d lp: %d free %d dirty replay: %d free %d dirty",
+				b->bud->lnum, lp->free, lp->dirty, b->free,
+				b->dirty);
 	}
 	lp = ubifs_change_lp(c, lp, b->free, dirty + b->dirty,
 			     lp->flags | LPROPS_TAKEN, 0);
@@ -154,8 +154,7 @@
 
 	/* Make sure the journal head points to the latest bud */
 	err = ubifs_wbuf_seek_nolock(&c->jheads[b->bud->jhead].wbuf,
-				     b->bud->lnum, c->leb_size - b->free,
-				     UBI_SHORTTERM);
+				     b->bud->lnum, c->leb_size - b->free);
 
 out:
 	ubifs_release_lprops(c);
@@ -221,8 +220,8 @@
 {
 	int err;
 
-	dbg_mnt("LEB %d:%d len %d deletion %d sqnum %llu %s", r->lnum,
-		r->offs, r->len, r->deletion, r->sqnum, DBGKEY(&r->key));
+	dbg_mntk(&r->key, "LEB %d:%d len %d deletion %d sqnum %llu key ",
+		 r->lnum, r->offs, r->len, r->deletion, r->sqnum);
 
 	/* Set c->replay_sqnum to help deal with dangling branches. */
 	c->replay_sqnum = r->sqnum;
@@ -361,7 +360,7 @@
 {
 	struct replay_entry *r;
 
-	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+	dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
 
 	if (key_inum(c, key) >= c->highest_inum)
 		c->highest_inum = key_inum(c, key);
@@ -409,7 +408,7 @@
 	struct replay_entry *r;
 	char *nbuf;
 
-	dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
+	dbg_mntk(key, "add LEB %d:%d, key ", lnum, offs);
 	if (key_inum(c, key) >= c->highest_inum)
 		c->highest_inum = key_inum(c, key);
 
@@ -678,7 +677,8 @@
 
 	b->dirty = sleb->endpt - offs - used;
 	b->free = c->leb_size - sleb->endpt;
-	dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, b->dirty, b->free);
+	dbg_mnt("bud LEB %d replied: dirty %d, free %d",
+		lnum, b->dirty, b->free);
 
 out:
 	ubifs_scan_destroy(sleb);
@@ -686,7 +686,7 @@
 
 out_dump:
 	ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
-	dbg_dump_node(c, snod->node);
+	ubifs_dump_node(c, snod->node);
 	ubifs_scan_destroy(sleb);
 	return -EINVAL;
 }
@@ -861,16 +861,15 @@
 		 * numbers.
 		 */
 		if (snod->type != UBIFS_CS_NODE) {
-			dbg_err("first log node at LEB %d:%d is not CS node",
-				lnum, offs);
+			ubifs_err("first log node at LEB %d:%d is not CS node",
+				  lnum, offs);
 			goto out_dump;
 		}
 		if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
-			dbg_err("first CS node at LEB %d:%d has wrong "
-				"commit number %llu expected %llu",
-				lnum, offs,
-				(unsigned long long)le64_to_cpu(node->cmt_no),
-				c->cmt_no);
+			ubifs_err("first CS node at LEB %d:%d has wrong commit number %llu expected %llu",
+				  lnum, offs,
+				  (unsigned long long)le64_to_cpu(node->cmt_no),
+				  c->cmt_no);
 			goto out_dump;
 		}
 
@@ -892,7 +891,7 @@
 
 	/* Make sure the first node sits at offset zero of the LEB */
 	if (snod->offs != 0) {
-		dbg_err("first node is not at zero offset");
+		ubifs_err("first node is not at zero offset");
 		goto out_dump;
 	}
 
@@ -905,8 +904,8 @@
 		}
 
 		if (snod->sqnum < c->cs_sqnum) {
-			dbg_err("bad sqnum %llu, commit sqnum %llu",
-				snod->sqnum, c->cs_sqnum);
+			ubifs_err("bad sqnum %llu, commit sqnum %llu",
+				  snod->sqnum, c->cs_sqnum);
 			goto out_dump;
 		}
 
@@ -958,7 +957,7 @@
 out_dump:
 	ubifs_err("log error detected while replaying the log at LEB %d:%d",
 		  lnum, offs + snod->offs);
-	dbg_dump_node(c, snod->node);
+	ubifs_dump_node(c, snod->node);
 	ubifs_scan_destroy(sleb);
 	return -EINVAL;
 }
@@ -1008,7 +1007,7 @@
  */
 int ubifs_replay_journal(struct ubifs_info *c)
 {
-	int err, i, lnum, offs, free;
+	int err, lnum, free;
 
 	BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
 
@@ -1026,25 +1025,16 @@
 	dbg_mnt("start replaying the journal");
 	c->replaying = 1;
 	lnum = c->ltail_lnum = c->lhead_lnum;
-	offs = c->lhead_offs;
 
-	for (i = 0; i < c->log_lebs; i++, lnum++) {
-		if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
-			/*
-			 * The log is logically circular, we reached the last
-			 * LEB, switch to the first one.
-			 */
-			lnum = UBIFS_LOG_LNUM;
-			offs = 0;
-		}
-		err = replay_log_leb(c, lnum, offs, c->sbuf);
+	do {
+		err = replay_log_leb(c, lnum, 0, c->sbuf);
 		if (err == 1)
 			/* We hit the end of the log */
 			break;
 		if (err)
 			goto out;
-		offs = 0;
-	}
+		lnum = ubifs_next_log_lnum(c, lnum);
+	} while (lnum != c->ltail_lnum);
 
 	err = replay_buds(c);
 	if (err)
@@ -1068,8 +1058,8 @@
 	c->bi.uncommitted_idx *= c->max_idx_node_sz;
 
 	ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
-	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
-		"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
+	dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, highest_inum %lu",
+		c->lhead_lnum, c->lhead_offs, c->max_sqnum,
 		(unsigned long)c->highest_inum);
 out:
 	destroy_replay_list(c);
diff -ur a/fs/ubifs/sb.c b/fs/ubifs/sb.c
--- a/fs/ubifs/sb.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/sb.c	2014-02-17 11:56:58.000000000 +0100
@@ -130,7 +130,6 @@
 	 * orphan node.
 	 */
 	orph_lebs = UBIFS_MIN_ORPH_LEBS;
-#ifdef CONFIG_UBIFS_FS_DEBUG
 	if (c->leb_cnt - min_leb_cnt > 1)
 		/*
 		 * For debugging purposes it is better to have at least 2
@@ -138,7 +137,6 @@
 		 * consolidations and would be stressed more.
 		 */
 		orph_lebs += 1;
-#endif
 
 	main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS - log_lebs;
 	main_lebs -= orph_lebs;
@@ -196,7 +194,7 @@
 	sup->rp_size = cpu_to_le64(tmp64);
 	sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
 
-	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
+	err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0);
 	kfree(sup);
 	if (err)
 		return err;
@@ -252,14 +250,13 @@
 
 	mst->total_used = cpu_to_le64(UBIFS_INO_NODE_SZ);
 
-	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0,
-			       UBI_UNKNOWN);
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM, 0);
 	if (err) {
 		kfree(mst);
 		return err;
 	}
-	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1, 0,
-			       UBI_UNKNOWN);
+	err = ubifs_write_node(c, mst, UBIFS_MST_NODE_SZ, UBIFS_MST_LNUM + 1,
+			       0);
 	kfree(mst);
 	if (err)
 		return err;
@@ -282,8 +279,7 @@
 	key_write_idx(c, &key, &br->key);
 	br->lnum = cpu_to_le32(main_first + DEFAULT_DATA_LEB);
 	br->len  = cpu_to_le32(UBIFS_INO_NODE_SZ);
-	err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0,
-			       UBI_UNKNOWN);
+	err = ubifs_write_node(c, idx, tmp, main_first + DEFAULT_IDX_LEB, 0);
 	kfree(idx);
 	if (err)
 		return err;
@@ -315,8 +311,7 @@
 	ino->flags = cpu_to_le32(UBIFS_COMPR_FL);
 
 	err = ubifs_write_node(c, ino, UBIFS_INO_NODE_SZ,
-			       main_first + DEFAULT_DATA_LEB, 0,
-			       UBI_UNKNOWN);
+			       main_first + DEFAULT_DATA_LEB, 0);
 	kfree(ino);
 	if (err)
 		return err;
@@ -335,8 +330,7 @@
 		return -ENOMEM;
 
 	cs->ch.node_type = UBIFS_CS_NODE;
-	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM,
-			       0, UBI_UNKNOWN);
+	err = ubifs_write_node(c, cs, UBIFS_CS_NODE_SZ, UBIFS_LOG_LNUM, 0);
 	kfree(cs);
 
 	ubifs_msg("default file-system created");
@@ -397,9 +391,8 @@
 	min_leb_cnt += c->lpt_lebs + c->orph_lebs + c->jhead_cnt + 6;
 
 	if (c->leb_cnt < min_leb_cnt || c->leb_cnt > c->vi.size) {
-		ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, "
-			  "%d minimum required", c->leb_cnt, c->vi.size,
-			  min_leb_cnt);
+		ubifs_err("bad LEB count: %d in superblock, %d on UBI volume, %d minimum required",
+			  c->leb_cnt, c->vi.size, min_leb_cnt);
 		goto failed;
 	}
 
@@ -410,13 +403,22 @@
 	}
 
 	if (c->main_lebs < UBIFS_MIN_MAIN_LEBS) {
-		err = 7;
+		ubifs_err("too few main LEBs count %d, must be at least %d",
+			  c->main_lebs, UBIFS_MIN_MAIN_LEBS);
 		goto failed;
 	}
 
-	if (c->max_bud_bytes < (long long)c->leb_size * UBIFS_MIN_BUD_LEBS ||
-	    c->max_bud_bytes > (long long)c->leb_size * c->main_lebs) {
-		err = 8;
+	max_bytes = (long long)c->leb_size * UBIFS_MIN_BUD_LEBS;
+	if (c->max_bud_bytes < max_bytes) {
+		ubifs_err("too small journal (%lld bytes), must be at least %lld bytes",
+			  c->max_bud_bytes, max_bytes);
+		goto failed;
+	}
+
+	max_bytes = (long long)c->leb_size * c->main_lebs;
+	if (c->max_bud_bytes > max_bytes) {
+		ubifs_err("too large journal size (%lld bytes), only %lld bytes available in the main area",
+			  c->max_bud_bytes, max_bytes);
 		goto failed;
 	}
 
@@ -450,7 +452,6 @@
 		goto failed;
 	}
 
-	max_bytes = c->main_lebs * (long long)c->leb_size;
 	if (c->rp_size < 0 || max_bytes < c->rp_size) {
 		err = 14;
 		goto failed;
@@ -466,7 +467,7 @@
 
 failed:
 	ubifs_err("bad superblock, error %d", err);
-	dbg_dump_node(c, sup);
+	ubifs_dump_node(c, sup);
 	return -EINVAL;
 }
 
@@ -509,7 +510,7 @@
 	int len = ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size);
 
 	ubifs_prepare_node(c, sup, UBIFS_SB_NODE_SZ, 1);
-	return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len, UBI_LONGTERM);
+	return ubifs_leb_change(c, UBIFS_SB_LNUM, sup, len);
 }
 
 /**
@@ -546,10 +547,9 @@
 		ubifs_assert(!c->ro_media || c->ro_mount);
 		if (!c->ro_mount ||
 		    c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
-			ubifs_err("on-flash format version is w%d/r%d, but "
-				  "software only supports up to version "
-				  "w%d/r%d", c->fmt_version,
-				  c->ro_compat_version, UBIFS_FORMAT_VERSION,
+			ubifs_err("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
+				  c->fmt_version, c->ro_compat_version,
+				  UBIFS_FORMAT_VERSION,
 				  UBIFS_RO_COMPAT_VERSION);
 			if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
 				ubifs_msg("only R/O mounting is possible");
@@ -682,7 +682,7 @@
 	if (err)
 		return err;
 
-	return ubifs_leb_change(c, lnum, c->sbuf, len, UBI_UNKNOWN);
+	return ubifs_leb_change(c, lnum, c->sbuf, len);
 }
 
 /**
diff -ur a/fs/ubifs/scan.c b/fs/ubifs/scan.c
--- a/fs/ubifs/scan.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/scan.c	2014-02-17 11:56:58.000000000 +0100
@@ -75,7 +75,7 @@
 	magic = le32_to_cpu(ch->magic);
 
 	if (magic == 0xFFFFFFFF) {
-		dbg_scan("hit empty space");
+		dbg_scan("hit empty space at LEB %d:%d", lnum, offs);
 		return SCANNED_EMPTY_SPACE;
 	}
 
@@ -85,7 +85,8 @@
 	if (len < UBIFS_CH_SZ)
 		return SCANNED_GARBAGE;
 
-	dbg_scan("scanning %s", dbg_ntype(ch->node_type));
+	dbg_scan("scanning %s at LEB %d:%d",
+		 dbg_ntype(ch->node_type), lnum, offs);
 
 	if (ubifs_check_node(c, buf, lnum, offs, quiet, 1))
 		return SCANNED_A_CORRUPT_NODE;
@@ -101,7 +102,7 @@
 			if (!quiet) {
 				ubifs_err("bad pad node at LEB %d:%d",
 					  lnum, offs);
-				dbg_dump_node(c, pad);
+				ubifs_dump_node(c, pad);
 			}
 			return SCANNED_A_BAD_PAD_NODE;
 		}
@@ -109,13 +110,13 @@
 		/* Make the node pads to 8-byte boundary */
 		if ((node_len + pad_len) & 7) {
 			if (!quiet)
-				dbg_err("bad padding length %d - %d",
-					offs, offs + node_len + pad_len);
+				ubifs_err("bad padding length %d - %d",
+					  offs, offs + node_len + pad_len);
 			return SCANNED_A_BAD_PAD_NODE;
 		}
 
-		dbg_scan("%d bytes padded, offset now %d",
-			 pad_len, ALIGN(offs + node_len + pad_len, 8));
+		dbg_scan("%d bytes padded at LEB %d:%d, offset now %d", pad_len,
+			 lnum, offs, ALIGN(offs + node_len + pad_len, 8));
 
 		return node_len + pad_len;
 	}
@@ -150,8 +151,8 @@
 
 	err = ubifs_leb_read(c, lnum, sbuf + offs, offs, c->leb_size - offs, 0);
 	if (err && err != -EBADMSG) {
-		ubifs_err("cannot read %d bytes from LEB %d:%d,"
-			  " error %d", c->leb_size - offs, lnum, offs, err);
+		ubifs_err("cannot read %d bytes from LEB %d:%d, error %d",
+			  c->leb_size - offs, lnum, offs, err);
 		kfree(sleb);
 		return ERR_PTR(err);
 	}
@@ -240,12 +241,10 @@
 	int len;
 
 	ubifs_err("corruption at LEB %d:%d", lnum, offs);
-	if (dbg_is_tst_rcvry(c))
-		return;
 	len = c->leb_size - offs;
 	if (len > 8192)
 		len = 8192;
-	dbg_err("first %d bytes from LEB %d:%d", len, lnum, offs);
+	ubifs_err("first %d bytes from LEB %d:%d", len, lnum, offs);
 	print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_OFFSET, 32, 4, buf, len, 1);
 }
 
@@ -300,16 +299,16 @@
 
 		switch (ret) {
 		case SCANNED_GARBAGE:
-			dbg_err("garbage");
+			ubifs_err("garbage");
 			goto corrupted;
 		case SCANNED_A_NODE:
 			break;
 		case SCANNED_A_CORRUPT_NODE:
 		case SCANNED_A_BAD_PAD_NODE:
-			dbg_err("bad node");
+			ubifs_err("bad node");
 			goto corrupted;
 		default:
-			dbg_err("unknown");
+			ubifs_err("unknown");
 			err = -EINVAL;
 			goto error;
 		}
diff -ur a/fs/ubifs/super.c b/fs/ubifs/super.c
--- a/fs/ubifs/super.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/super.c	2014-02-17 11:56:58.000000000 +0100
@@ -89,9 +89,8 @@
 		return 5;
 
 	if (!ubifs_compr_present(ui->compr_type)) {
-		ubifs_warn("inode %lu uses '%s' compression, but it was not "
-			   "compiled in", inode->i_ino,
-			   ubifs_compr_name(ui->compr_type));
+		ubifs_warn("inode %lu uses '%s' compression, but it was not compiled in",
+			   inode->i_ino, ubifs_compr_name(ui->compr_type));
 	}
 
 	err = dbg_check_dir(c, inode);
@@ -246,8 +245,8 @@
 
 out_invalid:
 	ubifs_err("inode %lu validation failed, error %d", inode->i_ino, err);
-	dbg_dump_node(c, ino);
-	dbg_dump_inode(c, inode);
+	ubifs_dump_node(c, ino);
+	ubifs_dump_inode(c, inode);
 	err = -EINVAL;
 out_ino:
 	kfree(ino);
@@ -385,7 +384,16 @@
 {
 	struct ubifs_inode *ui = ubifs_inode(inode);
 
+#if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
+	if (!mutex_is_locked(&ui->ui_mutex)) {
+		/* because the fuction won't lock when it's called by SYNO_ArchiveModify
+		 * we skip it
+		 **/
+		return;
+	}
+#else
 	ubifs_assert(mutex_is_locked(&ui->ui_mutex));
+#endif
 	if (!ui->dirty) {
 		ui->dirty = 1;
 		dbg_gen("inode %lu",  inode->i_ino);
@@ -668,8 +676,8 @@
 	tmp = UBIFS_CS_NODE_SZ + UBIFS_REF_NODE_SZ * c->jhead_cnt;
 	tmp = ALIGN(tmp, c->min_io_size);
 	if (tmp > c->leb_size) {
-		dbg_err("too small LEB size %d, at least %d needed",
-			c->leb_size, tmp);
+		ubifs_err("too small LEB size %d, at least %d needed",
+			  c->leb_size, tmp);
 		return -EINVAL;
 	}
 
@@ -683,8 +691,8 @@
 	tmp /= c->leb_size;
 	tmp += 1;
 	if (c->log_lebs < tmp) {
-		dbg_err("too small log %d LEBs, required min. %d LEBs",
-			c->log_lebs, tmp);
+		ubifs_err("too small log %d LEBs, required min. %d LEBs",
+			  c->log_lebs, tmp);
 		return -EINVAL;
 	}
 
@@ -813,13 +821,10 @@
 		c->jheads[i].grouped = 1;
 	}
 
-	c->jheads[BASEHD].wbuf.dtype = UBI_SHORTTERM;
 	/*
-	 * Garbage Collector head likely contains long-term data and
-	 * does not need to be synchronized by timer. Also GC head nodes are
-	 * not grouped.
+	 * Garbage Collector head does not need to be synchronized by timer.
+	 * Also GC head nodes are not grouped.
 	 */
-	c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM;
 	c->jheads[GCHD].wbuf.no_timer = 1;
 	c->jheads[GCHD].grouped = 0;
 
@@ -863,7 +868,7 @@
 		orph = list_entry(c->orph_list.next, struct ubifs_orphan, list);
 		list_del(&orph->list);
 		kfree(orph);
-		dbg_err("orphan list not empty at unmount");
+		ubifs_err("orphan list not empty at unmount");
 	}
 
 	vfree(c->orph_buf);
@@ -1064,8 +1069,8 @@
 
 			flag = parse_standard_option(p);
 			if (!flag) {
-				ubifs_err("unrecognized mount option \"%s\" "
-					  "or missing value", p);
+				ubifs_err("unrecognized mount option \"%s\" or missing value",
+					  p);
 				return -EINVAL;
 			}
 			sb->s_flags |= flag;
@@ -1127,8 +1132,8 @@
 		}
 
 		/* Just disable bulk-read */
-		ubifs_warn("Cannot allocate %d bytes of memory for bulk-read, "
-			   "disabling it", c->max_bu_buf_len);
+		ubifs_warn("cannot allocate %d bytes of memory for bulk-read, disabling it",
+			   c->max_bu_buf_len);
 		c->mount_opts.bulk_read = 1;
 		c->bulk_read = 0;
 		return;
@@ -1147,8 +1152,8 @@
 	ubifs_assert(c->dark_wm > 0);
 	if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
 		ubifs_err("insufficient free space to mount in R/W mode");
-		dbg_dump_budg(c, &c->bi);
-		dbg_dump_lprops(c);
+		ubifs_dump_budg(c, &c->bi);
+		ubifs_dump_lprops(c);
 		return -ENOSPC;
 	}
 	return 0;
@@ -1160,14 +1165,11 @@
  *
  * This function mounts UBIFS file system. Returns zero in case of success and
  * a negative error code in case of failure.
- *
- * Note, the function does not de-allocate resources it it fails half way
- * through, and the caller has to do this instead.
  */
 static int mount_ubifs(struct ubifs_info *c)
 {
 	int err;
-	long long x;
+	long long x, y;
 	size_t sz;
 
 	c->ro_mount = !!(c->vfs_sb->s_flags & MS_RDONLY);
@@ -1301,7 +1303,7 @@
 	if (!c->ro_mount && c->space_fixup) {
 		err = ubifs_fixup_free_space(c);
 		if (err)
-			goto out_master;
+			goto out_lpt;
 	}
 
 	if (!c->ro_mount) {
@@ -1417,75 +1419,69 @@
 
 	c->mounting = 0;
 
-	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"",
-		  c->vi.ubi_num, c->vi.vol_id, c->vi.name);
-	if (c->ro_mount)
-		ubifs_msg("mounted read-only");
+	ubifs_msg("mounted UBI device %d, volume %d, name \"%s\"%s",
+		  c->vi.ubi_num, c->vi.vol_id, c->vi.name,
+		  c->ro_mount ? ", R/O mode" : "");
 	x = (long long)c->main_lebs * c->leb_size;
-	ubifs_msg("file system size:   %lld bytes (%lld KiB, %lld MiB, %d "
-		  "LEBs)", x, x >> 10, x >> 20, c->main_lebs);
-	x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
-	ubifs_msg("journal size:       %lld bytes (%lld KiB, %lld MiB, %d "
-		  "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
-	ubifs_msg("media format:       w%d/r%d (latest is w%d/r%d)",
+	y = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
+	ubifs_msg("LEB size: %d bytes (%d KiB), min./max. I/O unit sizes: %d bytes/%d bytes",
+		  c->leb_size, c->leb_size >> 10, c->min_io_size,
+		  c->max_write_size);
+	ubifs_msg("FS size: %lld bytes (%lld MiB, %d LEBs), journal size %lld bytes (%lld MiB, %d LEBs)",
+		  x, x >> 20, c->main_lebs,
+		  y, y >> 20, c->log_lebs + c->max_bud_cnt);
+	ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
+		  c->report_rp_size, c->report_rp_size >> 10);
+	ubifs_msg("media format: w%d/r%d (latest is w%d/r%d), UUID %pUB%s",
 		  c->fmt_version, c->ro_compat_version,
-		  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
-	ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
-	ubifs_msg("reserved for root:  %llu bytes (%llu KiB)",
-		c->report_rp_size, c->report_rp_size >> 10);
-
-	dbg_msg("compiled on:         " __DATE__ " at " __TIME__);
-	dbg_msg("min. I/O unit size:  %d bytes", c->min_io_size);
-	dbg_msg("max. write size:     %d bytes", c->max_write_size);
-	dbg_msg("LEB size:            %d bytes (%d KiB)",
-		c->leb_size, c->leb_size >> 10);
-	dbg_msg("data journal heads:  %d",
+		  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION, c->uuid,
+		  c->big_lpt ? ", big LPT model" : ", small LPT model");
+
+	dbg_gen("default compressor:  %s", ubifs_compr_name(c->default_compr));
+	dbg_gen("data journal heads:  %d",
 		c->jhead_cnt - NONDATA_JHEADS_CNT);
-	dbg_msg("UUID:                %pUB", c->uuid);
-	dbg_msg("big_lpt              %d", c->big_lpt);
-	dbg_msg("log LEBs:            %d (%d - %d)",
+	dbg_gen("log LEBs:            %d (%d - %d)",
 		c->log_lebs, UBIFS_LOG_LNUM, c->log_last);
-	dbg_msg("LPT area LEBs:       %d (%d - %d)",
+	dbg_gen("LPT area LEBs:       %d (%d - %d)",
 		c->lpt_lebs, c->lpt_first, c->lpt_last);
-	dbg_msg("orphan area LEBs:    %d (%d - %d)",
+	dbg_gen("orphan area LEBs:    %d (%d - %d)",
 		c->orph_lebs, c->orph_first, c->orph_last);
-	dbg_msg("main area LEBs:      %d (%d - %d)",
+	dbg_gen("main area LEBs:      %d (%d - %d)",
 		c->main_lebs, c->main_first, c->leb_cnt - 1);
-	dbg_msg("index LEBs:          %d", c->lst.idx_lebs);
-	dbg_msg("total index bytes:   %lld (%lld KiB, %lld MiB)",
+	dbg_gen("index LEBs:          %d", c->lst.idx_lebs);
+	dbg_gen("total index bytes:   %lld (%lld KiB, %lld MiB)",
 		c->bi.old_idx_sz, c->bi.old_idx_sz >> 10,
 		c->bi.old_idx_sz >> 20);
-	dbg_msg("key hash type:       %d", c->key_hash_type);
-	dbg_msg("tree fanout:         %d", c->fanout);
-	dbg_msg("reserved GC LEB:     %d", c->gc_lnum);
-	dbg_msg("first main LEB:      %d", c->main_first);
-	dbg_msg("max. znode size      %d", c->max_znode_sz);
-	dbg_msg("max. index node size %d", c->max_idx_node_sz);
-	dbg_msg("node sizes:          data %zu, inode %zu, dentry %zu",
+	dbg_gen("key hash type:       %d", c->key_hash_type);
+	dbg_gen("tree fanout:         %d", c->fanout);
+	dbg_gen("reserved GC LEB:     %d", c->gc_lnum);
+	dbg_gen("max. znode size      %d", c->max_znode_sz);
+	dbg_gen("max. index node size %d", c->max_idx_node_sz);
+	dbg_gen("node sizes:          data %zu, inode %zu, dentry %zu",
 		UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
-	dbg_msg("node sizes:          trun %zu, sb %zu, master %zu",
+	dbg_gen("node sizes:          trun %zu, sb %zu, master %zu",
 		UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
-	dbg_msg("node sizes:          ref %zu, cmt. start %zu, orph %zu",
+	dbg_gen("node sizes:          ref %zu, cmt. start %zu, orph %zu",
 		UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
-	dbg_msg("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
+	dbg_gen("max. node sizes:     data %zu, inode %zu dentry %zu, idx %d",
 		UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
 		UBIFS_MAX_DENT_NODE_SZ, ubifs_idx_node_sz(c, c->fanout));
-	dbg_msg("dead watermark:      %d", c->dead_wm);
-	dbg_msg("dark watermark:      %d", c->dark_wm);
-	dbg_msg("LEB overhead:        %d", c->leb_overhead);
+	dbg_gen("dead watermark:      %d", c->dead_wm);
+	dbg_gen("dark watermark:      %d", c->dark_wm);
+	dbg_gen("LEB overhead:        %d", c->leb_overhead);
 	x = (long long)c->main_lebs * c->dark_wm;
-	dbg_msg("max. dark space:     %lld (%lld KiB, %lld MiB)",
+	dbg_gen("max. dark space:     %lld (%lld KiB, %lld MiB)",
 		x, x >> 10, x >> 20);
-	dbg_msg("maximum bud bytes:   %lld (%lld KiB, %lld MiB)",
+	dbg_gen("maximum bud bytes:   %lld (%lld KiB, %lld MiB)",
 		c->max_bud_bytes, c->max_bud_bytes >> 10,
 		c->max_bud_bytes >> 20);
-	dbg_msg("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
+	dbg_gen("BG commit bud bytes: %lld (%lld KiB, %lld MiB)",
 		c->bg_bud_bytes, c->bg_bud_bytes >> 10,
 		c->bg_bud_bytes >> 20);
-	dbg_msg("current bud bytes    %lld (%lld KiB, %lld MiB)",
+	dbg_gen("current bud bytes    %lld (%lld KiB, %lld MiB)",
 		c->bud_bytes, c->bud_bytes >> 10, c->bud_bytes >> 20);
-	dbg_msg("max. seq. number:    %llu", c->max_sqnum);
-	dbg_msg("commit number:       %llu", c->cmt_no);
+	dbg_gen("max. seq. number:    %llu", c->max_sqnum);
+	dbg_gen("commit number:       %llu", c->cmt_no);
 
 	return 0;
 
@@ -1570,10 +1566,9 @@
 
 	if (c->rw_incompat) {
 		ubifs_err("the file-system is not R/W-compatible");
-		ubifs_msg("on-flash format version is w%d/r%d, but software "
-			  "only supports up to version w%d/r%d", c->fmt_version,
-			  c->ro_compat_version, UBIFS_FORMAT_VERSION,
-			  UBIFS_RO_COMPAT_VERSION);
+		ubifs_msg("on-flash format version is w%d/r%d, but software only supports up to version w%d/r%d",
+			  c->fmt_version, c->ro_compat_version,
+			  UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
 		return -EROFS;
 	}
 
@@ -1582,6 +1577,12 @@
 	c->remounting_rw = 1;
 	c->ro_mount = 0;
 
+	if (c->space_fixup) {
+		err = ubifs_fixup_free_space(c);
+		if (err)
+			return err;
+	}
+
 	err = check_free_space(c);
 	if (err)
 		goto out;
@@ -1698,12 +1699,6 @@
 		err = dbg_check_space_info(c);
 	}
 
-	if (c->space_fixup) {
-		err = ubifs_fixup_free_space(c);
-		if (err)
-			goto out;
-	}
-
 	mutex_unlock(&c->umount_mutex);
 	return err;
 
@@ -1834,8 +1829,8 @@
 				 * next mount, so we just print a message and
 				 * continue to unmount normally.
 				 */
-				ubifs_err("failed to write master node, "
-					  "error %d", err);
+				ubifs_err("failed to write master node, error %d",
+					  err);
 		} else {
 			for (i = 0; i < c->jhead_cnt; i++)
 				/* Make sure write-buffer timers are canceled */
@@ -2128,8 +2123,8 @@
 	 */
 	ubi = open_ubi(name, UBI_READONLY);
 	if (IS_ERR(ubi)) {
-		dbg_err("cannot open \"%s\", error %d",
-			name, (int)PTR_ERR(ubi));
+		ubifs_err("cannot open \"%s\", error %d",
+			  name, (int)PTR_ERR(ubi));
 		return ERR_CAST(ubi);
 	}
 
@@ -2257,8 +2252,7 @@
 	 * UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
 	 */
 	if (PAGE_CACHE_SIZE < UBIFS_BLOCK_SIZE) {
-		ubifs_err("VFS page cache size is %u bytes, but UBIFS requires"
-			  " at least 4096 bytes",
+		ubifs_err("VFS page cache size is %u bytes, but UBIFS requires at least 4096 bytes",
 			  (unsigned int)PAGE_CACHE_SIZE);
 		return -EINVAL;
 	}
@@ -2307,6 +2301,12 @@
 	dbg_debugfs_exit();
 	ubifs_compressors_exit();
 	unregister_shrinker(&ubifs_shrinker_info);
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ubifs_inode_slab);
 	unregister_filesystem(&ubifs_fs_type);
 }
diff -ur a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
--- a/fs/ubifs/tnc.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/tnc.c	2014-02-17 11:56:58.000000000 +0100
@@ -339,17 +339,16 @@
 
 	err = ubifs_validate_entry(c, dent);
 	if (err) {
-		dbg_dump_stack();
-		dbg_dump_node(c, dent);
+		dump_stack();
+		ubifs_dump_node(c, dent);
 		return err;
 	}
 
-	lnc_node = kmalloc(zbr->len, GFP_NOFS);
+	lnc_node = kmemdup(node, zbr->len, GFP_NOFS);
 	if (!lnc_node)
 		/* We don't have to have the cache, so no error */
 		return 0;
 
-	memcpy(lnc_node, node, zbr->len);
 	zbr->leaf = lnc_node;
 	return 0;
 }
@@ -373,8 +372,8 @@
 
 	err = ubifs_validate_entry(c, node);
 	if (err) {
-		dbg_dump_stack();
-		dbg_dump_node(c, node);
+		dump_stack();
+		ubifs_dump_node(c, node);
 		return err;
 	}
 
@@ -506,7 +505,7 @@
 {
 	int ret;
 
-	dbg_tnc("LEB %d:%d, key %s", zbr->lnum, zbr->offs, DBGKEY(key));
+	dbg_tnck(key, "LEB %d:%d, key ", zbr->lnum, zbr->offs);
 
 	ret = try_read_node(c, node, key_type(c, key), zbr->len, zbr->lnum,
 			    zbr->offs);
@@ -520,8 +519,8 @@
 			ret = 0;
 	}
 	if (ret == 0 && c->replaying)
-		dbg_mnt("dangling branch LEB %d:%d len %d, key %s",
-			zbr->lnum, zbr->offs, zbr->len, DBGKEY(key));
+		dbg_mntk(key, "dangling branch LEB %d:%d len %d, key ",
+			zbr->lnum, zbr->offs, zbr->len);
 	return ret;
 }
 
@@ -996,9 +995,9 @@
 	if (adding || !o_znode)
 		return 0;
 
-	dbg_mnt("dangling match LEB %d:%d len %d %s",
+	dbg_mntk(key, "dangling match LEB %d:%d len %d key ",
 		o_znode->zbranch[o_n].lnum, o_znode->zbranch[o_n].offs,
-		o_znode->zbranch[o_n].len, DBGKEY(key));
+		o_znode->zbranch[o_n].len);
 	*zn = o_znode;
 	*n = o_n;
 	return 1;
@@ -1180,7 +1179,7 @@
 	struct ubifs_znode *znode;
 	unsigned long time = get_seconds();
 
-	dbg_tnc("search key %s", DBGKEY(key));
+	dbg_tnck(key, "search key ");
 	ubifs_assert(key_type(c, key) < UBIFS_INVALID_KEY);
 
 	znode = c->zroot.znode;
@@ -1316,7 +1315,7 @@
 	struct ubifs_znode *znode;
 	unsigned long time = get_seconds();
 
-	dbg_tnc("search and dirty key %s", DBGKEY(key));
+	dbg_tnck(key, "search and dirty key ");
 
 	znode = c->zroot.znode;
 	if (unlikely(!znode)) {
@@ -1723,8 +1722,8 @@
 	if (!keys_eq(c, &zbr->key, &key1)) {
 		ubifs_err("bad key in node at LEB %d:%d",
 			  zbr->lnum, zbr->offs);
-		dbg_tnc("looked for key %s found node's key %s",
-			DBGKEY(&zbr->key), DBGKEY1(&key1));
+		dbg_tnck(&zbr->key, "looked for key ");
+		dbg_tnck(&key1, "found node's key ");
 		goto out_err;
 	}
 
@@ -1734,8 +1733,8 @@
 	err = -EINVAL;
 out:
 	ubifs_err("bad node at LEB %d:%d", zbr->lnum, zbr->offs);
-	dbg_dump_node(c, buf);
-	dbg_dump_stack();
+	ubifs_dump_node(c, buf);
+	dump_stack();
 	return err;
 }
 
@@ -1776,8 +1775,8 @@
 	if (err && err != -EBADMSG) {
 		ubifs_err("failed to read from LEB %d:%d, error %d",
 			  lnum, offs, err);
-		dbg_dump_stack();
-		dbg_tnc("key %s", DBGKEY(&bu->key));
+		dump_stack();
+		dbg_tnck(&bu->key, "key ");
 		return err;
 	}
 
@@ -1812,7 +1811,7 @@
 	int found, n, err;
 	struct ubifs_znode *znode;
 
-	dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key));
+	dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
 	mutex_lock(&c->tnc_mutex);
 	found = ubifs_lookup_level0(c, key, &znode, &n);
 	if (!found) {
@@ -1986,8 +1985,7 @@
 	zp = znode->parent;
 	if (znode->child_cnt < c->fanout) {
 		ubifs_assert(n != c->fanout);
-		dbg_tnc("inserted at %d level %d, key %s", n, znode->level,
-			DBGKEY(key));
+		dbg_tnck(key, "inserted at %d level %d, key ", n, znode->level);
 
 		insert_zbranch(znode, zbr, n);
 
@@ -2002,7 +2000,7 @@
 	 * Unfortunately, @znode does not have more empty slots and we have to
 	 * split it.
 	 */
-	dbg_tnc("splitting level %d, key %s", znode->level, DBGKEY(key));
+	dbg_tnck(key, "splitting level %d, key ", znode->level);
 
 	if (znode->alt)
 		/*
@@ -2096,7 +2094,7 @@
 	}
 
 	/* Insert new key and branch */
-	dbg_tnc("inserting at %d level %d, key %s", n, zn->level, DBGKEY(key));
+	dbg_tnck(key, "inserting at %d level %d, key ", n, zn->level);
 
 	insert_zbranch(zi, zbr, n);
 
@@ -2172,7 +2170,7 @@
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("%d:%d, len %d, key %s", lnum, offs, len, DBGKEY(key));
+	dbg_tnck(key, "%d:%d, len %d, key ", lnum, offs, len);
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (!found) {
 		struct ubifs_zbranch zbr;
@@ -2221,8 +2219,8 @@
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("old LEB %d:%d, new LEB %d:%d, len %d, key %s", old_lnum,
-		old_offs, lnum, offs, len, DBGKEY(key));
+	dbg_tnck(key, "old LEB %d:%d, new LEB %d:%d, len %d, key ", old_lnum,
+		 old_offs, lnum, offs, len);
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (found < 0) {
 		err = found;
@@ -2304,8 +2302,8 @@
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("LEB %d:%d, name '%.*s', key %s", lnum, offs, nm->len, nm->name,
-		DBGKEY(key));
+	dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+		 lnum, offs, nm->len, nm->name);
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (found < 0) {
 		err = found;
@@ -2398,14 +2396,14 @@
 	/* Delete without merge for now */
 	ubifs_assert(znode->level == 0);
 	ubifs_assert(n >= 0 && n < c->fanout);
-	dbg_tnc("deleting %s", DBGKEY(&znode->zbranch[n].key));
+	dbg_tnck(&znode->zbranch[n].key, "deleting key ");
 
 	zbr = &znode->zbranch[n];
 	lnc_free(zbr);
 
 	err = ubifs_add_dirt(c, zbr->lnum, zbr->len);
 	if (err) {
-		dbg_dump_znode(c, znode);
+		ubifs_dump_znode(c, znode);
 		return err;
 	}
 
@@ -2508,7 +2506,7 @@
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("key %s", DBGKEY(key));
+	dbg_tnck(key, "key ");
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (found < 0) {
 		err = found;
@@ -2539,7 +2537,7 @@
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnc("%.*s, key %s", nm->len, nm->name, DBGKEY(key));
+	dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
 	err = lookup_level0_dirty(c, key, &znode, &n);
 	if (err < 0)
 		goto out_unlock;
@@ -2651,10 +2649,10 @@
 			err = ubifs_add_dirt(c, znode->zbranch[i].lnum,
 					     znode->zbranch[i].len);
 			if (err) {
-				dbg_dump_znode(c, znode);
+				ubifs_dump_znode(c, znode);
 				goto out_unlock;
 			}
-			dbg_tnc("removing %s", DBGKEY(key));
+			dbg_tnck(key, "removing key ");
 		}
 		if (k) {
 			for (i = n + 1 + k; i < znode->child_cnt; i++)
@@ -2774,7 +2772,7 @@
 	struct ubifs_zbranch *zbr;
 	union ubifs_key *dkey;
 
-	dbg_tnc("%s %s", nm->name ? (char *)nm->name : "(lowest)", DBGKEY(key));
+	dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
 	ubifs_assert(is_hash_key(c, key));
 
 	mutex_lock(&c->tnc_mutex);
@@ -3277,8 +3275,6 @@
 	return err;
 }
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
-
 /**
  * dbg_check_inode_size - check if inode size is correct.
  * @c: UBIFS file-system description object
@@ -3333,17 +3329,15 @@
 
 out_dump:
 	block = key_block(c, key);
-	ubifs_err("inode %lu has size %lld, but there are data at offset %lld "
-		  "(data key %s)", (unsigned long)inode->i_ino, size,
-		  ((loff_t)block) << UBIFS_BLOCK_SHIFT, DBGKEY(key));
+	ubifs_err("inode %lu has size %lld, but there are data at offset %lld",
+		  (unsigned long)inode->i_ino, size,
+		  ((loff_t)block) << UBIFS_BLOCK_SHIFT);
 	mutex_unlock(&c->tnc_mutex);
-	dbg_dump_inode(c, inode);
-	dbg_dump_stack();
+	ubifs_dump_inode(c, inode);
+	dump_stack();
 	return -EINVAL;
 
 out_unlock:
 	mutex_unlock(&c->tnc_mutex);
 	return err;
 }
-
-#endif /* CONFIG_UBIFS_FS_DEBUG */
diff -ur a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
--- a/fs/ubifs/tnc_commit.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/tnc_commit.c	2014-02-17 11:56:58.000000000 +0100
@@ -54,18 +54,16 @@
 		br->len = cpu_to_le32(zbr->len);
 		if (!zbr->lnum || !zbr->len) {
 			ubifs_err("bad ref in znode");
-			dbg_dump_znode(c, znode);
+			ubifs_dump_znode(c, znode);
 			if (zbr->znode)
-				dbg_dump_znode(c, zbr->znode);
+				ubifs_dump_znode(c, zbr->znode);
 		}
 	}
 	ubifs_prepare_node(c, idx, len, 0);
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 	znode->lnum = lnum;
 	znode->offs = offs;
 	znode->len = len;
-#endif
 
 	err = insert_old_idx_znode(c, znode);
 
@@ -322,8 +320,7 @@
 				  0, 0, 0);
 	if (err)
 		return err;
-	err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len,
-			       UBI_SHORTTERM);
+	err = ubifs_leb_change(c, lnum, c->ileb_buf, c->ileb_len);
 	if (err)
 		return err;
 	dbg_gc("LEB %d wrote %d index nodes", lnum, tot_written);
@@ -388,8 +385,8 @@
 				 * option which forces in-the-gaps is enabled.
 				 */
 				ubifs_warn("out of space");
-				dbg_dump_budg(c, &c->bi);
-				dbg_dump_lprops(c);
+				ubifs_dump_budg(c, &c->bi);
+				ubifs_dump_lprops(c);
 			}
 			/* Try to commit anyway */
 			err = 0;
@@ -456,11 +453,9 @@
 
 		offs = buf_offs + used;
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 		znode->lnum = lnum;
 		znode->offs = offs;
 		znode->len = len;
-#endif
 
 		/* Update the parent */
 		zp = znode->parent;
@@ -536,10 +531,8 @@
 		break;
 	}
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 	c->dbg->new_ihead_lnum = lnum;
 	c->dbg->new_ihead_offs = buf_offs;
-#endif
 
 	return 0;
 }
@@ -690,7 +683,7 @@
 		c->ilebs[c->ileb_cnt++] = lnum;
 		dbg_cmt("LEB %d", lnum);
 	}
-	if (dbg_is_chk_index(c) && !(random32() & 7))
+	if (dbg_is_chk_index(c) && !(prandom_u32() & 7))
 		return -ENOSPC;
 	return 0;
 }
@@ -864,9 +857,9 @@
 			br->len = cpu_to_le32(zbr->len);
 			if (!zbr->lnum || !zbr->len) {
 				ubifs_err("bad ref in znode");
-				dbg_dump_znode(c, znode);
+				ubifs_dump_znode(c, znode);
 				if (zbr->znode)
-					dbg_dump_znode(c, zbr->znode);
+					ubifs_dump_znode(c, zbr->znode);
 			}
 		}
 		len = ubifs_idx_node_sz(c, znode->child_cnt);
@@ -881,13 +874,11 @@
 		}
 		offs = buf_offs + used;
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 		if (lnum != znode->lnum || offs != znode->offs ||
 		    len != znode->len) {
 			ubifs_err("inconsistent znode posn");
 			return -EINVAL;
 		}
-#endif
 
 		/* Grab some stuff from znode while we still can */
 		cnext = znode->cnext;
@@ -959,8 +950,7 @@
 		}
 
 		/* The buffer is full or there are no more znodes to do */
-		err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen,
-				      UBI_SHORTTERM);
+		err = ubifs_leb_write(c, lnum, c->cbuf, buf_offs, blen);
 		if (err)
 			return err;
 		buf_offs += blen;
@@ -982,13 +972,11 @@
 		break;
 	}
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 	if (lnum != c->dbg->new_ihead_lnum ||
 	    buf_offs != c->dbg->new_ihead_offs) {
 		ubifs_err("inconsistent ihead");
 		return -EINVAL;
 	}
-#endif
 
 	c->ihead_lnum = lnum;
 	c->ihead_offs = buf_offs;
diff -ur a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
--- a/fs/ubifs/tnc_misc.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/tnc_misc.c	2014-02-17 11:56:58.000000000 +0100
@@ -293,10 +293,10 @@
 		lnum, offs, znode->level, znode->child_cnt);
 
 	if (znode->child_cnt > c->fanout || znode->level > UBIFS_MAX_LEVELS) {
-		dbg_err("current fanout %d, branch count %d",
-			c->fanout, znode->child_cnt);
-		dbg_err("max levels %d, znode level %d",
-			UBIFS_MAX_LEVELS, znode->level);
+		ubifs_err("current fanout %d, branch count %d",
+			  c->fanout, znode->child_cnt);
+		ubifs_err("max levels %d, znode level %d",
+			  UBIFS_MAX_LEVELS, znode->level);
 		err = 1;
 		goto out_dump;
 	}
@@ -316,7 +316,7 @@
 		if (zbr->lnum < c->main_first ||
 		    zbr->lnum >= c->leb_cnt || zbr->offs < 0 ||
 		    zbr->offs + zbr->len > c->leb_size || zbr->offs & 7) {
-			dbg_err("bad branch %d", i);
+			ubifs_err("bad branch %d", i);
 			err = 2;
 			goto out_dump;
 		}
@@ -328,8 +328,8 @@
 		case UBIFS_XENT_KEY:
 			break;
 		default:
-			dbg_msg("bad key type at slot %d: %s", i,
-				DBGKEY(&zbr->key));
+			ubifs_err("bad key type at slot %d: %d",
+				  i, key_type(c, &zbr->key));
 			err = 3;
 			goto out_dump;
 		}
@@ -340,19 +340,19 @@
 		type = key_type(c, &zbr->key);
 		if (c->ranges[type].max_len == 0) {
 			if (zbr->len != c->ranges[type].len) {
-				dbg_err("bad target node (type %d) length (%d)",
-					type, zbr->len);
-				dbg_err("have to be %d", c->ranges[type].len);
+				ubifs_err("bad target node (type %d) length (%d)",
+					  type, zbr->len);
+				ubifs_err("have to be %d", c->ranges[type].len);
 				err = 4;
 				goto out_dump;
 			}
 		} else if (zbr->len < c->ranges[type].min_len ||
 			   zbr->len > c->ranges[type].max_len) {
-			dbg_err("bad target node (type %d) length (%d)",
-				type, zbr->len);
-			dbg_err("have to be in range of %d-%d",
-				c->ranges[type].min_len,
-				c->ranges[type].max_len);
+			ubifs_err("bad target node (type %d) length (%d)",
+				  type, zbr->len);
+			ubifs_err("have to be in range of %d-%d",
+				  c->ranges[type].min_len,
+				  c->ranges[type].max_len);
 			err = 5;
 			goto out_dump;
 		}
@@ -370,13 +370,13 @@
 
 		cmp = keys_cmp(c, key1, key2);
 		if (cmp > 0) {
-			dbg_err("bad key order (keys %d and %d)", i, i + 1);
+			ubifs_err("bad key order (keys %d and %d)", i, i + 1);
 			err = 6;
 			goto out_dump;
 		} else if (cmp == 0 && !is_hash_key(c, key1)) {
 			/* These can only be keys with colliding hash */
-			dbg_err("keys %d and %d are not hashed but equivalent",
-				i, i + 1);
+			ubifs_err("keys %d and %d are not hashed but equivalent",
+				  i, i + 1);
 			err = 7;
 			goto out_dump;
 		}
@@ -387,7 +387,7 @@
 
 out_dump:
 	ubifs_err("bad indexing node at LEB %d:%d, error %d", lnum, offs, err);
-	dbg_dump_node(c, idx);
+	ubifs_dump_node(c, idx);
 	kfree(idx);
 	return -EINVAL;
 }
@@ -475,7 +475,7 @@
 				      zbr->offs);
 
 	if (err) {
-		dbg_tnc("key %s", DBGKEY(key));
+		dbg_tnck(key, "key ");
 		return err;
 	}
 
@@ -484,9 +484,9 @@
 	if (!keys_eq(c, key, &key1)) {
 		ubifs_err("bad key in node at LEB %d:%d",
 			  zbr->lnum, zbr->offs);
-		dbg_tnc("looked for key %s found node's key %s",
-			DBGKEY(key), DBGKEY1(&key1));
-		dbg_dump_node(c, node);
+		dbg_tnck(key, "looked for key ");
+		dbg_tnck(&key1, "but found node's key ");
+		ubifs_dump_node(c, node);
 		return -EINVAL;
 	}
 
diff -ur a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
--- a/fs/ubifs/ubifs.h	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/ubifs.h	2014-02-17 11:56:58.000000000 +0100
@@ -42,16 +42,15 @@
 #define UBIFS_VERSION 1
 
 /* Normal UBIFS messages */
-#define ubifs_msg(fmt, ...) \
-		printk(KERN_NOTICE "UBIFS: " fmt "\n", ##__VA_ARGS__)
+#define ubifs_msg(fmt, ...) pr_notice("UBIFS: " fmt "\n", ##__VA_ARGS__)
 /* UBIFS error messages */
-#define ubifs_err(fmt, ...)                                                  \
-	printk(KERN_ERR "UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
+#define ubifs_err(fmt, ...)                                         \
+	pr_err("UBIFS error (pid %d): %s: " fmt "\n", current->pid, \
 	       __func__, ##__VA_ARGS__)
 /* UBIFS warning messages */
-#define ubifs_warn(fmt, ...)                                         \
-	printk(KERN_WARNING "UBIFS warning (pid %d): %s: " fmt "\n", \
-	       current->pid, __func__, ##__VA_ARGS__)
+#define ubifs_warn(fmt, ...)                                        \
+	pr_warn("UBIFS warning (pid %d): %s: " fmt "\n",            \
+		current->pid, __func__, ##__VA_ARGS__)
 
 /* UBIFS file system VFS magic number */
 #define UBIFS_SUPER_MAGIC 0x24051905
@@ -84,9 +83,6 @@
 #define INUM_WARN_WATERMARK 0xFFF00000
 #define INUM_WATERMARK      0xFFFFFF00
 
-/* Largest key size supported in this implementation */
-#define CUR_MAX_KEY_LEN UBIFS_SK_LEN
-
 /* Maximum number of entries in each LPT (LEB category) heap */
 #define LPT_HEAP_SZ 256
 
@@ -277,10 +273,10 @@
 
 /* The below union makes it easier to deal with keys */
 union ubifs_key {
-	uint8_t u8[CUR_MAX_KEY_LEN];
-	uint32_t u32[CUR_MAX_KEY_LEN/4];
-	uint64_t u64[CUR_MAX_KEY_LEN/8];
-	__le32 j32[CUR_MAX_KEY_LEN/4];
+	uint8_t u8[UBIFS_SK_LEN];
+	uint32_t u32[UBIFS_SK_LEN/4];
+	uint64_t u64[UBIFS_SK_LEN/8];
+	__le32 j32[UBIFS_SK_LEN/4];
 };
 
 /**
@@ -653,8 +649,6 @@
  * @avail: number of bytes available in the write-buffer
  * @used:  number of used bytes in the write-buffer
  * @size: write-buffer size (in [@c->min_io_size, @c->max_write_size] range)
- * @dtype: type of data stored in this LEB (%UBI_LONGTERM, %UBI_SHORTTERM,
- * %UBI_UNKNOWN)
  * @jhead: journal head the mutex belongs to (note, needed only to shut lockdep
  *         up by 'mutex_lock_nested()).
  * @sync_callback: write-buffer synchronization callback
@@ -688,7 +682,6 @@
 	int avail;
 	int used;
 	int size;
-	int dtype;
 	int jhead;
 	int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
 	struct mutex io_mutex;
@@ -765,6 +758,9 @@
  * @offs: offset of the corresponding indexing node
  * @len: length  of the corresponding indexing node
  * @zbranch: array of znode branches (@c->fanout elements)
+ *
+ * Note! The @lnum, @offs, and @len fields are not really needed - we have them
+ * only for internal consistency check. They could be removed to save some RAM.
  */
 struct ubifs_znode {
 	struct ubifs_znode *parent;
@@ -775,9 +771,9 @@
 	int child_cnt;
 	int iip;
 	int alt;
-#ifdef CONFIG_UBIFS_FS_DEBUG
-	int lnum, offs, len;
-#endif
+	int lnum;
+	int offs;
+	int len;
 	struct ubifs_zbranch zbranch[];
 };
 
@@ -908,6 +904,7 @@
  * @dnext: next orphan to delete
  * @inum: inode number
  * @new: %1 => added since the last commit, otherwise %0
+ * @cmt: %1 => commit pending, otherwise %0
  * @del: %1 => delete pending, otherwise %0
  */
 struct ubifs_orphan {
@@ -917,7 +914,8 @@
 	struct ubifs_orphan *cnext;
 	struct ubifs_orphan *dnext;
 	ino_t inum;
-	int new;
+	unsigned new:1;
+	unsigned cmt:1;
 	unsigned del:1;
 };
 
@@ -1452,9 +1450,7 @@
 	struct rb_root size_tree;
 	struct ubifs_mount_opts mount_opts;
 
-#ifdef CONFIG_UBIFS_FS_DEBUG
 	struct ubifs_debug_info *dbg;
-#endif
 };
 
 extern struct list_head ubifs_infos;
@@ -1476,22 +1472,20 @@
 int ubifs_leb_read(const struct ubifs_info *c, int lnum, void *buf, int offs,
 		   int len, int even_ebadmsg);
 int ubifs_leb_write(struct ubifs_info *c, int lnum, const void *buf, int offs,
-		    int len, int dtype);
-int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len,
-		     int dtype);
+		    int len);
+int ubifs_leb_change(struct ubifs_info *c, int lnum, const void *buf, int len);
 int ubifs_leb_unmap(struct ubifs_info *c, int lnum);
-int ubifs_leb_map(struct ubifs_info *c, int lnum, int dtype);
+int ubifs_leb_map(struct ubifs_info *c, int lnum);
 int ubifs_is_mapped(const struct ubifs_info *c, int lnum);
 int ubifs_wbuf_write_nolock(struct ubifs_wbuf *wbuf, void *buf, int len);
-int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs,
-			   int dtype);
+int ubifs_wbuf_seek_nolock(struct ubifs_wbuf *wbuf, int lnum, int offs);
 int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf);
 int ubifs_read_node(const struct ubifs_info *c, void *buf, int type, int len,
 		    int lnum, int offs);
 int ubifs_read_node_wbuf(struct ubifs_wbuf *wbuf, void *buf, int type, int len,
 			 int lnum, int offs);
 int ubifs_write_node(struct ubifs_info *c, void *node, int len, int lnum,
-		     int offs, int dtype);
+		     int offs);
 int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
 		     int offs, int quiet, int must_chk_crc);
 void ubifs_prepare_node(struct ubifs_info *c, void *buf, int len, int pad);
diff -ur a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
--- a/fs/ubifs/xattr.c	2013-08-24 11:36:44.000000000 +0200
+++ b/fs/ubifs/xattr.c	2014-02-17 11:56:58.000000000 +0100
@@ -138,12 +138,11 @@
 	ui = ubifs_inode(inode);
 	ui->xattr = 1;
 	ui->flags |= UBIFS_XATTR_FL;
-	ui->data = kmalloc(size, GFP_NOFS);
+	ui->data = kmemdup(value, size, GFP_NOFS);
 	if (!ui->data) {
 		err = -ENOMEM;
 		goto out_free;
 	}
-	memcpy(ui->data, value, size);
 	inode->i_size = ui->ui_size = size;
 	ui->data_len = size;
 
@@ -204,12 +203,11 @@
 		return err;
 
 	kfree(ui->data);
-	ui->data = kmalloc(size, GFP_NOFS);
+	ui->data = kmemdup(value, size, GFP_NOFS);
 	if (!ui->data) {
 		err = -ENOMEM;
 		goto out_free;
 	}
-	memcpy(ui->data, value, size);
 	inode->i_size = ui->ui_size = size;
 	ui->data_len = size;
 
@@ -401,8 +399,8 @@
 	if (buf) {
 		/* If @buf is %NULL we are supposed to return the length */
 		if (ui->data_len > size) {
-			dbg_err("buffer size %zd, xattr len %d",
-				size, ui->data_len);
+			ubifs_err("buffer size %zd, xattr len %d",
+				  size, ui->data_len);
 			err = -ERANGE;
 			goto out_iput;
 		}
diff -ur a/fs/udf/super.c b/fs/udf/super.c
--- a/fs/udf/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/udf/super.c	2014-02-17 11:56:55.000000000 +0100
@@ -169,6 +169,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(udf_inode_cachep);
 }
 
diff -ur a/fs/ufs/super.c b/fs/ufs/super.c
--- a/fs/ufs/super.c	2013-08-24 11:36:43.000000000 +0200
+++ b/fs/ufs/super.c	2014-02-17 11:56:55.000000000 +0100
@@ -1454,6 +1454,11 @@
 
 static void destroy_inodecache(void)
 {
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
 	kmem_cache_destroy(ufs_inode_cachep);
 }
 
diff -ur a/fs/utimes.c b/fs/utimes.c
--- a/fs/utimes.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/utimes.c	2014-02-17 11:57:00.000000000 +0100
@@ -11,6 +11,10 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
+#ifdef CONFIG_FS_SYNO_ACL
+#include "synoacl_int.h"
+#endif
+
 #ifdef __ARCH_WANT_SYS_UTIME
 
 /*
@@ -54,39 +58,43 @@
 	int error;
 	struct path path;
 	struct inode *inode = NULL;
-	struct iattr newattrs;
+	struct timespec time;
 
 	if (!pCtime) {
 		return -EINVAL;
 	}
+	error = copy_from_user(&time, pCtime, sizeof(struct timespec));
+	if (error)
+		goto out;
 
 	error = user_path_at(AT_FDCWD, filename, LOOKUP_FOLLOW, &path);
 	if (error)
 		goto out;
-	inode = path.dentry->d_inode;
-
-	error = -EROFS;
-	if (IS_RDONLY(inode))
-		goto dput_and_out;
 
-	error = copy_from_user(&newattrs.ia_ctime, pCtime, sizeof(struct timespec));
+	error = mnt_want_write(path.mnt);
 	if (error)
 		goto dput_and_out;
 
-	newattrs.ia_valid = ATTR_CREATE_TIME;
-	mutex_lock(&inode->i_mutex);
-	if (inode->i_op && inode->i_op->setattr)  {
-		error = inode->i_op->setattr(path.dentry, &newattrs);
-	} else {
-		error = inode_change_ok(inode, &newattrs);
-		if (!error) {
-			setattr_copy(inode, &newattrs);
-			mark_inode_dirty(inode);
-			error = 0;
+	inode = path.dentry->d_inode;
+	if (!inode_owner_or_capable(inode)) {
+#ifdef CONFIG_FS_SYNO_ACL
+		if (IS_SYNOACL(path.dentry)) {
+			error = synoacl_op_perm(path.dentry, MAY_WRITE_ATTR | MAY_WRITE_EXT_ATTR);
+			if (error)
+				goto drop_write;
+		} else {
+#endif
+		error = -EPERM;
+		goto drop_write;
+#ifdef CONFIG_FS_SYNO_ACL
 		}
+#endif
 	}
-	mutex_unlock(&inode->i_mutex);
 
+	error = syno_op_set_crtime(path.dentry, &time);
+
+drop_write:
+	mnt_drop_write(path.mnt);
 dput_and_out:
 	path_put(&path);
 out:
@@ -150,8 +158,9 @@
 			goto mnt_drop_write_and_out;
 
 #ifdef CONFIG_FS_SYNO_ACL
-		if (IS_SYNOACL(inode)) {
-			if (inode->i_op->syno_permission(path->dentry, MAY_WRITE_ATTR | MAY_WRITE_EXT_ATTR)) {
+		if (IS_SYNOACL(path->dentry)) {
+			error = synoacl_op_perm(path->dentry, MAY_WRITE_ATTR | MAY_WRITE_EXT_ATTR);
+			if (error) {
 				goto mnt_drop_write_and_out;
 			}
 		} else
diff -ur a/fs/xattr.c b/fs/xattr.c
--- a/fs/xattr.c	2013-08-24 11:36:46.000000000 +0200
+++ b/fs/xattr.c	2014-02-17 11:57:01.000000000 +0100
@@ -23,6 +23,7 @@
 
 
 #ifdef CONFIG_FS_SYNO_ACL
+#include "synoacl_int.h"
 #include <linux/syno_acl_xattr_ds.h>
 #endif
 /*
@@ -101,6 +102,11 @@
 	if (issec)
 		inode->i_flags &= ~S_NOSEC;
 	if (inode->i_op->setxattr) {
+#ifdef CONFIG_FS_SYNO_ACL
+		if (0 > (error = synoacl_check_xattr_perm(name, dentry, MAY_WRITE_PERMISSION))) {
+			return error;
+		}
+#endif
 		error = inode->i_op->setxattr(dentry, name, value, size, flags);
 		if (!error) {
 			fsnotify_xattr(dentry);
@@ -117,7 +123,9 @@
 
 	return error;
 }
-
+#ifdef CONFIG_FS_SYNO_ACL
+EXPORT_SYMBOL(__vfs_setxattr_noperm);
+#endif
 
 int
 vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
@@ -247,7 +255,7 @@
 		int cmd = SYNO_ACL_INHERITED;
 
 		if (!strcmp(name, SYNO_ACL_XATTR_INHERIT)) {
-			if (!IS_SYNOACL(inode)) {
+			if (!IS_SYNOACL(dentry)) {
 				return -EOPNOTSUPP;
 			}
 			cmd = SYNO_ACL_INHERITED;
@@ -256,14 +264,12 @@
 			cmd = SYNO_ACL_PSEUDO_INHERIT_ONLY;
 		}
 
-		if (!inode->i_op->syno_permission || !inode->i_op->syno_acl_get) {
-			return -EOPNOTSUPP;
-		}
-		error = inode->i_op->syno_permission(dentry, MAY_READ_PERMISSION);
+		error = synoacl_op_perm(dentry, MAY_READ_PERMISSION);
 		if (error) {
 			return error;
 		}
-		return inode->i_op->syno_acl_get(dentry, cmd, value, size);
+
+		return synoacl_op_xattr_get(dentry, cmd, value, size);
 	}
 #endif //CONFIG_FS_SYNO_ACL
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
@@ -279,10 +285,18 @@
 		return ret;
 	}
 nolsm:
-	if (inode->i_op->getxattr)
+	if (inode->i_op->getxattr) {
+
+#ifdef CONFIG_FS_SYNO_ACL
+		if (0 > (error = synoacl_check_xattr_perm(name, dentry, MAY_READ_PERMISSION))) {
+			return error;
+		}
+#endif
 		error = inode->i_op->getxattr(dentry, name, value, size);
-	else
+	}
+	else {
 		error = -EOPNOTSUPP;
+	}
 
 	return error;
 }
@@ -325,6 +339,11 @@
 	if (error)
 		return error;
 
+#ifdef CONFIG_FS_SYNO_ACL
+	if (0 > (error = synoacl_check_xattr_perm(name, dentry, MAY_WRITE_PERMISSION))) {
+		return error;
+	}
+#endif
 	mutex_lock(&inode->i_mutex);
 	error = inode->i_op->removexattr(dentry, name);
 	mutex_unlock(&inode->i_mutex);
@@ -711,23 +730,6 @@
 {
 	const struct xattr_handler *handler;
 
-#ifdef CONFIG_FS_SYNO_ACL
-	if (name && !strcmp(name, SYNO_ACL_XATTR_ACCESS)) {
-		int error = 0;
-		struct inode *inode = dentry->d_inode;
-
-		if (!IS_SYNOACL(inode)) {
-			return -EOPNOTSUPP;
-		}
-		if (!inode->i_op->syno_permission) {
-			return -EOPNOTSUPP;
-		}
-		error = inode->i_op->syno_permission(dentry, MAY_READ_PERMISSION);
-		if (error) {
-			return error;
-		}
-	}
-#endif
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
 		return -EOPNOTSUPP;
@@ -776,20 +778,6 @@
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
 
-#ifdef CONFIG_FS_SYNO_ACL
-	if (strcmp_prefix(name, SYNO_ACL_XATTR_ACCESS)) {
-		int error = -1;
-		struct inode *inode = dentry->d_inode;
-
-		if (!IS_FS_SYNOACL(inode) || !inode->i_op->syno_permission) {
-			return -EOPNOTSUPP;
-		}
-		error = inode->i_op->syno_permission(dentry, MAY_WRITE_PERMISSION);
-		if (error) {
-			return error;
-		}
-	}
-#endif
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
 		return -EOPNOTSUPP;
@@ -805,20 +793,6 @@
 {
 	const struct xattr_handler *handler;
 
-#ifdef CONFIG_FS_SYNO_ACL
-	if (strcmp_prefix(name, SYNO_ACL_XATTR_ACCESS)) {
-		int error = -1;
-		struct inode *inode = dentry->d_inode;
-
-		if (!IS_FS_SYNOACL(inode) || !inode->i_op->syno_permission) {
-			return -EOPNOTSUPP;
-		}
-		error = inode->i_op->syno_permission(dentry, MAY_WRITE_PERMISSION);
-		if (error) {
-			return error;
-		}
-	}
-#endif
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
 	if (!handler)
 		return -EOPNOTSUPP;
@@ -830,29 +804,3 @@
 EXPORT_SYMBOL(generic_listxattr);
 EXPORT_SYMBOL(generic_setxattr);
 EXPORT_SYMBOL(generic_removexattr);
-
-#ifdef MY_ABC_HERE
-/*
- * Find the handler for the prefix and dispatch its set() operation.
- */
-int
-syno_generic_setxattr(struct inode *target, const char *name, const void *value, size_t size, int flags)
-{
-	const struct xattr_handler *handler;
-
-	if (size == 0)
-		value = "";  /* empty EA, do not remove */
-
-	handler = xattr_resolve_name(target->i_sb->s_xattr, &name);
-	if (!handler)
-		return -EOPNOTSUPP;
-
-	if (!handler->set_compact_syno) {
-		return -EOPNOTSUPP;
-	}
-	return handler->set_compact_syno(target, name, value, size, 0, handler->flags);
-}
-
-EXPORT_SYMBOL(syno_generic_setxattr);
-
-#endif
diff -ur a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
--- a/fs/xfs/xfs_buf.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/xfs/xfs_buf.c	2014-02-17 11:56:59.000000000 +0100
@@ -1713,7 +1713,7 @@
 
 		if (unlikely(freezing(current))) {
 			set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
-			refrigerator();
+			try_to_freeze();
 		} else {
 			clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
 		}
diff -ur a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
--- a/fs/xfs/xfs_file.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/xfs/xfs_file.c	2014-02-17 11:56:59.000000000 +0100
@@ -769,8 +769,11 @@
 		return error;
 	}
 
-	if (likely(!(file->f_mode & FMODE_NOCMTIME)))
-		file_update_time(file);
+	if (likely(!(file->f_mode & FMODE_NOCMTIME))) {
+		error = file_update_time(file);
+		if (error)
+			return error;
+	}
 
 	/*
 	 * If the offset is beyond the size of the file, we need to zero any
diff -ur a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
--- a/fs/xfs/xfs_super.c	2013-08-24 11:36:45.000000000 +0200
+++ b/fs/xfs/xfs_super.c	2014-02-17 11:56:59.000000000 +0100
@@ -1599,6 +1599,11 @@
 STATIC void
 xfs_destroy_zones(void)
 {
+	/*
+	 * Make sure all delayed rcu free are flushed before we
+	 * destroy caches.
+	 */
+	rcu_barrier();
 	kmem_zone_destroy(xfs_ili_zone);
 	kmem_zone_destroy(xfs_inode_zone);
 	kmem_zone_destroy(xfs_efi_zone);
diff -ur a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
--- a/include/asm-generic/vmlinux.lds.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/asm-generic/vmlinux.lds.h	2014-02-17 11:56:00.000000000 +0100
@@ -52,6 +52,33 @@
 #define LOAD_OFFSET 0
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#ifndef SYMTAB_KEEP_STR
+#define SYMTAB_KEEP_STR *(__ksymtab_strings+*)
+#define SYMTAB_DISCARD_STR
+#else
+#define SYMTAB_DISCARD_STR *(__ksymtab_strings+*)
+#endif
+
+#ifndef SYMTAB_KEEP
+#define SYMTAB_KEEP *(SORT(___ksymtab+*))
+#define SYMTAB_DISCARD
+#else
+#define SYMTAB_DISCARD *(SORT(___ksymtab+*))
+#endif
+
+#ifndef SYMTAB_KEEP_GPL
+#define SYMTAB_KEEP_GPL *(SORT(___ksymtab_gpl+*))
+#define SYMTAB_DISCARD_GPL
+#else
+#define SYMTAB_DISCARD_GPL *(SORT(___ksymtab_gpl+*))
+#endif
+#else /* !defined(CONFIG_SYNO_COMCERTO) */
+#define SYMTAB_KEEP_STR *(__ksymtab_strings+*)
+#define SYMTAB_KEEP *(SORT(___ksymtab+*))
+#define SYMTAB_KEEP_GPL *(SORT(___ksymtab_gpl+*))
+#endif
+
 #ifndef SYMBOL_PREFIX
 #define VMLINUX_SYMBOL(sym) sym
 #else
@@ -275,14 +302,14 @@
 	/* Kernel symbol table: Normal symbols */			\
 	__ksymtab         : AT(ADDR(__ksymtab) - LOAD_OFFSET) {		\
 		VMLINUX_SYMBOL(__start___ksymtab) = .;			\
-		*(SORT(___ksymtab+*))					\
+		SYMTAB_KEEP						\
 		VMLINUX_SYMBOL(__stop___ksymtab) = .;			\
 	}								\
 									\
 	/* Kernel symbol table: GPL-only symbols */			\
 	__ksymtab_gpl     : AT(ADDR(__ksymtab_gpl) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start___ksymtab_gpl) = .;		\
-		*(SORT(___ksymtab_gpl+*))				\
+		SYMTAB_KEEP_GPL						\
 		VMLINUX_SYMBOL(__stop___ksymtab_gpl) = .;		\
 	}								\
 									\
@@ -344,7 +371,7 @@
 									\
 	/* Kernel symbol table: strings */				\
         __ksymtab_strings : AT(ADDR(__ksymtab_strings) - LOAD_OFFSET) {	\
-		*(__ksymtab_strings)					\
+		SYMTAB_KEEP_STR						\
 	}								\
 									\
 	/* __*init sections */						\
@@ -615,30 +642,23 @@
 		*(.init.setup)						\
 		VMLINUX_SYMBOL(__setup_end) = .;
 
-#define INITCALLS							\
-	*(.initcallearly.init)						\
-	VMLINUX_SYMBOL(__early_initcall_end) = .;			\
-  	*(.initcall0.init)						\
-  	*(.initcall0s.init)						\
-  	*(.initcall1.init)						\
-  	*(.initcall1s.init)						\
-  	*(.initcall2.init)						\
-  	*(.initcall2s.init)						\
-  	*(.initcall3.init)						\
-  	*(.initcall3s.init)						\
-  	*(.initcall4.init)						\
-  	*(.initcall4s.init)						\
-  	*(.initcall5.init)						\
-  	*(.initcall5s.init)						\
-	*(.initcallrootfs.init)						\
-  	*(.initcall6.init)						\
-  	*(.initcall6s.init)						\
-  	*(.initcall7.init)						\
-  	*(.initcall7s.init)
+#define INIT_CALLS_LEVEL(level)						\
+		VMLINUX_SYMBOL(__initcall##level##_start) = .;		\
+		*(.initcall##level##.init)				\
+		*(.initcall##level##s.init)				\
 
 #define INIT_CALLS							\
 		VMLINUX_SYMBOL(__initcall_start) = .;			\
-		INITCALLS						\
+		*(.initcallearly.init)					\
+		INIT_CALLS_LEVEL(0)					\
+		INIT_CALLS_LEVEL(1)					\
+		INIT_CALLS_LEVEL(2)					\
+		INIT_CALLS_LEVEL(3)					\
+		INIT_CALLS_LEVEL(4)					\
+		INIT_CALLS_LEVEL(5)					\
+		INIT_CALLS_LEVEL(rootfs)				\
+		INIT_CALLS_LEVEL(6)					\
+		INIT_CALLS_LEVEL(7)					\
 		VMLINUX_SYMBOL(__initcall_end) = .;
 
 #define CON_INITCALL							\
@@ -671,14 +691,28 @@
  * section definitions so that such archs put those in earlier section
  * definitions.
  */
+#if defined(CONFIG_SYNO_COMCERTO)
 #define DISCARDS							\
 	/DISCARD/ : {							\
 	EXIT_TEXT							\
 	EXIT_DATA							\
 	EXIT_CALL							\
+	SYMTAB_DISCARD							\
+	SYMTAB_DISCARD_GPL						\
+	SYMTAB_DISCARD_STR						\
 	*(.discard)							\
 	*(.discard.*)							\
 	}
+#else
+#define DISCARDS							\
+	/DISCARD/ : {							\
+	EXIT_TEXT							\
+	EXIT_DATA							\
+	EXIT_CALL							\
+	*(.discard)							\
+	*(.discard.*)							\
+	}
+#endif
 
 /**
  * PERCPU_INPUT - the percpu input sections
diff -ur a/include/linux/ata.h b/include/linux/ata.h
--- a/include/linux/ata.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/ata.h	2014-02-17 11:56:04.000000000 +0100
@@ -450,10 +450,6 @@
 	ATA_TFLAG_LBA		= (1 << 4), /* enable LBA */
 	ATA_TFLAG_FUA		= (1 << 5), /* enable FUA */
 	ATA_TFLAG_POLLING	= (1 << 6), /* set nIEN to 1 and use polling */
-#ifdef MY_ABC_HERE
-	/* send cmd directly not through ata work queue to prevent timeout issue */
-	ATA_TFLAG_DIRECT    = (1 << 7), 
-#endif
 
 	/* protocol flags */
 	ATA_PROT_FLAG_PIO	= (1 << 0), /* is PIO */
diff -ur a/include/linux/backing-dev.h b/include/linux/backing-dev.h
--- a/include/linux/backing-dev.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/backing-dev.h	2014-02-17 11:56:05.000000000 +0100
@@ -108,6 +108,10 @@
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
 #endif
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	unsigned int cpu0_bind;
+#endif
 };
 
 int bdi_init(struct backing_dev_info *bdi);
diff -ur a/include/linux/blk_types.h b/include/linux/blk_types.h
--- a/include/linux/blk_types.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/blk_types.h	2014-02-17 11:56:04.000000000 +0100
@@ -98,6 +98,13 @@
 #ifdef MY_ABC_HERE
 #define BIO_AUTO_REMAP 12	/* record if auto-remap occurred */
 #endif
+#ifdef MY_ABC_HERE
+/*
+ * Currently, our RAID1 device won't return error on make_reuest() when RAID1 is crashed
+ * So we add this flag to told md layer that is should eturn error for flashcache * devices
+ */
+#define BIO_MD_RETURN_ERROR 13
+#endif
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 
 /*
Nur in b/include/linux: btrfs.h.
Nur in b/include/linux: c2k-devfreq.h.
diff -ur a/include/linux/compat.h b/include/linux/compat.h
--- a/include/linux/compat.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/compat.h	2014-02-17 11:56:07.000000000 +0100
@@ -543,16 +543,6 @@
 asmlinkage long compat_sys_socketcall(int call, u32 __user *args);
 asmlinkage long compat_sys_sysctl(struct compat_sysctl_args __user *args);
 
-#ifdef MY_ABC_HERE
-typedef struct compat_tag_mmap_arg_struct {
-	u32 addr;
-	u32 len;
-	u32 prot;
-	u32 flags;
-	u32 fd;
-	u32 pgoff;
-} compat_SYNO_MMAP_ARG;
-#endif
 extern ssize_t compat_rw_copy_check_uvector(int type,
 		const struct compat_iovec __user *uvector,
 		unsigned long nr_segs,
Nur in b/include/linux: crashlog.h.
Nur in b/include/linux/decompress: unlzo_mm.h.
diff -ur a/include/linux/dm-io.h b/include/linux/dm-io.h
--- a/include/linux/dm-io.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/dm-io.h	2014-02-17 11:56:03.000000000 +0100
@@ -13,6 +13,7 @@
 #ifdef __KERNEL__
 
 #include <linux/types.h>
+#include <linux/syno.h>
 
 struct dm_io_region {
 	struct block_device *bdev;
@@ -77,6 +78,10 @@
  * Each bit in the optional 'sync_error_bits' bitset indicates whether an
  * error occurred doing io to the corresponding region.
  */
+#ifdef MY_ABC_HERE
+int syno_dm_io(struct dm_io_request *io_req, unsigned num_regions,
+	  struct dm_io_region *region, unsigned long *sync_error_bits);
+#endif
 int dm_io(struct dm_io_request *io_req, unsigned num_regions,
 	  struct dm_io_region *region, unsigned long *sync_error_bits);
 
diff -ur a/include/linux/export.h b/include/linux/export.h
--- a/include/linux/export.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/export.h	2014-02-17 11:56:06.000000000 +0100
@@ -45,7 +45,28 @@
 #define __CRC_SYMBOL(sym, sec)
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#ifdef MODULE
+#define __EXPORT_SUFFIX(sym)
+#else
+#define __EXPORT_SUFFIX(sym) "+" #sym
+#endif
+#endif
+
 /* For every exported symbol, place a struct in the __ksymtab section */
+#if defined(CONFIG_SYNO_COMCERTO)
+#define __EXPORT_SYMBOL(sym, sec)				\
+	extern typeof(sym) sym;					\
+	__CRC_SYMBOL(sym, sec)					\
+	static const char __kstrtab_##sym[]			\
+	__attribute__((section("__ksymtab_strings"		\
+	  __EXPORT_SUFFIX(sym)), aligned(1)))			\
+	= MODULE_SYMBOL_PREFIX #sym;				\
+	static const struct kernel_symbol __ksymtab_##sym	\
+	__used							\
+	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
+	= { (unsigned long)&sym, __kstrtab_##sym }
+#else
 #define __EXPORT_SYMBOL(sym, sec)				\
 	extern typeof(sym) sym;					\
 	__CRC_SYMBOL(sym, sec)					\
@@ -56,6 +77,7 @@
 	__used							\
 	__attribute__((section("___ksymtab" sec "+" #sym), unused))	\
 	= { (unsigned long)&sym, __kstrtab_##sym }
+#endif
 
 #define EXPORT_SYMBOL(sym)					\
 	__EXPORT_SYMBOL(sym, "")
diff -ur a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
--- a/include/linux/ext3_fs.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/ext3_fs.h	2014-02-17 11:56:05.000000000 +0100
@@ -537,7 +537,7 @@
 #if defined(MY_ABC_HERE) || defined (MY_ABC_HERE)
 	__u32	s_reserved[159];	/* Padding to the end of the block */
 	__le32	s_archive_version;	/* Last archived version */
-	__le32	s_syno_reserved;
+	__le32	s_archive_version_obsoleted;
 	__le32  s_syno_hash_magic;	/* Enable Htree if the magic is given */
 #else
 	__u32   s_reserved[162];        /* Padding to the end of the block */
@@ -940,6 +940,13 @@
 extern void ext3_set_aops(struct inode *inode);
 extern int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		       u64 start, u64 len);
+#ifdef MY_ABC_HERE
+extern int syno_ext3_getattr(struct dentry *d, struct kstat *stat, int flags);
+#endif
+#ifdef MY_ABC_HERE
+extern int syno_ext3_get_archive_ver(struct dentry *, u32 *);
+extern int syno_ext3_set_archive_ver(struct dentry *, u32);
+#endif
 
 /* ioctl.c */
 extern long ext3_ioctl(struct file *, unsigned int, unsigned long);
diff -ur a/include/linux/freezer.h b/include/linux/freezer.h
--- a/include/linux/freezer.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/freezer.h	2014-02-17 11:56:05.000000000 +0100
@@ -47,19 +47,18 @@
 /* Takes and releases task alloc lock using task_lock() */
 extern int thaw_process(struct task_struct *p);
 
-extern void refrigerator(void);
+extern bool __refrigerator(void);
 extern int freeze_processes(void);
 extern int freeze_kernel_threads(void);
 extern void thaw_processes(void);
 extern void thaw_kernel_threads(void);
 
-static inline int try_to_freeze(void)
+static inline bool try_to_freeze(void)
 {
-	if (freezing(current)) {
-		refrigerator();
-		return 1;
-	} else
-		return 0;
+	might_sleep();
+	if (likely(!freezing(current)))
+		return false;
+	return __refrigerator();
 }
 
 extern bool freeze_task(struct task_struct *p, bool sig_only);
@@ -213,13 +212,13 @@
 static inline void clear_freeze_flag(struct task_struct *p) {}
 static inline int thaw_process(struct task_struct *p) { return 1; }
 
-static inline void refrigerator(void) {}
+static inline bool __refrigerator(void) { return false; }
 static inline int freeze_processes(void) { return -ENOSYS; }
 static inline int freeze_kernel_threads(void) { return -ENOSYS; }
 static inline void thaw_processes(void) {}
 static inline void thaw_kernel_threads(void) {}
 
-static inline int try_to_freeze(void) { return 0; }
+static inline bool try_to_freeze(void) { return false; }
 
 static inline void freezer_do_not_count(void) {}
 static inline void freezer_count(void) {}
diff -ur a/include/linux/fs.h b/include/linux/fs.h
--- a/include/linux/fs.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/fs.h	2014-02-17 11:56:07.000000000 +0100
@@ -15,9 +15,6 @@
 #include <linux/net.h>
 #endif /* MY_ABC_HERE */
 
-#ifdef CONFIG_FS_SYNO_ACL
-#include <linux/syno_acl.h>
-#endif
 /*
  * It's silly to have NR_OPEN bigger than NR_FILE, but you can change
  * the file limit at runtime and only root can increase the per-process
@@ -249,6 +246,12 @@
 #define MS_ACTIVE	(1<<30)
 #define MS_NOUSER	(1<<31)
 
+#ifdef MY_ABC_HERE
+#define SYNO_MS_META_XATTR	(1<<0) /* storing arbit/crtime ...extra syno metadata into xattr */
+
+#define __IS_SYNO_FLG(inode,flg) ((inode)->i_sb->s_syno_opt & (flg))
+#define IS_SYNO_META_XATTR(inode) __IS_SYNO_FLG(inode, SYNO_MS_META_XATTR)
+#endif
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
@@ -276,6 +279,10 @@
 #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
 #define S_NOSEC		4096	/* no suid or xattr security attributes */
 
+#ifdef MY_ABC_HERE
+#define S_ARCHIVE_VERSION_CACHED 0x80000000
+#endif
+
 /*
  * Note that nosuid etc flags are inode-specific: setting some file-system
  * flags just means all the inodes inherit those flags by default. It might be
@@ -305,22 +312,6 @@
 #define IS_IMMUTABLE(inode)	((inode)->i_flags & S_IMMUTABLE)
 #define IS_POSIXACL(inode)	__IS_FLG(inode, MS_POSIXACL)
 
-#ifdef CONFIG_FS_SYNO_ACL
-#define IS_SMB_READONLY(inode)	((inode)->i_mode2 & S2_SMB_READONLY)
-#define IS_SYNOACL_SUPERUSER() (0 == current_fsuid())
-
-#define IS_INODE_SYNOACL(inode)	((inode)->i_mode2 & S2_SYNO_ACL_SUPPORT)
-#define IS_FS_SYNOACL(inode)	__IS_FLG(inode, MS_SYNOACL)
-#define IS_SYNOACL(inode)	(IS_INODE_SYNOACL(inode) && IS_FS_SYNOACL(inode))
-
-#define IS_SYNOACL_INHERIT(inode)	((inode)->i_mode2 & S2_SYNO_ACL_INHERIT)
-#define IS_SYNOACL_OWNER_IS_GROUP(inode)	((inode)->i_mode2 & S2_SYNO_ACL_IS_OWNER_GROUP)
-#define IS_SYNOACL_EXIST(inode)	((inode)->i_mode2 & S2_SYNO_ACL_EXIST)
-#define HAS_SYNOACL(inode) ((IS_SYNOACL_EXIST(inode) || IS_SYNOACL_INHERIT(inode)))
-#define is_synoacl_owner(inode)	IS_SYNOACL_OWNER_IS_GROUP(inode)?in_group_p(inode->i_gid):(inode->i_uid == current_fsuid())
-#define is_synoacl_owner_or_capable(inode) (is_synoacl_owner(inode) || capable(CAP_FOWNER))
-
-#endif /* CONFIG_FS_SYNO_ACL */
 #define IS_DEADDIR(inode)	((inode)->i_flags & S_DEAD)
 #define IS_NOCMTIME(inode)	((inode)->i_flags & S_NOCMTIME)
 #define IS_SWAPFILE(inode)	((inode)->i_flags & S_SWAPFILE)
@@ -328,6 +319,9 @@
 #define IS_IMA(inode)		((inode)->i_flags & S_IMA)
 #define IS_AUTOMOUNT(inode)	((inode)->i_flags & S_AUTOMOUNT)
 #define IS_NOSEC(inode)		((inode)->i_flags & S_NOSEC)
+#ifdef MY_ABC_HERE
+#define IS_ARCHIVE_VERSION_CACHED(inode) ((inode)->i_flags & S_ARCHIVE_VERSION_CACHED)
+#endif
 
 /* the read-only stuff doesn't really belong here, but any other place is
    probably as bad and I don't want to create yet another include file. */
@@ -457,6 +451,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/atomic.h>
 #include <linux/shrinker.h>
+#include <linux/lockdep.h>
 
 #include <asm/byteorder.h>
 
@@ -514,9 +509,6 @@
 #define ATTR_KILL_PRIV	(1 << 14)
 #define ATTR_OPEN	(1 << 15) /* Truncating from open(O_TRUNC) */
 #define ATTR_TIMES_SET	(1 << 16)
-#ifdef MY_ABC_HERE
-#define ATTR_CREATE_TIME (1 << 17)
-#endif
 
 /*
  * This is the Inode Attributes structure, used for notify_change().  It
@@ -809,6 +801,9 @@
 #endif
 
 struct posix_acl;
+#ifdef CONFIG_FS_SYNO_ACL
+struct syno_acl;
+#endif
 #define ACL_NOT_CACHED ((void *)(-1))
 
 #define IOP_FASTPERM	0x0001
@@ -1479,6 +1474,8 @@
 extern pid_t f_getown(struct file *filp);
 extern int send_sigurg(struct fown_struct *fown);
 
+struct mm_struct;
+
 /*
  *	Umount options
  */
@@ -1492,6 +1489,31 @@
 extern struct list_head super_blocks;
 extern spinlock_t sb_lock;
 
+/* Possible states of 'frozen' field */
+enum {
+	SB_UNFROZEN = 0,		/* FS is unfrozen */
+	SB_FREEZE_WRITE	= 1,		/* Writes, dir ops, ioctls frozen */
+	SB_FREEZE_PAGEFAULT = 2,	/* Page faults stopped as well */
+	SB_FREEZE_FS = 3,		/* For internal FS use (e.g. to stop
+					 * internal threads if needed) */
+	SB_FREEZE_COMPLETE = 4,		/* ->freeze_fs finished successfully */
+};
+
+#define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1)
+
+struct sb_writers {
+	/* Counters for counting writers at each level */
+	struct percpu_counter	counter[SB_FREEZE_LEVELS];
+	wait_queue_head_t	wait;		/* queue for waiting for
+						   writers / faults to finish */
+	int			frozen;		/* Is sb frozen? */
+	wait_queue_head_t	wait_unfrozen;	/* queue for waiting for
+						   sb to be thawed */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	struct lockdep_map	lock_map[SB_FREEZE_LEVELS];
+#endif
+};
+
 struct super_block {
 	struct list_head	s_list;		/* Keep this first */
 	dev_t			s_dev;		/* search index; _not_ kdev_t */
@@ -1538,8 +1560,7 @@
 	struct list_head	s_instances;
 	struct quota_info	s_dquot;	/* Diskquota specific options */
 
-	int			s_frozen;
-	wait_queue_head_t	s_wait_unfrozen;
+	struct sb_writers	s_writers;
 
 	char s_id[32];				/* Informational name */
 	u8 s_uuid[16];				/* UUID */
@@ -1585,8 +1606,19 @@
 	int cleancache_poolid;
 
 	struct shrinker s_shrink;	/* per-sb shrinker handle */
+
+	/* Number of inodes with nlink == 0 but still referenced */
+	atomic_long_t s_remove_count;
+#ifdef MY_ABC_HERE
+	u64 	s_syno_opt;
+#endif
 };
 
+#ifdef SYNO_GLUSTER_FS
+#define SZ_FS_GLUSTER	"glusterfs"
+#define IS_GLUSTER_FS(inode) (inode->i_sb->s_subtype && !strcmp(SZ_FS_GLUSTER, inode->i_sb->s_subtype))
+#endif
+
 /* superblock cache pruning functions */
 extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
 extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
@@ -1596,14 +1628,117 @@
 /*
  * Snapshotting support.
  */
-enum {
-	SB_UNFROZEN = 0,
-	SB_FREEZE_WRITE	= 1,
-	SB_FREEZE_TRANS = 2,
-};
 
-#define vfs_check_frozen(sb, level) \
-	wait_event((sb)->s_wait_unfrozen, ((sb)->s_frozen < (level)))
+void __sb_end_write(struct super_block *sb, int level);
+int __sb_start_write(struct super_block *sb, int level, bool wait);
+
+/**
+ * sb_end_write - drop write access to a superblock
+ * @sb: the super we wrote to
+ *
+ * Decrement number of writers to the filesystem. Wake up possible waiters
+ * wanting to freeze the filesystem.
+ */
+static inline void sb_end_write(struct super_block *sb)
+{
+	__sb_end_write(sb, SB_FREEZE_WRITE);
+}
+
+/**
+ * sb_end_pagefault - drop write access to a superblock from a page fault
+ * @sb: the super we wrote to
+ *
+ * Decrement number of processes handling write page fault to the filesystem.
+ * Wake up possible waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_pagefault(struct super_block *sb)
+{
+	__sb_end_write(sb, SB_FREEZE_PAGEFAULT);
+}
+
+/**
+ * sb_end_intwrite - drop write access to a superblock for internal fs purposes
+ * @sb: the super we wrote to
+ *
+ * Decrement fs-internal number of writers to the filesystem.  Wake up possible
+ * waiters wanting to freeze the filesystem.
+ */
+static inline void sb_end_intwrite(struct super_block *sb)
+{
+	__sb_end_write(sb, SB_FREEZE_FS);
+}
+
+/**
+ * sb_start_write - get write access to a superblock
+ * @sb: the super we write to
+ *
+ * When a process wants to write data or metadata to a file system (i.e. dirty
+ * a page or an inode), it should embed the operation in a sb_start_write() -
+ * sb_end_write() pair to get exclusion against file system freezing. This
+ * function increments number of writers preventing freezing. If the file
+ * system is already frozen, the function waits until the file system is
+ * thawed.
+ *
+ * Since freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. Generally,
+ * freeze protection should be the outermost lock. In particular, we have:
+ *
+ * sb_start_write
+ *   -> i_mutex			(write path, truncate, directory ops, ...)
+ *   -> s_umount		(freeze_super, thaw_super)
+ */
+static inline void sb_start_write(struct super_block *sb)
+{
+	__sb_start_write(sb, SB_FREEZE_WRITE, true);
+}
+
+static inline int sb_start_write_trylock(struct super_block *sb)
+{
+	return __sb_start_write(sb, SB_FREEZE_WRITE, false);
+}
+
+/**
+ * sb_start_pagefault - get write access to a superblock from a page fault
+ * @sb: the super we write to
+ *
+ * When a process starts handling write page fault, it should embed the
+ * operation into sb_start_pagefault() - sb_end_pagefault() pair to get
+ * exclusion against file system freezing. This is needed since the page fault
+ * is going to dirty a page. This function increments number of running page
+ * faults preventing freezing. If the file system is already frozen, the
+ * function waits until the file system is thawed.
+ *
+ * Since page fault freeze protection behaves as a lock, users have to preserve
+ * ordering of freeze protection and other filesystem locks. It is advised to
+ * put sb_start_pagefault() close to mmap_sem in lock ordering. Page fault
+ * handling code implies lock dependency:
+ *
+ * mmap_sem
+ *   -> sb_start_pagefault
+ */
+static inline void sb_start_pagefault(struct super_block *sb)
+{
+	__sb_start_write(sb, SB_FREEZE_PAGEFAULT, true);
+}
+
+/*
+ * sb_start_intwrite - get write access to a superblock for internal fs purposes
+ * @sb: the super we write to
+ *
+ * This is the third level of protection against filesystem freezing. It is
+ * free for use by a filesystem. The only requirement is that it must rank
+ * below sb_start_pagefault.
+ *
+ * For example filesystem can call sb_start_intwrite() when starting a
+ * transaction which somewhat eases handling of freezing for internal sources
+ * of filesystem changes (internal fs threads, discarding preallocation on file
+ * close, etc.).
+ */
+static inline void sb_start_intwrite(struct super_block *sb)
+{
+	__sb_start_write(sb, SB_FREEZE_FS, true);
+}
+
 
 /*
  * until VFS tracks user namespaces for inodes, just make all files
@@ -1716,9 +1851,16 @@
 	int (*setlease)(struct file *, long, struct file_lock **);
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
+#ifdef MY_ABC_HERE
+	ssize_t (*syno_recvfile)(struct file *file, struct socket *sock,
+	                                              loff_t *ppos, size_t count, size_t * rbytes, size_t * wbytes);
+#endif
 };
 
 struct inode_operations {
+#ifdef MY_ABC_HERE
+	int (*syno_getattr)(struct dentry *, struct kstat *, int flags);
+#endif
 	struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *);
 	void * (*follow_link) (struct dentry *, struct nameidata *);
 	int (*permission) (struct inode *, int);
@@ -1738,15 +1880,28 @@
 			struct inode *, struct dentry *);
 	void (*truncate) (struct inode *);
 #ifdef CONFIG_FS_SYNO_ACL
-	int (*syno_acl_get)(struct dentry *, int cmd, void *value, size_t size);
+	struct syno_acl * (*syno_acl_get)(struct inode *);
+	int (*syno_acl_set)(struct inode *, struct syno_acl *);
+	int (*syno_acl_xattr_get)(struct dentry *, int, void *, size_t);
 	int (*syno_permission)(struct dentry *, int);
 	int (*syno_exec_permission)(struct dentry *);
-	int (*syno_access)(struct dentry *, int);
-	int (*syno_permission_get)(struct dentry *, unsigned int *, unsigned int *);
+	int (*syno_acl_access)(struct dentry *, int);
 	int (*syno_inode_change_ok)(struct dentry *, struct iattr *);
+	int (*syno_arbit_chg_ok)(struct dentry *, unsigned int cmd, int tag, int mask);
+	int (*syno_setattr_post)(struct dentry *, struct iattr *);
+	int (*syno_acl_init)(struct dentry *, struct inode *);
+	void (*syno_acl_to_mode)(struct dentry *, struct kstat *);
 #endif /* CONFIG_FS_SYNO_ACL */
 #ifdef MY_ABC_HERE
-	int (*set_archive)(struct dentry *, int);
+	int (*syno_get_archive_bit)(struct dentry *, unsigned int *);
+	int (*syno_set_archive_bit)(struct dentry *, unsigned int);
+#endif
+#ifdef MY_ABC_HERE
+	int (*syno_set_crtime)(struct dentry *, struct timespec *);
+#endif
+#ifdef MY_ABC_HERE
+	int (*syno_get_archive_ver)(struct dentry *, u32 *);
+	int (*syno_set_archive_ver)(struct dentry *, u32);
 #endif
 	int (*setattr) (struct dentry *, struct iattr *);
 	int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *);
@@ -1754,12 +1909,10 @@
 	ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
 	ssize_t (*listxattr) (struct dentry *, char *, size_t);
 	int (*removexattr) (struct dentry *, const char *);
-#ifdef MY_ABC_HERE
-	int (*synosetxattr) (struct inode *, const char *,const void *,size_t,int);
-#endif
 	void (*truncate_range)(struct inode *, loff_t, loff_t);
 	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
 		      u64 len);
+	int (*update_time)(struct inode *, struct timespec *, int);
 } ____cacheline_aligned;
 
 struct seq_file;
@@ -1778,6 +1931,14 @@
 		unsigned long, loff_t *);
 
 struct super_operations {
+#ifdef MY_ABC_HERE
+	int (*syno_get_sb_archive_ver)(struct super_block *sb, u32 *version);
+	int (*syno_set_sb_archive_ver)(struct super_block *sb, u32 version);
+#ifdef MY_ABC_HERE
+	int (*syno_get_sb_archive_ver1)(struct super_block *sb, u32 *version);
+	int (*syno_set_sb_archive_ver1)(struct super_block *sb, u32 version);
+#endif
+#endif
    	struct inode *(*alloc_inode)(struct super_block *sb);
 	void (*destroy_inode)(struct inode *);
 
@@ -1886,31 +2047,10 @@
 	__mark_inode_dirty(inode, I_DIRTY_SYNC);
 }
 
-/**
- * set_nlink - directly set an inode's link count
- * @inode: inode
- * @nlink: new nlink (should be non-zero)
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink.
- */
-static inline void set_nlink(struct inode *inode, unsigned int nlink)
-{
-	inode->__i_nlink = nlink;
-}
-
-/**
- * inc_nlink - directly increment an inode's link count
- * @inode: inode
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink.  Currently,
- * it is only here for parity with dec_nlink().
- */
-static inline void inc_nlink(struct inode *inode)
-{
-	inode->__i_nlink++;
-}
+extern void inc_nlink(struct inode *inode);
+extern void drop_nlink(struct inode *inode);
+extern void clear_nlink(struct inode *inode);
+extern void set_nlink(struct inode *inode, unsigned int nlink);
 
 static inline void inode_inc_link_count(struct inode *inode)
 {
@@ -1918,35 +2058,6 @@
 	mark_inode_dirty(inode);
 }
 
-/**
- * drop_nlink - directly drop an inode's link count
- * @inode: inode
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink.  In cases
- * where we are attempting to track writes to the
- * filesystem, a decrement to zero means an imminent
- * write when the file is truncated and actually unlinked
- * on the filesystem.
- */
-static inline void drop_nlink(struct inode *inode)
-{
-	inode->__i_nlink--;
-}
-
-/**
- * clear_nlink - directly zero an inode's link count
- * @inode: inode
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink.  See
- * drop_nlink() for why we care about i_nlink hitting zero.
- */
-static inline void clear_nlink(struct inode *inode)
-{
-	inode->__i_nlink = 0;
-}
-
 static inline void inode_dec_link_count(struct inode *inode)
 {
 	drop_nlink(inode);
@@ -1968,6 +2079,13 @@
        spin_unlock(&inode->i_lock);
 }
 
+enum file_time_flags {
+	S_ATIME = 1,
+	S_MTIME = 2,
+	S_CTIME = 4,
+	S_VERSION = 8,
+};
+
 extern void touch_atime(struct vfsmount *mnt, struct dentry *dentry);
 static inline void file_accessed(struct file *file)
 {
@@ -1991,6 +2109,7 @@
 	struct lock_class_key s_lock_key;
 	struct lock_class_key s_umount_key;
 	struct lock_class_key s_vfs_rename_key;
+	struct lock_class_key s_writers_key[SB_FREEZE_LEVELS];
 
 	struct lock_class_key i_lock_key;
 	struct lock_class_key i_mutex_key;
@@ -2017,7 +2136,7 @@
 void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
-#ifdef MY_ABC_HERE
+#ifdef SYNO_READ_LOCK_IN_THAW_BDEV
 void deactivate_read_locked_super(struct super_block *s);
 #endif
 void deactivate_locked_super(struct super_block *sb);
@@ -2529,7 +2648,7 @@
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 #ifdef MY_ABC_HERE
-#define MAX_PAGES_PER_RECVFILE 16
+#define MAX_PAGES_PER_RECVFILE 32
 extern int do_recvfile(struct file *file, struct socket *sock, loff_t *ppos, size_t count, size_t *rbytes , size_t *wbytes);
 #endif
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
@@ -2558,6 +2677,10 @@
 		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_IMPROVED_SPLICE)
+extern ssize_t comcerto_file_splice_write(struct pipe_inode_info *,
+		struct file *, loff_t *, size_t, unsigned int);
+#endif
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
 		struct file *out, loff_t *, size_t len, unsigned int flags);
 extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
@@ -2669,6 +2792,7 @@
 extern void put_filesystem(struct file_system_type *fs);
 extern struct file_system_type *get_fs_type(const char *name);
 extern struct super_block *get_super(struct block_device *);
+extern struct super_block *get_super_thawed(struct block_device *);
 extern struct super_block *get_active_super(struct block_device *bdev);
 extern struct super_block *user_get_super(dev_t);
 extern void drop_super(struct super_block *sb);
@@ -2728,7 +2852,7 @@
 extern int inode_newsize_ok(const struct inode *, loff_t offset);
 extern void setattr_copy(struct inode *inode, const struct iattr *attr);
 
-extern void file_update_time(struct file *file);
+extern int file_update_time(struct file *file);
 
 extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
 extern void save_mount_options(struct super_block *sb, char *options);
@@ -2849,5 +2973,128 @@
 int SYNOUnicodeUTF8toUpper(u_int8_t *to,const u_int8_t *from, int maxlen, int clenfrom, u_int16_t *upcasetable);
 #endif /*MY_ABC_HERE */
 
+#ifdef MY_ABC_HERE
+static inline int syno_op_set_crtime(struct dentry *dentry, struct timespec *time)
+{
+	int error = 0;
+	struct inode *inode = dentry->d_inode;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (inode->i_op->syno_set_crtime) {
+		error = inode->i_op->syno_set_crtime(dentry, time);
+		if (-EOPNOTSUPP == error) {
+			error = 0;
+			inode->i_CreateTime = timespec_trunc(*time, inode->i_sb->s_time_gran);
+			mark_inode_dirty(inode);
+		}
+	} else {
+		inode->i_CreateTime = timespec_trunc(*time, inode->i_sb->s_time_gran);
+		mark_inode_dirty(inode);
+	}
+
+	mutex_unlock(&inode->i_mutex);
+	return error;
+}
+#endif
+
+#ifdef MY_ABC_HERE
+static inline int syno_op_get_archive_bit(struct dentry *dentry, unsigned int *pArbit)
+{
+	int err = 0;
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_get_archive_bit) {
+		err = inode->i_op->syno_get_archive_bit(dentry, pArbit);
+		if (-EOPNOTSUPP == err) {
+			err = 0;
+			*pArbit = inode->i_mode2;
+		} else if (-ENODATA == err) {
+			err = 0;
+			*pArbit = 0;
+		}
+	} else {
+		*pArbit = inode->i_mode2;
+	}
+
+	return err;
+}
+
+static inline int syno_op_set_archive_bit_nolock(struct dentry *dentry, unsigned int arbit)
+{
+	int err = 0;
+	struct inode *inode = dentry->d_inode;
+
+	if (inode->i_op->syno_set_archive_bit) {
+		err = inode->i_op->syno_set_archive_bit(dentry, arbit);
+		if (-EOPNOTSUPP == err) {
+			err = 0;
+			inode->i_mode2 = arbit;
+			mark_inode_dirty_sync(inode);
+		}
+	} else {
+		inode->i_mode2 = arbit;
+		mark_inode_dirty_sync(inode);
+	}
+
+	return err;
+}
+
+static inline int syno_op_set_archive_bit(struct dentry *dentry, unsigned int arbit)
+{
+	int err = 0;
+	struct inode *inode = dentry->d_inode;
+
+	mutex_lock(&inode->i_syno_mutex);
+	err = syno_op_set_archive_bit_nolock(dentry, arbit);
+	mutex_unlock(&inode->i_syno_mutex);
+	return err;
+}
+
+#endif //MY_ABC_HERE
+
+#if defined(CONFIG_FS_SYNO_ACL) && defined(MY_ABC_HERE)
+#define IS_SYNOACL_SUPERUSER() (0 == current_fsuid())
+
+static inline int is_syno_arbit_enable(struct inode *inode, struct dentry * dentry, unsigned int arbit)
+{
+	if (inode->i_op->syno_get_archive_bit) {
+		unsigned int tmp = 0;
+		int err = inode->i_op->syno_get_archive_bit(dentry, &tmp);
+
+		if (!err && (arbit & tmp)) {
+			return 1;
+		}
+		if (-EOPNOTSUPP != err){ //err or arbit not enabled
+			return 0;
+		}
+	}
+
+	if (inode->i_mode2 & arbit) {
+		return 1;
+	}
+	return 0;
+}
+
+#define IS_INODE_SYNOACL(inode, dentry)	   is_syno_arbit_enable(inode, dentry, S2_SYNO_ACL_SUPPORT)
+#define IS_SMB_READONLY(dentry) 	   is_syno_arbit_enable(dentry->d_inode, dentry, S2_SMB_READONLY)
+#define IS_SYNOACL_INHERIT(dentry)  is_syno_arbit_enable(dentry->d_inode, dentry, S2_SYNO_ACL_INHERIT)
+#define IS_SYNOACL_EXIST(dentry)	   is_syno_arbit_enable(dentry->d_inode, dentry, S2_SYNO_ACL_EXIST)
+#define HAS_SYNOACL(dentry)  	   is_syno_arbit_enable(dentry->d_inode, dentry, (S2_SYNO_ACL_EXIST | S2_SYNO_ACL_INHERIT))
+#define IS_SYNOACL_OWNER_IS_GROUP(dentry) \
+	is_syno_arbit_enable(dentry->d_inode, dentry, S2_SYNO_ACL_IS_OWNER_GROUP)
+
+#define IS_FS_SYNOACL(inode)	__IS_FLG(inode, MS_SYNOACL)
+#define IS_SYNOACL(dentry)	(IS_INODE_SYNOACL(dentry->d_inode, dentry) && IS_FS_SYNOACL(dentry->d_inode))
+#define IS_SYNOACL_INODE(inode, dentry)	(IS_INODE_SYNOACL(inode, dentry) && IS_FS_SYNOACL(inode))
+
+#define is_synoacl_owner(dentry)	IS_SYNOACL_OWNER_IS_GROUP(dentry)?in_group_p(dentry->d_inode->i_gid):(dentry->d_inode->i_uid == current_fsuid())
+#define is_synoacl_owner_or_capable(dentry) (is_synoacl_owner(dentry) || capable(CAP_FOWNER))
+#endif /* CONFIG_FS_SYNO_ACL */
+
+#ifdef MY_ABC_HERE
+#define SYNO_EXT4_MOUNT_PATH_LEN 128
+#endif
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_FS_H */
diff -ur a/include/linux/fsnotify.h b/include/linux/fsnotify.h
--- a/include/linux/fsnotify.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/fsnotify.h	2014-02-17 11:56:05.000000000 +0100
@@ -28,9 +28,15 @@
 #if defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
 static inline void SYNO_ArchiveModify(struct inode *TargetInode, int blSetSMBArchive)
 {
+	struct dentry *dentry;
 #ifdef MY_ABC_HERE
-	int old_version;
-	int new_version;
+	u32 new_archive_bit;
+	u32 old_archive_bit;
+#endif
+#ifdef MY_ABC_HERE
+	u32 old_version;
+	u32 new_version;
+	int err;
 #endif
 	if (NULL == TargetInode) {
 		return;
@@ -39,30 +45,48 @@
 		S_ISFIFO(TargetInode->i_mode) || S_ISSOCK(TargetInode->i_mode)) {
 		return;
 	}
+	dentry = d_find_alias(TargetInode);
+	if (!dentry)
+		return;
+
 #ifdef MY_ABC_HERE
 	mutex_lock(&TargetInode->i_syno_mutex);
+	if (syno_op_get_archive_bit(dentry, &old_archive_bit)) {
+		goto next;
+	}
+
 	if (blSetSMBArchive) {
-		TargetInode->i_mode2 |= (S2_SMB_ARCHIVE|ALL_IARCHIVE);
+		new_archive_bit = old_archive_bit | (S2_SMB_ARCHIVE|ALL_IARCHIVE);
 	} else {
-		TargetInode->i_mode2 |= ALL_IARCHIVE;
+		new_archive_bit = old_archive_bit | ALL_IARCHIVE;
 	}
+	if (new_archive_bit == old_archive_bit) {
+		goto next;
+	}
+	syno_op_set_archive_bit_nolock(dentry, new_archive_bit);
+next:
 	mutex_unlock(&TargetInode->i_syno_mutex);
 #endif
 #ifdef MY_ABC_HERE
-	old_version = TargetInode->i_archive_version;
-	new_version = TargetInode->i_sb->s_archive_version + 1;
-	if (old_version != new_version) {
-		TargetInode->i_archive_version = new_version;
-		if (TargetInode->i_op->synosetxattr) {
-			struct syno_xattr_archive_version value;
-			value.v_magic = cpu_to_le16(0x2552);
-			value.v_struct_version = cpu_to_le16(1);
-			value.v_archive_version = cpu_to_le32(new_version);
-			TargetInode->i_op->synosetxattr(TargetInode, XATTR_SYNO_PREFIX XATTR_SYNO_ARCHIVE_VERSION, &value, sizeof(value), 0);
-		}
-	}
+	if (!TargetInode->i_op->syno_get_archive_ver)
+		goto out;
+
+	err = TargetInode->i_op->syno_get_archive_ver(dentry, &old_version);
+	if (err)
+		goto out;
+
+	TargetInode->i_sb->s_op->syno_get_sb_archive_ver(TargetInode->i_sb, &new_version);
+	if (err)
+		goto out;
+
+	new_version += 1;
+	if (new_version != old_version)
+		TargetInode->i_op->syno_set_archive_ver(dentry, new_version);
+out:
 #endif
-	mark_inode_dirty_sync(TargetInode);
+	if (dentry) {
+		dput(dentry);
+	}
 }
 #endif
 
diff -ur a/include/linux/fuse.h b/include/linux/fuse.h
--- a/include/linux/fuse.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/fuse.h	2014-02-17 11:56:04.000000000 +0100
@@ -1,634 +1,7 @@
-/*
-    FUSE: Filesystem in Userspace
-    Copyright (C) 2001-2008  Miklos Szeredi <miklos@szeredi.hu>
-
-    This program can be distributed under the terms of the GNU GPL.
-    See the file COPYING.
-*/
-
-/*
- * This file defines the kernel interface of FUSE
- *
- * Protocol changelog:
- *
- * 7.9:
- *  - new fuse_getattr_in input argument of GETATTR
- *  - add lk_flags in fuse_lk_in
- *  - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
- *  - add blksize field to fuse_attr
- *  - add file flags field to fuse_read_in and fuse_write_in
- *
- * 7.10
- *  - add nonseekable open flag
- *
- * 7.11
- *  - add IOCTL message
- *  - add unsolicited notification support
- *  - add POLL message and NOTIFY_POLL notification
- *
- * 7.12
- *  - add umask flag to input argument of open, mknod and mkdir
- *  - add notification messages for invalidation of inodes and
- *    directory entries
- *
- * 7.13
- *  - make max number of background requests and congestion threshold
- *    tunables
- *
- * 7.14
- *  - add splice support to fuse device
- *
- * 7.15
- *  - add store notify
- *  - add retrieve notify
- *
- * 7.16
- *  - add BATCH_FORGET request
- *  - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
- *    fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
- *  - add FUSE_IOCTL_32BIT flag
- *
- * 7.17
- *  - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
- */
 
 #ifndef _LINUX_FUSE_H
 #define _LINUX_FUSE_H
 
-#include <linux/types.h>
-
-/*
- * Version negotiation:
- *
- * Both the kernel and userspace send the version they support in the
- * INIT request and reply respectively.
- *
- * If the major versions match then both shall use the smallest
- * of the two minor versions for communication.
- *
- * If the kernel supports a larger major version, then userspace shall
- * reply with the major version it supports, ignore the rest of the
- * INIT message and expect a new INIT message from the kernel with a
- * matching major version.
- *
- * If the library supports a larger major version, then it shall fall
- * back to the major protocol version sent by the kernel for
- * communication and reply with that major version (and an arbitrary
- * supported minor version).
- */
-
-/** Version number of this interface */
-#define FUSE_KERNEL_VERSION 7
-
-/** Minor version number of this interface */
-#define FUSE_KERNEL_MINOR_VERSION 17
-
-/** The node ID of the root inode */
-#define FUSE_ROOT_ID 1
-
-/* Make sure all structures are padded to 64bit boundary, so 32bit
-   userspace works under 64bit kernels */
-
-struct fuse_attr {
-	__u64	ino;
-	__u64	size;
-	__u64	blocks;
-	__u64	atime;
-	__u64	mtime;
-	__u64	ctime;
-	__u32	atimensec;
-	__u32	mtimensec;
-	__u32	ctimensec;
-	__u32	mode;
-	__u32	nlink;
-	__u32	uid;
-	__u32	gid;
-	__u32	rdev;
-	__u32	blksize;
-	__u32	padding;
-};
-
-struct fuse_kstatfs {
-	__u64	blocks;
-	__u64	bfree;
-	__u64	bavail;
-	__u64	files;
-	__u64	ffree;
-	__u32	bsize;
-	__u32	namelen;
-	__u32	frsize;
-	__u32	padding;
-	__u32	spare[6];
-};
-
-struct fuse_file_lock {
-	__u64	start;
-	__u64	end;
-	__u32	type;
-	__u32	pid; /* tgid */
-};
-
-/**
- * Bitmasks for fuse_setattr_in.valid
- */
-#define FATTR_MODE	(1 << 0)
-#define FATTR_UID	(1 << 1)
-#define FATTR_GID	(1 << 2)
-#define FATTR_SIZE	(1 << 3)
-#define FATTR_ATIME	(1 << 4)
-#define FATTR_MTIME	(1 << 5)
-#define FATTR_FH	(1 << 6)
-#define FATTR_ATIME_NOW	(1 << 7)
-#define FATTR_MTIME_NOW	(1 << 8)
-#define FATTR_LOCKOWNER	(1 << 9)
-
-/**
- * Flags returned by the OPEN request
- *
- * FOPEN_DIRECT_IO: bypass page cache for this open file
- * FOPEN_KEEP_CACHE: don't invalidate the data cache on open
- * FOPEN_NONSEEKABLE: the file is not seekable
- */
-#define FOPEN_DIRECT_IO		(1 << 0)
-#define FOPEN_KEEP_CACHE	(1 << 1)
-#define FOPEN_NONSEEKABLE	(1 << 2)
-
-/**
- * INIT request/reply flags
- *
- * FUSE_POSIX_LOCKS: remote locking for POSIX file locks
- * FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
- * FUSE_DONT_MASK: don't apply umask to file mode on create operations
- * FUSE_FLOCK_LOCKS: remote locking for BSD style file locks
- */
-#define FUSE_ASYNC_READ		(1 << 0)
-#define FUSE_POSIX_LOCKS	(1 << 1)
-#define FUSE_FILE_OPS		(1 << 2)
-#define FUSE_ATOMIC_O_TRUNC	(1 << 3)
-#define FUSE_EXPORT_SUPPORT	(1 << 4)
-#define FUSE_BIG_WRITES		(1 << 5)
-#define FUSE_DONT_MASK		(1 << 6)
-#define FUSE_FLOCK_LOCKS	(1 << 10)
-
-/**
- * CUSE INIT request/reply flags
- *
- * CUSE_UNRESTRICTED_IOCTL:  use unrestricted ioctl
- */
-#define CUSE_UNRESTRICTED_IOCTL	(1 << 0)
-
-/**
- * Release flags
- */
-#define FUSE_RELEASE_FLUSH	(1 << 0)
-#define FUSE_RELEASE_FLOCK_UNLOCK	(1 << 1)
-
-/**
- * Getattr flags
- */
-#define FUSE_GETATTR_FH		(1 << 0)
-
-/**
- * Lock flags
- */
-#define FUSE_LK_FLOCK		(1 << 0)
-
-/**
- * WRITE flags
- *
- * FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed
- * FUSE_WRITE_LOCKOWNER: lock_owner field is valid
- */
-#define FUSE_WRITE_CACHE	(1 << 0)
-#define FUSE_WRITE_LOCKOWNER	(1 << 1)
-
-/**
- * Read flags
- */
-#define FUSE_READ_LOCKOWNER	(1 << 1)
-
-/**
- * Ioctl flags
- *
- * FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
- * FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
- * FUSE_IOCTL_RETRY: retry with new iovecs
- * FUSE_IOCTL_32BIT: 32bit ioctl
- *
- * FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
- */
-#define FUSE_IOCTL_COMPAT	(1 << 0)
-#define FUSE_IOCTL_UNRESTRICTED	(1 << 1)
-#define FUSE_IOCTL_RETRY	(1 << 2)
-#define FUSE_IOCTL_32BIT	(1 << 3)
-
-#define FUSE_IOCTL_MAX_IOV	256
-
-/**
- * Poll flags
- *
- * FUSE_POLL_SCHEDULE_NOTIFY: request poll notify
- */
-#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
-
-enum fuse_opcode {
-	FUSE_LOOKUP	   = 1,
-	FUSE_FORGET	   = 2,  /* no reply */
-	FUSE_GETATTR	   = 3,
-	FUSE_SETATTR	   = 4,
-	FUSE_READLINK	   = 5,
-	FUSE_SYMLINK	   = 6,
-	FUSE_MKNOD	   = 8,
-	FUSE_MKDIR	   = 9,
-	FUSE_UNLINK	   = 10,
-	FUSE_RMDIR	   = 11,
-	FUSE_RENAME	   = 12,
-	FUSE_LINK	   = 13,
-	FUSE_OPEN	   = 14,
-	FUSE_READ	   = 15,
-	FUSE_WRITE	   = 16,
-	FUSE_STATFS	   = 17,
-	FUSE_RELEASE       = 18,
-	FUSE_FSYNC         = 20,
-	FUSE_SETXATTR      = 21,
-	FUSE_GETXATTR      = 22,
-	FUSE_LISTXATTR     = 23,
-	FUSE_REMOVEXATTR   = 24,
-	FUSE_FLUSH         = 25,
-	FUSE_INIT          = 26,
-	FUSE_OPENDIR       = 27,
-	FUSE_READDIR       = 28,
-	FUSE_RELEASEDIR    = 29,
-	FUSE_FSYNCDIR      = 30,
-	FUSE_GETLK         = 31,
-	FUSE_SETLK         = 32,
-	FUSE_SETLKW        = 33,
-	FUSE_ACCESS        = 34,
-	FUSE_CREATE        = 35,
-	FUSE_INTERRUPT     = 36,
-	FUSE_BMAP          = 37,
-	FUSE_DESTROY       = 38,
-	FUSE_IOCTL         = 39,
-	FUSE_POLL          = 40,
-	FUSE_NOTIFY_REPLY  = 41,
-	FUSE_BATCH_FORGET  = 42,
-
-	/* CUSE specific operations */
-	CUSE_INIT          = 4096,
-};
-
-enum fuse_notify_code {
-	FUSE_NOTIFY_POLL   = 1,
-	FUSE_NOTIFY_INVAL_INODE = 2,
-	FUSE_NOTIFY_INVAL_ENTRY = 3,
-	FUSE_NOTIFY_STORE = 4,
-	FUSE_NOTIFY_RETRIEVE = 5,
-	FUSE_NOTIFY_CODE_MAX,
-};
-
-/* The read buffer is required to be at least 8k, but may be much larger */
-#define FUSE_MIN_READ_BUFFER 8192
-
-#define FUSE_COMPAT_ENTRY_OUT_SIZE 120
-
-struct fuse_entry_out {
-	__u64	nodeid;		/* Inode ID */
-	__u64	generation;	/* Inode generation: nodeid:gen must
-				   be unique for the fs's lifetime */
-	__u64	entry_valid;	/* Cache timeout for the name */
-	__u64	attr_valid;	/* Cache timeout for the attributes */
-	__u32	entry_valid_nsec;
-	__u32	attr_valid_nsec;
-	struct fuse_attr attr;
-};
-
-struct fuse_forget_in {
-	__u64	nlookup;
-};
-
-struct fuse_forget_one {
-	__u64	nodeid;
-	__u64	nlookup;
-};
-
-struct fuse_batch_forget_in {
-	__u32	count;
-	__u32	dummy;
-};
-
-struct fuse_getattr_in {
-	__u32	getattr_flags;
-	__u32	dummy;
-	__u64	fh;
-};
-
-#define FUSE_COMPAT_ATTR_OUT_SIZE 96
-
-struct fuse_attr_out {
-	__u64	attr_valid;	/* Cache timeout for the attributes */
-	__u32	attr_valid_nsec;
-	__u32	dummy;
-	struct fuse_attr attr;
-};
-
-#define FUSE_COMPAT_MKNOD_IN_SIZE 8
-
-struct fuse_mknod_in {
-	__u32	mode;
-	__u32	rdev;
-	__u32	umask;
-	__u32	padding;
-};
-
-struct fuse_mkdir_in {
-	__u32	mode;
-	__u32	umask;
-};
-
-struct fuse_rename_in {
-	__u64	newdir;
-};
-
-struct fuse_link_in {
-	__u64	oldnodeid;
-};
-
-struct fuse_setattr_in {
-	__u32	valid;
-	__u32	padding;
-	__u64	fh;
-	__u64	size;
-	__u64	lock_owner;
-	__u64	atime;
-	__u64	mtime;
-	__u64	unused2;
-	__u32	atimensec;
-	__u32	mtimensec;
-	__u32	unused3;
-	__u32	mode;
-	__u32	unused4;
-	__u32	uid;
-	__u32	gid;
-	__u32	unused5;
-};
-
-struct fuse_open_in {
-	__u32	flags;
-	__u32	unused;
-};
-
-struct fuse_create_in {
-	__u32	flags;
-	__u32	mode;
-	__u32	umask;
-	__u32	padding;
-};
-
-struct fuse_open_out {
-	__u64	fh;
-	__u32	open_flags;
-	__u32	padding;
-};
-
-struct fuse_release_in {
-	__u64	fh;
-	__u32	flags;
-	__u32	release_flags;
-	__u64	lock_owner;
-};
-
-struct fuse_flush_in {
-	__u64	fh;
-	__u32	unused;
-	__u32	padding;
-	__u64	lock_owner;
-};
-
-struct fuse_read_in {
-	__u64	fh;
-	__u64	offset;
-	__u32	size;
-	__u32	read_flags;
-	__u64	lock_owner;
-	__u32	flags;
-	__u32	padding;
-};
-
-#define FUSE_COMPAT_WRITE_IN_SIZE 24
-
-struct fuse_write_in {
-	__u64	fh;
-	__u64	offset;
-	__u32	size;
-	__u32	write_flags;
-	__u64	lock_owner;
-	__u32	flags;
-	__u32	padding;
-};
-
-struct fuse_write_out {
-	__u32	size;
-	__u32	padding;
-};
-
-#define FUSE_COMPAT_STATFS_SIZE 48
-
-struct fuse_statfs_out {
-	struct fuse_kstatfs st;
-};
-
-struct fuse_fsync_in {
-	__u64	fh;
-	__u32	fsync_flags;
-	__u32	padding;
-};
-
-struct fuse_setxattr_in {
-	__u32	size;
-	__u32	flags;
-};
-
-struct fuse_getxattr_in {
-	__u32	size;
-	__u32	padding;
-};
-
-struct fuse_getxattr_out {
-	__u32	size;
-	__u32	padding;
-};
-
-struct fuse_lk_in {
-	__u64	fh;
-	__u64	owner;
-	struct fuse_file_lock lk;
-	__u32	lk_flags;
-	__u32	padding;
-};
-
-struct fuse_lk_out {
-	struct fuse_file_lock lk;
-};
-
-struct fuse_access_in {
-	__u32	mask;
-	__u32	padding;
-};
-
-struct fuse_init_in {
-	__u32	major;
-	__u32	minor;
-	__u32	max_readahead;
-	__u32	flags;
-};
-
-struct fuse_init_out {
-	__u32	major;
-	__u32	minor;
-	__u32	max_readahead;
-	__u32	flags;
-	__u16   max_background;
-	__u16   congestion_threshold;
-	__u32	max_write;
-};
-
-#define CUSE_INIT_INFO_MAX 4096
-
-struct cuse_init_in {
-	__u32	major;
-	__u32	minor;
-	__u32	unused;
-	__u32	flags;
-};
-
-struct cuse_init_out {
-	__u32	major;
-	__u32	minor;
-	__u32	unused;
-	__u32	flags;
-	__u32	max_read;
-	__u32	max_write;
-	__u32	dev_major;		/* chardev major */
-	__u32	dev_minor;		/* chardev minor */
-	__u32	spare[10];
-};
-
-struct fuse_interrupt_in {
-	__u64	unique;
-};
-
-struct fuse_bmap_in {
-	__u64	block;
-	__u32	blocksize;
-	__u32	padding;
-};
-
-struct fuse_bmap_out {
-	__u64	block;
-};
-
-struct fuse_ioctl_in {
-	__u64	fh;
-	__u32	flags;
-	__u32	cmd;
-	__u64	arg;
-	__u32	in_size;
-	__u32	out_size;
-};
-
-struct fuse_ioctl_iovec {
-	__u64	base;
-	__u64	len;
-};
-
-struct fuse_ioctl_out {
-	__s32	result;
-	__u32	flags;
-	__u32	in_iovs;
-	__u32	out_iovs;
-};
-
-struct fuse_poll_in {
-	__u64	fh;
-	__u64	kh;
-	__u32	flags;
-	__u32   padding;
-};
-
-struct fuse_poll_out {
-	__u32	revents;
-	__u32	padding;
-};
-
-struct fuse_notify_poll_wakeup_out {
-	__u64	kh;
-};
-
-struct fuse_in_header {
-	__u32	len;
-	__u32	opcode;
-	__u64	unique;
-	__u64	nodeid;
-	__u32	uid;
-	__u32	gid;
-	__u32	pid;
-	__u32	padding;
-};
-
-struct fuse_out_header {
-	__u32	len;
-	__s32	error;
-	__u64	unique;
-};
-
-struct fuse_dirent {
-	__u64	ino;
-	__u64	off;
-	__u32	namelen;
-	__u32	type;
-	char name[0];
-};
-
-#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
-#define FUSE_DIRENT_ALIGN(x) (((x) + sizeof(__u64) - 1) & ~(sizeof(__u64) - 1))
-#define FUSE_DIRENT_SIZE(d) \
-	FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
-
-struct fuse_notify_inval_inode_out {
-	__u64	ino;
-	__s64	off;
-	__s64	len;
-};
-
-struct fuse_notify_inval_entry_out {
-	__u64	parent;
-	__u32	namelen;
-	__u32	padding;
-};
-
-struct fuse_notify_store_out {
-	__u64	nodeid;
-	__u64	offset;
-	__u32	size;
-	__u32	padding;
-};
-
-struct fuse_notify_retrieve_out {
-	__u64	notify_unique;
-	__u64	nodeid;
-	__u64	offset;
-	__u32	size;
-	__u32	padding;
-};
-
-/* Matches the size of fuse_write_in */
-struct fuse_notify_retrieve_in {
-	__u64	dummy1;
-	__u64	offset;
-	__u32	size;
-	__u32	dummy2;
-	__u64	dummy3;
-	__u64	dummy4;
-};
+#include <uapi/linux/fuse.h>
 
 #endif /* _LINUX_FUSE_H */
diff -ur a/include/linux/gfp.h b/include/linux/gfp.h
--- a/include/linux/gfp.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/gfp.h	2014-02-17 11:56:06.000000000 +0100
@@ -141,7 +141,12 @@
 /* Flag - indicates that the buffer will be suitable for DMA.  Ignored on some
    platforms, used as appropriate on others */
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+#define GFP_DMA		__GFP_DMA32
+#define GFP_DMA_NCNB	__GFP_DMA
+#else
 #define GFP_DMA		__GFP_DMA
+#endif
 
 /* 4GB DMA on some platforms */
 #define GFP_DMA32	__GFP_DMA32
diff -ur a/include/linux/if_arp.h b/include/linux/if_arp.h
--- a/include/linux/if_arp.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/if_arp.h	2014-02-17 11:56:07.000000000 +0100
@@ -92,6 +92,10 @@
 #define ARPHRD_PHONET_PIPE 821		/* PhoNet pipe header		*/
 #define ARPHRD_CAIF	822		/* CAIF media type		*/
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#define ARPHRD_IPV6_IPV6_TUNNEL   ARPHRD_ETHER 
+#endif
+
 #define ARPHRD_VOID	  0xFFFF	/* Void type, nothing is known */
 #define ARPHRD_NONE	  0xFFFE	/* zero header length */
 
diff -ur a/include/linux/if_ether.h b/include/linux/if_ether.h
--- a/include/linux/if_ether.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/if_ether.h	2014-02-17 11:56:03.000000000 +0100
@@ -34,6 +34,9 @@
 #define ETH_DATA_LEN	1500		/* Max. octets in payload	 */
 #define ETH_FRAME_LEN	1514		/* Max. octets in frame sans FCS */
 #define ETH_FCS_LEN	4		/* Octets in the FCS		 */
+#if defined(CONFIG_SYNO_COMCERTO)
+#define ETH_IPHLEN      2               /* EtherIP header length         */
+#endif
 
 /*
  *	These are the defined Ethernet Protocol ID's.
diff -ur a/include/linux/if_packet.h b/include/linux/if_packet.h
--- a/include/linux/if_packet.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/if_packet.h	2014-02-17 11:56:07.000000000 +0100
@@ -29,6 +29,10 @@
 /* These ones are invisible by user level */
 #define PACKET_LOOPBACK		5		/* MC/BRD frame looped back */
 #define PACKET_FASTROUTE	6		/* Fastrouted frame	*/
+#if defined(CONFIG_SYNO_COMCERTO)
+#define PACKET_MASK_ANY		0xffffffff	/* mask for packet type bits */
+#endif
+
 
 /* Packet socket options */
 
@@ -50,6 +54,9 @@
 #define PACKET_TX_TIMESTAMP		16
 #define PACKET_TIMESTAMP		17
 #define PACKET_FANOUT			18
+#if defined(CONFIG_SYNO_COMCERTO)
+#define PACKET_RECV_TYPE		19
+#endif
 
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
diff -ur a/include/linux/if_ppp.h b/include/linux/if_ppp.h
--- a/include/linux/if_ppp.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/if_ppp.h	2014-02-17 11:56:06.000000000 +0100
@@ -161,6 +161,9 @@
 #define PPPIOCATTCHAN	_IOW('t', 56, int)	/* attach to ppp channel */
 #define PPPIOCGCHAN	_IOR('t', 55, int)	/* get ppp channel number */
 #define PPPIOCGL2TPSTATS _IOR('t', 54, struct pppol2tp_ioc_stats)
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+#define PPPIOCSFPPIDLE	_IOW('t', 53, struct ppp_idle)	/* Set the FPP stats */
+#endif
 
 #define SIOCGPPPSTATS   (SIOCDEVPRIVATE + 0)
 #define SIOCGPPPVER     (SIOCDEVPRIVATE + 1)	/* NEVER change this!! */
diff -ur a/include/linux/in.h b/include/linux/in.h
--- a/include/linux/in.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/in.h	2014-02-17 11:56:04.000000000 +0100
@@ -41,6 +41,9 @@
   IPPROTO_ESP = 50,            /* Encapsulation Security Payload protocol */
   IPPROTO_AH = 51,             /* Authentication Header protocol       */
   IPPROTO_BEETPH = 94,	       /* IP option pseudo header for BEET */
+#if defined(CONFIG_SYNO_COMCERTO)
+  IPPROTO_ETHERIP = 97,        /* IP option for EtherIP tunnel (rfc 3378) */
+#endif
   IPPROTO_PIM    = 103,		/* Protocol Independent Multicast	*/
 
   IPPROTO_COMP   = 108,                /* Compression Header protocol */
diff -ur a/include/linux/kobject.h b/include/linux/kobject.h
--- a/include/linux/kobject.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/kobject.h	2014-02-17 11:56:07.000000000 +0100
@@ -31,6 +31,10 @@
 #define UEVENT_NUM_ENVP			32	/* number of env pointers */
 #define UEVENT_BUFFER_SIZE		2048	/* buffer for the variables */
 
+#if defined(CONFIG_SYNO_COMCERTO)
+struct sk_buff;
+#endif
+
 /* path to the userspace helper executed on an event */
 extern char uevent_helper[];
 
@@ -215,6 +219,12 @@
 
 int kobject_action_type(const char *buf, size_t count,
 			enum kobject_action *type);
+
+#if defined(CONFIG_SYNO_COMCERTO)
+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group,
+		     gfp_t allocation);
+#endif
+
 #else
 static inline int kobject_uevent(struct kobject *kobj,
 				 enum kobject_action action)
@@ -231,6 +241,18 @@
 static inline int kobject_action_type(const char *buf, size_t count,
 				      enum kobject_action *type)
 { return -EINVAL; }
+
+#if defined(CONFIG_SYNO_COMCERTO)
+void kfree_skb(struct sk_buff *);
+
+static inline int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group,
+				   gfp_t allocation)
+{
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+
 #endif
 
 #endif /* _KOBJECT_H_ */
diff -ur a/include/linux/kvm.h b/include/linux/kvm.h
--- a/include/linux/kvm.h	2013-08-24 11:36:12.000000000 +0200
+++ b/include/linux/kvm.h	2014-02-17 11:56:08.000000000 +0100
@@ -558,6 +558,11 @@
 #define KVM_CAP_PPC_PAPR 68
 #define KVM_CAP_S390_GMAP 71
 #define KVM_CAP_TSC_DEADLINE_TIMER 72
+#define KVM_CAP_S390_UCONTROL 73
+#define KVM_CAP_SYNC_REGS 74
+#define KVM_CAP_PCI_2_3 75
+#define KVM_CAP_KVMCLOCK_CTRL 76
+#define KVM_CAP_SIGNAL_MSI 77
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -637,6 +642,60 @@
 	__u32 pad[9];
 };
 
+#define KVM_MMU_FSL_BOOKE_NOHV		0
+#define KVM_MMU_FSL_BOOKE_HV		1
+
+struct kvm_config_tlb {
+	__u64 params;
+	__u64 array;
+	__u32 mmu_type;
+	__u32 array_len;
+};
+
+struct kvm_dirty_tlb {
+	__u64 bitmap;
+	__u32 num_dirty;
+};
+
+/* Available with KVM_CAP_ONE_REG */
+
+#define KVM_REG_ARCH_MASK	0xff00000000000000ULL
+#define KVM_REG_GENERIC		0x0000000000000000ULL
+
+/*
+ * Architecture specific registers are to be defined in arch headers and
+ * ORed with the arch identifier.
+ */
+#define KVM_REG_PPC		0x1000000000000000ULL
+#define KVM_REG_X86		0x2000000000000000ULL
+#define KVM_REG_IA64		0x3000000000000000ULL
+#define KVM_REG_ARM		0x4000000000000000ULL
+#define KVM_REG_S390		0x5000000000000000ULL
+
+#define KVM_REG_SIZE_SHIFT	52
+#define KVM_REG_SIZE_MASK	0x00f0000000000000ULL
+#define KVM_REG_SIZE_U8		0x0000000000000000ULL
+#define KVM_REG_SIZE_U16	0x0010000000000000ULL
+#define KVM_REG_SIZE_U32	0x0020000000000000ULL
+#define KVM_REG_SIZE_U64	0x0030000000000000ULL
+#define KVM_REG_SIZE_U128	0x0040000000000000ULL
+#define KVM_REG_SIZE_U256	0x0050000000000000ULL
+#define KVM_REG_SIZE_U512	0x0060000000000000ULL
+#define KVM_REG_SIZE_U1024	0x0070000000000000ULL
+
+struct kvm_one_reg {
+	__u64 id;
+	__u64 addr;
+};
+
+struct kvm_msi {
+	__u32 address_lo;
+	__u32 address_hi;
+	__u32 data;
+	__u32 flags;
+	__u8  pad[16];
+};
+
 /*
  * ioctls for VM fds
  */
@@ -697,6 +756,11 @@
 /* Available with KVM_CAP_TSC_CONTROL */
 #define KVM_SET_TSC_KHZ           _IO(KVMIO,  0xa2)
 #define KVM_GET_TSC_KHZ           _IO(KVMIO,  0xa3)
+/* Available with KVM_CAP_PCI_2_3 */
+#define KVM_ASSIGN_SET_INTX_MASK  _IOW(KVMIO,  0xa4, \
+				       struct kvm_assigned_pci_dev)
+/* Available with KVM_CAP_SIGNAL_MSI */
+#define KVM_SIGNAL_MSI            _IOW(KVMIO,  0xa5, struct kvm_msi)
 
 /*
  * ioctls for vcpu fds
diff -ur a/include/linux/kvm_host.h b/include/linux/kvm_host.h
--- a/include/linux/kvm_host.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/kvm_host.h	2014-02-17 11:56:06.000000000 +0100
@@ -699,6 +699,8 @@
 			unsigned flags);
 void kvm_free_irq_routing(struct kvm *kvm);
 
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
 #else
 
 static inline void kvm_free_irq_routing(struct kvm *kvm) {}
diff -ur a/include/linux/libata.h b/include/linux/libata.h
--- a/include/linux/libata.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/libata.h	2014-02-17 11:56:07.000000000 +0100
@@ -103,13 +103,6 @@
 	}							\
 })
 
-#ifdef MY_ABC_HERE
-#define SYNO_ERROR_ALWAYS 999
-#define SYNO_ERROR_TILL_TO_FORCE 998
-#define SYNO_ERROR_TILL_TO_DEEP 997
-#define SYNO_ERROR_MAX 950
-#endif
-
 /* NEW: debug levels */
 #define HAVE_LIBATA_MSG 1
 
@@ -190,12 +183,6 @@
 	ATA_DFLAG_DETACH	= (1 << 24),
 	ATA_DFLAG_DETACHED	= (1 << 25),
 
-#ifdef MY_ABC_HERE
-	ATA_SYNO_DFLAG_PMP_DETACH	= (1 << 0), /* forece device detach */
-	ATA_SYNO_DFLAG_DISABLE	= (1 << 1), /* forece device detach */
-	ATA_SYNO_DFLAG_DETACH	= (1 << 2), /* forece device detach */
-#endif
-
 	ATA_DEV_UNKNOWN		= 0,	/* unknown device */
 	ATA_DEV_ATA		= 1,	/* ATA device */
 	ATA_DEV_ATA_UNSUP	= 2,	/* ATA device (unsupported) */
@@ -243,16 +230,6 @@
 					      * led */
 	ATA_FLAG_NO_DIPM	= (1 << 23), /* host not happy with DIPM */
 
-#ifdef MY_ABC_HERE
-	/* if after reset, still have the following fail, we must try force detect */
-	ATA_SYNO_FLAG_SRST_FAIL	= (1 << 0), /* still have SRST fail */
-	ATA_SYNO_FLAG_COMRESET_FAIL	= (1 << 1), /* still COMRESET fail */
-	ATA_SYNO_FLAG_REVALID_FAIL	= (1 << 2), /* still revalid fail */
-	ATA_SYNO_FLAG_GSCR_FAIL	= (1 << 3), /* still read gscr fail */
-	ATA_SYNO_FLAG_FORCE_INTR	= (1 << 4), /* force fake plugged interrupt */
-	ATA_SYNO_FLAG_FORCE_RETRY	= (1 << 5), /* force eh retries */
-#endif
-
 	/* bits 24:31 of ap->flags are reserved for LLD specific flags */
 
 
@@ -529,6 +506,9 @@
 #ifdef SYNO_SATA_PM_DEVICE_GPIO
 	SYNO_STATUS_GPIO_CTRL		= 1 << 2,
 #endif
+#ifdef MY_ABC_HERE
+	SYNO_STATUS_IS_MV9235		= 1 << 3,
+#endif
 };
 
 enum ata_xfer_mask {
@@ -713,9 +693,6 @@
 	unsigned int		devno;		/* 0 or 1 */
 	unsigned int		horkage;	/* List of broken features */
 	unsigned long		flags;		/* ATA_DFLAG_xxx */
-#ifdef MY_ABC_HERE
-	unsigned long		ulSflags;		/* ATA_SYNO_DFLAG_xxx */
-#endif
 	struct scsi_device	*sdev;		/* attached SCSI device */
 	void			*private_data;
 #ifdef CONFIG_ATA_ACPI
@@ -730,9 +707,8 @@
 	int			  iCheckPwr;
 
 	/* bit definitions */
-	#define CHKPOWER_CHECKING 0
-	#define CHKPOWER_FIRST_CMD 1
-	#define CHKPOWER_FIRST_WAIT 2
+	#define CHKPOWER_FIRST_CMD 0x0
+	#define CHKPOWER_FIRST_WAIT 0x1
 #endif
 	struct device		tdev;
 	/* n_sector is CLEAR_BEGIN, read comment above CLEAR_BEGIN */
@@ -829,9 +805,6 @@
 	u32			sactive;	/* active NCQ commands */
 
 	unsigned int		flags;		/* ATA_LFLAG_xxx */
-#ifdef MY_ABC_HERE
-	unsigned int		uiSflags;		/* ATA_SYNO_FLAG_xxx, the same as ata_port */
-#endif
 	u32			saved_scontrol;	/* SControl on probe */
 	unsigned int		hw_sata_spd_limit;
 	unsigned int		sata_spd_limit;
@@ -846,7 +819,7 @@
 	struct ata_device	device[ATA_MAX_DEVICES];
 
 #if defined(MY_ABC_HERE) || defined(MY_ABC_HERE) || \
-	defined(SYNO_SATA_PM_DEVICE_GPIO)
+	defined(SYNO_SATA_PM_DEVICE_GPIO) || defined(MY_ABC_HERE)
 	unsigned int	uiStsFlags; /* SYNO_STATUS_xxx */
 #endif
 };
@@ -868,15 +841,6 @@
 	unsigned long		flags;	/* ATA_FLAG_xxx */
 	/* Flags that change dynamically, protected by ap->lock */
 	unsigned int		pflags; /* ATA_PFLAG_xxx */
-#ifdef MY_ABC_HERE
-	/* SYNO flags */
-	unsigned int		uiSflags; /* ATA_SYNO_FLAG_xxx */
-	int iFakeError;		/* fake errors */
-	int iDetectStat;	/* detect plugged/un-plugged status at eh complete
-						   to prevent port freeze issue */
-	struct work_struct	SendPwrResetEventTask;
-	struct work_struct	SendPortDisEventTask;
-#endif
 	unsigned int		print_id; /* user visible unique port ID */
 	unsigned int		port_no; /* 0 based port no. inside the host */
 
@@ -1063,9 +1027,6 @@
 	void (*phy_reset)(struct ata_port *ap);
 	void (*eng_timeout)(struct ata_port *ap);
 
-#ifdef MY_ABC_HERE
-	void (*syno_force_intr)(struct ata_port *ap);
-#endif
 	/*
 	 * ->inherits must be the last field and all the preceding
 	 * fields must be pointers.
@@ -1103,11 +1064,6 @@
 extern struct device_attribute dev_attr_syno_manutil_power_disable;
 extern struct device_attribute dev_attr_syno_pm_gpio;
 extern struct device_attribute dev_attr_syno_pm_info;
-#ifdef MY_ABC_HERE
-extern struct device_attribute dev_attr_syno_port_thaw;
-extern struct device_attribute dev_attr_syno_fake_error_ctrl;
-extern struct device_attribute dev_attr_syno_pwr_reset_count;
-#endif
 #endif
 #ifdef MY_ABC_HERE
 extern struct device_attribute dev_attr_syno_wcache;
@@ -1119,9 +1075,6 @@
 extern struct device_attribute dev_attr_syno_diskname_trans;
 #endif
 #ifdef MY_ABC_HERE
-extern unsigned int uiCheckPortLinksFlags(struct ata_port *pAp);
-#endif
-#ifdef MY_ABC_HERE
 extern struct device_attribute dev_attr_syno_sata_disk_led_ctrl;
 #endif
 
@@ -1404,7 +1357,7 @@
 
 #ifdef MY_ABC_HERE
 #define IS_SYNO_SPINUP_CMD(qc) (NULL == qc->scsicmd && !ata_tag_internal(qc->tag) && \
-			(ATA_CMD_CHK_POWER == qc->tf.command || ATA_CMD_FPDMA_READ == qc->tf.command || ATA_CMD_READ == qc->tf.command || \
+			(ATA_CMD_FPDMA_READ == qc->tf.command || ATA_CMD_READ == qc->tf.command || \
 			 ATA_CMD_READ_EXT == qc->tf.command || ATA_CMD_PIO_READ == qc->tf.command || ATA_CMD_PIO_READ_EXT == qc->tf.command || \
 			 ATA_CMD_READ_MULTI == qc->tf.command || ATA_CMD_READ_MULTI_EXT == qc->tf.command))
 #endif
Nur in b/include/linux/mfd: lpc_ich.h.
diff -ur a/include/linux/miscdevice.h b/include/linux/miscdevice.h
--- a/include/linux/miscdevice.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/miscdevice.h	2014-02-17 11:56:04.000000000 +0100
@@ -51,6 +51,7 @@
 #define AUTOFS_MINOR		235
 #define MAPPER_CTRL_MINOR	236
 #define LOOP_CTRL_MINOR		237
+#define VHOST_NET_MINOR		238
 #define MISC_DYNAMIC_MINOR	255
 
 struct device;
diff -ur a/include/linux/mm.h b/include/linux/mm.h
--- a/include/linux/mm.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/mm.h	2014-02-17 11:56:04.000000000 +0100
@@ -872,6 +872,9 @@
 
 int shmem_lock(struct file *file, int lock, struct user_struct *user);
 struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags);
+#if defined(CONFIG_SYNO_COMCERTO)
+void shmem_set_file(struct vm_area_struct *vma, struct file *file);
+#endif
 int shmem_zero_setup(struct vm_area_struct *);
 
 extern int can_do_mlock(void);
@@ -1432,6 +1435,7 @@
 
 /* generic vm_area_ops exported for stackable file systems */
 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
+extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
diff -ur a/include/linux/moduleparam.h b/include/linux/moduleparam.h
--- a/include/linux/moduleparam.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/moduleparam.h	2014-02-17 11:56:05.000000000 +0100
@@ -47,14 +47,11 @@
 	void (*free)(void *arg);
 };
 
-/* Flag bits for kernel_param.flags */
-#define KPARAM_ISBOOL		2
-
 struct kernel_param {
 	const char *name;
 	const struct kernel_param_ops *ops;
 	u16 perm;
-	u16 flags;
+	s16 level;
 	union {
 		void *arg;
 		const struct kparam_string *str;
@@ -131,8 +128,40 @@
  * The ops can have NULL set or get functions.
  */
 #define module_param_cb(name, ops, arg, perm)				      \
-	__module_param_call(MODULE_PARAM_PREFIX,			      \
-			    name, ops, arg, __same_type((arg), bool *), perm)
+	__module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, 0)
+
+/**
+ * <level>_param_cb - general callback for a module/cmdline parameter
+ *                    to be evaluated before certain initcall level
+ * @name: a valid C identifier which is the parameter name.
+ * @ops: the set & get operations for this parameter.
+ * @perm: visibility in sysfs.
+ *
+ * The ops can have NULL set or get functions.
+ */
+#define __level_param_cb(name, ops, arg, perm, level)			\
+	__module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, level)
+
+#define core_param_cb(name, ops, arg, perm)		\
+	__level_param_cb(name, ops, arg, perm, 1)
+
+#define postcore_param_cb(name, ops, arg, perm)		\
+	__level_param_cb(name, ops, arg, perm, 2)
+
+#define arch_param_cb(name, ops, arg, perm)		\
+	__level_param_cb(name, ops, arg, perm, 3)
+
+#define subsys_param_cb(name, ops, arg, perm)		\
+	__level_param_cb(name, ops, arg, perm, 4)
+
+#define fs_param_cb(name, ops, arg, perm)		\
+	__level_param_cb(name, ops, arg, perm, 5)
+
+#define device_param_cb(name, ops, arg, perm)		\
+	__level_param_cb(name, ops, arg, perm, 6)
+
+#define late_param_cb(name, ops, arg, perm)		\
+	__level_param_cb(name, ops, arg, perm, 7)
 
 /* On alpha, ia64 and ppc64 relocations to global data cannot go into
    read-only sections (which is part of respective UNIX ABI on these
@@ -146,7 +175,7 @@
 
 /* This is the fundamental function for registering boot/module
    parameters. */
-#define __module_param_call(prefix, name, ops, arg, isbool, perm)	\
+#define __module_param_call(prefix, name, ops, arg, perm, level)	\
 	/* Default value instead of permissions? */			\
 	static int __param_perm_check_##name __attribute__((unused)) =	\
 	BUILD_BUG_ON_ZERO((perm) < 0 || (perm) > 0777 || ((perm) & 2))	\
@@ -155,8 +184,7 @@
 	static struct kernel_param __moduleparam_const __param_##name	\
 	__used								\
     __attribute__ ((unused,__section__ ("__param"),aligned(sizeof(void *)))) \
-	= { __param_str_##name, ops, perm, isbool ? KPARAM_ISBOOL : 0,	\
-	    { arg } }
+	= { __param_str_##name, ops, perm, level, { arg } }
 
 /* Obsolete - use module_param_cb() */
 #define module_param_call(name, set, get, arg, perm)			\
@@ -164,8 +192,7 @@
 		 { (void *)set, (void *)get };				\
 	__module_param_call(MODULE_PARAM_PREFIX,			\
 			    name, &__param_ops_##name, arg,		\
-			    __same_type(arg, bool *),			\
-			    (perm) + sizeof(__check_old_set_param(set))*0)
+			    (perm) + sizeof(__check_old_set_param(set))*0, 0)
 
 /* We don't get oldget: it's often a new-style param_get_uint, etc. */
 static inline int
@@ -245,8 +272,7 @@
  */
 #define core_param(name, var, type, perm)				\
 	param_check_##type(name, &(var));				\
-	__module_param_call("", name, &param_ops_##type,		\
-			    &var, __same_type(var, bool), perm)
+	__module_param_call("", name, &param_ops_##type, &var, perm, 0)
 #endif /* !MODULE */
 
 /**
@@ -264,7 +290,7 @@
 		= { len, string };					\
 	__module_param_call(MODULE_PARAM_PREFIX, name,			\
 			    &param_ops_string,				\
-			    .str = &__param_string_##name, 0, perm);	\
+			    .str = &__param_string_##name, perm, 0);	\
 	__MODULE_PARM_TYPE(name, "string")
 
 /**
@@ -292,6 +318,8 @@
 		      char *args,
 		      const struct kernel_param *params,
 		      unsigned num,
+		      s16 level_min,
+		      s16 level_max,
 		      int (*unknown)(char *param, char *val));
 
 /* Called by module remove. */
@@ -402,7 +430,7 @@
 	__module_param_call(MODULE_PARAM_PREFIX, name,			\
 			    &param_array_ops,				\
 			    .arr = &__param_arr_##name,			\
-			    __same_type(array[0], bool), perm);		\
+			    perm, 0);					\
 	__MODULE_PARM_TYPE(name, "array of " #type)
 
 extern struct kernel_param_ops param_array_ops;
diff -ur a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h
--- a/include/linux/mtd/mtd.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/mtd/mtd.h	2014-02-17 11:56:05.000000000 +0100
@@ -58,6 +58,11 @@
 	u_long priv;
 	u_char state;
 	struct erase_info *next;
+#if defined(CONFIG_SYNO_COMCERTO)
+	u8 *erase_buf;
+	u32 erase_buf_ofs;
+	bool partial_start;
+#endif
 };
 
 struct mtd_erase_region_info {
@@ -114,6 +119,9 @@
 
 struct module;	/* only needed for owner field in mtd_info */
 
+#if defined(CONFIG_SYNO_COMCERTO)
+struct mtd_info;
+#endif
 struct mtd_info {
 	u_char type;
 	uint32_t flags;
@@ -266,6 +274,11 @@
 	struct device dev;
 	int usecount;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	int (*refresh_device)(struct mtd_info *mtd);
+	struct mtd_info *split;
+#endif
+
 	/* If the driver is something smart, like UBI, it may need to maintain
 	 * its own reference counting. The below functions are only for driver.
 	 * The driver may register its callbacks. These callbacks are not
@@ -321,6 +334,9 @@
 			      int defnr_parts);
 #define mtd_device_register(master, parts, nr_parts)	\
 	mtd_device_parse_register(master, NULL, NULL, parts, nr_parts)
+#if defined(CONFIG_SYNO_COMCERTO)
+extern int mtd_device_refresh(struct mtd_info *master);
+#endif
 extern int mtd_device_unregister(struct mtd_info *master);
 extern struct mtd_info *get_mtd_device(struct mtd_info *mtd, int num);
 extern int __get_mtd_device(struct mtd_info *mtd);
diff -ur a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h
--- a/include/linux/mtd/partitions.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/mtd/partitions.h	2014-02-17 11:56:05.000000000 +0100
@@ -35,6 +35,11 @@
  * Note: writeable partitions require their size and offset be
  * erasesize aligned (e.g. use MTDPART_OFS_NEXTBLK).
  */
+#if defined(CONFIG_SYNO_COMCERTO)
+struct mtd_info;
+
+struct mtd_partition;
+#endif
 
 struct mtd_partition {
 	char *name;			/* identifier string */
@@ -42,6 +47,9 @@
 	uint64_t offset;		/* offset within the master MTD space */
 	uint32_t mask_flags;		/* master MTD flags to mask out for this partition */
 	struct nand_ecclayout *ecclayout;	/* out of band layout for this partition (NAND only) */
+#if defined(CONFIG_SYNO_COMCERTO)
+	int (*refresh_partition)(struct mtd_info *);
+#endif
 };
 
 #define MTDPART_OFS_RETAIN	(-3)
@@ -50,7 +58,9 @@
 #define MTDPART_SIZ_FULL	(0)
 
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 struct mtd_info;
+#endif
 struct device_node;
 
 /**
@@ -79,9 +89,10 @@
 extern int register_mtd_parser(struct mtd_part_parser *parser);
 extern int deregister_mtd_parser(struct mtd_part_parser *parser);
 
-int mtd_is_partition(struct mtd_info *mtd);
+int mtd_is_partition(const struct mtd_info *mtd);
 int mtd_add_partition(struct mtd_info *master, char *name,
 		      long long offset, long long length);
 int mtd_del_partition(struct mtd_info *master, int partno);
+uint64_t mtd_get_device_size(const struct mtd_info *mtd);
 
 #endif
diff -ur a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h
--- a/include/linux/mtd/physmap.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/mtd/physmap.h	2014-02-17 11:56:05.000000000 +0100
@@ -17,6 +17,9 @@
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/platform_device.h>
+#endif
 
 struct map_info;
 struct platform_device;
diff -ur a/include/linux/mtd/ubi.h b/include/linux/mtd/ubi.h
--- a/include/linux/mtd/ubi.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/mtd/ubi.h	2014-02-17 11:56:05.000000000 +0100
@@ -25,6 +25,9 @@
 #include <linux/types.h>
 #include <mtd/ubi-user.h>
 
+/* All voumes/LEBs */
+#define UBI_ALL -1
+
 /*
  * enum ubi_open_mode - UBI volume open mode constants.
  *
@@ -208,14 +211,15 @@
 int ubi_leb_read(struct ubi_volume_desc *desc, int lnum, char *buf, int offset,
 		 int len, int check);
 int ubi_leb_write(struct ubi_volume_desc *desc, int lnum, const void *buf,
-		  int offset, int len, int dtype);
+		  int offset, int len);
 int ubi_leb_change(struct ubi_volume_desc *desc, int lnum, const void *buf,
-		   int len, int dtype);
+		   int len);
 int ubi_leb_erase(struct ubi_volume_desc *desc, int lnum);
 int ubi_leb_unmap(struct ubi_volume_desc *desc, int lnum);
-int ubi_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype);
+int ubi_leb_map(struct ubi_volume_desc *desc, int lnum);
 int ubi_is_mapped(struct ubi_volume_desc *desc, int lnum);
 int ubi_sync(int ubi_num);
+int ubi_flush(int ubi_num, int vol_id, int lnum);
 
 /*
  * This function is the same as the 'ubi_leb_read()' function, but it does not
@@ -226,25 +230,4 @@
 {
 	return ubi_leb_read(desc, lnum, buf, offset, len, 0);
 }
-
-/*
- * This function is the same as the 'ubi_leb_write()' functions, but it does
- * not have the data type argument.
- */
-static inline int ubi_write(struct ubi_volume_desc *desc, int lnum,
-			    const void *buf, int offset, int len)
-{
-	return ubi_leb_write(desc, lnum, buf, offset, len, UBI_UNKNOWN);
-}
-
-/*
- * This function is the same as the 'ubi_leb_change()' functions, but it does
- * not have the data type argument.
- */
-static inline int ubi_change(struct ubi_volume_desc *desc, int lnum,
-				    const void *buf, int len)
-{
-	return ubi_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
-}
-
 #endif /* !__LINUX_UBI_H__ */
diff -ur a/include/linux/netdevice.h b/include/linux/netdevice.h
--- a/include/linux/netdevice.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/netdevice.h	2014-02-17 11:56:05.000000000 +0100
@@ -142,7 +142,7 @@
  */
 
 #if defined(CONFIG_WLAN) || defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE)
-# if defined(CONFIG_MAC80211_MESH)
+# if defined(CONFIG_SYNO_COMCERTO) || defined(CONFIG_MAC80211_MESH)
 #  define LL_MAX_HEADER 128
 # else
 #  define LL_MAX_HEADER 96
diff -ur a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h
--- a/include/linux/netfilter/nf_conntrack_common.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/netfilter/nf_conntrack_common.h	2014-02-17 11:56:03.000000000 +0100
@@ -83,6 +83,16 @@
 	/* Conntrack is a fake untracked entry */
 	IPS_UNTRACKED_BIT = 12,
 	IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT),
+
+#if defined(CONFIG_SYNO_COMCERTO)
+	/* Connection  cannot expire */
+	IPS_PERMANENT_BIT = 13,
+	IPS_PERMANENT = (1 << IPS_PERMANENT_BIT),
+
+	/* Connection is assured by DPI application */
+	IPS_DPI_ALLOWED_BIT = 14,
+	IPS_DPI_ALLOWED = (1 << IPS_DPI_ALLOWED_BIT),
+#endif
 };
 
 /* Connection tracking event types */
diff -ur a/include/linux/netfilter/nf_conntrack_sip.h b/include/linux/netfilter/nf_conntrack_sip.h
--- a/include/linux/netfilter/nf_conntrack_sip.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/netfilter/nf_conntrack_sip.h	2014-02-17 11:56:03.000000000 +0100
@@ -2,12 +2,19 @@
 #define __NF_CONNTRACK_SIP_H__
 #ifdef __KERNEL__
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/types.h>
+#endif
+
 #define SIP_PORT	5060
 #define SIP_TIMEOUT	3600
 
 struct nf_ct_sip_master {
 	unsigned int	register_cseq;
 	unsigned int	invite_cseq;
+#if defined(CONFIG_SYNO_COMCERTO)
+	__be16		forced_dport;
+#endif
 };
 
 enum sip_expectation_classes {
diff -ur a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h
--- a/include/linux/netfilter/nfnetlink_conntrack.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/netfilter/nfnetlink_conntrack.h	2014-02-17 11:56:03.000000000 +0100
@@ -43,6 +43,10 @@
 	CTA_ZONE,
 	CTA_SECCTX,
 	CTA_TIMESTAMP,
+#if defined(CONFIG_SYNO_COMCERTO)
+	CTA_COMCERTO_FP_ORIG,
+	CTA_COMCERTO_FP_REPLY,
+#endif
 	__CTA_MAX
 };
 #define CTA_MAX (__CTA_MAX - 1)
@@ -190,4 +194,15 @@
 };
 #define CTA_SECCTX_MAX (__CTA_SECCTX_MAX - 1)
 
+#if defined(CONFIG_SYNO_COMCERTO)
+enum ctattr_comcerto_fp {
+	CTA_COMCERTO_FP_UNSPEC,
+	CTA_COMCERTO_FP_MARK,
+	CTA_COMCERTO_FP_IFINDEX,
+	CTA_COMCERTO_FP_IIF,
+	__CTA_COMCERTO_FP_MAX
+};
+#define CTA_COMCERTO_FP_MAX (__CTA_COMCERTO_FP_MAX - 1)
+#endif
+
 #endif /* _IPCONNTRACK_NETLINK_H */
diff -ur a/include/linux/netfilter/xt_layer7.h b/include/linux/netfilter/xt_layer7.h
--- a/include/linux/netfilter/xt_layer7.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/netfilter/xt_layer7.h	2014-02-17 11:56:03.000000000 +0100
@@ -8,6 +8,9 @@
     char protocol[MAX_PROTOCOL_LEN];
     char pattern[MAX_PATTERN_LEN];
     u_int8_t invert;
+#if defined(CONFIG_SYNO_COMCERTO)
+    u_int8_t pkt;
+#endif
 };
 
 #endif /* _XT_LAYER7_H */
diff -ur a/include/linux/netfilter_ipv4/ip_tables.h b/include/linux/netfilter_ipv4/ip_tables.h
--- a/include/linux/netfilter_ipv4/ip_tables.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/netfilter_ipv4/ip_tables.h	2014-02-17 11:56:04.000000000 +0100
@@ -93,6 +93,9 @@
 #define IPT_F_FRAG		0x01	/* Set if rule is a fragment rule */
 #define IPT_F_GOTO		0x02	/* Set if jump is a goto */
 #define IPT_F_MASK		0x03	/* All possible flag bits mask. */
+#if defined(CONFIG_SYNO_COMCERTO)
+#define IPT_F_NO_DEF_MATCH	0x80	/* Internal: no default match rules present */
+#endif
 
 /* Values for "inv" field in struct ipt_ip. */
 #define IPT_INV_VIA_IN		0x01	/* Invert the sense of IN IFACE. */
diff -ur a/include/linux/netlink.h b/include/linux/netlink.h
--- a/include/linux/netlink.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/netlink.h	2014-02-17 11:56:03.000000000 +0100
@@ -26,8 +26,27 @@
 #define NETLINK_ECRYPTFS	19
 #define NETLINK_RDMA		20
 #define NETLINK_CRYPTO		21	/* Crypto layer */
+#if defined(CONFIG_SYNO_COMCERTO)
+#define NETLINK_FF              30
+#define NETLINK_VOP             31
+#define NETLINK_KEY             32
+#define NETLINK_L2FLOW          33
 
+#define NETLINK_VOIP		34
+#define NETLINK_DTAM		35
+#define NETLINK_MCH		36
+#define NETLINK_CONFIG		37
+#define NETLINK_GENPLAY		38
+#define NETLINK_COMADEBUG	39
+#define NETLINK_EEPROM		40
+#define NETLINK_SS7		41
+#define NETLINK_DSR		42
+#define NETLINK_MMI		43
+
+#define MAX_LINKS 44
+#else
 #define MAX_LINKS 32		
+#endif
 
 struct sockaddr_nl {
 	__kernel_sa_family_t	nl_family;	/* AF_NETLINK	*/
diff -ur a/include/linux/pci_ids.h b/include/linux/pci_ids.h
--- a/include/linux/pci_ids.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/pci_ids.h	2014-02-17 11:56:05.000000000 +0100
@@ -2503,9 +2503,10 @@
 #define PCI_DEVICE_ID_INTEL_82845_HB	0x1a30
 #define PCI_DEVICE_ID_INTEL_IOAT	0x1a38
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN	0x1c41
-#ifdef MY_ABC_HERE
+#if defined(MY_ABC_HERE) || defined(MY_DEF_HERE)
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_C206 0x1c56
 #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LPC_C226 0x8c56
+#define PCI_DEVICE_ID_INTEL_AVOTON_LPC 0x1f38
 #endif
 #define PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX	0x1c5f
 #define PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0	0x1d40
diff -ur a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h
--- a/include/linux/pfkeyv2.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/pfkeyv2.h	2014-02-17 11:56:06.000000000 +0100
@@ -268,6 +268,9 @@
 #define SADB_SAFLAGS_NOPMTUDISC	0x20000000
 #define SADB_SAFLAGS_DECAP_DSCP	0x40000000
 #define SADB_SAFLAGS_NOECN	0x80000000
+#if defined(CONFIG_SYNO_COMCERTO)
+#define SADB_SAFLAGS_ESN	0x01000000
+#endif
 
 /* Security Association states */
 #define SADB_SASTATE_LARVAL	0
diff -ur a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h
--- a/include/linux/pkt_sched.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/pkt_sched.h	2014-02-17 11:56:06.000000000 +0100
@@ -173,8 +173,41 @@
  *
  *	The only reason for this is efficiency, it is possible
  *	to change these parameters in compile time.
+#if defined(CONFIG_SYNO_COMCERTO)
+ *
+ *	If you need to play with these values, use esfq instead.
+#endif
  */
 
+#if defined(CONFIG_SYNO_COMCERTO)
+/* ESFQ section */
+
+enum
+{
+        /* traditional */
+	TCA_SFQ_HASH_CLASSIC,
+	TCA_SFQ_HASH_DST,
+	TCA_SFQ_HASH_SRC,
+	TCA_SFQ_HASH_FWMARK,
+	/* conntrack */
+	TCA_SFQ_HASH_CTORIGDST,
+	TCA_SFQ_HASH_CTORIGSRC,
+	TCA_SFQ_HASH_CTREPLDST,
+	TCA_SFQ_HASH_CTREPLSRC,
+	TCA_SFQ_HASH_CTNATCHG,
+};
+
+struct tc_esfq_qopt
+{
+	unsigned	quantum;	/* Bytes per round allocated to flow */
+	int		perturb_period;	/* Period of hash perturbation */
+	__u32		limit;		/* Maximal packets in queue */
+	unsigned	divisor;	/* Hash divisor  */
+	unsigned	flows;		/* Maximal number of flows  */
+	unsigned	hash_kind;	/* Hash function to use for flow identification */
+};
+#endif
+
 /* RED section */
 
 enum {
diff -ur a/include/linux/poll.h b/include/linux/poll.h
--- a/include/linux/poll.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/poll.h	2014-02-17 11:56:06.000000000 +0100
@@ -32,21 +32,46 @@
  */
 typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);
 
+/*
+ * Do not touch the structure directly, use the access functions
+ * poll_does_not_wait() and poll_requested_events() instead.
+ */
 typedef struct poll_table_struct {
-	poll_queue_proc qproc;
-	unsigned long key;
+	poll_queue_proc _qproc;
+	unsigned long _key;
 } poll_table;
 
 static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
 {
-	if (p && wait_address)
-		p->qproc(filp, wait_address, p);
+	if (p && p->_qproc && wait_address)
+		p->_qproc(filp, wait_address, p);
+}
+
+/*
+ * Return true if it is guaranteed that poll will not wait. This is the case
+ * if the poll() of another file descriptor in the set got an event, so there
+ * is no need for waiting.
+ */
+static inline bool poll_does_not_wait(const poll_table *p)
+{
+	return p == NULL || p->_qproc == NULL;
+}
+
+/*
+ * Return the set of events that the application wants to poll for.
+ * This is useful for drivers that need to know whether a DMA transfer has
+ * to be started implicitly on poll(). You typically only want to do that
+ * if the application is actually polling for POLLIN and/or POLLOUT.
+ */
+static inline unsigned long poll_requested_events(const poll_table *p)
+{
+	return p ? p->_key : ~0UL;
 }
 
 static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
 {
-	pt->qproc = qproc;
-	pt->key   = ~0UL; /* all events enabled */
+	pt->_qproc = qproc;
+	pt->_key   = ~0UL; /* all events enabled */
 }
 
 struct poll_table_entry {
diff -ur a/include/linux/printk.h b/include/linux/printk.h
--- a/include/linux/printk.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/printk.h	2014-02-17 11:56:05.000000000 +0100
@@ -6,6 +6,32 @@
 extern const char linux_banner[];
 extern const char linux_proc_banner[];
 
+#ifdef MY_ABC_HERE
+static inline int printk_get_level(const char *buffer)
+{
+	if (buffer[0] == '\001' && buffer[1]) {
+		switch (buffer[1]) {
+		case '0' ... '7':
+		case 'd':	/* KERN_DEFAULT */
+			return buffer[1];
+		}
+	}
+	return 0;
+}
+
+static inline const char *printk_skip_level(const char *buffer)
+{
+	if (printk_get_level(buffer)) {
+		switch (buffer[1]) {
+		case '0' ... '7':
+		case 'd':	/* KERN_DEFAULT */
+			return buffer + 2;
+		}
+	}
+	return buffer;
+}
+#endif
+
 #define KERN_EMERG	"<0>"	/* system is unusable			*/
 #define KERN_ALERT	"<1>"	/* action must be taken immediately	*/
 #define KERN_CRIT	"<2>"	/* critical conditions			*/
diff -ur a/include/linux/random.h b/include/linux/random.h
--- a/include/linux/random.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/random.h	2014-02-17 11:56:04.000000000 +0100
@@ -34,6 +34,30 @@
 /* Clear the entropy pool and associated counters.  (Superuser only.) */
 #define RNDCLEARPOOL	_IO( 'R', 0x06 )
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_FIPS_RNG)
+
+/* Size of seed value - equal to AES blocksize */
+#define AES_BLOCK_SIZE_BYTES	16
+#define SEED_SIZE_BYTES			AES_BLOCK_SIZE_BYTES
+/* Size of AES key */
+#define KEY_SIZE_BYTES		16
+
+/* ioctl() structure used by FIPS 140-2 Tests */
+struct rand_fips_test {
+	unsigned char key[KEY_SIZE_BYTES];			/* Input */
+	unsigned char datetime[SEED_SIZE_BYTES];	/* Input */
+	unsigned char seed[SEED_SIZE_BYTES];		/* Input */
+	unsigned char result[SEED_SIZE_BYTES];		/* Output */
+};
+
+/* FIPS 140-2 RNG Variable Seed Test. (Superuser only.) */
+#define RNDFIPSVST	_IOWR('R', 0x10, struct rand_fips_test)
+
+/* FIPS 140-2 RNG Monte Carlo Test. (Superuser only.) */
+#define RNDFIPSMCT	_IOWR('R', 0x11, struct rand_fips_test)
+
+#endif /* #if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_FIPS_RNG) */
+
 struct rand_pool_info {
 	int	entropy_count;
 	int	buf_size;
@@ -61,6 +85,12 @@
 #define HAS_RANDOM_INPUT_WAIT 1
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO)
+extern void random_input_words(__u32 *buf, size_t wordcount, int ent_count);
+extern int random_input_wait(void);
+#define HAS_RANDOM_INPUT_WAIT 1
+#endif
+
 extern void get_random_bytes(void *buf, int nbytes);
 extern void get_random_bytes_arch(void *buf, int nbytes);
 void generate_random_uuid(unsigned char uuid_out[16]);
@@ -72,10 +102,19 @@
 unsigned int get_random_int(void);
 unsigned long randomize_range(unsigned long start, unsigned long end, unsigned long len);
 
-u32 random32(void);
-void srandom32(u32 seed);
+u32 prandom_u32(void);
+void prandom_bytes(void *buf, int nbytes);
+void prandom_seed(u32 seed);
+
+/*
+ * These macros are preserved for backward compatibility and should be
+ * removed as soon as a transition is finished.
+ */
+#define random32() prandom_u32()
+#define srandom32(seed) prandom_seed(seed)
 
-u32 prandom32(struct rnd_state *);
+u32 prandom_u32_state(struct rnd_state *);
+void prandom_bytes_state(struct rnd_state *state, void *buf, int nbytes);
 
 /*
  * Handle minimum values for seeds
@@ -86,11 +125,11 @@
 }
 
 /**
- * prandom32_seed - set seed for prandom32().
+ * prandom_seed_state - set seed for prandom_u32_state().
  * @state: pointer to state structure to receive the seed.
  * @seed: arbitrary 64-bit value to use as a seed.
  */
-static inline void prandom32_seed(struct rnd_state *state, u64 seed)
+static inline void prandom_seed_state(struct rnd_state *state, u64 seed)
 {
 	u32 i = (seed >> 32) ^ (seed << 10) ^ seed;
 
Nur in b/include/linux/rtc: rtc-c2k.h.
diff -ur a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
--- a/include/linux/rtnetlink.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/rtnetlink.h	2014-02-17 11:56:04.000000000 +0100
@@ -624,6 +624,9 @@
 			u32 group, struct nlmsghdr *nlh, gfp_t flags);
 extern void rtnl_set_sk_err(struct net *net, u32 group, int error);
 extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_COMCERTO)
+extern int rtnetlink_put_metrics_2(struct sk_buff *skb, u32 *metrics, struct dst_entry *dst);
+#endif
 extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst,
 			      u32 id, u32 ts, u32 tsage, long expires,
 			      u32 error);
diff -ur a/include/linux/serial_core.h b/include/linux/serial_core.h
--- a/include/linux/serial_core.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/serial_core.h	2014-02-17 11:56:03.000000000 +0100
@@ -321,8 +321,8 @@
 #define UPIO_AU			(4)			/* Au1x00 type IO */
 #define UPIO_TSI		(5)			/* Tsi108/109 type IO */
 
-#if defined(CONFIG_SYNO_ARMADA_ARCH)
-#if defined (CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP)
+#if defined(CONFIG_SYNO_ARMADA_ARCH) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
+#if defined (CONFIG_ARCH_ARMADA370) || defined(CONFIG_ARCH_ARMADA_XP) || defined(CONFIG_SYNO_C2K_SERIAL_FIX)
 #define UPIO_DWAPB		(6)
 #define UPIO_DWAPB32		(7)
 #endif
diff -ur a/include/linux/skbuff.h b/include/linux/skbuff.h
--- a/include/linux/skbuff.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/skbuff.h	2014-02-17 11:56:05.000000000 +0100
@@ -420,6 +420,9 @@
 	kmemcheck_bitfield_end(flags1);
 	__be16			protocol;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	__u32			ipsec_offload;
+#endif
 	void			(*destructor)(struct sk_buff *skb);
 #if defined(CONFIG_SYNO_ARMADA)
 #ifdef CONFIG_NET_SKB_RECYCLE
@@ -562,6 +565,17 @@
 	return __alloc_skb(size, priority, 1, NUMA_NO_NODE);
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_COMCERTO)
+extern struct sk_buff *__alloc_skb_header(unsigned int size, void* data, gfp_t gfp_mask,
+			    int fclone, int node);
+static inline struct sk_buff *alloc_skb_header(unsigned int size, 
+					void* data,
+					gfp_t priority)
+{
+	return __alloc_skb_header(size, data, priority, 0, NUMA_NO_NODE);
+}
+#endif
+
 extern void skb_recycle(struct sk_buff *skb);
 extern bool skb_recycle_check(struct sk_buff *skb, int skb_size);
 
diff -ur a/include/linux/slab_def.h b/include/linux/slab_def.h
--- a/include/linux/slab_def.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/slab_def.h	2014-02-17 11:56:05.000000000 +0100
@@ -148,7 +148,11 @@
 		return NULL;
 found:
 #ifdef CONFIG_ZONE_DMA
+#if defined(CONFIG_SYNO_COMCERTO)
+		if (flags & __GFP_DMA)
+#else
 		if (flags & GFP_DMA)
+#endif
 			cachep = malloc_sizes[i].cs_dmacachep;
 		else
 #endif
Nur in b/include/linux/spi: comcerto_spi.h.
Nur in b/include/linux: spi2.
diff -ur a/include/linux/stat.h b/include/linux/stat.h
--- a/include/linux/stat.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/stat.h	2014-02-17 11:56:05.000000000 +0100
@@ -140,15 +140,4 @@
 #endif
 #endif
 
-#ifdef  MY_ABC_HERE
-typedef struct _tag_mmap_arg_struct {
-	unsigned long addr;
-	unsigned long len;
-	unsigned long prot;
-	unsigned long flags;
-	unsigned long fd;
-	unsigned long pgoff;
-} SYNO_MMAP_ARG;
-#endif
-
 #endif
diff -ur a/include/linux/syno_acl.h b/include/linux/syno_acl.h
--- a/include/linux/syno_acl.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/syno_acl.h	2014-02-17 11:56:07.000000000 +0100
@@ -5,6 +5,7 @@
 #ifndef __LINUX_SYNO_ACL_H
 #define __LINUX_SYNO_ACL_H
 
+#include <linux/slab.h>
 #include <linux/syno_acl_xattr_ds.h>
 
 /* e_tag entry in struct syno_acl_entry */
@@ -20,18 +21,87 @@
 #define SYNO_ACL_DENY		(0x02)
 
 struct syno_acl_entry {
-    unsigned short  e_tag; 
-    unsigned int    e_id;
-    unsigned int	e_perm;
-    unsigned short  e_inherit;
-    unsigned short  e_allow;
-	unsigned int  	e_level;
+	unsigned short          e_tag;
+	unsigned int            e_id;
+	unsigned int            e_perm;
+	unsigned short          e_inherit;
+	unsigned short          e_allow;
+	unsigned int            e_level;
 };
 
 struct syno_acl { 
-    atomic_t                a_refcount;
-    unsigned int            a_count;
-    struct syno_acl_entry   a_entries[0];
+	atomic_t                a_refcount;
+	unsigned int            a_count;
+	struct syno_acl_entry   a_entries[0];
 };
 
+#define FOREACH_SYNOACL_ENTRY(pa, acl, pe) \
+	for(pa=(acl)->a_entries, pe=pa+(acl)->a_count; pa<pe; pa++)
+
+
+/*
+ * Duplicate an ACL handle.
+ */
+static inline struct syno_acl *
+syno_acl_dup(struct syno_acl *acl)
+{
+	if (acl)
+		atomic_inc(&acl->a_refcount);
+	return acl;
+}
+
+/*
+ * Free an ACL handle.
+ */
+static inline void
+syno_acl_release(struct syno_acl *acl)
+{
+	if (acl && atomic_dec_and_test(&acl->a_refcount))
+		kfree(acl);
+}
+
+extern struct syno_acl *syno_acl_alloc(int count, gfp_t flags);
+extern int syno_acl_valid(const struct syno_acl *);
+extern struct syno_acl *syno_acl_realloc(struct syno_acl *acl, unsigned int counts, gfp_t flags);
+extern struct syno_acl *syno_acl_clone(const struct syno_acl *acl, gfp_t flags);
+
+extern int syno_acl_to_xattr(const struct syno_acl *acl, void *buffer, size_t size);
+extern struct syno_acl *syno_acl_from_xattr(const void *value, size_t size);
+
+static inline struct syno_acl *get_cached_syno_acl(struct inode *inode)
+{
+	struct syno_acl **p, *acl;
+
+	p = &inode->i_syno_acl;
+	acl = ACCESS_ONCE(*p);
+	if (acl) {
+		spin_lock(&inode->i_lock);
+		acl = *p;
+		if (acl != ACL_NOT_CACHED)
+			acl = syno_acl_dup(acl);
+		spin_unlock(&inode->i_lock);
+	}
+	return acl;
+}
+
+static inline void set_cached_syno_acl(struct inode *inode, struct syno_acl *acl)
+{
+	struct syno_acl *old = NULL;
+
+	spin_lock(&inode->i_lock);
+	old = inode->i_syno_acl;
+	inode->i_syno_acl = acl?syno_acl_dup(acl):ACL_NOT_CACHED;
+	spin_unlock(&inode->i_lock);
+
+	if (old != ACL_NOT_CACHED)
+		syno_acl_release(old);
+}
+
+
+extern int SYNOACLModuleStatusGet(const char *szModName);
+extern void UseACLModule(const char *szModName, int isGet);
+
+#define SYNOACLModuleGet(mod_name) do { UseACLModule(mod_name, 1); } while (0)
+#define SYNOACLModulePut(mod_name) do { UseACLModule(mod_name, 0); } while (0)
+
 #endif  /* __LINUX_SYNO_ACL_H */
diff -ur a/include/linux/syno_acl_xattr_ds.h b/include/linux/syno_acl_xattr_ds.h
--- a/include/linux/syno_acl_xattr_ds.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/syno_acl_xattr_ds.h	2014-02-17 11:56:03.000000000 +0100
@@ -15,6 +15,7 @@
 
 /* Extended attribute names */
 #define SYNO_ACL_XATTR_ACCESS	"system.syno_acl_self"
+#define SYNO_ACL_XATTR_ACCESS_NOPERM	"system.syno_acl_noperm_self"
 #define SYNO_ACL_XATTR_INHERIT	"system.syno_acl_inherit"
 #define SYNO_ACL_XATTR_PSEUDO_INHERIT_ONLY	"system.syno_acl_pseudo_inherit_only"
 
diff -ur a/include/linux/synobios.h b/include/linux/synobios.h
--- a/include/linux/synobios.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/synobios.h	2014-02-17 11:56:04.000000000 +0100
@@ -3,6 +3,10 @@
 #define __SYNOBIOS_OEM_H_
 
 #include <linux/syno.h>
+#ifdef MY_ABC_HERE
+#include <linux/string.h>
+extern char gszSynoHWVersion[];
+#endif
 
 #ifndef BCD_TO_BIN
 #define BCD_TO_BIN(val) ((val)=((val)&15) + ((val)>>4)*10)
@@ -186,6 +190,14 @@
 #define SYNO_EVENT_ERROR_FS 0x2900
 #endif
 
+#ifdef SYNO_SATA_ERROR_REPORT
+#define SYNO_EVENT_SATA_ERROR_REPORT 0x2a00
+#endif
+
+#ifdef MY_ABC_HERE
+#define SYNO_EVENT_WAKE_FROM_DEEP_SLEEP 0x2b00
+#endif
+
 #define SYNO_EVENT_BACK_TEMP_CRITICAL   0x4004
 #define SYNO_EVENT_BACK_TEMP_HIGH       0x4003
 #define SYNO_EVENT_BACK_TEMP_HEAT       0x4002
@@ -353,7 +365,6 @@
 	DISK_LED_ORANGE_SOLID,
 	DISK_LED_ORANGE_BLINK,
 	DISK_LED_GREEN_BLINK,
-	DISK_LED_BLUE,
 } SYNO_DISK_LED;
 
 typedef struct _tag_DiskLedStatus {
@@ -429,7 +440,9 @@
 #define FAN_SPEED_SHIFT_DUTY_GET(speed)		  (((speed) - FAN_SPEED_PWM_FORMAT_SHIFT) & 0xFF) /* duty cycle is set in the first 8bit, so get it in first 8bit */
 
 /* if curspeed is stop and nxtspeed is lower than "high", return true. else return false*/
-#define IS_FAN_NEED_TO_SPIN_FASTER_FIRST(NxtSpeed, CurSpeed) (((NxtSpeed) < FAN_SPEED_PWM_FORMAT_SHIFT) ? ( FAN_SPEED_HIGH > (NxtSpeed) && FAN_SPEED_STOP == (CurSpeed) ) : (65 > FAN_SPEED_SHIFT_DUTY_GET(NxtSpeed) && (0 == FAN_SPEED_SHIFT_DUTY_GET(CurSpeed) ||  FAN_SPEED_STOP == CurSpeed)))
+#define IS_FAN_NEED_TO_SPIN_FASTER_FIRST(NxtSpeed, CurSpeed) (((NxtSpeed) < FAN_SPEED_PWM_FORMAT_SHIFT) ? ( (FAN_SPEED_HIGH > (NxtSpeed) && FAN_SPEED_STOP == (CurSpeed) ) || (FAN_SPEED_TEST_5 > (NxtSpeed) && FAN_SPEED_TEST_0 == (CurSpeed) ) ) : (65 > FAN_SPEED_SHIFT_DUTY_GET(NxtSpeed) && (0 == FAN_SPEED_SHIFT_DUTY_GET(CurSpeed) ||  IS_FAN_SET_TO_STOP(CurSpeed))))
+
+#define IS_FAN_SET_TO_STOP(CurSpeed) ((FAN_SPEED_STOP == CurSpeed) || (FAN_SPEED_TEST_0 == CurSpeed))
 
 // Synology Disk Station Brand
 enum {
@@ -560,13 +573,6 @@
 	RTC_MV,
 }RTC_TYPE;
 
-//Support 
-typedef enum {
-	S_LED_UNKNOWN,
-	S_LED_NORMAL,
-	S_LED_BREATH,
-}STATUS_LED_T;
-
 typedef enum {
 	FAN_RPM_RPT_UNKNOWN,
 	FAN_RPM_RPT_YES,
@@ -580,8 +586,9 @@
 }LCM_T;
 
 typedef enum {
-	WIFI_WPS_NO = 0x00,
+	WIFI_WPS_214air = 0x0, /* GPIO PIN 0  */
 	WIFI_WPS_213air = 0x28, /* GPIO PIN 40  */
+	WIFI_WPS_NO = 0xFE,
 	WIFI_WPS_UNKNOWN = 0xFF
 }WIFI_WPS_T;
 
@@ -605,11 +612,16 @@
 
 typedef enum {
 	HIBER_LED_UNKNOWN,
-	HIBER_LED_NORMAL,
-	HIBER_LED_ALLOUT,
-	HIBER_LED_EXCEPTPWR,
+	HIBER_LED_STATUS_OFF,    /* status led off, others don't care */
+	HIBER_LED_STATUS_BREATH, /* status led breathing */
+	HIBER_LED_POWER_ONLY,    /* power led on, others off */
 }HIBERNATE_LED_T;
 
+//For backward compatibility
+#define HIBER_LED_NORMAL HIBER_LED_STATUS_OFF
+#define HIBER_LED_ALLOUT HIBER_LED_STATUS_BREATH
+#define HIBER_LED_EXCEPTPWR HIBER_LED_POWER_ONLY
+
 typedef enum {
 	RTC_NOT_NEED_TO_CORRECT = 0x00,
 	RTC_SEIKO_CORR_DEFAULT  = 0x03, /* -5.62 sec/day */
@@ -618,11 +630,6 @@
 }RTC_CORRECTION_T;
 
 typedef enum {
-	FAN_FAIL_ADJUST_NO,
-	FAN_FAIL_ADJUST_FULL,
-}FAN_FAIL_ADJUST_T;
-
-typedef enum {
 	MICROP_ID_710p = 0x31, /* '1' */
 	MICROP_ID_411p = 0x33, /* '3' 411+II is the same*/
 	MICROP_ID_1010p = 0x32, /* '2' */
@@ -654,9 +661,11 @@
 	MICROP_ID_RS814p = 0x55, /* 'U' */
 	MICROP_ID_RS814rpp = 0x56, /* 'V' */
 	MICROP_ID_RS3614xsp = 0x57, /* 'W' RS3614xsp */
-	MICROP_ID_RS3614xs = 0x57, /* 'W' RS3614xs */
-	MICROP_ID_RS3614rpxs = 0x57, /* 'W' RS3614rpxs */
-	MICROP_ID_1814p = 0x58, /* 'X' */
+	MICROP_ID_RS3614xs = 0x5B, /* 'W' RS3614xs */
+	MICROP_ID_RS3614rpxs = 0x5C, /* 'W' RS3614rpxs */
+	MICROP_ID_9615xsp = 0x4d, /* 'M' Temporarily using the same microp ID as 10613 */
+	MICROP_ID_9615vmxsp = 0x4d, /* 'M' Temporarily using the same microp ID as 10613 */
+	MICROP_ID_DS2414xs = 0x57, /* 'W' DS2414xs */
 	MICROP_ID_UNKNOW = 0xFF,
 } SYNO_MICROP_ID;
 
@@ -673,6 +682,27 @@
 	UNKNOW_DISK_DENO = 0x00,
 } GROUP_WAKE_CONFIG_T;
 
+typedef enum {
+	CPU_UNKNOWN,
+	CPU_E3_1230v2,
+	CPU_I3_2100,
+	CPU_I3_4130,
+	CPU_D410,
+	CPU_D425,
+	CPU_D510,
+	CPU_D525,
+	CPU_D2700,
+	CPU_CE5335,
+	CPU_88F6281,
+	CPU_88F6282,
+	CPU_88F6702,
+	CPU_88F6707,
+	CPU_MV78230,
+	CPU_8533e,
+	CPU_P1022,
+	CPU_C2000,
+} CPU_ARCH_INFO_T;
+
 /**
  * This structure is used to store types of each module
  * in different DS models, including module fan type,
@@ -692,7 +722,6 @@
 	POWER_IN_SEQ   pis_type          :4;
 	RTC_TYPE       rtc_type          :4;
 	CPUTMP_T       cputmp_type       :2;
-	STATUS_LED_T   status_led_type   :2;
 	FAN_RPM_RPT_T  fan_rpm_rpt_type  :2;
 	LCM_T          lcm_type		 :2;
 	WIFI_WPS_T		wifi_wps_type	 :8;
@@ -700,9 +729,9 @@
 	CARDREADER_T   has_cardreader    :2;
 	HIBERNATE_LED_T  hibernate_led   :2;
 	RTC_CORRECTION_T rtc_corr_value  :8;
-	FAN_FAIL_ADJUST_T fan_fail_adjust:2;
 	SYNO_MICROP_ID microp_id         :8;
 	GROUP_WAKE_CONFIG_T group_wake_config :8;
+	CPU_ARCH_INFO_T cpu_arch_info    :8;
 } __attribute__((packed)) module_t;
 
 #define HW_DS107e      "DS107e"
@@ -779,6 +808,7 @@
 #define HW_RS3614xs    "RS3614xs"      //"RS3614xs"
 #define HW_RS3614rpxs    "RS3614rpxs"      //"RS3614rpxs"
 #define HW_RS3614xsp    "RS3614xs+"      //"RS3614xs+"
+#define HW_DS2414xs    "DS2414xs"      //"DS2414xs"
 #define HW_DS111j      "DS111j"        //"DS111j"
 #define HW_DS212       "DS212"         //"DS212v10"
 #define HW_DS413       "DS413"         //"DS413"
@@ -793,6 +823,7 @@
 #define HW_RS2414p     "RS2414+"       //"RS2414+"
 #define HW_RS2414rpp   "RS2414rp+"     //"RS2414rp+"
 #define HW_DS2413p     "DS2413+"       //"DS2413+"
+#define HW_DS2414p     "DS2414+"       //"DS2414+"
 #define HW_RS212       "RS212"         //"RS212"
 #define HW_DS212jv10   "DS212j"        //"DS212j"
 #define HW_DS212jv20   "DS212jv20"     //"DS212j"
@@ -807,6 +838,7 @@
 #define HW_DS112slim   "DS112slim"     //"DS112slim"
 #define HW_DS413jv10   "DS413jv10"     //"DS413jv10"
 #define HW_DS414jv10   "DS414jv10"     //"DS414jv10"
+#define HW_DS214airv10   "DS214airv10"     //"DS214airv10"
 #define HW_DS213pv10   "DS213pv10"     //"DS213pv10"
 #define HW_DS213airv10 "DS213airv10"   //"DS213airv10"
 #define HW_DS213v10    "DS213v10"      //"DS213v10"
@@ -824,11 +856,13 @@
 #define HW_DS414v10    "DS414v10"
 #define HW_RS814v10    "RS814v10"
 #define HW_DS114p      "DS114+"      //"DS114+"
+#define HW_RS9615xsp  "RS9615xs+"    //"RS9615xs+"
+#define HW_RS9615vmxsp  "RS9615vmxs+"    //"RS9615vmxs+"
 #define HW_DS714v10    "DS714v10"
 #define HW_RS814p      "RS814+"        //"RS814+"
 #define HW_RS814rpp    "RS814rp+"      //"RS814rp+"
-#define HW_DS1814p     "DS1814+"       //"DS1814+"
 #define HW_DS214play      "DS214play"
+#define HW_DS414slim   "DS414slim"    //DS414slim
 #define HW_UNKNOWN     "DSUnknown"
 									    
 typedef struct _tag_HwCapability {
@@ -902,6 +936,7 @@
 	MODEL_RS3614xs,
 	MODEL_RS3614rpxs,
 	MODEL_RS3614xsp,
+	MODEL_DS2414xs,
 	MODEL_RS411,
 	MODEL_DS111j,
 	MODEL_RS2211p,
@@ -930,6 +965,7 @@
 	MODEL_DS112slim,
 	MODEL_DS413j,	//90
 	MODEL_DS414j,
+	MODEL_DS214air,
 	MODEL_DS213p,
 	MODEL_DS213air,
 	MODEL_DS213,
@@ -948,12 +984,15 @@
 	MODEL_DS414,
 	MODEL_RS814,
 	MODEL_DS114p,
+	MODEL_RS9615xsp,  //110
+	MODEL_RS9615vmxsp,
 	MODEL_DS714,
-	MODEL_DS214se,
 	MODEL_RS814p,
 	MODEL_RS814rpp,
-	MODEL_DS1814p,
+	MODEL_DS214se,
 	MODEL_DS214play,
+	MODEL_DS2414p,
+	MODEL_DS414slim,
 	MODEL_INVALID
 } PRODUCT_MODEL;
 
@@ -1328,7 +1367,8 @@
 #define SYNO_SYS_BOOT                   0x4001
 #define SYNO_SYS_RUN                    0x4002
 #define SYNO_SYS_SHUTDOWN               0x4003
-#define SYNO_SYS_NODISK                 0x4004
+#define SYNO_SYS_NO_SYSTEM              0x4004
+#define SYNO_SYS_NODISK                 SYNO_SYS_NO_SYSTEM  // NODISK is alias of NO_SYSTEM
 #define SYNO_SYS_WAIT_RESET             0x4005
 #define SYNO_SYS_FACTORY_DEFAULT        0x4006
 #define SYNO_LED_USB_COPY_NONE          0x5100
@@ -1453,6 +1493,19 @@
 
 
 PRODUCT_MODEL synobios_getmodel(void);
-
+#ifdef MY_ABC_HERE
+static inline int syno_is_hw_version(const char *hw_version)
+{
+	if (NULL == hw_version) {
+		return 0;
+	}
+
+	if (0 == strncmp(gszSynoHWVersion, hw_version, strlen(hw_version))) {
+		return 1;
+	} else {
+		return 0;
+	}
+}
+#endif
 #endif
 
diff -ur a/include/linux/syno.h b/include/linux/syno.h
--- a/include/linux/syno.h	2013-08-24 11:57:08.000000000 +0200
+++ b/include/linux/syno.h	2014-02-17 12:27:54.000000000 +0100
@@ -37,6 +37,12 @@
 #endif
 
 #if 1
+#define SYNO_SATA_DOM_VENDOR	"SATADOM "
+#define SYNO_SATA_DOM_MODEL	"WD5002ABYS-01B1B0"
+#define SYNO_DUALHEAD_SYSTEM_DEVICE_PATH  "/dev/synoboot4"
+#endif
+
+#if 1
 #ifdef MY_ABC_HERE
 #define SYNO_MAX_SWITCHABLE_NET_DEVICE 8
 #define SYNO_NET_DEVICE_ENCODING_LENGTH 6
@@ -62,7 +68,7 @@
 #define SYNO_MD_CHUNK_SIZE 65536
 #define SYNO_FIX_MD_RESIZE_BUSY_LOOP 5
 #if	defined(MY_ABC_HERE) || defined(SYNO_BADSECTOR_TEST)
-#if 1
+#if 1 && (1)
 #define SYNO_MAX_INTERNAL_DISK 19
 #else
 #define SYNO_MAX_INTERNAL_DISK	15
@@ -111,25 +117,29 @@
 
 #endif
 
-#define F_CLEAR_ARCHIVE     513
-#define F_SETSMB_ARCHIVE    514
-#define F_SETSMB_HIDDEN     515
-#define F_SETSMB_SYSTEM     516
-#define F_CLRSMB_ARCHIVE    517
-#define F_CLRSMB_HIDDEN     518
-#define F_CLRSMB_SYSTEM     519
-#define F_CLEAR_S3_ARCHIVE  520
-#ifdef MY_ABC_HERE
-#define F_CLRSMB_READONLY   		521
-#define F_SETSMB_READONLY   		522
-#define F_CLRACL_INHERIT    		523
-#define F_SETACL_INHERIT    		524
-#define F_CLRACL_HAS_ACL   			525
-#define F_SETACL_HAS_ACL   			526
-#define F_CLRACL_SUPPORT   			527
-#define F_SETACL_SUPPORT   			528
-#define F_CLRACL_OWNER_IS_GROUP   	529
-#define F_SETACL_OWNER_IS_GROUP   	530
+#define SYNO_FCNTL_BASE             513
+#define F_CLEAR_ARCHIVE             (SYNO_FCNTL_BASE + 0)
+#define F_SETSMB_ARCHIVE            (SYNO_FCNTL_BASE + 1)
+#define F_SETSMB_HIDDEN             (SYNO_FCNTL_BASE + 2)
+#define F_SETSMB_SYSTEM             (SYNO_FCNTL_BASE + 3)
+#define F_CLRSMB_ARCHIVE            (SYNO_FCNTL_BASE + 4)
+#define F_CLRSMB_HIDDEN             (SYNO_FCNTL_BASE + 5)
+#define F_CLRSMB_SYSTEM             (SYNO_FCNTL_BASE + 6)
+#define F_CLEAR_S3_ARCHIVE          (SYNO_FCNTL_BASE + 7)
+#ifdef MY_ABC_HERE
+#define F_CLRSMB_READONLY           (SYNO_FCNTL_BASE + 8)
+#define F_SETSMB_READONLY           (SYNO_FCNTL_BASE + 9)
+#define F_CLRACL_INHERIT            (SYNO_FCNTL_BASE + 10)
+#define F_SETACL_INHERIT            (SYNO_FCNTL_BASE + 11)
+#define F_CLRACL_HAS_ACL            (SYNO_FCNTL_BASE + 12)
+#define F_SETACL_HAS_ACL            (SYNO_FCNTL_BASE + 13)
+#define F_CLRACL_SUPPORT            (SYNO_FCNTL_BASE + 14)
+#define F_SETACL_SUPPORT            (SYNO_FCNTL_BASE + 15)
+#define F_CLRACL_OWNER_IS_GROUP     (SYNO_FCNTL_BASE + 16)
+#define F_SETACL_OWNER_IS_GROUP     (SYNO_FCNTL_BASE + 17)
+#define SYNO_FCNTL_LAST             F_SETACL_OWNER_IS_GROUP
+#else
+#define SYNO_FCNTL_LAST             F_CLEAR_S3_ARCHIVE
 #endif
 
 #else
@@ -141,7 +151,6 @@
 #define SYNO_SMB_PSTRING_LEN 1024
 #endif
 
-#define SYNO_EXT4_SYNC_DALLOC_RETRY  100
 #ifdef MY_ABC_HERE
 #define MAX_CHANNEL_RETRY       2
 #define CHANNEL_RETRY_INTERVAL  (3*HZ)
@@ -184,5 +193,9 @@
 
 #endif
 
+#if 1 && defined(MY_ABC_HERE)
+#define SYNO_GLUSTER_FS
+#endif
+
 #endif
 
diff -ur a/include/linux/synosata.h b/include/linux/synosata.h
--- a/include/linux/synosata.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/synosata.h	2014-02-17 11:56:07.000000000 +0100
@@ -436,6 +436,36 @@
 	/* add other port multiplier here */
 }
 
+/**
+ * Init eunit deepsleep indicator
+ *
+ * @param vendor  [IN] PMP vendor
+ * @param devid   [IN] device id
+ * @param pPM_pkg [IN] Store the result. Should not be NULL.
+ * @param blCLR   [IN] clean or not
+ *
+ * return 0: not support deepsleep indicator
+ *        1: support deepsleep indicator
+ */
+static inline int
+syno_pm_deepsleep_indicator_pkg_init(unsigned short vendor, unsigned short devid, SYNO_PM_PKG *pPKG, unsigned char blCLR)
+{
+	/* do not check parameters, caller should do it */
+	int iRet = 0;
+
+	memset(pPKG, 0, sizeof(*pPKG));
+	if (syno_pm_is_9705(vendor, devid)) {
+		if (blCLR) {
+			pPKG->var = GPIO_9705_PKG_INIT(1,0);
+		} else {
+			pPKG->var = GPIO_9705_PKG_INIT(1,0x80);
+		}
+		iRet = 1;
+	}
+	/* add other port multiplier here */
+	return iRet;
+}
+
 static inline void 
 syno_pm_enable_powerbtn_pkg_init(unsigned short vendor, unsigned short devid, SYNO_PM_PKG *pPKG)
 {
diff -ur a/include/linux/syscalls.h b/include/linux/syscalls.h
--- a/include/linux/syscalls.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/syscalls.h	2014-02-17 11:56:08.000000000 +0100
@@ -826,13 +826,6 @@
 				  const char  __user *pathname);
 asmlinkage long sys_syncfs(int fd);
 
-#ifdef CONFIG_IA32_EMULATION
-#ifdef MY_ABC_HERE
-asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot,
-						 unsigned long flags, unsigned long fd, unsigned long off);
-#endif
-#endif
-
 int kernel_execve(const char *filename, const char *const argv[], const char *const envp[]);
 
 #ifdef CONFIG_IA32_EMULATION
diff -ur a/include/linux/usb/ch11.h b/include/linux/usb/ch11.h
--- a/include/linux/usb/ch11.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/usb/ch11.h	2014-02-17 11:56:03.000000000 +0100
@@ -111,7 +111,7 @@
 #define USB_PORT_STAT_TEST_MODE         0x8000
 #endif
 
-#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE)
+#if defined(MY_DEF_HERE) || defined(MY_ABC_HERE) || defined(MY_ABC_HERE)
 enum XHCI_SPECIAL_RESET_MODE{
 	XHCI_SPECIAL_RESET_PAUSE = 0, // enable and pause
 	XHCI_SPECIAL_RESET_RUN, // enable and run
@@ -216,6 +216,17 @@
 #define USB_DT_HUB_NONVAR_SIZE		7
 #define USB_DT_SS_HUB_SIZE              12
 
+/*
+ * Hub Device descriptor
+ * USB Hub class device protocols
+ */
+
+#define USB_HUB_PR_FS		0 /* Full speed hub */
+#define USB_HUB_PR_HS_NO_TT	0 /* Hi-speed hub without TT */
+#define USB_HUB_PR_HS_SINGLE_TT	1 /* Hi-speed hub with single TT */
+#define USB_HUB_PR_HS_MULTI_TT	2 /* Hi-speed hub with multiple TT */
+#define USB_HUB_PR_SS		3 /* Super speed hub */
+
 struct usb_hub_descriptor {
 	__u8  bDescLength;
 	__u8  bDescriptorType;
diff -ur a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h
--- a/include/linux/usb/hcd.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/usb/hcd.h	2014-02-17 11:56:03.000000000 +0100
@@ -349,6 +349,9 @@
 		 */
 	int	(*update_device)(struct usb_hcd *, struct usb_device *);
 	int	(*set_usb2_hw_lpm)(struct usb_hcd *, struct usb_device *, int);
+	int (*update_uas_device)(struct usb_hcd *, struct usb_device *, int);
+	void	(*stop_endpoint)(struct usb_hcd *, struct usb_device *,
+				struct usb_host_endpoint *);
 };
 
 extern int usb_hcd_link_urb_to_ep(struct usb_hcd *hcd, struct urb *urb);
diff -ur a/include/linux/usb.h b/include/linux/usb.h
--- a/include/linux/usb.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/usb.h	2014-02-17 11:56:04.000000000 +0100
@@ -531,6 +531,9 @@
 extern int usb_reset_device(struct usb_device *dev);
 extern void usb_queue_reset_device(struct usb_interface *dev);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+extern struct usb_device *usb_find_device_by_name(const char *name);
+#endif
 
 /* USB autosuspend and autoresume */
 #ifdef CONFIG_USB_SUSPEND
diff -ur a/include/linux/virtio.h b/include/linux/virtio.h
--- a/include/linux/virtio.h	2013-08-24 11:36:10.000000000 +0200
+++ b/include/linux/virtio.h	2014-02-17 11:56:04.000000000 +0100
@@ -25,71 +25,19 @@
 	void *priv;
 };
 
-/**
- * operations for virtqueue
- * virtqueue_add_buf: expose buffer to other end
- *	vq: the struct virtqueue we're talking about.
- *	sg: the description of the buffer(s).
- *	out_num: the number of sg readable by other side
- *	in_num: the number of sg which are writable (after readable ones)
- *	data: the token identifying the buffer.
- *	gfp: how to do memory allocations (if necessary).
- *      Returns remaining capacity of queue (sg segments) or a negative error.
- * virtqueue_kick: update after add_buf
- *	vq: the struct virtqueue
- *	After one or more add_buf calls, invoke this to kick the other side.
- * virtqueue_get_buf: get the next used buffer
- *	vq: the struct virtqueue we're talking about.
- *	len: the length written into the buffer
- *	Returns NULL or the "data" token handed to add_buf.
- * virtqueue_disable_cb: disable callbacks
- *	vq: the struct virtqueue we're talking about.
- *	Note that this is not necessarily synchronous, hence unreliable and only
- *	useful as an optimization.
- * virtqueue_enable_cb: restart callbacks after disable_cb.
- *	vq: the struct virtqueue we're talking about.
- *	This re-enables callbacks; it returns "false" if there are pending
- *	buffers in the queue, to detect a possible race between the driver
- *	checking for more work, and enabling callbacks.
- * virtqueue_enable_cb_delayed: restart callbacks after disable_cb.
- *	vq: the struct virtqueue we're talking about.
- *	This re-enables callbacks but hints to the other side to delay
- *	interrupts until most of the available buffers have been processed;
- *	it returns "false" if there are many pending buffers in the queue,
- *	to detect a possible race between the driver checking for more work,
- *	and enabling callbacks.
- * virtqueue_detach_unused_buf: detach first unused buffer
- * 	vq: the struct virtqueue we're talking about.
- * 	Returns NULL or the "data" token handed to add_buf
- * virtqueue_get_vring_size: return the size of the virtqueue's vring
- *	vq: the struct virtqueue containing the vring of interest.
- *	Returns the size of the vring.
- *
- * Locking rules are straightforward: the driver is responsible for
- * locking.  No two operations may be invoked simultaneously, with the exception
- * of virtqueue_disable_cb.
- *
- * All operations can be called in any context.
- */
-
-int virtqueue_add_buf_gfp(struct virtqueue *vq,
-			  struct scatterlist sg[],
-			  unsigned int out_num,
-			  unsigned int in_num,
-			  void *data,
-			  gfp_t gfp);
-
-static inline int virtqueue_add_buf(struct virtqueue *vq,
-				    struct scatterlist sg[],
-				    unsigned int out_num,
-				    unsigned int in_num,
-				    void *data)
-{
-	return virtqueue_add_buf_gfp(vq, sg, out_num, in_num, data, GFP_ATOMIC);
-}
+int virtqueue_add_buf(struct virtqueue *vq,
+		      struct scatterlist sg[],
+		      unsigned int out_num,
+		      unsigned int in_num,
+		      void *data,
+		      gfp_t gfp);
 
 void virtqueue_kick(struct virtqueue *vq);
 
+bool virtqueue_kick_prepare(struct virtqueue *vq);
+
+void virtqueue_notify(struct virtqueue *vq);
+
 void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len);
 
 void virtqueue_disable_cb(struct virtqueue *vq);
@@ -144,8 +92,14 @@
 	const unsigned int *feature_table;
 	unsigned int feature_table_size;
 	int (*probe)(struct virtio_device *dev);
+	void (*scan)(struct virtio_device *dev);
 	void (*remove)(struct virtio_device *dev);
 	void (*config_changed)(struct virtio_device *dev);
+#ifdef CONFIG_PM
+	int (*freeze)(struct virtio_device *dev);
+	int (*thaw)(struct virtio_device *dev);
+	int (*restore)(struct virtio_device *dev);
+#endif
 };
 
 int register_virtio_driver(struct virtio_driver *drv);
diff -ur a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h
--- a/include/linux/virtio_ids.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/virtio_ids.h	2014-02-17 11:56:06.000000000 +0100
@@ -34,6 +34,7 @@
 #define VIRTIO_ID_CONSOLE	3 /* virtio console */
 #define VIRTIO_ID_RNG		4 /* virtio ring */
 #define VIRTIO_ID_BALLOON	5 /* virtio balloon */
+#define VIRTIO_ID_SCSI		8 /* virtio scsi */
 #define VIRTIO_ID_9P		9 /* 9p virtio console */
 
 #endif /* _LINUX_VIRTIO_IDS_H */
diff -ur a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h
--- a/include/linux/virtio_ring.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/virtio_ring.h	2014-02-17 11:56:06.000000000 +0100
@@ -168,6 +168,7 @@
 struct virtqueue *vring_new_virtqueue(unsigned int num,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
+				      bool weak_barriers,
 				      void *pages,
 				      void (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
Nur in b/include/linux: virtio_scsi.h.
diff -ur a/include/linux/writeback.h b/include/linux/writeback.h
--- a/include/linux/writeback.h	2013-08-24 11:36:09.000000000 +0200
+++ b/include/linux/writeback.h	2014-02-17 11:56:03.000000000 +0100
@@ -7,6 +7,8 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 
+DECLARE_PER_CPU(int, dirty_throttle_leaks);
+
 /*
  * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
  *
@@ -170,14 +172,7 @@
 			    unsigned long start_time);
 
 void page_writeback_init(void);
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
-					unsigned long nr_pages_dirtied);
-
-static inline void
-balance_dirty_pages_ratelimited(struct address_space *mapping)
-{
-	balance_dirty_pages_ratelimited_nr(mapping, 1);
-}
+void balance_dirty_pages_ratelimited(struct address_space *mapping);
 
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
 				void *data);
@@ -195,6 +190,8 @@
 void tag_pages_for_writeback(struct address_space *mapping,
 			     pgoff_t start, pgoff_t end);
 
+void account_page_redirty(struct page *page);
+
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
 				   read-only. */
diff -ur a/include/linux/xattr.h b/include/linux/xattr.h
--- a/include/linux/xattr.h	2013-08-24 11:36:11.000000000 +0200
+++ b/include/linux/xattr.h	2014-02-17 11:56:07.000000000 +0100
@@ -44,6 +44,13 @@
 #ifdef MY_ABC_HERE
 #define XATTR_SYNO_ARCHIVE_VERSION "archive_version"
 #endif
+#ifdef MY_ABC_HERE
+#define XATTR_SYNO_CREATE_TIME "create_time"
+#endif
+#ifdef MY_ABC_HERE
+#define XATTR_SYNO_ARCHIVE_BIT "archive_bit"
+#define XATTR_SYNO_ARCHIVE_BIT_NOPERM "archive_bit_noperm" //for glusterfs
+#endif
 
 #define XATTR_SMACK_SUFFIX "SMACK64"
 #define XATTR_SMACK_IPIN "SMACK64IPIN"
@@ -82,10 +89,6 @@
 		   size_t size, int handler_flags);
 	int (*set)(struct dentry *dentry, const char *name, const void *buffer,
 		   size_t size, int flags, int handler_flags);
-#ifdef MY_ABC_HERE
-	int (*set_compact_syno)(struct inode *inode, const char *name, const void *buffer,
-		   size_t size, int flags, int handler_flags);
-#endif
 };
 
 #ifdef MY_ABC_HERE
@@ -95,6 +98,7 @@
 	__le32	v_archive_version;
 };
 #endif
+
 struct xattr {
 	char *name;
 	void *value;
diff -ur a/include/mtd/mtd-abi.h b/include/mtd/mtd-abi.h
--- a/include/mtd/mtd-abi.h	2013-08-24 11:36:08.000000000 +0200
+++ b/include/mtd/mtd-abi.h	2014-02-17 11:56:02.000000000 +0100
@@ -225,6 +225,9 @@
  * modes (see "struct mtd_write_req")
  */
 #define MEMWRITE		_IOWR('M', 24, struct mtd_write_req)
+#if defined(CONFIG_SYNO_COMCERTO)
+#define MTDREFRESH		_IO('M', 50)
+#endif
 
 
 /*
diff -ur a/include/mtd/ubi-user.h b/include/mtd/ubi-user.h
--- a/include/mtd/ubi-user.h	2013-08-24 11:36:08.000000000 +0200
+++ b/include/mtd/ubi-user.h	2014-02-17 11:56:02.000000000 +0100
@@ -173,7 +173,10 @@
 
 #define UBI_VOL_IOC_MAGIC 'O'
 
-/* Start UBI volume update */
+/* Start UBI volume update
+ * Note: This actually takes a pointer (__s64*), but we can't change
+ *       that without breaking the ABI on 32bit systems
+ */
 #define UBI_IOCVOLUP _IOW(UBI_VOL_IOC_MAGIC, 0, __s64)
 /* LEB erasure command, used for debugging, disabled by default */
 #define UBI_IOCEBER _IOW(UBI_VOL_IOC_MAGIC, 1, __s32)
@@ -196,23 +199,6 @@
 #define UBI_MAX_RNVOL 32
 
 /*
- * UBI data type hint constants.
- *
- * UBI_LONGTERM: long-term data
- * UBI_SHORTTERM: short-term data
- * UBI_UNKNOWN: data persistence is unknown
- *
- * These constants are used when data is written to UBI volumes in order to
- * help the UBI wear-leveling unit to find more appropriate physical
- * eraseblocks.
- */
-enum {
-	UBI_LONGTERM  = 1,
-	UBI_SHORTTERM = 2,
-	UBI_UNKNOWN   = 3,
-};
-
-/*
  * UBI volume type constants.
  *
  * @UBI_DYNAMIC_VOLUME: dynamic volume
@@ -239,6 +225,7 @@
  * @ubi_num: UBI device number to create
  * @mtd_num: MTD device number to attach
  * @vid_hdr_offset: VID header offset (use defaults if %0)
+ * @max_beb_per1024: maximum expected number of bad PEB per 1024 PEBs
  * @padding: reserved for future, not used, has to be zeroed
  *
  * This data structure is used to specify MTD device UBI has to attach and the
@@ -262,12 +249,25 @@
  * be 2KiB-64 bytes = 1984. Note, that this position is not even 512-bytes
  * aligned, which is OK, as UBI is clever enough to realize this is 4th
  * sub-page of the first page and add needed padding.
+ *
+ * The @max_beb_per1024 is the maximum amount of bad PEBs UBI expects on the
+ * UBI device per 1024 eraseblocks.  This value is often given in an other form
+ * in the NAND datasheet (min NVB i.e. minimal number of valid blocks). The
+ * maximum expected bad eraseblocks per 1024 is then:
+ *    1024 * (1 - MinNVB / MaxNVB)
+ * Which gives 20 for most NAND devices.  This limit is used in order to derive
+ * amount of eraseblock UBI reserves for handling new bad blocks. If the device
+ * has more bad eraseblocks than this limit, UBI does not reserve any physical
+ * eraseblocks for new bad eraseblocks, but attempts to use available
+ * eraseblocks (if any). The accepted range is 0-768. If 0 is given, the
+ * default kernel value of %CONFIG_MTD_UBI_BEB_LIMIT will be used.
  */
 struct ubi_attach_req {
 	__s32 ubi_num;
 	__s32 mtd_num;
 	__s32 vid_hdr_offset;
-	__s8 padding[12];
+	__s16 max_beb_per1024;
+	__s8 padding[10];
 };
 
 /**
@@ -375,25 +375,34 @@
  *                             requests.
  * @lnum: logical eraseblock number to change
  * @bytes: how many bytes will be written to the logical eraseblock
- * @dtype: data type (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
+ * @dtype: pass "3" for better compatibility with old kernels
  * @padding: reserved for future, not used, has to be zeroed
+ *
+ * The @dtype field used to inform UBI about what kind of data will be written
+ * to the LEB: long term (value 1), short term (value 2), unknown (value 3).
+ * UBI tried to pick a PEB with lower erase counter for short term data and a
+ * PEB with higher erase counter for long term data. But this was not really
+ * used because users usually do not know this and could easily mislead UBI. We
+ * removed this feature in May 2012. UBI currently just ignores the @dtype
+ * field. But for better compatibility with older kernels it is recommended to
+ * set @dtype to 3 (unknown).
  */
 struct ubi_leb_change_req {
 	__s32 lnum;
 	__s32 bytes;
-	__s8  dtype;
+	__s8  dtype; /* obsolete, do not use! */
 	__s8  padding[7];
 } __packed;
 
 /**
  * struct ubi_map_req - a data structure used in map LEB requests.
+ * @dtype: pass "3" for better compatibility with old kernels
  * @lnum: logical eraseblock number to unmap
- * @dtype: data type (%UBI_LONGTERM, %UBI_SHORTTERM, %UBI_UNKNOWN)
  * @padding: reserved for future, not used, has to be zeroed
  */
 struct ubi_map_req {
 	__s32 lnum;
-	__s8  dtype;
+	__s8  dtype; /* obsolete, do not use! */
 	__s8  padding[3];
 } __packed;
 
diff -ur a/include/net/addrconf.h b/include/net/addrconf.h
--- a/include/net/addrconf.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/net/addrconf.h	2014-02-17 11:56:00.000000000 +0100
@@ -91,6 +91,14 @@
 extern void			addrconf_leave_solict(struct inet6_dev *idev,
 					const struct in6_addr *addr);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+extern int			(*ipv6_dev_get_saddr_hook)(struct net *net,
+						struct net_device *dev,
+						const struct in6_addr *daddr,
+						unsigned int srcprefs,
+						struct in6_addr *saddr);
+#endif
+
 static inline unsigned long addrconf_timeout_fixup(u32 timeout,
 						    unsigned unit)
 {
diff -ur a/include/net/flow.h b/include/net/flow.h
--- a/include/net/flow.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/net/flow.h	2014-02-17 11:56:00.000000000 +0100
@@ -212,9 +212,17 @@
 		struct net *net, const struct flowi *key, u16 family,
 		u8 dir, struct flow_cache_object *oldobj, void *ctx);
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+extern struct flow_cache_object *flow_cache_lookup(
+		struct net *net, const struct flowi *key, u16 family,
+		u8 dir, u8 *new_flow, flow_resolve_t resolver, void *ctx);
+extern void flow_cache_remove(
+				const struct flowi *fl, unsigned short family, unsigned short dir);
+#else
 extern struct flow_cache_object *flow_cache_lookup(
 		struct net *net, const struct flowi *key, u16 family,
 		u8 dir, flow_resolve_t resolver, void *ctx);
+#endif
 
 extern void flow_cache_flush(void);
 extern void flow_cache_flush_deferred(void);
diff -ur a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h
--- a/include/net/ip6_tunnel.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/net/ip6_tunnel.h	2014-02-17 11:56:01.000000000 +0100
@@ -19,6 +19,9 @@
 	struct flowi fl;	/* flowi template for xmit */
 	struct dst_entry *dst_cache;    /* cached dst */
 	u32 dst_cookie;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_INET6_IPSEC_OFFLOAD)
+	u32 genid;
+#endif
 };
 
 /* Tunnel encapsulation limit destination sub-option */
diff -ur a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h
--- a/include/net/netfilter/nf_conntrack_core.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/net/netfilter/nf_conntrack_core.h	2014-02-17 11:56:00.000000000 +0100
@@ -76,4 +76,8 @@
 
 extern spinlock_t nf_conntrack_lock ;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+#define COMCERTO_PERMANENT_TIMEOUT	1000
+#endif
+
 #endif /* _NF_CONNTRACK_CORE_H */
diff -ur a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
--- a/include/net/netfilter/nf_conntrack.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/net/netfilter/nf_conntrack.h	2014-02-17 11:56:00.000000000 +0100
@@ -100,6 +100,14 @@
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+struct comcerto_fp_info {
+	int ifindex;
+	int iif;
+	u32 mark;
+};
+#endif
+
 struct nf_conn {
 	/* Usage count in here is 1 for hash table/destruct timer, 1 per skb,
            plus 1 for any connection(s) we are `master' for */
@@ -145,12 +153,33 @@
 	} layer7;
 #endif
 #endif
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	struct comcerto_fp_info fp_info[IP_CT_DIR_MAX];
+#endif
+
 	/* Extensions */
 	struct nf_ct_ext *ext;
 #ifdef CONFIG_NET_NS
 	struct net *ct_net;
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || \
+    defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE))
+	struct {
+		/*
+		 * e.g. "http". NULL before decision. "unknown" after decision
+		 * if no match.
+		 */
+		char *app_proto;
+		/*
+		 * application layer data so far. NULL after match decision.
+		 */
+		char *app_data;
+		unsigned int app_data_len;
+	} layer7;
+#endif
+
 	/* Storage reserved for other modules, must be the last member */
 	union nf_conntrack_proto proto;
 };
@@ -332,6 +361,11 @@
 
 struct kernel_param;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+extern int nf_conntrack_set_dpi_allow_report(struct sk_buff *skb);
+extern int nf_conntrack_set_dpi_allow_and_mark(struct sk_buff *skb, int mark);
+#endif
+
 extern int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
 extern unsigned int nf_conntrack_htable_size;
 extern unsigned int nf_conntrack_max;
diff -ur a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h
--- a/include/net/netns/xfrm.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/net/netns/xfrm.h	2014-02-17 11:56:01.000000000 +0100
@@ -27,6 +27,9 @@
 	struct hlist_head	*state_bydst;
 	struct hlist_head	*state_bysrc;
 	struct hlist_head	*state_byspi;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	struct hlist_head	*state_byh;
+#endif
 	unsigned int		state_hmask;
 	unsigned int		state_num;
 	struct work_struct	state_hash_work;
diff -ur a/include/net/sock.h b/include/net/sock.h
--- a/include/net/sock.h	2013-08-24 11:36:08.000000000 +0200
+++ b/include/net/sock.h	2014-02-17 11:56:01.000000000 +0100
@@ -1568,7 +1568,7 @@
 static inline void sock_poll_wait(struct file *filp,
 		wait_queue_head_t *wait_address, poll_table *p)
 {
-	if (p && wait_address) {
+	if (!poll_does_not_wait(p) && wait_address) {
 		poll_wait(filp, wait_address, p);
 		/*
 		 * We need to be sure we are in sync with the
diff -ur a/include/net/xfrm.h b/include/net/xfrm.h
--- a/include/net/xfrm.h	2013-08-24 11:36:07.000000000 +0200
+++ b/include/net/xfrm.h	2014-02-17 11:56:01.000000000 +0100
@@ -138,6 +138,10 @@
 	struct hlist_node	bysrc;
 	struct hlist_node	byspi;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	struct hlist_node 	byh;
+	u16			handle;
+#endif
 	atomic_t		refcnt;
 	spinlock_t		lock;
 
@@ -229,6 +233,11 @@
 	/* Private data of this transformer, format is opaque,
 	 * interpreted by xfrm_type methods. */
 	void			*data;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	 /* Intended direction of this state, used for offloading */
+	int	dir;
+	int	offloaded;	
+#endif
 };
 
 static inline struct net *xs_net(struct xfrm_state *x)
@@ -248,6 +257,14 @@
 	XFRM_STATE_DEAD
 };
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+enum {
+	 XFRM_STATE_DIR_UNKNOWN,
+	 XFRM_STATE_DIR_IN,
+	 XFRM_STATE_DIR_OUT,
+};
+#endif
+
 /* callback structure passed from either netlink or pfkey */
 struct km_event {
 	union {
@@ -302,6 +319,9 @@
 
 extern int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo);
 extern int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo);
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+extern struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
+#endif
 extern void km_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c);
 extern void km_state_notify(struct xfrm_state *x, const struct km_event *c);
 
@@ -961,6 +981,35 @@
 	struct xfrm_state	*xvec[XFRM_MAX_DEPTH];
 };
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+struct xfrm_input_shared
+{
+	struct sk_buff 		*skb;
+	int 			xfrm_nr, first, xfrm_encap;
+	struct xfrm_state 	*xfrm_vec[XFRM_MAX_DEPTH];
+	__u16 			encap_type;
+	int 			decaps;
+	u32			seq, spi;
+	unsigned int   nhoff;
+	int 			nexthdr;
+	int 			(*callback)(struct xfrm_input_shared *sh);
+	atomic_t		refcnt;
+};
+
+
+static inline void xfrm_shared_get(struct xfrm_input_shared *sh)
+{
+	atomic_inc(&sh->refcnt);
+}
+
+static inline void xfrm_shared_put(struct xfrm_input_shared *sh)
+{
+	if (atomic_dec_and_test(&sh->refcnt)) {
+		kfree(sh);
+	}
+}
+#endif
+
 static inline int secpath_exists(struct sk_buff *skb)
 {
 #ifdef CONFIG_XFRM
diff -ur a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h
--- a/include/scsi/scsi_device.h	2013-08-24 11:36:08.000000000 +0200
+++ b/include/scsi/scsi_device.h	2014-02-17 11:56:01.000000000 +0100
@@ -97,9 +97,6 @@
 #ifdef MY_ABC_HERE
 	unsigned char auto_remap;
 #endif
-#ifdef MY_ABC_HERE
-	int iResetPwrCount;  /* the count of disk power reset */
-#endif
 
 	unsigned int manufacturer;	/* Manufacturer of device, for using 
 					 * vendor-specific cmd's */
@@ -178,7 +175,7 @@
 	unsigned long   idle;   /* scsi idle time in jiffers */
 	unsigned char	spindown;
 	unsigned char   nospindown;
-#endif
+#endif /* MY_ABC_HERE */
 
 	struct device		sdev_gendev,
 				sdev_dev;
diff -ur a/include/scsi/scsi_ioctl.h b/include/scsi/scsi_ioctl.h
--- a/include/scsi/scsi_ioctl.h	2013-08-24 11:36:08.000000000 +0200
+++ b/include/scsi/scsi_ioctl.h	2014-02-17 11:56:01.000000000 +0100
@@ -13,7 +13,7 @@
  * you must check scsi.h. too */
 #define SD_IOCTL_IDLE 4746
 #define SD_IOCTL_SUPPORT_SLEEP 4747
-#endif
+#endif /* MY_ABC_HERE */
 
 /* The door lock/unlock constants are compatible with Sun constants for
    the cdrom */
diff -ur a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
--- a/include/trace/events/btrfs.h	2013-08-24 11:36:08.000000000 +0200
+++ b/include/trace/events/btrfs.h	2014-02-17 11:56:03.000000000 +0100
@@ -6,6 +6,7 @@
 
 #include <linux/writeback.h>
 #include <linux/tracepoint.h>
+#include <trace/events/gfpflags.h>
 
 struct btrfs_root;
 struct btrfs_fs_info;
@@ -16,6 +17,8 @@
 struct btrfs_delayed_tree_ref;
 struct btrfs_delayed_data_ref;
 struct btrfs_delayed_ref_head;
+struct btrfs_block_group_cache;
+struct btrfs_free_cluster;
 struct map_lookup;
 struct extent_buffer;
 
@@ -38,12 +41,24 @@
 		{ BTRFS_CSUM_TREE_OBJECTID, 	"CSUM_TREE"	},	\
 		{ BTRFS_TREE_LOG_OBJECTID,	"TREE_LOG"	},	\
 		{ BTRFS_TREE_RELOC_OBJECTID,	"TREE_RELOC"	},	\
+		{ BTRFS_UUID_TREE_OBJECTID,	"UUID_RELOC"	},	\
 		{ BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" })
 
 #define show_root_type(obj)						\
 	obj, ((obj >= BTRFS_DATA_RELOC_TREE_OBJECTID) ||		\
 	      (obj <= BTRFS_CSUM_TREE_OBJECTID )) ? __show_root_type(obj) : "-"
 
+#define BTRFS_GROUP_FLAGS	\
+	{ BTRFS_BLOCK_GROUP_DATA,	"DATA"}, \
+	{ BTRFS_BLOCK_GROUP_SYSTEM,	"SYSTEM"}, \
+	{ BTRFS_BLOCK_GROUP_METADATA,	"METADATA"}, \
+	{ BTRFS_BLOCK_GROUP_RAID0,	"RAID0"}, \
+	{ BTRFS_BLOCK_GROUP_RAID1,	"RAID1"}, \
+	{ BTRFS_BLOCK_GROUP_DUP,	"DUP"}, \
+	{ BTRFS_BLOCK_GROUP_RAID10,	"RAID10"}
+
+#define BTRFS_UUID_SIZE 16
+
 TRACE_EVENT(btrfs_transaction_commit,
 
 	TP_PROTO(struct btrfs_root *root),
@@ -415,7 +430,7 @@
 		{ BTRFS_UPDATE_DELAYED_HEAD, "UPDATE_DELAYED_HEAD" })
 			
 
-TRACE_EVENT(btrfs_delayed_tree_ref,
+DECLARE_EVENT_CLASS(btrfs_delayed_tree_ref,
 
 	TP_PROTO(struct btrfs_delayed_ref_node *ref,
 		 struct btrfs_delayed_tree_ref *full_ref,
@@ -454,7 +469,25 @@
 		  __entry->level, show_ref_type(__entry->type))
 );
 
-TRACE_EVENT(btrfs_delayed_data_ref,
+DEFINE_EVENT(btrfs_delayed_tree_ref,  add_delayed_tree_ref,
+
+	TP_PROTO(struct btrfs_delayed_ref_node *ref,
+		 struct btrfs_delayed_tree_ref *full_ref,
+		 int action),
+
+	TP_ARGS(ref, full_ref, action)
+);
+
+DEFINE_EVENT(btrfs_delayed_tree_ref,  run_delayed_tree_ref,
+
+	TP_PROTO(struct btrfs_delayed_ref_node *ref,
+		 struct btrfs_delayed_tree_ref *full_ref,
+		 int action),
+
+	TP_ARGS(ref, full_ref, action)
+);
+
+DECLARE_EVENT_CLASS(btrfs_delayed_data_ref,
 
 	TP_PROTO(struct btrfs_delayed_ref_node *ref,
 		 struct btrfs_delayed_data_ref *full_ref,
@@ -497,7 +530,25 @@
 		  show_ref_type(__entry->type))
 );
 
-TRACE_EVENT(btrfs_delayed_ref_head,
+DEFINE_EVENT(btrfs_delayed_data_ref,  add_delayed_data_ref,
+
+	TP_PROTO(struct btrfs_delayed_ref_node *ref,
+		 struct btrfs_delayed_data_ref *full_ref,
+		 int action),
+
+	TP_ARGS(ref, full_ref, action)
+);
+
+DEFINE_EVENT(btrfs_delayed_data_ref,  run_delayed_data_ref,
+
+	TP_PROTO(struct btrfs_delayed_ref_node *ref,
+		 struct btrfs_delayed_data_ref *full_ref,
+		 int action),
+
+	TP_ARGS(ref, full_ref, action)
+);
+
+DECLARE_EVENT_CLASS(btrfs_delayed_ref_head,
 
 	TP_PROTO(struct btrfs_delayed_ref_node *ref,
 		 struct btrfs_delayed_ref_head *head_ref,
@@ -526,6 +577,24 @@
 		  __entry->is_data)
 );
 
+DEFINE_EVENT(btrfs_delayed_ref_head,  add_delayed_ref_head,
+
+	TP_PROTO(struct btrfs_delayed_ref_node *ref,
+		 struct btrfs_delayed_ref_head *head_ref,
+		 int action),
+
+	TP_ARGS(ref, head_ref, action)
+);
+
+DEFINE_EVENT(btrfs_delayed_ref_head,  run_delayed_ref_head,
+
+	TP_PROTO(struct btrfs_delayed_ref_node *ref,
+		 struct btrfs_delayed_ref_head *head_ref,
+		 int action),
+
+	TP_ARGS(ref, head_ref, action)
+);
+
 #define show_chunk_type(type)					\
 	__print_flags(type, "|",				\
 		{ BTRFS_BLOCK_GROUP_DATA, 	"DATA"	},	\
@@ -621,6 +690,34 @@
 		  __entry->cow_level)
 );
 
+TRACE_EVENT(btrfs_space_reservation,
+
+	TP_PROTO(struct btrfs_fs_info *fs_info, char *type, u64 val,
+		 u64 bytes, int reserve),
+
+	TP_ARGS(fs_info, type, val, bytes, reserve),
+
+	TP_STRUCT__entry(
+		__array(	u8,	fsid,	BTRFS_UUID_SIZE	)
+		__string(	type,	type			)
+		__field(	u64,	val			)
+		__field(	u64,	bytes			)
+		__field(	int,	reserve			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
+		__assign_str(type, type);
+		__entry->val		= val;
+		__entry->bytes		= bytes;
+		__entry->reserve	= reserve;
+	),
+
+	TP_printk("%pU: %s: %Lu %s %Lu", __entry->fsid, __get_str(type),
+		  __entry->val, __entry->reserve ? "reserve" : "release",
+		  __entry->bytes)
+);
+
 DECLARE_EVENT_CLASS(btrfs__reserved_extent,
 
 	TP_PROTO(struct btrfs_root *root, u64 start, u64 len),
@@ -659,6 +756,211 @@
 	TP_ARGS(root, start, len)
 );
 
+TRACE_EVENT(find_free_extent,
+
+	TP_PROTO(struct btrfs_root *root, u64 num_bytes, u64 empty_size,
+		 u64 data),
+
+	TP_ARGS(root, num_bytes, empty_size, data),
+
+	TP_STRUCT__entry(
+		__field(	u64,	root_objectid		)
+		__field(	u64,	num_bytes		)
+		__field(	u64,	empty_size		)
+		__field(	u64,	data			)
+	),
+
+	TP_fast_assign(
+		__entry->root_objectid	= root->root_key.objectid;
+		__entry->num_bytes	= num_bytes;
+		__entry->empty_size	= empty_size;
+		__entry->data		= data;
+	),
+
+	TP_printk("root = %Lu(%s), len = %Lu, empty_size = %Lu, "
+		  "flags = %Lu(%s)", show_root_type(__entry->root_objectid),
+		  __entry->num_bytes, __entry->empty_size, __entry->data,
+		  __print_flags((unsigned long)__entry->data, "|",
+				 BTRFS_GROUP_FLAGS))
+);
+
+DECLARE_EVENT_CLASS(btrfs__reserve_extent,
+
+	TP_PROTO(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 len),
+
+	TP_ARGS(root, block_group, start, len),
+
+	TP_STRUCT__entry(
+		__field(	u64,	root_objectid		)
+		__field(	u64,	bg_objectid		)
+		__field(	u64,	flags			)
+		__field(	u64,	start			)
+		__field(	u64,	len			)
+	),
+
+	TP_fast_assign(
+		__entry->root_objectid	= root->root_key.objectid;
+		__entry->bg_objectid	= block_group->key.objectid;
+		__entry->flags		= block_group->flags;
+		__entry->start		= start;
+		__entry->len		= len;
+	),
+
+	TP_printk("root = %Lu(%s), block_group = %Lu, flags = %Lu(%s), "
+		  "start = %Lu, len = %Lu",
+		  show_root_type(__entry->root_objectid), __entry->bg_objectid,
+		  __entry->flags, __print_flags((unsigned long)__entry->flags,
+						"|", BTRFS_GROUP_FLAGS),
+		  __entry->start, __entry->len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent,
+
+	TP_PROTO(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 len),
+
+	TP_ARGS(root, block_group, start, len)
+);
+
+DEFINE_EVENT(btrfs__reserve_extent, btrfs_reserve_extent_cluster,
+
+	TP_PROTO(struct btrfs_root *root,
+		 struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 len),
+
+	TP_ARGS(root, block_group, start, len)
+);
+
+TRACE_EVENT(btrfs_find_cluster,
+
+	TP_PROTO(struct btrfs_block_group_cache *block_group, u64 start,
+		 u64 bytes, u64 empty_size, u64 min_bytes),
+
+	TP_ARGS(block_group, start, bytes, empty_size, min_bytes),
+
+	TP_STRUCT__entry(
+		__field(	u64,	bg_objectid		)
+		__field(	u64,	flags			)
+		__field(	u64,	start			)
+		__field(	u64,	bytes			)
+		__field(	u64,	empty_size		)
+		__field(	u64,	min_bytes		)
+	),
+
+	TP_fast_assign(
+		__entry->bg_objectid	= block_group->key.objectid;
+		__entry->flags		= block_group->flags;
+		__entry->start		= start;
+		__entry->bytes		= bytes;
+		__entry->empty_size	= empty_size;
+		__entry->min_bytes	= min_bytes;
+	),
+
+	TP_printk("block_group = %Lu, flags = %Lu(%s), start = %Lu, len = %Lu,"
+		  " empty_size = %Lu, min_bytes = %Lu", __entry->bg_objectid,
+		  __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|",
+				BTRFS_GROUP_FLAGS), __entry->start,
+		  __entry->bytes, __entry->empty_size,  __entry->min_bytes)
+);
+
+TRACE_EVENT(btrfs_failed_cluster_setup,
+
+	TP_PROTO(struct btrfs_block_group_cache *block_group),
+
+	TP_ARGS(block_group),
+
+	TP_STRUCT__entry(
+		__field(	u64,	bg_objectid		)
+	),
+
+	TP_fast_assign(
+		__entry->bg_objectid	= block_group->key.objectid;
+	),
+
+	TP_printk("block_group = %Lu", __entry->bg_objectid)
+);
+
+TRACE_EVENT(btrfs_setup_cluster,
+
+	TP_PROTO(struct btrfs_block_group_cache *block_group,
+		 struct btrfs_free_cluster *cluster, u64 size, int bitmap),
+
+	TP_ARGS(block_group, cluster, size, bitmap),
+
+	TP_STRUCT__entry(
+		__field(	u64,	bg_objectid		)
+		__field(	u64,	flags			)
+		__field(	u64,	start			)
+		__field(	u64,	max_size		)
+		__field(	u64,	size			)
+		__field(	int,	bitmap			)
+	),
+
+	TP_fast_assign(
+		__entry->bg_objectid	= block_group->key.objectid;
+		__entry->flags		= block_group->flags;
+		__entry->start		= cluster->window_start;
+		__entry->max_size	= cluster->max_size;
+		__entry->size		= size;
+		__entry->bitmap		= bitmap;
+	),
+
+	TP_printk("block_group = %Lu, flags = %Lu(%s), window_start = %Lu, "
+		  "size = %Lu, max_size = %Lu, bitmap = %d",
+		  __entry->bg_objectid,
+		  __entry->flags,
+		  __print_flags((unsigned long)__entry->flags, "|",
+				BTRFS_GROUP_FLAGS), __entry->start,
+		  __entry->size, __entry->max_size, __entry->bitmap)
+);
+
+struct extent_state;
+TRACE_EVENT(alloc_extent_state,
+
+	TP_PROTO(struct extent_state *state, gfp_t mask, unsigned long IP),
+
+	TP_ARGS(state, mask, IP),
+
+	TP_STRUCT__entry(
+		__field(struct extent_state *, state)
+		__field(gfp_t, mask)
+		__field(unsigned long, ip)
+	),
+
+	TP_fast_assign(
+		__entry->state	= state,
+		__entry->mask	= mask,
+		__entry->ip	= IP
+	),
+
+	TP_printk("state=%p; mask = %s; caller = %pF", __entry->state,
+		  show_gfp_flags(__entry->mask), (void *)__entry->ip)
+);
+
+TRACE_EVENT(free_extent_state,
+
+	TP_PROTO(struct extent_state *state, unsigned long IP),
+
+	TP_ARGS(state, IP),
+
+	TP_STRUCT__entry(
+		__field(struct extent_state *, state)
+		__field(unsigned long, ip)
+	),
+
+	TP_fast_assign(
+		__entry->state	= state,
+		__entry->ip = IP
+	),
+
+	TP_printk(" state=%p; caller = %pF", __entry->state,
+		  (void *)__entry->ip)
+);
+
 #endif /* _TRACE_BTRFS_H */
 
 /* This part must be outside protection */
Nur in b/include: uapi.
diff -ur a/init/initramfs.c b/init/initramfs.c
--- a/init/initramfs.c	2013-08-24 11:37:17.000000000 +0200
+++ b/init/initramfs.c	2014-02-17 11:57:57.000000000 +0100
@@ -457,8 +457,16 @@
 					 compress_name);
 				message = msg_buf;
 			}
+#ifdef MY_ABC_HERE
+		} else {
+			/* It's workaround. For backward supporting inaccurate size of rd
+			 * in boot arguments, eg.: initrd=0x2000040,4M */
+			break;
+		}
+#else
 		} else
 			error("junk in compressed archive");
+#endif
 		if (state != Reset)
 			error("junk in compressed archive");
 		this_header = saved_offset + my_inptr;
diff -ur a/init/Kconfig b/init/Kconfig
--- a/init/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/init/Kconfig	2014-01-21 09:37:30.000000000 +0100
@@ -891,6 +891,10 @@
 
 	  If unsure, say N.
 
+config CRASHLOG
+	bool "Crash logging"
+	depends on !NO_BOOTMEM && !HAVE_MEMBLOCK && SYNO_COMCERTO
+
 config BLK_DEV_INITRD
 	bool "Initial RAM filesystem and RAM disk (initramfs/initrd) support"
 	depends on BROKEN || !FRV
diff -ur a/init/main.c b/init/main.c
--- a/init/main.c	2013-08-24 11:37:17.000000000 +0200
+++ b/init/main.c	2014-02-17 11:57:57.000000000 +0100
@@ -407,7 +407,7 @@
 
 void __init parse_early_options(char *cmdline)
 {
-	parse_args("early options", cmdline, NULL, 0, do_early_param);
+	parse_args("early options", cmdline, NULL, 0, 0, 0, do_early_param);
 }
 
 /* Arch code calls this early on, or if not, just before other parsing. */
@@ -511,7 +511,7 @@
 	parse_early_param();
 	parse_args("Booting kernel", static_command_line, __start___param,
 		   __stop___param - __start___param,
-		   &unknown_bootoption);
+		   0, 0, &unknown_bootoption);
 
 	jump_label_init();
 
@@ -705,16 +705,69 @@
 }
 
 
-extern initcall_t __initcall_start[], __initcall_end[], __early_initcall_end[];
+extern initcall_t __initcall_start[];
+extern initcall_t __initcall0_start[];
+extern initcall_t __initcall1_start[];
+extern initcall_t __initcall2_start[];
+extern initcall_t __initcall3_start[];
+extern initcall_t __initcall4_start[];
+extern initcall_t __initcall5_start[];
+extern initcall_t __initcall6_start[];
+extern initcall_t __initcall7_start[];
+extern initcall_t __initcall_end[];
+
+static initcall_t *initcall_levels[] __initdata = {
+	__initcall0_start,
+	__initcall1_start,
+	__initcall2_start,
+	__initcall3_start,
+	__initcall4_start,
+	__initcall5_start,
+	__initcall6_start,
+	__initcall7_start,
+	__initcall_end,
+};
+
+static char *initcall_level_names[] __initdata = {
+	"early parameters",
+	"core parameters",
+	"postcore parameters",
+	"arch parameters",
+	"subsys parameters",
+	"fs parameters",
+	"device parameters",
+	"late parameters",
+};
 
-static void __init do_initcalls(void)
+static int __init ignore_unknown_bootoption(char *param, char *val)
+{
+	return 0;
+}
+
+static void __init do_initcall_level(int level)
 {
+	extern const struct kernel_param __start___param[], __stop___param[];
 	initcall_t *fn;
 
-	for (fn = __early_initcall_end; fn < __initcall_end; fn++)
+	strcpy(static_command_line, saved_command_line);
+	parse_args(initcall_level_names[level],
+		   static_command_line, __start___param,
+		   __stop___param - __start___param,
+		   level, level,
+		   ignore_unknown_bootoption);
+
+	for (fn = initcall_levels[level]; fn < initcall_levels[level+1]; fn++)
 		do_one_initcall(*fn);
 }
 
+static void __init do_initcalls(void)
+{
+	int level;
+
+	for (level = 0; level < ARRAY_SIZE(initcall_levels) - 1; level++)
+		do_initcall_level(level);
+}
+
 /*
  * Ok, the machine is now initialized. None of the devices
  * have been touched yet, but the CPU subsystem is up and
@@ -738,7 +791,7 @@
 {
 	initcall_t *fn;
 
-	for (fn = __initcall_start; fn < __early_initcall_end; fn++)
+	for (fn = __initcall_start; fn < __initcall0_start; fn++)
 		do_one_initcall(*fn);
 }
 
@@ -822,7 +875,11 @@
 
 	/* Open the /dev/console on the rootfs, this should never fail */
 	if (sys_open((const char __user *) "/dev/console", O_RDWR, 0) < 0)
+#if defined(CONFIG_SYNO_COMCERTO)
+		printk(KERN_WARNING "Please be patient, while OpenWrt loads ...\n");
+#else
 		printk(KERN_WARNING "Warning: unable to open an initial console.\n");
+#endif
 
 	(void) sys_dup(0);
 	(void) sys_dup(0);
Nur in b/kernel: crashlog.c.
diff -ur a/kernel/exit.c b/kernel/exit.c
--- a/kernel/exit.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/exit.c	2014-02-17 11:57:57.000000000 +0100
@@ -51,6 +51,7 @@
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
+#include <linux/writeback.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -499,6 +500,9 @@
 
 	return files;
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(get_files_struct);
+#endif
 
 void put_files_struct(struct files_struct *files)
 {
@@ -520,6 +524,9 @@
 		rcu_read_unlock();
 	}
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(put_files_struct);
+#endif
 
 void reset_files_struct(struct files_struct *files)
 {
@@ -1018,6 +1025,8 @@
 	validate_creds_for_do_exit(tsk);
 
 	preempt_disable();
+	if (tsk->nr_dirtied)
+		__this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
 	exit_rcu();
 
 	/*
diff -ur a/kernel/freezer.c b/kernel/freezer.c
--- a/kernel/freezer.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/freezer.c	2014-02-17 11:57:56.000000000 +0100
@@ -23,10 +23,11 @@
 }
 
 /* Refrigerator is place where frozen processes are stored :-). */
-void refrigerator(void)
+bool __refrigerator(void)
 {
 	/* Hmm, should we be allowed to suspend when there are realtime
 	   processes around? */
+	bool was_frozen = false;
 	long save;
 
 	task_lock(current);
@@ -35,7 +36,7 @@
 		task_unlock(current);
 	} else {
 		task_unlock(current);
-		return;
+		return was_frozen;
 	}
 	save = current->state;
 	pr_debug("%s entered refrigerator\n", current->comm);
@@ -51,6 +52,7 @@
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!frozen(current))
 			break;
+		was_frozen = true;
 		schedule();
 	}
 
@@ -58,9 +60,17 @@
 	current->flags &= ~PF_FREEZING;
 
 	pr_debug("%s left refrigerator\n", current->comm);
-	__set_current_state(save);
+
+	/*
+	 * Restore saved task state before returning.  The mb'd version
+	 * needs to be used; otherwise, it might silently break
+	 * synchronization which depends on ordered task state change.
+	 */
+	set_current_state(save);
+
+	return was_frozen;
 }
-EXPORT_SYMBOL(refrigerator);
+EXPORT_SYMBOL(__refrigerator);
 
 static void fake_signal_wake_up(struct task_struct *p)
 {
diff -ur a/kernel/ksysfs.c b/kernel/ksysfs.c
--- a/kernel/ksysfs.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/ksysfs.c	2014-02-17 11:57:56.000000000 +0100
@@ -141,6 +141,446 @@
 }
 KERNEL_ATTR_RO(fscaps);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MDMA_PROF)
+extern unsigned int mdma_time_counter[256]; // 16 -> 4000 us
+extern unsigned int mdma_reqtime_counter[256]; // 16 -> 4000 us
+extern unsigned int mdma_data_counter[256];
+extern unsigned int init_mdma_prof;
+extern unsigned int enable_mdma_prof;
+
+static ssize_t comcerto_mdma_prof_enable_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int n;
+	buf[0] = '\0';
+	n = 0;
+	if (enable_mdma_prof)
+		n += sprintf(buf, "MDMA profiling is enabled\n");
+	else
+		n += sprintf(buf, "MDMA profiling is disabled\n");
+
+	return (n + 1);
+}
+
+static ssize_t comcerto_mdma_prof_enable_store(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf, size_t count)
+{
+	unsigned int enable;
+
+	if (kstrtouint(buf, 0, &enable))
+		return -EINVAL;
+
+	if (enable > 0)
+		enable_mdma_prof = 1;
+	else
+		enable_mdma_prof = 0;
+
+	return count;
+}
+KERNEL_ATTR_RW(comcerto_mdma_prof_enable);
+
+static ssize_t comcerto_mdma_reqtiming_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of mdma request time\n");
+
+	for (i = 0; i < 255; i++)
+	{
+		if (mdma_reqtime_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] us\n", mdma_reqtime_counter[i], i << 4, (i + 1) << 4);
+			mdma_reqtime_counter[i] = 0;
+		}
+	}
+	if (mdma_reqtime_counter[255]) {
+		n += sprintf(buf + n, "%d >= %d us\n", mdma_reqtime_counter[255], 255 << 4);
+		mdma_reqtime_counter[255] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_mdma_reqtiming);
+
+static ssize_t comcerto_mdma_timing_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	init_mdma_prof = 0;
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of inter mdma request time\n");
+
+	for (i = 0; i < 255; i++)
+	{
+		if (mdma_time_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] us\n", mdma_time_counter[i], i << 4, (i + 1) << 4);
+			mdma_time_counter[i] = 0;
+		}
+	}
+	if (mdma_time_counter[255]) {
+		n += sprintf(buf + n, "%d >= %d us\n", mdma_time_counter[255], 255 << 4);
+		mdma_time_counter[255] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_mdma_timing);
+
+static ssize_t comcerto_mdma_data_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of mdma data length (up to 1M)\n");
+	for (i = 0; i < 256; i++)
+	{
+		if (mdma_data_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] KB\n", mdma_data_counter[i], i << (13 - 10), (i + 1) << (13 - 10));
+			mdma_data_counter[i] = 0;
+		}
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_mdma_data);
+#endif
+
+#if defined(CONFIG_COMCERTO_SPLICE_PROF)
+extern unsigned int splicew_time_counter[256]; // 4 ms -> 1S
+extern unsigned int splicew_reqtime_counter[256]; // 4 ms -> 1S
+extern unsigned int splicew_data_counter[256]; 
+extern unsigned int init_splicew_prof; 
+extern unsigned int enable_splice_prof;
+static ssize_t comcerto_splice_prof_enable_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int n;
+	buf[0] = '\0';
+	n = 0;
+	if (enable_splice_prof)
+		n += sprintf(buf, "Splice profiling is enabled\n");
+	else
+		n += sprintf(buf, "Splice profiling is disabled\n");
+
+	return (n + 1);
+}
+static ssize_t comcerto_splice_prof_enable_store(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf, size_t count)
+{
+	unsigned int enable;
+
+	if (kstrtouint(buf, 0, &enable))
+		return -EINVAL;
+
+	if (enable > 0)
+		enable_splice_prof = 1;
+	else
+		enable_splice_prof = 0;
+
+	return count;
+}
+KERNEL_ATTR_RW(comcerto_splice_prof_enable);
+static ssize_t comcerto_splicew_reqtiming_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of splice write time (up to 1 sec otherwise date is discarded)\n");
+
+	for (i = 0; i < 255; i++)
+	{
+		if (splicew_reqtime_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] ms\n", splicew_reqtime_counter[i], (i * 8), (i * 8) + 8);
+			splicew_reqtime_counter[i] = 0;
+		}
+	}
+	if (splicew_reqtime_counter[255]) {
+		n += sprintf(buf + n, "%d > 1 second\n", splicew_reqtime_counter[255]);
+		splicew_reqtime_counter[255] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_splicew_reqtiming);
+static ssize_t comcerto_splicew_timing_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	init_splicew_prof = 0;
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of inter splice write time (up to 1 sec otherwise date is discarded)\n");
+
+	for (i = 0; i < 255; i++)
+	{
+		if (splicew_time_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] ms\n", splicew_time_counter[i], (i * 8), (i * 8) + 8);
+			splicew_time_counter[i] = 0;
+		}
+	}
+	if (splicew_time_counter[255]) {
+ 		n += sprintf(buf + n, "%d > 1 second\n", splicew_time_counter[255]);
+		splicew_time_counter[255] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_splicew_timing);
+static ssize_t comcerto_splicew_data_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of splice write data length (up to 1M)\n");
+	for (i = 0; i < 256; i++)
+	{
+		if (splicew_data_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] KB\n", splicew_data_counter[i], (i * 8), (i * 8) + 8);
+			splicew_data_counter[i] = 0;
+		}
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_splicew_data);
+
+
+extern unsigned int splicer_time_counter[256]; // 4 ms -> 1S
+extern unsigned int splicer_reqtime_counter[256]; // 4 ms -> 1S
+extern unsigned int splicer_data_counter[256]; 
+extern unsigned int splicer_tcp_rsock_counter[64];
+extern unsigned int init_splicer_prof; 
+static ssize_t comcerto_splicer_reqtiming_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of splice read time\n");
+
+	for (i = 0; i < 255; i++)
+	{
+		if (splicer_reqtime_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] ms\n", splicer_reqtime_counter[i], (i * 8), (i * 8) + 8);
+			splicer_reqtime_counter[i] = 0;
+		}
+	}
+	if (splicer_reqtime_counter[255]) {
+	 	n += sprintf(buf + n, "%d > 1 second\n", splicer_reqtime_counter[255]);
+		splicer_reqtime_counter[255] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_splicer_reqtiming);
+static ssize_t comcerto_splicer_timing_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	init_splicer_prof = 0;
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of inter splice read time\n");
+
+	for (i = 0; i < 255; i++)
+	{
+		if (splicer_time_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] ms\n", splicer_time_counter[i], (i * 8), (i * 8) + 8);
+			splicer_time_counter[i] = 0;
+		}
+	}
+	if (splicer_time_counter[255]) {
+ 		n += sprintf(buf + n, "%d > 1 second\n", splicer_time_counter[255]);
+		splicer_time_counter[255] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_splicer_timing);
+static ssize_t comcerto_splicer_data_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of splice read data length (up to 1M)\n");
+	for (i = 0; i < 256; i++)
+	{
+		if (splicer_data_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] KB\n", splicer_data_counter[i], (i * 8), (i * 8) + 8);
+			splicer_data_counter[i] = 0;
+		}
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_splicer_data);
+static ssize_t comcerto_splicer_tcp_rsock_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of TCP receive queue size when splice read is performed\n");
+	for (i = 0; i < 63; i++)
+	{
+		if (splicer_tcp_rsock_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] KB\n", splicer_tcp_rsock_counter[i], (i * 64), (i * 64) + 64);
+			splicer_tcp_rsock_counter[i] = 0;
+		}
+	}
+	if (splicer_tcp_rsock_counter[i]) {
+			n += sprintf(buf + n, "%d >= %d KB\n", splicer_tcp_rsock_counter[i], (i * 64));
+			splicer_tcp_rsock_counter[i] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_splicer_tcp_rsock);
+#endif
+
+#if defined(CONFIG_COMCERTO_AHCI_PROF)
+extern unsigned int ahci_time_counter[256]; // 4 ms -> 1S
+extern unsigned int ahci_data_counter[256]; 
+extern unsigned int init_ahci_prof;
+extern unsigned int enable_ahci_prof;
+static ssize_t comcerto_ahci_prof_enable_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int n;
+	buf[0] = '\0';
+	n = 0;
+	if (enable_ahci_prof)
+		n += sprintf(buf, "AHCI profiling is enabled\n");
+	else
+		n += sprintf(buf, "AHCI profiling is disabled\n");
+
+	return (n + 1);
+}
+static ssize_t comcerto_ahci_prof_enable_store(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf, size_t count)
+{
+	unsigned int enable;
+
+	if (kstrtouint(buf, 0, &enable))
+		return -EINVAL;
+
+	if (enable > 0)
+		enable_ahci_prof = 1;
+	else
+		enable_ahci_prof = 0;
+
+	return count;
+}
+KERNEL_ATTR_RW(comcerto_ahci_prof_enable);
+static ssize_t comcerto_ahci_timing_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of inter ahci write time (up to 1 sec otherwise date is discarded)\n");
+	init_ahci_prof = 0;
+	for (i = 0; i < 255; i++)
+	{
+		if (ahci_time_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] ms\n", ahci_time_counter[i], (i * 8), (i * 8) + 8);
+			ahci_time_counter[i] = 0;
+		}
+	}
+	if (ahci_time_counter[255]) {
+	 	n += sprintf(buf + n, "%d > 1 second\n", ahci_time_counter[255]);
+		ahci_time_counter[255] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_ahci_timing);
+static ssize_t comcerto_ahci_data_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	n += sprintf(buf, "Histogram of ahci write data length (up to 1M)\n");
+
+	for (i = 0; i < 256; i++)
+	{
+		if (ahci_data_counter[i]) {
+			n += sprintf(buf + n, "%d in [%d-%d] KB\n", ahci_data_counter[i], (i * 8), (i * 8) + 8);
+			ahci_data_counter[i] = 0;
+		}
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_ahci_data);
+
+
+extern unsigned int ahci_qc_comp_counter[33];
+static ssize_t comcerto_ahci_qc_comp_timing_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int i;
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	sprintf(buf, "Histogram of AHCI qc_complete time (in ms):\n");
+	n = strlen(buf);
+	for (i = 0; i < 32; i++)
+	{
+		if (ahci_qc_comp_counter[i]) {
+			sprintf(buf + n, "%d, in [%d-%d]ms\n",ahci_qc_comp_counter[i], (i * 16), (i * 16) + 16);
+			n = strlen(buf);
+			ahci_qc_comp_counter[i] = 0;
+		}
+	}
+	if (ahci_qc_comp_counter[i]) {
+		sprintf(buf + n, "%d, in [> 512]ms\n",ahci_qc_comp_counter[i]);
+		n = strlen(buf);
+		ahci_qc_comp_counter[i] = 0;
+	}
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_ahci_qc_comp_timing);
+
+
+extern unsigned int ahci_qc_no_free_slot;
+static ssize_t comcerto_ahci_qc_no_free_slot_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int n;
+
+	buf[0] = '\0';
+	n = 0;
+	sprintf(buf, "AHCI qc_no_free_slot count: %d\n", ahci_qc_no_free_slot);
+	ahci_qc_no_free_slot = 0;
+
+	n = strlen(buf);
+
+	return (n + 1);
+}
+KERNEL_ATTR_RO(comcerto_ahci_qc_no_free_slot);
+#endif
+
 /*
  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
  */
@@ -182,6 +622,29 @@
 	&kexec_crash_size_attr.attr,
 	&vmcoreinfo_attr.attr,
 #endif
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_MDMA_PROF)
+	&comcerto_mdma_prof_enable_attr.attr,
+	&comcerto_mdma_timing_attr.attr,
+	&comcerto_mdma_reqtiming_attr.attr,
+	&comcerto_mdma_data_attr.attr,
+#endif
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_SPLICE_PROF)
+	&comcerto_splice_prof_enable_attr.attr,
+	&comcerto_splicew_timing_attr.attr,
+	&comcerto_splicew_reqtiming_attr.attr,
+	&comcerto_splicew_data_attr.attr,
+	&comcerto_splicer_timing_attr.attr,
+	&comcerto_splicer_reqtiming_attr.attr,
+	&comcerto_splicer_data_attr.attr,
+	&comcerto_splicer_tcp_rsock_attr.attr,
+#endif
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_AHCI_PROF)
+	&comcerto_ahci_prof_enable_attr.attr,
+	&comcerto_ahci_timing_attr.attr,
+	&comcerto_ahci_data_attr.attr,
+	&comcerto_ahci_qc_comp_timing_attr.attr,
+	&comcerto_ahci_qc_no_free_slot_attr.attr,
+#endif
 	NULL
 };
 
diff -ur a/kernel/Makefile b/kernel/Makefile
--- a/kernel/Makefile	2013-08-03 09:59:52.000000000 +0200
+++ b/kernel/Makefile	2014-01-21 09:37:30.000000000 +0100
@@ -112,6 +112,9 @@
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_CRASHLOG) += crashlog.o
+endif
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff -ur a/kernel/module.c b/kernel/module.c
--- a/kernel/module.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/module.c	2014-02-17 11:57:56.000000000 +0100
@@ -107,6 +107,9 @@
 #ifdef CONFIG_KGDB_KDB
 struct list_head *kdb_modules = &modules; /* kdb needs the list of modules */
 #endif /* CONFIG_KGDB_KDB */
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_CRASHLOG)
+struct list_head *crashlog_modules = &modules;
+#endif
 
 
 /* Block module loading/unloading? */
@@ -2898,7 +2901,8 @@
 	mutex_unlock(&module_mutex);
 
 	/* Module is ready to execute: parsing args may do that. */
-	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
+	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
+			 -32768, 32767, NULL);
 	if (err < 0)
 		goto unlink;
 
diff -ur a/kernel/params.c b/kernel/params.c
--- a/kernel/params.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/params.c	2014-02-17 11:57:56.000000000 +0100
@@ -94,6 +94,8 @@
 		     char *val,
 		     const struct kernel_param *params,
 		     unsigned num_params,
+		     s16 min_level,
+		     s16 max_level,
 		     int (*handle_unknown)(char *param, char *val))
 {
 	unsigned int i;
@@ -102,6 +104,9 @@
 	/* Find parameter */
 	for (i = 0; i < num_params; i++) {
 		if (parameq(param, params[i].name)) {
+			if (params[i].level < min_level
+			    || params[i].level > max_level)
+				return 0;
 			/* No one handled NULL, so do it here. */
 			if (!val && params[i].ops->set != param_set_bool)
 				return -EINVAL;
@@ -180,6 +185,8 @@
 	       char *args,
 	       const struct kernel_param *params,
 	       unsigned num,
+	       s16 min_level,
+	       s16 max_level,
 	       int (*unknown)(char *param, char *val))
 {
 	char *param, *val;
@@ -195,7 +202,8 @@
 
 		args = next_arg(args, &param, &val);
 		irq_was_disabled = irqs_disabled();
-		ret = parse_one(param, val, params, num, unknown);
+		ret = parse_one(param, val, params, num,
+				min_level, max_level, unknown);
 		if (irq_was_disabled && !irqs_disabled()) {
 			printk(KERN_WARNING "parse_args(): option '%s' enabled "
 					"irq's!\n", param);
@@ -303,35 +311,18 @@
 /* Actually could be a bool or an int, for historical reasons. */
 int param_set_bool(const char *val, const struct kernel_param *kp)
 {
-	bool v;
-	int ret;
-
 	/* No equals means "set"... */
 	if (!val) val = "1";
 
 	/* One of =[yYnN01] */
-	ret = strtobool(val, &v);
-	if (ret)
-		return ret;
-
-	if (kp->flags & KPARAM_ISBOOL)
-		*(bool *)kp->arg = v;
-	else
-		*(int *)kp->arg = v;
-	return 0;
+	return strtobool(val, kp->arg);
 }
 EXPORT_SYMBOL(param_set_bool);
 
 int param_get_bool(char *buffer, const struct kernel_param *kp)
 {
-	bool val;
-	if (kp->flags & KPARAM_ISBOOL)
-		val = *(bool *)kp->arg;
-	else
-		val = *(int *)kp->arg;
-
 	/* Y and N chosen as being relatively non-coder friendly */
-	return sprintf(buffer, "%c", val ? 'Y' : 'N');
+	return sprintf(buffer, "%c", *(bool *)kp->arg ? 'Y' : 'N');
 }
 EXPORT_SYMBOL(param_get_bool);
 
@@ -349,7 +340,6 @@
 	struct kernel_param dummy;
 
 	dummy.arg = &boolval;
-	dummy.flags = KPARAM_ISBOOL;
 	ret = param_set_bool(val, &dummy);
 	if (ret == 0)
 		*(bool *)kp->arg = !boolval;
@@ -369,13 +359,36 @@
 };
 EXPORT_SYMBOL(param_ops_invbool);
 
+int param_set_bint(const char *val, const struct kernel_param *kp)
+{
+	struct kernel_param boolkp;
+	bool v;
+	int ret;
+
+	/* Match bool exactly, by re-using it. */
+	boolkp = *kp;
+	boolkp.arg = &v;
+
+	ret = param_set_bool(val, &boolkp);
+	if (ret == 0)
+		*(int *)kp->arg = v;
+	return ret;
+}
+EXPORT_SYMBOL(param_set_bint);
+
+struct kernel_param_ops param_ops_bint = {
+	.set = param_set_bint,
+	.get = param_get_int,
+};
+EXPORT_SYMBOL(param_ops_bint);
+
 /* We break the rule and mangle the string. */
 static int param_array(const char *name,
 		       const char *val,
 		       unsigned int min, unsigned int max,
 		       void *elem, int elemsize,
 		       int (*set)(const char *, const struct kernel_param *kp),
-		       u16 flags,
+		       s16 level,
 		       unsigned int *num)
 {
 	int ret;
@@ -385,7 +398,7 @@
 	/* Get the name right for errors. */
 	kp.name = name;
 	kp.arg = elem;
-	kp.flags = flags;
+	kp.level = level;
 
 	*num = 0;
 	/* We expect a comma-separated list of values. */
@@ -426,7 +439,7 @@
 	unsigned int temp_num;
 
 	return param_array(kp->name, val, 1, arr->max, arr->elem,
-			   arr->elemsize, arr->ops->set, kp->flags,
+			   arr->elemsize, arr->ops->set, kp->level,
 			   arr->num ?: &temp_num);
 }
 
diff -ur a/kernel/power/Kconfig b/kernel/power/Kconfig
--- a/kernel/power/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/kernel/power/Kconfig	2014-01-21 09:37:30.000000000 +0100
@@ -148,6 +148,15 @@
 	You probably want to have your system's RTC driver statically
 	linked, ensuring that it's available when this test runs.
 
+config PM_SYSFS_MANUAL
+	bool "Driver model /sys/devices/.../power/state files"
+	depends on PM && SYNO_COMCERTO 
+	default n
+	---help---
+	The driver model started out with a sysfs file intended to provide
+	a userspace hook for device power management. By this hook Non CPU 
+	devices can be put in power off state. 
+
 config CAN_PM_TRACE
 	def_bool y
 	depends on PM_DEBUG && PM_SLEEP
diff -ur a/kernel/sched.c b/kernel/sched.c
--- a/kernel/sched.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/sched.c	2014-02-17 11:57:56.000000000 +0100
@@ -5299,6 +5299,9 @@
 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
 		capable(CAP_SYS_NICE));
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(can_nice);
+#endif
 
 #ifdef __ARCH_WANT_SYS_NICE
 
diff -ur a/kernel/softirq.c b/kernel/softirq.c
--- a/kernel/softirq.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/softirq.c	2014-02-17 11:57:56.000000000 +0100
@@ -202,7 +202,11 @@
  * we want to handle softirqs as soon as possible, but they
  * should not be able to lock up the box.
  */
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_COMCERTO)
+#define MAX_SOFTIRQ_RESTART 2
+#else
 #define MAX_SOFTIRQ_RESTART 10
+#endif
 
 asmlinkage void __do_softirq(void)
 {
diff -ur a/kernel/sysctl.c b/kernel/sysctl.c
--- a/kernel/sysctl.c	2013-08-24 11:37:17.000000000 +0200
+++ b/kernel/sysctl.c	2014-02-17 11:57:56.000000000 +0100
@@ -235,11 +235,6 @@
 
 #endif
 
-#ifdef CONFIG_SYNO_ARMADA
-long gSynoUSBStation= 0;
-EXPORT_SYMBOL(gSynoUSBStation);
-#endif
-
 #ifdef CONFIG_SYNO_DISPLAY_CPUINFO
 unsigned int gSynoCPUInfoCore = 0;
 EXPORT_SYMBOL(gSynoCPUInfoCore);
@@ -257,6 +252,11 @@
 EXPORT_SYMBOL(gSynoFactoryUSB3Disable);
 #endif
 
+#ifdef CONFIG_SYNO_DUAL_HEAD
+int gSynoDualHead = 0;
+EXPORT_SYMBOL(gSynoDualHead);
+#endif
+
 #ifdef MY_ABC_HERE
 int gSynoNoEhci = 0;
 EXPORT_SYMBOL(gSynoNoEhci);
@@ -1347,15 +1347,6 @@
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_SYNO_ARMADA
-	{
-		.procname		= "syno_usbstation",
-		.data			= &gSynoUSBStation,
-		.maxlen 			= sizeof (int),
-		.mode			= 0444,
-		.proc_handler		= &proc_dointvec,
-	},
-#endif
 #ifdef CONFIG_SYNO_DISPLAY_CPUINFO
         {
             .procname       = "syno_CPU_info_core",
diff -ur a/lib/decompress.c b/lib/decompress.c
--- a/lib/decompress.c	2013-08-24 11:36:48.000000000 +0200
+++ b/lib/decompress.c	2014-02-17 11:57:03.000000000 +0100
@@ -40,6 +40,9 @@
 	{ {037, 0236}, "gzip", gunzip },
 	{ {0x42, 0x5a}, "bzip2", bunzip2 },
 	{ {0x5d, 0x00}, "lzma", unlzma },
+#if defined(CONFIG_SYNO_COMCERTO)
+	{ {0x6d, 0x00}, "lzma-openwrt", unlzma },
+#endif
 	{ {0xfd, 0x37}, "xz", unxz },
 	{ {0x89, 0x4c}, "lzo", unlzo },
 	{ {0, 0}, NULL, NULL }
diff -ur a/lib/decompress_unlzo.c b/lib/decompress_unlzo.c
--- a/lib/decompress_unlzo.c	2013-08-24 11:36:48.000000000 +0200
+++ b/lib/decompress_unlzo.c	2014-02-17 11:57:03.000000000 +0100
@@ -38,6 +38,9 @@
 
 #include <linux/types.h>
 #include <linux/lzo.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/decompress/unlzo_mm.h>
+#endif
 #include <linux/decompress/mm.h>
 
 #include <linux/compiler.h>
diff -ur a/lib/Kconfig b/lib/Kconfig
--- a/lib/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/lib/Kconfig	2014-01-21 09:37:30.000000000 +0100
@@ -197,16 +197,16 @@
 # Textsearch support is select'ed if needed
 #
 config TEXTSEARCH
-	boolean
+	boolean	"Textsearch support"
 
 config TEXTSEARCH_KMP
-	tristate
+	tristate "Textsearch KMP"
 
 config TEXTSEARCH_BM
-	tristate
+	tristate "Textsearch BM"
 
 config TEXTSEARCH_FSM
-	tristate
+	tristate "Textsearch FSM"
 
 config BTREE
 	boolean
diff -ur a/lib/kobject_uevent.c b/lib/kobject_uevent.c
--- a/lib/kobject_uevent.c	2013-08-24 11:36:48.000000000 +0200
+++ b/lib/kobject_uevent.c	2014-02-17 11:57:03.000000000 +0100
@@ -50,6 +50,20 @@
 	[KOBJ_OFFLINE] =	"offline",
 };
 
+#if defined(CONFIG_SYNO_COMCERTO)
+u64 uevent_next_seqnum(void)
+{
+	u64 seq;
+
+	mutex_lock(&uevent_sock_mutex);
+	seq = ++uevent_seqnum;
+	mutex_unlock(&uevent_sock_mutex);
+
+	return seq;
+}
+EXPORT_SYMBOL_GPL(uevent_next_seqnum);
+#endif
+
 /**
  * kobject_action_type - translate action string to numeric type
  *
@@ -374,6 +388,45 @@
 }
 EXPORT_SYMBOL_GPL(add_uevent_var);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#if defined(CONFIG_NET)
+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group,
+		     gfp_t allocation)
+{
+	struct uevent_sock *ue_sk;
+	int err = 0;
+
+	/* send netlink message */
+	mutex_lock(&uevent_sock_mutex);
+	list_for_each_entry(ue_sk, &uevent_sock_list, list) {
+		struct sock *uevent_sock = ue_sk->sk;
+		struct sk_buff *skb2;
+
+		skb2 = skb_clone(skb, allocation);
+		if (!skb2)
+			break;
+
+		err = netlink_broadcast(uevent_sock, skb2, pid, group,
+					allocation);
+		if (err)
+			break;
+	}
+	mutex_unlock(&uevent_sock_mutex);
+
+	kfree_skb(skb);
+	return err;
+}
+#else
+int broadcast_uevent(struct sk_buff *skb, __u32 pid, __u32 group,
+		     gfp_t allocation)
+{
+	kfree_skb(skb);
+	return 0;
+}
+#endif
+EXPORT_SYMBOL_GPL(broadcast_uevent);
+#endif
+
 #if defined(CONFIG_NET)
 static int uevent_net_init(struct net *net)
 {
diff -ur a/lib/random32.c b/lib/random32.c
--- a/lib/random32.c	2013-08-24 11:36:48.000000000 +0200
+++ b/lib/random32.c	2014-02-17 11:57:03.000000000 +0100
@@ -42,13 +42,13 @@
 static DEFINE_PER_CPU(struct rnd_state, net_rand_state);
 
 /**
- *	prandom32 - seeded pseudo-random number generator.
+ *	prandom_u32_state - seeded pseudo-random number generator.
  *	@state: pointer to state structure holding seeded state.
  *
  *	This is used for pseudo-randomness with no outside seeding.
- *	For more random results, use random32().
+ *	For more random results, use prandom_u32().
  */
-u32 prandom32(struct rnd_state *state)
+u32 prandom_u32_state(struct rnd_state *state)
 {
 #define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
 
@@ -58,32 +58,81 @@
 
 	return (state->s1 ^ state->s2 ^ state->s3);
 }
-EXPORT_SYMBOL(prandom32);
+EXPORT_SYMBOL(prandom_u32_state);
 
 /**
- *	random32 - pseudo random number generator
+ *	prandom_u32 - pseudo random number generator
  *
  *	A 32 bit pseudo-random number is generated using a fast
  *	algorithm suitable for simulation. This algorithm is NOT
  *	considered safe for cryptographic use.
  */
-u32 random32(void)
+u32 prandom_u32(void)
 {
 	unsigned long r;
 	struct rnd_state *state = &get_cpu_var(net_rand_state);
-	r = prandom32(state);
+	r = prandom_u32_state(state);
 	put_cpu_var(state);
 	return r;
 }
-EXPORT_SYMBOL(random32);
+EXPORT_SYMBOL(prandom_u32);
+
+/*
+ *	prandom_bytes_state - get the requested number of pseudo-random bytes
+ *
+ *	@state: pointer to state structure holding seeded state.
+ *	@buf: where to copy the pseudo-random bytes to
+ *	@bytes: the requested number of bytes
+ *
+ *	This is used for pseudo-randomness with no outside seeding.
+ *	For more random results, use prandom_bytes().
+ */
+void prandom_bytes_state(struct rnd_state *state, void *buf, int bytes)
+{
+	unsigned char *p = buf;
+	int i;
+
+	for (i = 0; i < round_down(bytes, sizeof(u32)); i += sizeof(u32)) {
+		u32 random = prandom_u32_state(state);
+		int j;
+
+		for (j = 0; j < sizeof(u32); j++) {
+			p[i + j] = random;
+			random >>= BITS_PER_BYTE;
+		}
+	}
+	if (i < bytes) {
+		u32 random = prandom_u32_state(state);
+
+		for (; i < bytes; i++) {
+			p[i] = random;
+			random >>= BITS_PER_BYTE;
+		}
+	}
+}
+EXPORT_SYMBOL(prandom_bytes_state);
+
+/**
+ *	prandom_bytes - get the requested number of pseudo-random bytes
+ *	@buf: where to copy the pseudo-random bytes to
+ *	@bytes: the requested number of bytes
+ */
+void prandom_bytes(void *buf, int bytes)
+{
+	struct rnd_state *state = &get_cpu_var(net_rand_state);
+
+	prandom_bytes_state(state, buf, bytes);
+	put_cpu_var(state);
+}
+EXPORT_SYMBOL(prandom_bytes);
 
 /**
- *	srandom32 - add entropy to pseudo random number generator
+ *	prandom_seed - add entropy to pseudo random number generator
  *	@seed: seed value
  *
- *	Add some additional seeding to the random32() pool.
+ *	Add some additional seeding to the prandom pool.
  */
-void srandom32(u32 entropy)
+void prandom_seed(u32 entropy)
 {
 	int i;
 	/*
@@ -95,13 +144,13 @@
 		state->s1 = __seed(state->s1 ^ entropy, 1);
 	}
 }
-EXPORT_SYMBOL(srandom32);
+EXPORT_SYMBOL(prandom_seed);
 
 /*
  *	Generate some initially weak seeding values to allow
- *	to start the random32() engine.
+ *	to start the prandom_u32() engine.
  */
-static int __init random32_init(void)
+static int __init prandom_init(void)
 {
 	int i;
 
@@ -114,22 +163,22 @@
 		state->s3 = __seed(LCG(state->s2), 15);
 
 		/* "warm it up" */
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
-		prandom32(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
+		prandom_u32_state(state);
 	}
 	return 0;
 }
-core_initcall(random32_init);
+core_initcall(prandom_init);
 
 /*
  *	Generate better values after random number generator
  *	is fully initialized.
  */
-static int __init random32_reseed(void)
+static int __init prandom_reseed(void)
 {
 	int i;
 
@@ -143,8 +192,8 @@
 		state->s3 = __seed(seeds[2], 15);
 
 		/* mix it in */
-		prandom32(state);
+		prandom_u32_state(state);
 	}
 	return 0;
 }
-late_initcall(random32_reseed);
+late_initcall(prandom_reseed);
diff -ur a/MAINTAINERS b/MAINTAINERS
--- a/MAINTAINERS	2013-08-03 09:59:49.000000000 +0200
+++ b/MAINTAINERS	2014-01-21 09:36:44.000000000 +0100
@@ -6765,6 +6765,12 @@
 F:	include/linux/mtd/ubi.h
 F:	include/mtd/ubi-user.h
 
+UNSORTED BLOCK IMAGES (UBI) Fastmap
+M:	Richard Weinberger <richard@nod.at>
+L:	linux-mtd@lists.infradead.org
+S:	Maintained
+F:	drivers/mtd/ubi/fastmap.c
+
 USB ACM DRIVER
 M:	Oliver Neukum <oliver@neukum.name>
 L:	linux-usb@vger.kernel.org
diff -ur a/Makefile b/Makefile
--- a/Makefile	2013-08-03 09:59:49.000000000 +0200
+++ b/Makefile	2014-01-21 09:36:44.000000000 +0100
@@ -564,10 +564,22 @@
 all: vmlinux
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
+ifdef CONFIG_SYNO_COMCERTO
+KBUILD_CFLAGS	+= -Os -fno-caller-saves
+else
 KBUILD_CFLAGS	+= -Os
+endif
+else
+ifdef CONFIG_SYNO_COMCERTO
+ifdef CONFIG_COMCERTO_CC_OPTIMIZE_O3
+KBUILD_CFLAGS	+= -O3 -fno-reorder-blocks -fno-tree-ch -fno-caller-saves
+else
+KBUILD_CFLAGS	+= -O2 -fno-reorder-blocks -fno-tree-ch -fno-caller-saves
+endif
 else
 KBUILD_CFLAGS	+= -O2
 endif
+endif
 
 include $(srctree)/arch/$(SRCARCH)/Makefile
 
@@ -625,6 +637,11 @@
 NOSTDINC_FLAGS += -nostdinc -isystem $(shell $(CC) -print-file-name=include)
 CHECKFLAGS     += $(NOSTDINC_FLAGS)
 
+ifdef CONFIG_SYNO_COMCERTO
+# improve gcc optimization
+CFLAGS += $(call cc-option,-funit-at-a-time,)
+endif
+
 # warn about C99 declaration after statement
 KBUILD_CFLAGS += $(call cc-option,-Wdeclaration-after-statement,)
 
diff -ur a/mm/backing-dev.c b/mm/backing-dev.c
--- a/mm/backing-dev.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/backing-dev.c	2014-02-17 11:57:56.000000000 +0100
@@ -225,12 +225,40 @@
 }
 BDI_SHOW(max_ratio, bdi->max_ratio)
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+static ssize_t cpu0_bind_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+	unsigned int flag;
+
+	flag = simple_strtoul(buf, NULL, 10);
+	if (flag)
+		bdi->cpu0_bind = 1;
+	else
+		bdi->cpu0_bind = 0;
+
+	return count;
+}
+
+static ssize_t cpu0_bind_show(struct device *dev,
+			   struct device_attribute *attr, char *page)
+{
+	struct backing_dev_info *bdi = dev_get_drvdata(dev);
+
+	return snprintf(page, PAGE_SIZE-1, "%d\n", bdi->cpu0_bind);
+}
+#endif /* CONFIG_SYNO_COMCERTO && CONFIG_ARCH_M86XXX */
+
 #define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store)
 
 static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+	__ATTR_RW(cpu0_bind),
+#endif
 	__ATTR_NULL,
 };
 
@@ -478,6 +506,10 @@
 				writeback_inodes_wb(&bdi->wb, 1024,
 						    WB_REASON_FORKER_THREAD);
 			} else {
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_M86XXX)
+				if (bdi->cpu0_bind)
+					kthread_bind(task, 0);
+#endif
 				/*
 				 * The spinlock makes sure we do not lose
 				 * wake-ups when racing with 'bdi_queue_work()'.
diff -ur a/mm/bootmem.c b/mm/bootmem.c
--- a/mm/bootmem.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/bootmem.c	2014-02-17 11:57:56.000000000 +0100
@@ -15,6 +15,9 @@
 #include <linux/export.h>
 #include <linux/kmemleak.h>
 #include <linux/range.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <linux/crashlog.h>
+#endif
 #include <linux/memblock.h>
 
 #include <asm/bug.h>
@@ -178,6 +181,9 @@
 	if (!bdata->node_bootmem_map)
 		return 0;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	crashlog_init_mem(bdata);
+#endif
 	start = bdata->node_min_pfn;
 	end = bdata->node_low_pfn;
 
diff -ur a/mm/filemap.c b/mm/filemap.c
--- a/mm/filemap.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/filemap.c	2014-02-17 11:57:56.000000000 +0100
@@ -1767,8 +1767,35 @@
 }
 EXPORT_SYMBOL(filemap_fault);
 
+int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page = vmf->page;
+	struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
+	int ret = VM_FAULT_LOCKED;
+
+	sb_start_pagefault(inode->i_sb);
+	file_update_time(vma->vm_file);
+	lock_page(page);
+	if (page->mapping != inode->i_mapping) {
+		unlock_page(page);
+		ret = VM_FAULT_NOPAGE;
+		goto out;
+	}
+	/*
+	 * We mark the page dirty already here so that when freeze is in
+	 * progress, we are guaranteed that writeback during freezing will
+	 * see the dirty page and writeprotect it again.
+	 */
+	set_page_dirty(page);
+out:
+	sb_end_pagefault(inode->i_sb);
+	return ret;
+}
+EXPORT_SYMBOL(filemap_page_mkwrite);
+
 const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
+	.page_mkwrite	= filemap_page_mkwrite,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -2300,7 +2327,7 @@
 	*wbytes = 0;
 	pos = *ppos;
 
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+	sb_start_write(inode->i_sb);
 
 	/*
 	 * We can write back this queue in page reclaim
@@ -2312,6 +2339,14 @@
 		goto done1;
 	}
 
+	if (file->f_op->syno_recvfile) {
+		file_remove_suid(file);
+		file_update_time(file);
+		err = file->f_op->syno_recvfile(file, sock, ppos, count, rbytes, wbytes);
+		sb_end_write(inode->i_sb);
+		current->backing_dev_info = NULL;
+		return err;
+	}
 	/* Check address_ops functions */
 	if (!mapping->a_ops->write_begin || !mapping->a_ops->write_end) {
 		printk("write_begin() or write_end() is not implemented\n");
@@ -2436,10 +2471,11 @@
 	}
 
 	if (!err) {
-		balance_dirty_pages_ratelimited_nr(mapping, cPagesAllocated);
+		balance_dirty_pages_ratelimited(mapping);
 	}
 
 done1:
+	sb_end_write(inode->i_sb);
 	current->backing_dev_info = NULL;
 	if (err) {
 		return err;
@@ -2447,7 +2483,6 @@
 		return bytes_received;
 	}
 }
-
 #endif
 
 ssize_t
@@ -2705,8 +2740,6 @@
 	count = ocount;
 	pos = *ppos;
 
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 	written = 0;
@@ -2722,7 +2755,9 @@
 	if (err)
 		goto out;
 
-	file_update_time(file);
+	err = file_update_time(file);
+	if (err)
+		goto out;
 
 	/* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
 	if (unlikely(file->f_flags & O_DIRECT)) {
@@ -2803,6 +2838,7 @@
 
 	BUG_ON(iocb->ki_pos != pos);
 
+	sb_start_write(inode->i_sb);
 	mutex_lock(&inode->i_mutex);
 	blk_start_plug(&plug);
 	ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
@@ -2816,6 +2852,7 @@
 			ret = err;
 	}
 	blk_finish_plug(&plug);
+	sb_end_write(inode->i_sb);
 	return ret;
 }
 EXPORT_SYMBOL(generic_file_aio_write);
diff -ur a/mm/filemap_xip.c b/mm/filemap_xip.c
--- a/mm/filemap_xip.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/filemap_xip.c	2014-02-17 11:57:56.000000000 +0100
@@ -304,6 +304,7 @@
 
 static const struct vm_operations_struct xip_file_vm_ops = {
 	.fault	= xip_file_fault,
+	.page_mkwrite	= filemap_page_mkwrite,
 };
 
 int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
@@ -401,6 +402,8 @@
 	loff_t pos;
 	ssize_t ret;
 
+	sb_start_write(inode->i_sb);
+
 	mutex_lock(&inode->i_mutex);
 
 	if (!access_ok(VERIFY_READ, buf, len)) {
@@ -411,8 +414,6 @@
 	pos = *ppos;
 	count = len;
 
-	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = mapping->backing_dev_info;
 
@@ -426,7 +427,9 @@
 	if (ret)
 		goto out_backing;
 
-	file_update_time(filp);
+	ret = file_update_time(filp);
+	if (ret)
+		goto out_backing;
 
 	ret = __xip_file_write (filp, buf, count, pos, ppos);
 
@@ -434,6 +437,7 @@
 	current->backing_dev_info = NULL;
  out_up:
 	mutex_unlock(&inode->i_mutex);
+	sb_end_write(inode->i_sb);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(xip_file_write);
diff -ur a/mm/memory.c b/mm/memory.c
--- a/mm/memory.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/memory.c	2014-02-17 11:57:56.000000000 +0100
@@ -1401,6 +1401,9 @@
 	tlb_finish_mmu(&tlb, address, end);
 	return end;
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(zap_page_range);
+#endif
 
 /**
  * zap_vma_ptes - remove ptes mapping the vma
@@ -3076,6 +3079,9 @@
 	}
 	return 0;
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(vmtruncate_range);
+#endif
 
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
diff -ur a/mm/mmap.c b/mm/mmap.c
--- a/mm/mmap.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/mmap.c	2014-02-17 11:57:56.000000000 +0100
@@ -1897,8 +1897,13 @@
 	update_hiwater_rss(mm);
 	unmap_vmas(&tlb, vma, start, end, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
+#if defined(CONFIG_SYNO_COMCERTO)
+	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+				 next ? next->vm_start : mm->task_size);
+#else
 	free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
 				 next ? next->vm_start : 0);
+#endif
 	tlb_finish_mmu(&tlb, start, end);
 }
 
@@ -2273,7 +2278,11 @@
 	end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
 	vm_unacct_memory(nr_accounted);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, mm->task_size);
+#else
 	free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, 0);
+#endif
 	tlb_finish_mmu(&tlb, 0, end);
 
 	/*
diff -ur a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/page_alloc.c	2014-02-17 11:57:56.000000000 +0100
@@ -1668,6 +1668,11 @@
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_ZONE_DMA_NCNB)
+		if (!(gfp_mask & __GFP_DMA) && (zone_idx(zone) == ZONE_DMA))
+			continue;
+#endif
+
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
 			unsigned long mark;
diff -ur a/mm/page-writeback.c b/mm/page-writeback.c
--- a/mm/page-writeback.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/page-writeback.c	2014-02-17 11:57:56.000000000 +0100
@@ -946,7 +946,7 @@
 }
 
 /*
- * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr()
+ * After a task dirtied this many pages, balance_dirty_pages_ratelimited()
  * will look to see if it needs to start dirty throttling.
  *
  * If dirty_poll_interval is too low, big NUMA machines will call the expensive
@@ -1214,10 +1214,25 @@
 
 static DEFINE_PER_CPU(int, bdp_ratelimits);
 
+/*
+ * Normal tasks are throttled by
+ *	loop {
+ *		dirty tsk->nr_dirtied_pause pages;
+ *		take a snap in balance_dirty_pages();
+ *	}
+ * However there is a worst case. If every task exit immediately when dirtied
+ * (tsk->nr_dirtied_pause - 1) pages, balance_dirty_pages() will never be
+ * called to throttle the page dirties. The solution is to save the not yet
+ * throttled page dirties in dirty_throttle_leaks on task exit and charge them
+ * randomly into the running tasks. This works well for the above worst case,
+ * as the new task will pick up and accumulate the old task's leaked dirty
+ * count and eventually get throttled.
+ */
+DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
+
 /**
- * balance_dirty_pages_ratelimited_nr - balance dirty memory state
+ * balance_dirty_pages_ratelimited - balance dirty memory state
  * @mapping: address_space which was dirtied
- * @nr_pages_dirtied: number of pages which the caller has just dirtied
  *
  * Processes which are dirtying memory should call in here once for each page
  * which was newly dirtied.  The function will periodically check the system's
@@ -1228,8 +1243,7 @@
  * limit we decrease the ratelimiting by a lot, to prevent individual processes
  * from overshooting the limit by (ratelimit_pages) each.
  */
-void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
-					unsigned long nr_pages_dirtied)
+void balance_dirty_pages_ratelimited(struct address_space *mapping)
 {
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	int ratelimit;
@@ -1242,8 +1256,6 @@
 	if (bdi->dirty_exceeded)
 		ratelimit = min(ratelimit, 32 >> (PAGE_SHIFT - 10));
 
-	current->nr_dirtied += nr_pages_dirtied;
-
 	preempt_disable();
 	/*
 	 * This prevents one CPU to accumulate too many dirtied pages without
@@ -1254,19 +1266,28 @@
 	p =  &__get_cpu_var(bdp_ratelimits);
 	if (unlikely(current->nr_dirtied >= ratelimit))
 		*p = 0;
-	else {
-		*p += nr_pages_dirtied;
-		if (unlikely(*p >= ratelimit_pages)) {
-			*p = 0;
-			ratelimit = 0;
-		}
+	else if (unlikely(*p >= ratelimit_pages)) {
+		*p = 0;
+		ratelimit = 0;
+	}
+	/*
+	 * Pick up the dirtied pages by the exited tasks. This avoids lots of
+	 * short-lived tasks (eg. gcc invocations in a kernel build) escaping
+	 * the dirty throttling and livelock other long-run dirtiers.
+	 */
+	p = &__get_cpu_var(dirty_throttle_leaks);
+	if (*p > 0 && current->nr_dirtied < ratelimit) {
+		unsigned long nr_pages_dirtied;
+		nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied);
+		*p -= nr_pages_dirtied;
+		current->nr_dirtied += nr_pages_dirtied;
 	}
 	preempt_enable();
 
 	if (unlikely(current->nr_dirtied >= ratelimit))
 		balance_dirty_pages(mapping, current->nr_dirtied);
 }
-EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
+EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
 void throttle_vm_writeout(gfp_t gfp_mask)
 {
@@ -1741,6 +1762,8 @@
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
 		task_io_account_write(PAGE_CACHE_SIZE);
+		current->nr_dirtied++;
+		this_cpu_inc(bdp_ratelimits);
 	}
 }
 EXPORT_SYMBOL(account_page_dirtied);
@@ -1801,6 +1824,24 @@
 EXPORT_SYMBOL(__set_page_dirty_nobuffers);
 
 /*
+ * Call this whenever redirtying a page, to de-account the dirty counters
+ * (NR_DIRTIED, BDI_DIRTIED, tsk->nr_dirtied), so that they match the written
+ * counters (NR_WRITTEN, BDI_WRITTEN) in long term. The mismatches will lead to
+ * systematic errors in balanced_dirty_ratelimit and the dirty pages position
+ * control.
+ */
+void account_page_redirty(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	if (mapping && mapping_cap_account_dirty(mapping)) {
+		current->nr_dirtied--;
+		dec_zone_page_state(page, NR_DIRTIED);
+		dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+	}
+}
+EXPORT_SYMBOL(account_page_redirty);
+
+/*
  * When a writepage implementation decides that it doesn't want to write this
  * page for some reason, it should redirty the locked page via
  * redirty_page_for_writepage() and it should then unlock the page and return 0
@@ -1808,6 +1849,7 @@
 int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
 {
 	wbc->pages_skipped++;
+	account_page_redirty(page);
 	return __set_page_dirty_nobuffers(page);
 }
 EXPORT_SYMBOL(redirty_page_for_writepage);
diff -ur a/mm/shmem.c b/mm/shmem.c
--- a/mm/shmem.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/shmem.c	2014-02-17 11:57:56.000000000 +0100
@@ -2505,6 +2505,18 @@
 
 /* common code */
 
+#if defined(CONFIG_SYNO_COMCERTO)
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+	if (vma->vm_file)
+		fput(vma->vm_file);
+	vma->vm_file = file;
+	vma->vm_ops = &shmem_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
+}
+EXPORT_SYMBOL_GPL(shmem_set_file);
+#endif
+
 /**
  * shmem_file_setup - get an unlinked file living in tmpfs
  * @name: name for dentry (to be seen in /proc/<pid>/maps
@@ -2582,11 +2594,16 @@
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	shmem_set_file(vma, file);
+#else
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
 	vma->vm_flags |= VM_CAN_NONLINEAR;
+#endif
+
 	return 0;
 }
 
diff -ur a/mm/slab.c b/mm/slab.c
--- a/mm/slab.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/slab.c	2014-02-17 11:57:56.000000000 +0100
@@ -749,7 +749,11 @@
 	 * for large kmalloc calls required.
 	 */
 #ifdef CONFIG_ZONE_DMA
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (unlikely(gfpflags & __GFP_DMA))
+#else
 	if (unlikely(gfpflags & GFP_DMA))
+#endif
 		return csizep->cs_dmacachep;
 #endif
 	return csizep->cs_cachep;
@@ -2445,7 +2449,12 @@
 	cachep->flags = flags;
 	cachep->gfpflags = 0;
 	if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
+#if defined(CONFIG_SYNO_COMCERTO)
+		cachep->gfpflags |= __GFP_DMA;
+#else
 		cachep->gfpflags |= GFP_DMA;
+#endif
+
 	cachep->buffer_size = size;
 	cachep->reciprocal_buffer_size = reciprocal_value(size);
 
@@ -2791,10 +2800,19 @@
 static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
 {
 	if (CONFIG_ZONE_DMA_FLAG) {
+#if defined(CONFIG_SYNO_COMCERTO)
+		if (flags & __GFP_DMA)
+			BUG_ON(!(cachep->gfpflags & __GFP_DMA));
+#else
 		if (flags & GFP_DMA)
 			BUG_ON(!(cachep->gfpflags & GFP_DMA));
+#endif
 		else
+#if defined(CONFIG_SYNO_COMCERTO)
+			BUG_ON(cachep->gfpflags & __GFP_DMA);
+#else
 			BUG_ON(cachep->gfpflags & GFP_DMA);
+#endif
 	}
 }
 
diff -ur a/mm/vmalloc.c b/mm/vmalloc.c
--- a/mm/vmalloc.c	2013-08-24 11:37:16.000000000 +0200
+++ b/mm/vmalloc.c	2014-02-17 11:57:56.000000000 +0100
@@ -1233,6 +1233,9 @@
 	vunmap_page_range(addr, end);
 	flush_tlb_kernel_range(addr, end);
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(unmap_kernel_range);
+#endif
 
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 {
@@ -1370,6 +1373,9 @@
 	return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
 				-1, GFP_KERNEL, __builtin_return_address(0));
 }
+#if defined(CONFIG_SYNO_COMCERTO)
+EXPORT_SYMBOL_GPL(get_vm_area);
+#endif
 
 struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
 				void *caller)
diff -ur a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
--- a/net/9p/trans_virtio.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/9p/trans_virtio.c	2014-02-17 11:56:55.000000000 +0100
@@ -271,7 +271,8 @@
 	in = pack_sg_list(chan->sg, out,
 			  VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity);
 
-	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc);
+	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc,
+				GFP_ATOMIC);
 	if (err < 0) {
 		if (err == -ENOSPC) {
 			chan->ring_bufs_avail = 0;
@@ -414,7 +415,8 @@
 		in += pack_sg_list_p(chan->sg, out + in, VIRTQUEUE_NUM,
 				     in_pages, in_nr_pages, uidata, inlen);
 
-	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc);
+	err = virtqueue_add_buf(chan->vq, chan->sg, out, in, req->tc,
+				GFP_ATOMIC);
 	if (err < 0) {
 		if (err == -ENOSPC) {
 			chan->ring_bufs_avail = 0;
diff -ur a/net/bridge/br_forward.c b/net/bridge/br_forward.c
--- a/net/bridge/br_forward.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/bridge/br_forward.c	2014-02-17 11:56:55.000000000 +0100
@@ -43,7 +43,11 @@
 {
 	/* ip_fragment doesn't copy the MAC header */
 	if (nf_bridge_maybe_copy_header(skb) ||
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	    (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb) && (!skb->ipsec_offload))) {
+#else
 	    (packet_length(skb) > skb->dev->mtu && !skb_is_gso(skb))) {
+#endif
 		kfree_skb(skb);
 	} else {
 		skb_push(skb, ETH_HLEN);
@@ -110,7 +114,11 @@
 /* called with rcu_read_lock */
 void br_forward(const struct net_bridge_port *to, struct sk_buff *skb, struct sk_buff *skb0)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (should_deliver(to, skb) && !(to->flags & BR_ISOLATE_MODE)) {
+#else
 	if (should_deliver(to, skb)) {
+#endif
 		if (skb0)
 			deliver_clone(to, skb, __br_forward);
 		else
@@ -162,10 +170,18 @@
 }
 
 /* called under bridge lock */
+#if defined(CONFIG_SYNO_COMCERTO)
+static void br_flood(struct net_bridge *br, struct sk_buff *skb,
+		     struct sk_buff *skb0,
+		     void (*__packet_hook)(const struct net_bridge_port *p,
+					   struct sk_buff *skb),
+		     bool forward)
+#else
 static void br_flood(struct net_bridge *br, struct sk_buff *skb,
 		     struct sk_buff *skb0,
 		     void (*__packet_hook)(const struct net_bridge_port *p,
 					   struct sk_buff *skb))
+#endif
 {
 	struct net_bridge_port *p;
 	struct net_bridge_port *prev;
@@ -173,6 +189,11 @@
 	prev = NULL;
 
 	list_for_each_entry_rcu(p, &br->port_list, list) {
+#if defined(CONFIG_SYNO_COMCERTO)
+		if (forward && (p->flags & BR_ISOLATE_MODE))
+			continue;
+#endif
+
 		prev = maybe_deliver(prev, p, skb, __packet_hook);
 		if (IS_ERR(prev))
 			goto out;
@@ -196,14 +217,22 @@
 /* called with rcu_read_lock */
 void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	br_flood(br, skb, NULL, __br_deliver, false);
+#else
 	br_flood(br, skb, NULL, __br_deliver);
+#endif
 }
 
 /* called under bridge lock */
 void br_flood_forward(struct net_bridge *br, struct sk_buff *skb,
 		      struct sk_buff *skb2)
 {
+#if defined(CONFIG_SYNO_COMCERTO)
+	br_flood(br, skb, skb2, __br_forward, true);
+#else
 	br_flood(br, skb, skb2, __br_forward);
+#endif
 }
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
diff -ur a/net/bridge/br_input.c b/net/bridge/br_input.c
--- a/net/bridge/br_input.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/bridge/br_input.c	2014-02-17 11:56:55.000000000 +0100
@@ -65,7 +65,11 @@
 	    br_multicast_rcv(br, p, skb))
 		goto drop;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if ((p->state == BR_STATE_LEARNING) && skb->protocol != htons(ETH_P_PAE))
+#else
 	if (p->state == BR_STATE_LEARNING)
+#endif
 		goto drop;
 
 	BR_INPUT_SKB_CB(skb)->brdev = br->dev;
@@ -78,6 +82,13 @@
 
 	dst = NULL;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (skb->protocol == htons(ETH_P_PAE)) {
+		skb2 = skb;
+		/* Do not forward 802.1x/EAP frames */
+		skb = NULL;
+	} else
+#endif
 	if (is_broadcast_ether_addr(dest))
 		skb2 = skb;
 	else if (is_multicast_ether_addr(dest)) {
@@ -94,7 +105,12 @@
 			skb2 = skb;
 
 		br->dev->stats.multicast++;
+#if defined(CONFIG_SYNO_COMCERTO)
+	} else if ((p->flags & BR_ISOLATE_MODE) ||
+		   ((dst = __br_fdb_get(br, dest)) && dst->is_local)) {
+#else
 	} else if ((dst = __br_fdb_get(br, dest)) && dst->is_local) {
+#endif
 		skb2 = skb;
 		/* Do not forward the packet since it's local. */
 		skb = NULL;
diff -ur a/net/bridge/br_private.h b/net/bridge/br_private.h
--- a/net/bridge/br_private.h	2013-08-24 11:36:43.000000000 +0200
+++ b/net/bridge/br_private.h	2014-02-17 11:56:55.000000000 +0100
@@ -141,6 +141,9 @@
 
 	unsigned long 			flags;
 #define BR_HAIRPIN_MODE		0x00000001
+#if defined(CONFIG_SYNO_COMCERTO)
+#define BR_ISOLATE_MODE		0x00000002
+#endif
 
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	u32				multicast_startup_queries_sent;
diff -ur a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
--- a/net/bridge/br_sysfs_if.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/bridge/br_sysfs_if.c	2014-02-17 11:56:55.000000000 +0100
@@ -149,6 +149,24 @@
 static BRPORT_ATTR(hairpin_mode, S_IRUGO | S_IWUSR,
 		   show_hairpin_mode, store_hairpin_mode);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+static ssize_t show_isolate_mode(struct net_bridge_port *p, char *buf)
+{
+	int isolate_mode = (p->flags & BR_ISOLATE_MODE) ? 1 : 0;
+	return sprintf(buf, "%d\n", isolate_mode);
+}
+static ssize_t store_isolate_mode(struct net_bridge_port *p, unsigned long v)
+{
+	if (v)
+		p->flags |= BR_ISOLATE_MODE;
+	else
+		p->flags &= ~BR_ISOLATE_MODE;
+	return 0;
+}
+static BRPORT_ATTR(isolate_mode, S_IRUGO | S_IWUSR,
+		   show_isolate_mode, store_isolate_mode);
+#endif
+
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf)
 {
@@ -181,6 +199,9 @@
 	&brport_attr_hold_timer,
 	&brport_attr_flush,
 	&brport_attr_hairpin_mode,
+#if defined(CONFIG_SYNO_COMCERTO)
+	&brport_attr_isolate_mode,
+#endif
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	&brport_attr_multicast_router,
 #endif
diff -ur a/net/bridge/Kconfig b/net/bridge/Kconfig
--- a/net/bridge/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/net/bridge/Kconfig	2014-01-21 09:37:31.000000000 +0100
@@ -6,7 +6,6 @@
 	tristate "802.1d Ethernet Bridging"
 	select LLC
 	select STP
-	depends on IPV6 || IPV6=n
 	---help---
 	  If you say Y here, then your Linux box will be able to act as an
 	  Ethernet bridge, which means that the different Ethernet segments it
diff -ur a/net/core/dev.c b/net/core/dev.c
--- a/net/core/dev.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/core/dev.c	2014-02-17 11:56:54.000000000 +0100
@@ -201,25 +201,27 @@
 	if (!szMac || !szDev)
 		goto ERR;
 
-	if (!memcmp(szDev, "eth0", 4)) {
+	// According to function __dev_get_by_name
+	// we can use strncmp & IFNAMSIZ to replace memcmp to avoid #48870
+	if (!strncmp(szDev, "eth0", IFNAMSIZ)) {
 		if (!strcmp(grgbLanMac[0], "")) {
 			err = SYNO_VENDOR_MAC_EMPTY;
 			goto ERR;
 		}
 		convert_str_to_mac(grgbLanMac[0], szMac);
-	} else if ( !memcmp(szDev, "eth1", 4) ) {
+	} else if (!strncmp(szDev, "eth1", IFNAMSIZ)) {
 		if (!strcmp(grgbLanMac[1], "")) {
 			err = SYNO_VENDOR_MAC_EMPTY;
 			goto ERR;
 		}
 		convert_str_to_mac(grgbLanMac[1], szMac);
-	} else if ( !memcmp(szDev, "eth2", 4) ) {
+	} else if (!strncmp(szDev, "eth2", IFNAMSIZ)) {
 		if (!strcmp(grgbLanMac[2], "")) {
 			err = SYNO_VENDOR_MAC_EMPTY;
 			goto ERR;
 		}
 		convert_str_to_mac(grgbLanMac[2], szMac);
-	} else if ( !memcmp(szDev, "eth3", 4) ) {
+	} else if (!strncmp(szDev, "eth3", IFNAMSIZ)) {
 		if (!strcmp(grgbLanMac[3], "")) {
 			err = SYNO_VENDOR_MAC_EMPTY;
 			goto ERR;
diff -ur a/net/core/flow.c b/net/core/flow.c
--- a/net/core/flow.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/core/flow.c	2014-02-17 11:56:54.000000000 +0100
@@ -22,6 +22,9 @@
 #include <linux/cpumask.h>
 #include <linux/mutex.h>
 #include <net/flow.h>
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+#include <net/xfrm.h>
+#endif
 #include <linux/atomic.h>
 #include <linux/security.h>
 
@@ -204,9 +207,15 @@
 	return 0;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+struct flow_cache_object *
+flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
+			u8 *new_flow, flow_resolve_t resolver, void *ctx)
+#else
 struct flow_cache_object *
 flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir,
 		  flow_resolve_t resolver, void *ctx)
+#endif
 {
 	struct flow_cache *fc = &flow_cache_global;
 	struct flow_cache_percpu *fcp;
@@ -216,6 +225,11 @@
 	size_t keysize;
 	unsigned int hash;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	if (new_flow)
+		*new_flow = 0;
+#endif
+
 	local_bh_disable();
 	fcp = this_cpu_ptr(fc->percpu);
 
@@ -281,8 +295,17 @@
 	flo = resolver(net, key, family, dir, flo, ctx);
 	if (fle) {
 		fle->genid = atomic_read(&flow_cache_genid);
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		if (!IS_ERR(flo)) {
+#else
 		if (!IS_ERR(flo))
+#endif
 			fle->object = flo;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+			if (new_flow)
+				*new_flow = 1;
+		}
+#endif
 		else
 			fle->genid--;
 	} else {
@@ -358,6 +381,39 @@
 	put_online_cpus();
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+void flow_cache_remove(const struct flowi *key, 
+			unsigned short family, unsigned short dir)
+{
+	struct flow_cache *fc = &flow_cache_global;
+	struct flow_cache_percpu *fcp;
+	struct flow_cache_entry *fle;
+	struct hlist_node *entry;
+	size_t keysize;
+	unsigned int hash;
+
+	local_bh_disable();
+	fcp = this_cpu_ptr(fc->percpu);
+	
+	keysize = flow_key_size(family);
+	if (!keysize)
+		goto nocache;
+
+	hash = flow_hash_code(fc, fcp, key, keysize);
+	
+	hlist_for_each_entry(fle, entry, &fcp->hash_table[hash], u.hlist) {
+		if((fle->family == family) && (fle->dir == dir) && (flow_key_compare(&fle->key, key, keysize) == 0)) {
+			hlist_del(&fle->u.hlist);
+			flow_entry_kill(fle);
+			break;
+		}
+	}
+		
+nocache:	
+	local_bh_enable();
+}
+#endif
+
 static void flow_cache_flush_task(struct work_struct *work)
 {
 	flow_cache_flush();
diff -ur a/net/core/rtnetlink.c b/net/core/rtnetlink.c
--- a/net/core/rtnetlink.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/core/rtnetlink.c	2014-02-17 11:56:54.000000000 +0100
@@ -621,7 +621,40 @@
 	return -EMSGSIZE;
 }
 EXPORT_SYMBOL(rtnetlink_put_metrics);
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_COMCERTO)
+int rtnetlink_put_metrics_2(struct sk_buff *skb, u32 *metrics, struct dst_entry *dst)
+{
+	struct nlattr *mx;
+	int i, valid = 0;
+
+	mx = nla_nest_start(skb, RTA_METRICS);
+	if (mx == NULL)
+		return -ENOBUFS;
 
+	for (i = 0; i < RTAX_MAX; i++) {
+		if (metrics[i]) {
+			valid++;
+			NLA_PUT_U32(skb, i+1, metrics[i]);
+		}
+		else if ((i + 1) == RTAX_MTU){
+			valid++;
+			NLA_PUT_U32(skb, i+1, dst_mtu(dst));
+		}
+	}
+
+	if (!valid) {
+		nla_nest_cancel(skb, mx);
+		return 0;
+	}
+
+	return nla_nest_end(skb, mx);
+
+nla_put_failure:
+	nla_nest_cancel(skb, mx);
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(rtnetlink_put_metrics_2);
+#endif
 int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id,
 		       u32 ts, u32 tsage, long expires, u32 error)
 {
@@ -1975,6 +2008,10 @@
 	if (err < 0)
 		rtnl_set_sk_err(net, RTNLGRP_LINK, err);
 }
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+EXPORT_SYMBOL(rtmsg_ifinfo);
+#endif
+
 
 /* Protected by RTNL sempahore.  */
 static struct rtattr **rta_buf;
diff -ur a/net/core/skbuff.c b/net/core/skbuff.c
--- a/net/core/skbuff.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/core/skbuff.c	2014-02-17 11:56:54.000000000 +0100
@@ -244,6 +244,88 @@
 }
 EXPORT_SYMBOL(__alloc_skb);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_COMCERTO)
+/**
+ *	__alloc_skb_header	-	allocate a network buffer
+ *	@size: size to allocate
+ *	@gfp_mask: allocation mask
+ *	@fclone: allocate from fclone cache instead of head cache
+ *		and allocate a cloned (child) skb
+ *
+ *	Allocate a new &sk_buff. The returned buffer has no headroom and a
+ *	tail room of size bytes. The object has a reference count of one.
+ *	The return is the buffer. On a failure the return is %NULL.
+ *
+ *	Buffers may only be allocated from interrupts using a @gfp_mask of
+ *	%GFP_ATOMIC.
+ */
+struct sk_buff *__alloc_skb_header(unsigned int size, void *data, gfp_t gfp_mask,
+			    int fclone, int node)
+{
+	struct kmem_cache *cache;
+	struct skb_shared_info *shinfo;
+	struct sk_buff *skb;
+
+	cache = fclone ? skbuff_fclone_cache : skbuff_head_cache;
+
+	if (size <= SKB_DATA_ALIGN(sizeof(struct skb_shared_info))) {
+		skb = NULL;
+		goto out;
+	}
+
+	/* Get the HEAD */
+	skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node);
+	if (!skb)
+		goto out;
+	prefetchw(skb);
+
+	/* kmalloc might give us more room than requested.
+	 * Put skb_shared_info exactly at the end of allocated zone,
+	 * to allow max possible filling before reallocation.
+	 */
+	size = SKB_WITH_OVERHEAD(ksize(data));
+	prefetchw(data + size);
+
+	/*
+	 * Only clear those fields we need to clear, not those that we will
+	 * actually initialise below. Hence, don't put any more fields after
+	 * the tail pointer in struct sk_buff!
+	 */
+	memset(skb, 0, offsetof(struct sk_buff, tail));
+	/* Account for allocated memory : skb + skb->head */
+	skb->truesize = SKB_TRUESIZE(size);
+	atomic_set(&skb->users, 1);
+	skb->head = data;
+	skb->data = data;
+	skb_reset_tail_pointer(skb);
+	skb->end = skb->tail + size;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+	skb->mac_header = ~0U;
+#endif
+
+	/* make sure we initialize shinfo sequentially */
+	shinfo = skb_shinfo(skb);
+	memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+	atomic_set(&shinfo->dataref, 1);
+	kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+	if (fclone) {
+		struct sk_buff *child = skb + 1;
+		atomic_t *fclone_ref = (atomic_t *) (child + 1);
+
+		kmemcheck_annotate_bitfield(child, flags1);
+		kmemcheck_annotate_bitfield(child, flags2);
+		skb->fclone = SKB_FCLONE_ORIG;
+		atomic_set(fclone_ref, 1);
+
+		child->fclone = SKB_FCLONE_UNAVAILABLE;
+	}
+out:
+	return skb;
+}
+EXPORT_SYMBOL(__alloc_skb_header);
+#endif
+
 /**
  *	__netdev_alloc_skb - allocate an skbuff for rx on a specific device
  *	@dev: network device to receive on
@@ -588,6 +670,9 @@
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+        new->ipsec_offload      = old->ipsec_offload;
+#endif
 	memcpy(new->cb, old->cb, sizeof(old->cb));
 	new->csum		= old->csum;
 	new->local_df		= old->local_df;
diff -ur a/net/ipv4/Kconfig b/net/ipv4/Kconfig
--- a/net/ipv4/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/net/ipv4/Kconfig	2014-01-21 09:37:31.000000000 +0100
@@ -339,6 +339,12 @@
 
 	  If unsure, say Y.
 
+config INET_IPSEC_OFFLOAD
+	bool "IPsec Fast Path Processing offload"
+	depends on (INET_ESP || INET_AH) && COMCERTO_FP && SYNO_COMCERTO
+		---help---
+	  Support for IPsec Fast Path offload.
+
 config INET_IPCOMP
 	tristate "IP: IPComp transformation"
 	select INET_XFRM_TUNNEL
diff -ur a/net/ipv4/Makefile b/net/ipv4/Makefile
--- a/net/ipv4/Makefile	2013-08-03 09:59:52.000000000 +0200
+++ b/net/ipv4/Makefile	2014-01-21 09:37:31.000000000 +0100
@@ -22,7 +22,11 @@
 obj-$(CONFIG_NET_IPGRE) += ip_gre.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_INET_IPSEC_OFFLOAD) += esp4.o
+else
 obj-$(CONFIG_INET_ESP) += esp4.o
+endif
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
diff -ur a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
--- a/net/ipv4/netfilter/ip_tables.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/ipv4/netfilter/ip_tables.c	2014-02-17 11:56:52.000000000 +0100
@@ -81,9 +81,20 @@
 
 #define FWINV(bool, invflg) ((bool) ^ !!(ipinfo->invflags & (invflg)))
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (ipinfo->flags & IPT_F_NO_DEF_MATCH)
+		return true;
+
+	if (FWINV(ipinfo->smsk.s_addr &&
+		  (ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
+		  IPT_INV_SRCIP) ||
+	    FWINV(ipinfo->dmsk.s_addr &&
+		  (ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+#else
 	if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr,
 		  IPT_INV_SRCIP) ||
 	    FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr,
+#endif
 		  IPT_INV_DSTIP)) {
 		dprintf("Source or dest mismatch.\n");
 
@@ -134,6 +145,31 @@
 	return true;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO)
+static void
+ip_checkdefault(struct ipt_ip *ip)
+{
+	static const char iface_mask[IFNAMSIZ] = {};
+
+	if (ip->invflags || ip->flags & IPT_F_FRAG)
+		return;
+
+	if (memcmp(ip->iniface_mask, iface_mask, IFNAMSIZ) != 0)
+		return;
+
+	if (memcmp(ip->outiface_mask, iface_mask, IFNAMSIZ) != 0)
+		return;
+
+	if (ip->smsk.s_addr || ip->dmsk.s_addr)
+		return;
+
+	if (ip->proto)
+		return;
+
+	ip->flags |= IPT_F_NO_DEF_MATCH;
+}
+#endif
+
 static bool
 ip_checkentry(const struct ipt_ip *ip)
 {
@@ -284,6 +320,35 @@
 	return (void *)entry + entry->next_offset;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO)
+static bool
+ipt_handle_default_rule(struct ipt_entry *e, unsigned int *verdict)
+{
+	struct xt_entry_target *t;
+	struct xt_standard_target *st;
+
+	if (e->target_offset != sizeof(struct ipt_entry))
+		return false;
+
+	if (!(e->ip.flags & IPT_F_NO_DEF_MATCH))
+		return false;
+
+	t = ipt_get_target(e);
+	if (t->u.kernel.target->target)
+		return false;
+
+	st = (struct xt_standard_target *) t;
+	if (st->verdict == XT_RETURN)
+		return false;
+
+	if (st->verdict >= 0)
+		return false;
+
+	*verdict = (unsigned)(-st->verdict) - 1;
+	return true;
+}
+#endif
+
 /* Returns one of the generic firewall policies, like NF_ACCEPT. */
 unsigned int
 ipt_do_table(struct sk_buff *skb,
@@ -308,6 +373,27 @@
 	ip = ip_hdr(skb);
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
+
+#if defined(CONFIG_SYNO_COMCERTO)
+	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	local_bh_disable();
+	addend = xt_write_recseq_begin();
+	private = table->private;
+	cpu        = smp_processor_id();
+	table_base = private->entries[cpu];
+	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
+	stackptr   = per_cpu_ptr(private->stackptr, cpu);
+	origptr    = *stackptr;
+
+	e = get_entry(table_base, private->hook_entry[hook]);
+	if (ipt_handle_default_rule(e, &verdict)) {
+		ADD_COUNTER(e->counters, skb->len, 1);
+		xt_write_recseq_end(addend);
+		local_bh_enable();
+		return verdict;
+	}
+#endif
+
 	/* We handle fragments by dealing with the first fragment as
 	 * if it was a normal packet.  All other fragments are treated
 	 * normally, except that they will NEVER match rules that ask
@@ -322,6 +408,7 @@
 	acpar.family  = NFPROTO_IPV4;
 	acpar.hooknum = hook;
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
@@ -334,6 +421,7 @@
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
+#endif
 	pr_debug("Entering %s(hook %u); sp at %u (UF %p)\n",
 		 table->name, hook, origptr,
 		 get_entry(table_base, private->underflow[hook]));
@@ -561,7 +649,11 @@
 }
 
 static int
+#if defined(CONFIG_SYNO_COMCERTO)
+check_entry(struct ipt_entry *e, const char *name)
+#else
 check_entry(const struct ipt_entry *e, const char *name)
+#endif
 {
 	const struct xt_entry_target *t;
 
@@ -570,6 +662,10 @@
 		return -EINVAL;
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	ip_checkdefault(&e->ip);
+#endif
+
 	if (e->target_offset + sizeof(struct xt_entry_target) >
 	    e->next_offset)
 		return -EINVAL;
@@ -932,6 +1028,9 @@
 	const struct xt_table_info *private = table->private;
 	int ret = 0;
 	const void *loc_cpu_entry;
+#if defined(CONFIG_SYNO_COMCERTO)
+	u8 flags;
+#endif
 
 	counters = alloc_counters(table);
 	if (IS_ERR(counters))
@@ -963,6 +1062,16 @@
 			goto free_counters;
 		}
 
+#if defined(CONFIG_SYNO_COMCERTO)
+		flags = e->ip.flags & IPT_F_MASK;
+		if (copy_to_user(userptr + off
+				 + offsetof(struct ipt_entry, ip.flags),
+				 &flags, sizeof(flags)) != 0) {
+			ret = -EFAULT;
+			goto free_counters;
+		}
+#endif
+
 		for (i = sizeof(struct ipt_entry);
 		     i < e->target_offset;
 		     i += m->u.match_size) {
diff -ur a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c
--- a/net/ipv4/netfilter/nf_nat_sip.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/ipv4/netfilter/nf_nat_sip.c	2014-02-17 11:56:52.000000000 +0100
@@ -73,6 +73,9 @@
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+#if defined(CONFIG_SYNO_COMCERTO)
+	struct nf_conn_help *help = nfct_help(ct);
+#endif
 	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
 	unsigned int buflen;
 	__be32 newaddr;
@@ -85,7 +88,12 @@
 	} else if (ct->tuplehash[dir].tuple.dst.u3.ip == addr->ip &&
 		   ct->tuplehash[dir].tuple.dst.u.udp.port == port) {
 		newaddr = ct->tuplehash[!dir].tuple.src.u3.ip;
+#if defined(CONFIG_SYNO_COMCERTO)
+		newport = help->help.ct_sip_info.forced_dport ? :
+			  ct->tuplehash[!dir].tuple.src.u.udp.port;
+#else
 		newport = ct->tuplehash[!dir].tuple.src.u.udp.port;
+#endif
 	} else
 		return 1;
 
@@ -121,6 +129,9 @@
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+#if defined(CONFIG_SYNO_COMCERTO)
+	struct nf_conn_help *help = nfct_help(ct);
+#endif
 	unsigned int coff, matchoff, matchlen;
 	enum sip_header_types hdr;
 	union nf_inet_addr addr;
@@ -230,6 +241,22 @@
 	    !map_sip_addr(skb, dataoff, dptr, datalen, SIP_HDR_TO))
 		return NF_DROP;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	/* Mangle destination port for Cisco phones, then fix up checksums */
+	if (dir == IP_CT_DIR_REPLY && help->help.ct_sip_info.forced_dport) {
+		struct udphdr *uh;
+
+		if (!skb_make_writable(skb, skb->len))
+			return NF_DROP;
+
+		uh = (struct udphdr *)(skb->data + ip_hdrlen(skb));
+		uh->dest = help->help.ct_sip_info.forced_dport;
+
+		if (!nf_nat_mangle_udp_packet(skb, ct, ctinfo, 0, 0, NULL, 0))
+			return NF_DROP;
+	}
+#endif
+
 	return NF_ACCEPT;
 }
 
@@ -281,8 +308,14 @@
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
 	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+#if defined(CONFIG_SYNO_COMCERTO)
+	struct nf_conn_help *help = nfct_help(ct);
+#endif
 	__be32 newip;
 	u_int16_t port;
+#if defined(CONFIG_SYNO_COMCERTO)
+	__be16 srcport;
+#endif
 	char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")];
 	unsigned buflen;
 
@@ -295,8 +328,14 @@
 	/* If the signalling port matches the connection's source port in the
 	 * original direction, try to use the destination port in the opposite
 	 * direction. */
+#if defined(CONFIG_SYNO_COMCERTO)
+	srcport = help->help.ct_sip_info.forced_dport ? :
+		  ct->tuplehash[dir].tuple.src.u.udp.port;
+	if (exp->tuple.dst.u.udp.port == srcport)
+#else
 	if (exp->tuple.dst.u.udp.port ==
 	    ct->tuplehash[dir].tuple.src.u.udp.port)
+#endif
 		port = ntohs(ct->tuplehash[!dir].tuple.dst.u.udp.port);
 	else
 		port = ntohs(exp->tuple.dst.u.udp.port);
diff -ur a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
--- a/net/ipv4/netfilter.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/ipv4/netfilter.c	2014-02-17 11:56:53.000000000 +0100
@@ -81,6 +81,16 @@
 
 	if (IPCB(skb)->flags & IPSKB_XFRM_TRANSFORMED)
 		return 0;
+
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	/* Mindspeed added WA: required to support 4o6 ipsec offload */
+	if(skb->ipsec_offload)
+	{
+		if (IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED)
+			return 0;
+	}
+#endif
+
 	if (xfrm_decode_session(skb, &fl, AF_INET) < 0)
 		return -1;
 
diff -ur a/net/ipv4/route.c b/net/ipv4/route.c
--- a/net/ipv4/route.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/ipv4/route.c	2014-02-17 11:56:52.000000000 +0100
@@ -3098,7 +3098,11 @@
 	if (rt->rt_dst != rt->rt_gateway)
 		NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
 
+#ifdef CONFIG_ARCH_COMCERTO
+	if (rtnetlink_put_metrics_2(skb, dst_metrics_ptr(&rt->dst), &rt->dst) < 0)
+#else
 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+#endif
 		goto nla_put_failure;
 
 	if (rt->rt_mark)
@@ -3584,8 +3588,14 @@
 	if (ip_rt_proc_init())
 		printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
+#ifdef MY_DEF_HERE
+	mdelay(500);
+#endif
 	xfrm_init();
 	xfrm4_init(ip_rt_max_size);
+#ifdef MY_DEF_HERE
+	mdelay(500);
+#endif
 #endif
 	rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
 
diff -ur a/net/ipv4/tcp.c b/net/ipv4/tcp.c
--- a/net/ipv4/tcp.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/ipv4/tcp.c	2014-02-17 11:56:53.000000000 +0100
@@ -596,7 +596,15 @@
 
 	return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv);
 }
-
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_SPLICE_PROF)
+unsigned int splicer_time_counter[256];
+unsigned int splicer_reqtime_counter[256];
+unsigned int splicer_data_counter[256];
+unsigned int splicer_tcp_rsock_counter[64];
+static struct timeval last_splicer;
+unsigned int init_splicer_prof = 0;
+extern unsigned int enable_splice_prof;
+#endif
 /**
  *  tcp_splice_read - splice data from TCP socket to a pipe
  * @sock:	socket to splice from
@@ -622,7 +630,28 @@
 	long timeo;
 	ssize_t spliced;
 	int ret;
-
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_SPLICE_PROF)
+	struct timeval now;
+	int diff_time_ms;
+
+	if (enable_splice_prof) {
+		do_gettimeofday(&now);
+		if (init_splicer_prof) {
+			diff_time_ms = ((now.tv_sec - last_splicer.tv_sec) * 1000) + ((now.tv_usec - last_splicer.tv_usec) / 1000);
+			if (diff_time_ms < 1000) {
+				splicer_time_counter[diff_time_ms >> 3]++;
+			}
+			else {
+				splicer_time_counter[255]++;
+			}
+		}
+		if (len < (1 <<21))
+			splicer_data_counter[(len >> 13) & 0xFF]++;
+		else
+			splicer_data_counter[255]++;
+		last_splicer = now;
+	}
+#endif
 	sock_rps_record_flow(sk);
 	/*
 	 * We can't seek on a socket input
@@ -633,7 +662,18 @@
 	ret = spliced = 0;
 
 	lock_sock(sk);
-
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_SPLICE_PROF)
+	/* Need locked socket*/
+	if (enable_splice_prof) {
+		const struct tcp_sock *tp = tcp_sk(sk);
+		int rsock_qsize = tp->rcv_nxt - tp->copied_seq;
+
+		if (rsock_qsize < (4 * 1024 * 1024))
+			splicer_tcp_rsock_counter[(rsock_qsize >> 16) & 0x3F]++;
+		else
+			splicer_tcp_rsock_counter[63]++;
+	}
+#endif
 	timeo = sock_rcvtimeo(sk, sock->file->f_flags & O_NONBLOCK);
 	while (tss.len) {
 		ret = __tcp_splice_read(sk, &tss);
@@ -686,6 +726,23 @@
 
 	release_sock(sk);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_SPLICE_PROF)
+	if (enable_splice_prof) {
+		do_gettimeofday(&now);
+
+		diff_time_ms = ((now.tv_sec - last_splicer.tv_sec) * 1000) + ((now.tv_usec - last_splicer.tv_usec) / 1000);
+		if (diff_time_ms < 1000) {//Don't record useless data
+			splicer_reqtime_counter[diff_time_ms >> 3]++;
+		}
+		else
+			splicer_reqtime_counter[255]++;
+
+		if(!init_splicer_prof)
+			init_splicer_prof = 1;
+
+		last_splicer = now;
+	}
+#endif
 	if (spliced)
 		return spliced;
 
diff -ur a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
--- a/net/ipv6/addrconf.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/ipv6/addrconf.c	2014-02-17 11:56:55.000000000 +0100
@@ -80,7 +80,6 @@
 
 #if defined(MY_ABC_HERE) && defined(MY_DEF_HERE)
 #include <linux/synobios.h>
-extern char gszSynoHWVersion[];
 #endif
 
 #ifdef CONFIG_IPV6_PRIVACY
@@ -1122,9 +1121,15 @@
 	return ret;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO)
+static int __ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
+		       const struct in6_addr *daddr, unsigned int prefs,
+		       struct in6_addr *saddr)
+#else
 int ipv6_dev_get_saddr(struct net *net, struct net_device *dst_dev,
 		       const struct in6_addr *daddr, unsigned int prefs,
 		       struct in6_addr *saddr)
+#endif
 {
 	struct ipv6_saddr_score scores[2],
 				*score = &scores[0], *hiscore = &scores[1];
@@ -1247,7 +1252,9 @@
 	in6_ifa_put(hiscore->ifa);
 	return 0;
 }
+#if !defined(CONFIG_SYNO_COMCERTO)
 EXPORT_SYMBOL(ipv6_dev_get_saddr);
+#endif
 
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
 		    unsigned char banned_flags)
@@ -1835,9 +1842,9 @@
 #if defined(MY_ABC_HERE) && defined(MY_DEF_HERE)
 void SYNO_IPV6_ready_timer_workaround(__u32 *valid_lft, __u32 *prefered_lft)
 {
-	if(!strncmp(gszSynoHWVersion, HW_DS110p, strlen(HW_DS110p)) ||
-		!strncmp(gszSynoHWVersion, HW_DS210p, strlen(HW_DS210p)) ||
-		!strncmp(gszSynoHWVersion, HW_DS410, strlen(HW_DS410))) {
+	if(syno_is_hw_version(HW_DS110p) ||
+	   syno_is_hw_version(HW_DS210p) ||
+	   syno_is_hw_version(HW_DS410)) {
 		*valid_lft -= (*valid_lft)/400;
 		*prefered_lft -= (*prefered_lft)/400;
 	}
@@ -4852,6 +4859,11 @@
 
 	ipv6_addr_label_rtnl_register();
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	BUG_ON(ipv6_dev_get_saddr_hook != NULL);
+	rcu_assign_pointer(ipv6_dev_get_saddr_hook, __ipv6_dev_get_saddr);
+#endif
+
 	return 0;
 errout:
 	rtnl_af_unregister(&inet6_ops);
@@ -4870,6 +4882,11 @@
 	struct net_device *dev;
 	int i;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	rcu_assign_pointer(ipv6_dev_get_saddr_hook, NULL);
+	synchronize_rcu();
+#endif
+
 	unregister_netdevice_notifier(&ipv6_dev_notf);
 	unregister_pernet_subsys(&addrconf_ops);
 	ipv6_addr_label_cleanup();
Nur in b/net/ipv6: ethipip6.c.
Nur in b/net/ipv6: inet6_stubs.c.
diff -ur a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
--- a/net/ipv6/ip6_output.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/ipv6/ip6_output.c	2014-02-17 11:56:55.000000000 +0100
@@ -152,8 +152,15 @@
 
 static int ip6_finish_output(struct sk_buff *skb)
 {
+
+#if defined(CONFIG_INET6_IPSEC_OFFLOAD)
+	if ((skb->ipsec_offload == 0) &&
+		(skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
+		dst_allfrag(skb_dst(skb)))
+#else
 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
-	    dst_allfrag(skb_dst(skb)))
+		dst_allfrag(skb_dst(skb)))
+#endif
 		return ip6_fragment(skb, ip6_finish_output2);
 	else
 		return ip6_finish_output2(skb);
diff -ur a/net/ipv6/Kconfig b/net/ipv6/Kconfig
--- a/net/ipv6/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/net/ipv6/Kconfig	2014-01-21 09:37:31.000000000 +0100
@@ -94,6 +94,12 @@
 
 	  If unsure, say Y.
 
+config INET6_IPSEC_OFFLOAD
+	bool "IPsec IPv6 Fast Path Processing offload"
+	depends on (INET6_ESP && COMCERTO_FP && SYNO_COMCERTO)
+		---help---
+	  Support for IPsec IPv6 Fast Path offload.
+
 config INET6_IPCOMP
 	tristate "IPv6: IPComp transformation"
 	select INET6_XFRM_TUNNEL
@@ -222,6 +228,15 @@
 
 	  If unsure, say N.
 
+config IPV6_ETHERIP
+	tristate "EtherIP over IPv6: EtherIP-in-IPv6 tunnel"
+	depends on IPV6 && SYNO_COMCERTO
+	select INET6_TUNNEL
+	---help---
+	  Support for EtherIP-in-IPv6 tunnels described in RFC 3378.
+
+	  If unsure, say N.
+
 config IPV6_MROUTE
 	bool "IPv6: multicast routing (EXPERIMENTAL)"
 	depends on IPV6 && EXPERIMENTAL
diff -ur a/net/ipv6/Makefile b/net/ipv6/Makefile
--- a/net/ipv6/Makefile	2013-08-03 09:59:52.000000000 +0200
+++ b/net/ipv6/Makefile	2014-01-21 09:37:31.000000000 +0100
@@ -23,7 +23,11 @@
 ipv6-objs += $(ipv6-y)
 
 obj-$(CONFIG_INET6_AH) += ah6.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_INET6_IPSEC_OFFLOAD) += esp6.o
+else
 obj-$(CONFIG_INET6_ESP) += esp6.o
+endif
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
@@ -36,7 +40,13 @@
 
 obj-$(CONFIG_IPV6_SIT) += sit.o
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_IPV6_ETHERIP) += ethipip6.o
+endif
 
 obj-y += addrconf_core.o exthdrs_core.o
 
 obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_hashtables.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(subst m,y,$(CONFIG_IPV6)) += inet6_stubs.o
+endif
diff -ur a/net/ipv6/route.c b/net/ipv6/route.c
--- a/net/ipv6/route.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/ipv6/route.c	2014-02-17 11:56:55.000000000 +0100
@@ -2458,8 +2458,11 @@
 		ipv6_addr_copy(&saddr_buf, &rt->rt6i_prefsrc.addr);
 		NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
 	}
-
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_ARCH_COMCERTO)
+	if (rtnetlink_put_metrics_2(skb, dst_metrics_ptr(&rt->dst), &rt->dst) < 0)
+#else
 	if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0)
+#endif
 		goto nla_put_failure;
 
 	rcu_read_lock();
@@ -2521,6 +2524,9 @@
 	struct rtmsg *rtm;
 	struct flowi6 fl6;
 	int err, iif = 0;
+#if defined(CONFIG_SYNO_COMCERTO)
+	int flags = 0;
+#endif
 
 	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
 	if (err < 0)
@@ -2549,6 +2555,11 @@
 	if (tb[RTA_OIF])
 		fl6.flowi6_oif = nla_get_u32(tb[RTA_OIF]);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (tb[RTA_MARK])
+		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
+#endif
+
 	if (iif) {
 		struct net_device *dev;
 		dev = __dev_get_by_index(net, iif);
@@ -2556,6 +2567,16 @@
 			err = -ENODEV;
 			goto errout;
 		}
+
+#if defined(CONFIG_SYNO_COMCERTO)
+		fl6.flowi6_iif = iif;
+
+		if (!ipv6_addr_any(&fl6.saddr))
+			flags |= RT6_LOOKUP_F_HAS_SADDR;
+
+		if (rt6_need_strict(&fl6.daddr) && dev->type != ARPHRD_PIMREG)
+			flags |= RT6_LOOKUP_F_IFACE;
+#endif
 	}
 
 	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
@@ -2570,6 +2591,11 @@
 	skb_reset_mac_header(skb);
 	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (iif)
+		rt = (struct rt6_info*) fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_input);
+	else
+#endif
 	rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl6);
 	skb_dst_set(skb, &rt->dst);
 
diff -ur a/net/key/af_key.c b/net/key/af_key.c
--- a/net/key/af_key.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/key/af_key.c	2014-02-17 11:56:54.000000000 +0100
@@ -30,8 +30,188 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <net/xfrm.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <net/netlink.h>
+#endif
 
 #include <net/sock.h>
+#if defined(CONFIG_SYNO_COMCERTO)
+#include <net/ip6_route.h>
+
+#if defined(CONFIG_INET_IPSEC_OFFLOAD)|| defined(CONFIG_INET6_IPSEC_OFFLOAD)
+#define NLKEY_SUPPORT 1
+#else 
+#undef NLKEY_SUPPORT
+#endif 
+
+#ifdef NLKEY_SUPPORT
+#include <net/dsfield.h>
+#include <net/inet_ecn.h>
+#include <net/ipv6.h>
+
+
+extern struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
+						xfrm_address_t *prev_saddr,
+						xfrm_address_t *prev_daddr,
+						int family);
+extern int xfrm_get_tos(struct flowi *fl, int family);
+extern int ipsec_nlkey_flow(u16 xfrm_nr, u16 *xfrm_handle, 
+		const struct flowi *fl, u16 family, u16 dir);
+
+
+#define	NLKEY_SA_CREATE		0x0A01
+#define NLKEY_SA_DELETE		0x0A02
+#define NLKEY_SA_FLUSH 		0x0A03
+#define NLKEY_SA_SET_KEYS	0x0A04
+#define NLKEY_SA_SET_TUNNEL	0x0A05
+#define NLKEY_SA_SET_NATT	0x0A06
+#define	NLKEY_SA_SET_STATE	0x0A07
+#define	NLKEY_SA_SET_LIFETIME	0x0A08
+#define	NLKEY_SA_NOTIFY		0x0A09
+#define NLKEY_SA_INFO_UPDATE	0x0A0C
+#define	NLKEY_FLOW_ADD		0x0A11
+#define NLKEY_FLOW_REMOVE	0x0A12
+#define NLKEY_FLOW_NOTIFY	0x0A13
+#define NLKEY_NULL_MSG		0x0000
+
+#define NLKEY_HDR_LEN		4
+#define NLKEY_MSG_LEN 		256
+
+#define NLKEY_MAX_NUM_KEYS	2
+#define NLKEY_MAX_KEY_LEN	(256 / 8)
+
+struct nlkey_msg {
+	/* message data */
+	unsigned short fcode;
+	unsigned short length;
+	unsigned short payload[(NLKEY_MSG_LEN /sizeof(unsigned short))];
+};
+/* sizeof(nlkey_msg) = 4 + 256 */
+
+struct nlkey_sa_id {
+	unsigned int spi;
+	unsigned char sa_type;
+	unsigned char proto_family;
+	unsigned char replay_window;
+#define NLKEY_SAFLAGS_ESN	0x1
+	unsigned char flags;
+	unsigned int dst_ip[4];
+	unsigned int src_ip[4];
+	unsigned short mtu;
+	unsigned short dev_mtu;
+
+};
+/* sizeof(nlkey_sa_id) = 24 */
+
+struct nlkey_sa_create {
+	unsigned short sagd;
+	unsigned short rsvd;
+	struct nlkey_sa_id said;
+};
+/* sizeof(nlkey_sa_delete) = 28 */
+
+struct nlkey_sa_delete {
+	unsigned short sagd;
+	unsigned short rsvd;
+};
+/* sizeof(nlkey_sa_delete) = 4 */
+
+struct nlkey_sa_set_tunnel {
+	unsigned short sagd;
+	unsigned char rsvd;
+	unsigned char proto_family;
+	union {
+		struct iphdr 	 ipv4h;
+		struct ipv6hdr ipv6h;
+	} h;
+};
+/* sizeof(nlkey_sa_set_tunnel) = 36 */
+
+struct nlkey_sa_set_natt {
+	unsigned short sagd;
+	unsigned short sport;
+	unsigned short dport;
+	unsigned short rsvd;
+};
+/* sizeof(nlkey_sa_set_natt) = 4 */
+
+struct nlkey_sa_set_state {
+	unsigned short sagd;
+	unsigned short rsvd;
+	unsigned short state;
+	unsigned short rsvd2;
+};
+/* sizeof(nlkey_sa_set_natt) = 8 */
+
+struct nlkey_key_desc {
+	unsigned short key_bits;
+	unsigned char key_alg;
+	unsigned char  key_type;
+	unsigned char key[NLKEY_MAX_KEY_LEN]; 
+};
+/* sizeof(nlkey_key_desc) =  36 */
+
+struct nlkey_sa_set_keys {
+	unsigned short sagd;
+	unsigned short rsvd;	
+	unsigned short num_keys;
+	unsigned short rsvd2;
+	struct nlkey_key_desc keys[NLKEY_MAX_NUM_KEYS];
+};
+/* sizeof(nlkey_sa_set_keys) =  80 */
+
+struct nlkey_lifetime_desc {
+	unsigned int allocations;
+	unsigned int bytes[2];
+};
+/* sizeof(nlkey_sa_set_lifetime) =  12 */
+
+struct nlkey_sa_set_lifetime {
+	unsigned short sagd;
+	unsigned short rsvd;
+	struct nlkey_lifetime_desc hard_time;
+	struct nlkey_lifetime_desc soft_time;
+	struct nlkey_lifetime_desc current_time;
+};
+/* sizeof(nlkey_sa_set_lifetime) =  40 */
+
+/* SA notifications */
+#define IPSEC_SOFT_EXPIRE 0
+#define IPSEC_HARD_EXPIRE 1
+
+struct nlkey_sa_notify {
+	unsigned short sagd;
+	unsigned short rsvd;
+	unsigned int  action;
+};
+/* sizeof(nlkey_sa_notify) = 8 */
+
+/* SA Info update */
+
+struct nlkey_sa_info {
+        unsigned short sagd;
+        unsigned short rsvd;
+        unsigned long long bytes;
+        unsigned long long packets;
+};
+/* sizeof(nlkey_sa_info) =  */
+
+
+static int ipsec_nlkey_send(struct net *net, struct xfrm_state *x, const struct km_event *c);
+static void ipsec_nlkey_rcv(struct sk_buff *skb);
+static void ipsec_nlkey_init(void);
+static unsigned short ipsec_sacode_to_nlkeycode(unsigned short sa_code);
+static struct sk_buff * ipsec_xfrm2nlkey (struct net *net, struct xfrm_state *x, 
+					const struct km_event *c, unsigned short *msg_id);
+static int ipsec_nlkey_set_said(struct net *net, struct xfrm_state *x, const struct km_event *c, struct nlkey_sa_id *said);
+
+/* netlink NETLINK_KEY socket */
+struct sock *nlkey_socket = NULL;
+
+#endif
+/************************************************************************************/
+#endif
+
 
 #define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x))
 #define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x))
@@ -837,6 +1017,10 @@
 		sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP;
 	if (x->props.flags & XFRM_STATE_NOPMTUDISC)
 		sa->sadb_sa_flags |= SADB_SAFLAGS_NOPMTUDISC;
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (x->props.flags & XFRM_STATE_ESN)
+		sa->sadb_sa_flags |= SADB_SAFLAGS_ESN;
+#endif
 
 	/* hard time */
 	if (hsc & 2) {
@@ -1104,6 +1288,10 @@
 		x->props.flags |= XFRM_STATE_DECAP_DSCP;
 	if (sa->sadb_sa_flags & SADB_SAFLAGS_NOPMTUDISC)
 		x->props.flags |= XFRM_STATE_NOPMTUDISC;
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (sa->sadb_sa_flags & SADB_SAFLAGS_ESN)
+		x->props.flags |= XFRM_STATE_ESN;
+#endif
 
 	lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD - 1];
 	if (lifetime != NULL) {
@@ -1430,7 +1618,11 @@
 }
 
 /* ADD/UPD/DEL */
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+static int key_notify_sa(struct net *net, struct xfrm_state *x, const struct km_event *c)
+#else
 static int key_notify_sa(struct xfrm_state *x, const struct km_event *c)
+#endif
 {
 	struct sk_buff *skb;
 	struct sadb_msg *hdr;
@@ -1451,6 +1643,10 @@
 
 	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x));
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+	/* now sent message also to the user space through NETLINK_KEY socket*/
+	ipsec_nlkey_send(net, x, c);
+#endif
 	return 0;
 }
 
@@ -1689,7 +1885,11 @@
 	return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk));
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+static int key_notify_sa_flush(struct net *net, const struct km_event *c)
+#else
 static int key_notify_sa_flush(const struct km_event *c)
+#endif
 {
 	struct sk_buff *skb;
 	struct sadb_msg *hdr;
@@ -1708,6 +1908,11 @@
 
 	pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net);
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+	/* now sent message also to the user space through NETLINK_KEY socket*/
+	ipsec_nlkey_send(net, NULL, c);
+#endif
+
 	return 0;
 }
 
@@ -2932,7 +3137,11 @@
 	return 0;
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+static int key_notify_sa_expire(struct net *net, struct xfrm_state *x, const struct km_event *c)
+#else
 static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c)
+#endif
 {
 	struct sk_buff *out_skb;
 	struct sadb_msg *out_hdr;
@@ -2959,6 +3168,11 @@
 	out_hdr->sadb_msg_pid = 0;
 
 	pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, xs_net(x));
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+	/* now sent message also to the user space through NETLINK_KEY socket*/
+	ipsec_nlkey_send(net, x, c);
+#endif
 	return 0;
 }
 
@@ -2972,13 +3186,25 @@
 
 	switch (c->event) {
 	case XFRM_MSG_EXPIRE:
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+		return key_notify_sa_expire(net, x, c);
+#else
 		return key_notify_sa_expire(x, c);
+#endif
 	case XFRM_MSG_DELSA:
 	case XFRM_MSG_NEWSA:
 	case XFRM_MSG_UPDSA:
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+		return key_notify_sa(net, x, c);
+#else
 		return key_notify_sa(x, c);
+#endif
 	case XFRM_MSG_FLUSHSA:
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+		return key_notify_sa_flush(net, c);
+#else
 		return key_notify_sa_flush(c);
+#endif
 	case XFRM_MSG_NEWAE: /* not yet supported */
 		break;
 	default:
@@ -3748,6 +3974,539 @@
 	.migrate	= pfkey_send_migrate,
 };
 
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+extern struct xfrm_state *xfrm_state_lookup_byhandle(struct net *net, u16 handle);
+
+static unsigned short ipsec_sacode_to_nlkeycode(unsigned short sa_code)
+{
+	unsigned nlkey_code;
+
+	switch (sa_code) 
+	{
+		case XFRM_MSG_DELSA:
+			nlkey_code = NLKEY_SA_DELETE;
+			break;
+		case XFRM_MSG_NEWSA:
+		case XFRM_MSG_UPDSA:
+			nlkey_code = NLKEY_SA_CREATE;
+			break;
+		case XFRM_MSG_FLUSHSA:
+			nlkey_code = NLKEY_SA_FLUSH;
+			break;
+		case XFRM_MSG_EXPIRE:
+			nlkey_code = NLKEY_SA_SET_STATE;
+			break;
+		default:
+			nlkey_code = NLKEY_NULL_MSG;
+			break;
+	}
+
+	return nlkey_code;
+}
+
+static void ipsec_nlkey_rcv(struct sk_buff *skb)
+{
+	struct nlmsghdr *nlh = NULL;
+	struct nlkey_msg *msg = NULL;
+	struct flowi flow;
+	unsigned short *p;
+	unsigned short family, dir;
+	struct xfrm_state *x;
+	struct nlkey_sa_notify sa_notify_msg;
+	struct nlkey_sa_info sa_info_msg;
+
+	/* extract message from skb */
+	nlh = (struct nlmsghdr *)skb->data;
+
+	msg = (struct nlkey_msg *)NLMSG_DATA(nlh);
+
+	//printk(KERN_INFO "ipsec_nlkey_rcv fcode: 0x%x length: %d bytes\n",msg->fcode,msg->length);
+
+	/* process command received from user space */
+	switch(msg->fcode)
+	{
+		case NLKEY_FLOW_REMOVE:
+			//printk(KERN_INFO "ipsec_nlkey_rcv NLKEY_FLOW_REMOVE\n");
+			p = msg->payload;
+			memcpy(&flow, p, sizeof(struct flowi)); p += sizeof(struct flowi)/2;
+			family = *p; p++;
+			dir = *p; p++;
+			flow_cache_remove(&flow, family, dir);
+			break;
+
+		case NLKEY_SA_NOTIFY:
+			//printk(KERN_INFO "ipsec_nlkey_rcv NLKEY_SA_NOTIFY\n");
+			memcpy(&sa_notify_msg, msg->payload, sizeof(struct nlkey_sa_notify));
+			x = xfrm_state_lookup_byhandle(&init_net, sa_notify_msg.sagd);
+			if (x) {
+				spin_lock(&x->lock);
+
+				if (sa_notify_msg.action) { 
+					// hard expired
+					x->km.state = XFRM_STATE_EXPIRED;
+					tasklet_hrtimer_start(&x->mtimer, ktime_set(0,0), HRTIMER_MODE_REL);
+				}
+				else if (!x->km.dying) {
+					 x->km.dying = 1;
+					 km_state_expired(x, 0, 0);
+				}
+
+				spin_unlock(&x->lock);
+				xfrm_state_put(x);
+			}
+			break;
+
+		case NLKEY_SA_INFO_UPDATE:
+			memcpy(&sa_info_msg, msg->payload, sizeof(struct nlkey_sa_info));
+
+			x = xfrm_state_lookup_byhandle(&init_net,sa_info_msg.sagd);
+			if (x) {
+				spin_lock(&x->lock);
+
+				x->curlft.bytes = sa_info_msg.bytes;
+				x->curlft.packets = sa_info_msg.packets;
+
+				spin_unlock(&x->lock);
+				xfrm_state_put(x);
+			}
+			break;
+		default:
+			//printk(KERN_INFO "ipsec_nlkey_rcv fcode 0x%x not supported\n", msg->fcode);
+			break;
+	}
+
+}
+
+extern struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
+						  xfrm_address_t *saddr,
+						  xfrm_address_t *daddr,
+						  int family);
+static int ipsec_nlkey_set_said(struct net *net, struct xfrm_state *x, 
+				const struct km_event *c, struct nlkey_sa_id *said)
+{
+
+	struct flowi fl;
+	int tos;
+	xfrm_address_t saddr, daddr;
+	struct dst_entry *dst;
+	int rc = 0;
+
+	memset(&fl, 0, sizeof(struct flowi));
+
+	/* SPI */
+	said->spi = x->id.spi;
+	/* SA Type (AH or ESP) */
+	said->sa_type = x->id.proto;
+	/* Protocol Family (IPv4 or IPv6) */
+	said->proto_family = x->props.family;
+	/* Replay window */
+	said->replay_window = x->props.replay_window;
+	/* Destination IP Address */
+	if(x->props.family == AF_INET6) {
+		memcpy(&said->dst_ip, x->id.daddr.a6, sizeof(struct in6_addr));
+		ipv6_addr_copy(&fl.u.ip6.daddr, (struct in6_addr *)x->id.daddr.a6);
+		memcpy(&said->src_ip, x->props.saddr.a6, sizeof(struct in6_addr));
+	}
+	else {
+		said->dst_ip[0] = x->id.daddr.a4;
+		fl.u.ip4.daddr = x->id.daddr.a4;
+		said->src_ip[0] = x->props.saddr.a4;
+	}
+	said->mtu = 0;
+
+	if(x->props.flags & XFRM_STATE_ESN)
+		said->flags = NLKEY_SAFLAGS_ESN;
+	xfrm_flowi_addr_get(&fl, &saddr, &daddr, x->props.family);
+
+	tos = xfrm_get_tos(&fl, x->props.family);
+	if (tos < 0) {
+		printk(KERN_ERR "%s:%d: FIXME\n",__FUNCTION__,__LINE__);	
+		rc = -1;
+		goto error;
+	}
+	
+	dst = __xfrm_dst_lookup(net, tos, NULL, &daddr, x->props.family);
+	if (IS_ERR(dst)) {
+		printk(KERN_ERR "%s:%d: FIXME\n",__FUNCTION__,__LINE__);
+		rc = -1;
+		goto error;
+	}
+	said->dev_mtu = dst_mtu(dst);
+	said->mtu = xfrm_state_mtu(x,dst_mtu(dst));	
+
+	dst_release(dst);
+error:
+	return rc;
+}
+
+static struct sk_buff * ipsec_xfrm2nlkey (struct net *net, struct xfrm_state *x, 
+					const struct km_event *c, unsigned short *msg_id)
+{
+	struct nlkey_sa_id sa_id_msg;
+	struct nlkey_sa_create sa_create_msg;
+	struct nlkey_sa_delete sa_delete_msg;
+	struct nlkey_sa_set_keys sa_set_keys_msg;
+	struct nlkey_sa_set_tunnel sa_set_tunnel_msg;
+	struct nlkey_sa_set_natt sa_set_natt_msg;
+	struct nlkey_sa_set_state sa_set_state_msg;
+	struct nlkey_sa_set_lifetime sa_set_lifetime_msg;
+	struct nlkey_msg msg;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh = NULL;
+	gfp_t allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+	unsigned char tunnel, keys, natt, state, lifetime;
+
+	/* supported SA informations */
+	keys = 1; state = 1; tunnel = 1; lifetime = 1; natt = 1; 
+
+	/* next message to build */
+	memset(&msg, 0, sizeof(struct nlkey_msg));
+	msg.fcode = *msg_id;
+	
+	//printk(KERN_INFO "\n\nipsec_xfrm2nlkey: processing event 0x%x\n", msg.fcode);
+
+	switch (msg.fcode)
+	{
+		case NLKEY_SA_CREATE:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SA_CREATE\n");
+			if(x) {
+				/* some check before builing message */
+				if((x->id.proto != IPPROTO_ESP) && (x->id.proto != IPPROTO_AH)) {
+					printk(KERN_ERR "ipsec_xfrm2nlkey: protocol %d not supported\n", x->id.proto);
+					*msg_id = NLKEY_NULL_MSG;
+					goto exit;
+				}	
+				memset(&sa_create_msg, 0, sizeof(struct nlkey_sa_create));	
+
+				/* SA global handler */
+				sa_create_msg.sagd = x->handle;
+
+				/* SA identifier */
+				if(ipsec_nlkey_set_said(net, x, c, &sa_create_msg.said) < 0)
+				{
+					printk(KERN_ERR "%s: set sa ID failed\n", __func__);
+					*msg_id = NLKEY_NULL_MSG; /* next message */
+					goto exit;
+				}
+				memcpy(msg.payload, &sa_create_msg, sizeof(struct nlkey_sa_create));
+				msg.length = sizeof(struct nlkey_sa_create);
+				*msg_id = NLKEY_SA_SET_KEYS; /* next message */
+			} else {
+				*msg_id = NLKEY_NULL_MSG; /* next message */
+				goto exit;
+			}
+			
+			break;
+
+		case NLKEY_SA_SET_KEYS:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SA_SET_KEYS\n");
+			if(keys) {
+				memset(&sa_set_keys_msg, 0, sizeof(struct nlkey_sa_set_keys));
+
+				/* SA global handler */
+				sa_set_keys_msg.sagd = x->handle; 
+				
+				/* auth key */
+				if(x->aalg) {
+					if (x->aalg->alg_key_len) {
+						sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_bits = x->aalg->alg_key_len;
+						sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_alg = x->props.aalgo;
+						sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_type = 0;
+						memcpy(sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key, x->aalg->alg_key,(sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_bits / 8));
+						//printk(KERN_INFO "ipsec_xfrm2nlkey: AUTH - algo %d key %d bits\n", sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_alg, sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_bits);
+						sa_set_keys_msg.num_keys++;
+					}
+				}
+				/* encrypt key */
+				if(x->ealg) {
+					if (x->ealg->alg_key_len) {
+
+						sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_bits = x->ealg->alg_key_len;
+						sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_alg = x->props.ealgo;
+						sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_type = 1;
+						memcpy(sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key, x->ealg->alg_key,(sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_bits / 8));
+						//printk(KERN_INFO "ipsec_xfrm2nlkey: ENCRYPT - algo %d key %d bits\n", sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_alg, sa_set_keys_msg.keys[sa_set_keys_msg.num_keys].key_bits);
+						sa_set_keys_msg.num_keys++;
+					}
+				}
+				memcpy(msg.payload, &sa_set_keys_msg, sizeof(struct nlkey_sa_set_keys));
+				msg.length = sizeof(struct nlkey_sa_set_keys);
+				*msg_id = NLKEY_SA_SET_TUNNEL; /* next message */
+			} else {
+				*msg_id = NLKEY_SA_SET_TUNNEL; /* next message */
+				goto exit;
+			}
+			break;
+
+		case NLKEY_SA_SET_TUNNEL:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SA_SET_TUNNEL\n");
+			if(tunnel && (x->props.mode == XFRM_MODE_TUNNEL)) {
+				memset(&sa_set_tunnel_msg, 0, sizeof(struct nlkey_sa_set_tunnel));
+
+				/* SA global handler */
+				sa_set_tunnel_msg.sagd = x->handle; 
+
+				/* Tunnel */
+				sa_set_tunnel_msg.proto_family = x->props.family;
+				if(x->props.family == AF_INET6) {
+					struct ipv6hdr *top_iph = &sa_set_tunnel_msg.h.ipv6h;
+					int dsfield;
+					top_iph->version = 6;
+					top_iph->priority = 0;
+					top_iph->flow_lbl[0] = 0;
+					top_iph->flow_lbl[1] = 0;
+					top_iph->flow_lbl[2] = 0;
+					top_iph->nexthdr = IPPROTO_IPIP;	
+					dsfield = ipv6_get_dsfield(top_iph);
+					dsfield = INET_ECN_encapsulate(dsfield, dsfield);
+					if (x->props.flags & XFRM_STATE_NOECN)
+						dsfield &= ~INET_ECN_MASK;
+					ipv6_change_dsfield(top_iph, 0, dsfield);
+					top_iph->hop_limit = 64;
+					memcpy(&top_iph->daddr, x->id.daddr.a6, sizeof(struct in6_addr));
+					memcpy(&top_iph->saddr, x->props.saddr.a6, sizeof(struct in6_addr));
+					//printk(KERN_INFO "ipsec_xfrm2nlkey: IPv6 tunnel\n");
+					//printk(KERN_INFO "dst: %x %x %x %x\n", x->id.daddr.a6[0], x->id.daddr.a6[1], x->id.daddr.a6[2], x->id.daddr.a6[3]);
+					//(KERN_INFO "src: %x %x %x %x\n", x->props.saddr.a6[0], x->props.saddr.a6[1], x->props.saddr.a6[2], x->props.saddr.a6[3]);
+				}
+				else {
+					struct iphdr *top_iph = &sa_set_tunnel_msg.h.ipv4h;
+					top_iph->ihl = 5;
+					top_iph->version = 4;
+					top_iph->tos = 0;
+					top_iph->frag_off = 0; 
+					top_iph->ttl = 64;
+					top_iph->saddr = x->props.saddr.a4;
+					top_iph->daddr = x->id.daddr.a4;
+					//printk(KERN_INFO "ipsec_xfrm2nlkey: IPv4 tunnel dst:%x - src:%x \n", x->id.daddr.a4, x->props.saddr.a4);
+				}
+				memcpy(msg.payload, &sa_set_tunnel_msg, sizeof(struct nlkey_sa_set_tunnel));
+				msg.length = sizeof(struct nlkey_sa_set_tunnel);
+				*msg_id = NLKEY_SA_SET_NATT; /* next message */
+			} else {
+				*msg_id = NLKEY_SA_SET_NATT; /* next message */
+				goto exit;	
+			} 
+			break;
+
+		case NLKEY_SA_SET_NATT:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SA_SET_NATT\n");
+			if((natt) && (x->encap)){
+				memset(&sa_set_natt_msg, 0, sizeof(struct nlkey_sa_set_natt));
+
+				/* SA global handler */
+				sa_set_natt_msg.sagd = x->handle; 
+				sa_set_natt_msg.sport = x->encap->encap_sport;
+				sa_set_natt_msg.dport = x->encap->encap_dport;
+				//printk(KERN_INFO "src port: %d  dst port: %d \n", ntohs(sa_set_natt_msg.sport), ntohs( sa_set_natt_msg.dport));
+				memcpy(msg.payload, &sa_set_natt_msg, sizeof(struct nlkey_sa_set_natt));
+				msg.length = sizeof(struct nlkey_sa_set_natt);
+				*msg_id = NLKEY_SA_SET_LIFETIME; /* next message */
+			} else {
+				*msg_id = NLKEY_SA_SET_LIFETIME; /* next message */
+				goto exit;	
+			}
+			break;
+
+		case NLKEY_SA_SET_LIFETIME:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SA_SET_LIFETIME\n");
+			if(lifetime) {
+				memset(&sa_set_lifetime_msg, 0, sizeof(struct nlkey_sa_set_lifetime));
+
+				/* SA global handler */
+				sa_set_lifetime_msg.sagd = x->handle;
+
+				/* hard time */
+				sa_set_lifetime_msg.hard_time.allocations =  _X2KEY(x->lft.hard_packet_limit);
+				if(_X2KEY(x->lft.hard_byte_limit))
+					memcpy(sa_set_lifetime_msg.hard_time.bytes, &x->lft.hard_byte_limit, sizeof(uint64_t));
+
+				/* soft time */
+				sa_set_lifetime_msg.soft_time.allocations =  _X2KEY(x->lft.soft_packet_limit);
+				if(_X2KEY(x->lft.soft_byte_limit))
+					memcpy(sa_set_lifetime_msg.soft_time.bytes, &x->lft.soft_byte_limit, sizeof(uint64_t));
+
+				/* current time */
+				sa_set_lifetime_msg.current_time.allocations = x->curlft.packets;
+				memcpy(sa_set_lifetime_msg.current_time.bytes, &x->curlft.bytes, sizeof(uint64_t));
+
+				memcpy(msg.payload, &sa_set_lifetime_msg, sizeof(struct nlkey_sa_set_lifetime));
+				msg.length = sizeof(struct nlkey_sa_set_lifetime);
+				*msg_id = NLKEY_SA_SET_STATE; /* next message */
+			} else {
+				*msg_id = NLKEY_SA_SET_STATE; /* next message */
+				goto exit;	
+			}
+			break;
+
+		case NLKEY_SA_SET_STATE:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SET_STATE\n");
+			if(state) {
+				memset(&sa_set_state_msg, 0, sizeof(struct nlkey_sa_set_state));
+				memset(&sa_id_msg, 0, sizeof(struct nlkey_sa_id));
+
+				/* SA global handler */
+				sa_set_state_msg.sagd = x->handle; 
+				/* State */
+				sa_set_state_msg.state = x->km.state;
+				// TODO: set the offloaded state once ack received !
+				x->offloaded = 1;
+				atomic_inc(&flow_cache_genid);
+
+				memcpy(msg.payload, &sa_set_state_msg, sizeof(struct nlkey_sa_set_state));
+				msg.length = sizeof(struct nlkey_sa_set_state);
+				*msg_id = NLKEY_NULL_MSG; /* next message */
+			} else {
+				*msg_id = NLKEY_NULL_MSG; /* next message */
+				goto exit;
+			}
+			break;
+		
+		case NLKEY_SA_DELETE:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SA_DELETE\n");
+			memset(&sa_delete_msg, 0, sizeof(struct nlkey_sa_delete));
+			
+			/* SA global handler */
+			sa_delete_msg.sagd = x->handle;
+			memcpy(msg.payload, &sa_delete_msg, sizeof(struct nlkey_sa_delete));
+			msg.length = sizeof(struct nlkey_sa_delete);
+			atomic_inc(&flow_cache_genid);
+
+
+			*msg_id = NLKEY_NULL_MSG; /* next message */
+			break;
+
+		case NLKEY_SA_FLUSH:
+			//printk(KERN_INFO "ipsec_xfrm2nlkey: NLKEY_SA_FLUSH\n");
+			/* No data required for flush SA command */
+			atomic_inc(&flow_cache_genid);
+
+			*msg_id = NLKEY_NULL_MSG; /* next message */
+			break;
+
+		default:
+			printk(KERN_ERR "ipsec_xfrm2nlkey: event 0x%x not supported\n", c->event);
+			*msg_id = NLKEY_NULL_MSG; /* next message */
+			break;
+	}
+
+	/* prepare netlink message for kernel to user space direction */
+	if(msg.length > NLKEY_MSG_LEN)
+	{
+		printk(KERN_ERR "ipsec_xfrm2nlkey: maximum message size reached (%d bytes)\n", msg.length);
+		goto exit;
+	}
+
+	skb = alloc_skb(NLMSG_SPACE(NLKEY_MSG_LEN + NLKEY_HDR_LEN), allocation);
+	if (skb == NULL)
+		goto exit;
+		
+	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_SPACE(NLKEY_HDR_LEN + msg.length));
+	memcpy(NLMSG_DATA(nlh), (unsigned char *)&msg, (NLKEY_HDR_LEN + msg.length));
+	
+	/* whole length of the message i.e. header + payload */
+	nlh->nlmsg_len = NLMSG_SPACE(NLKEY_HDR_LEN + msg.length);
+
+	/* from kernel */
+	nlh->nlmsg_pid = 0;
+	nlh->nlmsg_flags = 0;
+        nlh->nlmsg_type = 0;
+	NETLINK_CB(skb).pid = 0;
+	NETLINK_CB(skb).dst_group = 1;
+exit:
+	return skb;
+}
+
+static int ipsec_nlkey_send(struct net *net, struct xfrm_state *x, const struct km_event *c)
+{
+	struct sk_buff *skb;
+	unsigned short msg_type;
+	int rc = 0;
+
+	/* We may generate more than one message when adding new SA (sa_create + sa_set_state + sa_set_tunnel...) */
+	msg_type = ipsec_sacode_to_nlkeycode((unsigned short)c->event);
+
+	while(msg_type != NLKEY_NULL_MSG)
+	{
+		/* build nlkey message */
+		skb = ipsec_xfrm2nlkey(net, x, c, &msg_type);
+
+		if(skb != NULL)
+			if((rc = netlink_broadcast(nlkey_socket, skb, 0, 1, GFP_ATOMIC)) < 0)
+				return rc;
+	}
+
+	return rc;
+}
+
+
+int ipsec_nlkey_flow(u16 xfrm_nr, u16 *xfrm_handle, const struct flowi *fl, u16 family, u16 dir)
+{
+	struct sk_buff *skb;
+	struct nlkey_msg msg;
+	struct nlmsghdr *nlh = NULL;
+	unsigned short *p;
+	gfp_t allocation = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
+
+	//printk(KERN_INFO "ipsec_nlkey_flow \n");
+
+	/* next message to build */
+	memset(&msg, 0, sizeof(struct nlkey_msg));
+	msg.fcode = NLKEY_FLOW_ADD;
+
+	// Number of SA for this flow
+	p = msg.payload;
+	*p++ = xfrm_nr;
+	msg.length += sizeof(unsigned short);
+	// SA handles list
+	memcpy(p, xfrm_handle, xfrm_nr*sizeof(unsigned short));
+	msg.length += xfrm_nr*sizeof(unsigned short);
+	p+=xfrm_nr;
+	// flow family
+	*p++ = family;
+	msg.length += sizeof(unsigned short);
+	// flow family
+	*p++ = dir;
+	msg.length += sizeof(unsigned short);
+	// flow descriptor
+	memcpy(p, fl, sizeof(struct flowi));
+	msg.length +=sizeof(struct flowi);
+	p+=sizeof(struct flowi) / sizeof(u16);
+
+	skb = alloc_skb(NLMSG_SPACE(NLKEY_MSG_LEN + NLKEY_HDR_LEN), allocation);
+	if (skb == NULL)
+		return -ENOMEM;
+
+	/* prepare netlink message for kernel to user space direction */
+	nlh = (struct nlmsghdr *)skb_put(skb, NLMSG_SPACE(NLKEY_HDR_LEN + msg.length));
+	memcpy(NLMSG_DATA(nlh), (unsigned char *)&msg, (NLKEY_HDR_LEN + msg.length));
+
+	/* whole length of the message i.e. header + payload */
+	nlh->nlmsg_len = NLMSG_SPACE(NLKEY_HDR_LEN + msg.length);
+
+	/* from kernel */
+	nlh->nlmsg_pid = 0; 
+	nlh->nlmsg_flags = 0;
+        nlh->nlmsg_type = 0;
+	NETLINK_CB(skb).pid = 0;
+	NETLINK_CB(skb).dst_group = 1;
+
+	return(netlink_broadcast(nlkey_socket, skb, 0, 1, allocation));
+}
+EXPORT_SYMBOL(ipsec_nlkey_flow);
+
+
+static void ipsec_nlkey_init(void)
+{
+	printk(KERN_INFO "Initializing NETLINK_KEY socket\n");
+
+	nlkey_socket = netlink_kernel_create(&init_net, NETLINK_KEY, 1,
+				     ipsec_nlkey_rcv, NULL, THIS_MODULE);
+}
+#endif
+
+
 static int __net_init pfkey_net_init(struct net *net)
 {
 	struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id);
@@ -3782,6 +4541,11 @@
 	sock_unregister(PF_KEY);
 	unregister_pernet_subsys(&pfkey_net_ops);
 	proto_unregister(&key_proto);
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+	/* release NETLINK_KEY socket */
+	sock_release(nlkey_socket->sk_socket);
+#endif
 }
 
 static int __init ipsec_pfkey_init(void)
@@ -3800,6 +4564,12 @@
 	err = xfrm_register_km(&pfkeyv2_mgr);
 	if (err != 0)
 		goto out_sock_unregister;
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(NLKEY_SUPPORT)
+	/* create NETLINK_KEY socket for IPSec offload on Comcerto */
+	ipsec_nlkey_init();
+#endif
+
 out:
 	return err;
 
Nur in b/net/netfilter: comcerto_fp_netfilter.c.
diff -ur a/net/netfilter/Makefile b/net/netfilter/Makefile
--- a/net/netfilter/Makefile	2013-08-03 09:59:52.000000000 +0200
+++ b/net/netfilter/Makefile	2014-01-21 09:37:31.000000000 +0100
@@ -6,6 +6,10 @@
 
 obj-$(CONFIG_NETFILTER) = netfilter.o
 
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_COMCERTO_FP) += comcerto_fp_netfilter.o
+endif
+
 obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o
 obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o
 obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o
diff -ur a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
--- a/net/netfilter/nf_conntrack_core.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/netfilter/nf_conntrack_core.c	2014-02-17 11:56:52.000000000 +0100
@@ -228,6 +228,14 @@
 #endif
 #endif
 
+	#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE))
+	if(ct->layer7.app_proto)
+		kfree(ct->layer7.app_proto);
+	if(ct->layer7.app_data)
+	kfree(ct->layer7.app_data);
+	#endif
+
+
 	/* We overload first tuple to link into unconfirmed list. */
 	if (!nf_ct_is_confirmed(ct)) {
 		BUG_ON(hlist_nulls_unhashed(&ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode));
@@ -305,6 +313,9 @@
 {
 	struct nf_conn *ct = (void *)ul_conntrack;
 	struct nf_conn_tstamp *tstamp;
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	struct nf_conntrack_l4proto *l4proto;
+#endif
 
 #if defined(CONFIG_SYNO_ARMADA)
 #if defined(CONFIG_MV_ETH_NFP_HOOKS)
@@ -395,6 +406,27 @@
 	if (tstamp && tstamp->stop == 0)
 		tstamp->stop = ktime_to_ns(ktime_get_real());
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+
+	if (test_bit(IPS_DYING_BIT, &ct->status) ||
+	   (!test_bit(IPS_PERMANENT_BIT, &ct->status)) ||
+	   ((l4proto->l4proto == IPPROTO_TCP) && (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED))) {
+		if (!test_bit(IPS_DYING_BIT, &ct->status) &&
+		    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
+			/* destroy event was not delivered */
+			nf_ct_delete_from_lists(ct);
+			nf_ct_insert_dying_list(ct);
+			return;
+		}
+		set_bit(IPS_DYING_BIT, &ct->status);
+		nf_ct_delete_from_lists(ct);
+		nf_ct_put(ct);
+	} else {
+		ct->timeout.expires = jiffies + COMCERTO_PERMANENT_TIMEOUT * HZ;
+		add_timer(&ct->timeout);
+	}
+#else
 	if (!test_bit(IPS_DYING_BIT, &ct->status) &&
 	    unlikely(nf_conntrack_event(IPCT_DESTROY, ct) < 0)) {
 		/* destroy event was not delivered */
@@ -405,6 +437,7 @@
 	set_bit(IPS_DYING_BIT, &ct->status);
 	nf_ct_delete_from_lists(ct);
 	nf_ct_put(ct);
+#endif
 }
 
 /*
@@ -708,7 +741,13 @@
 	if (!ct)
 		return dropped;
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	clear_bit(IPS_PERMANENT_BIT, &ct->status);
+	/* Avoid race with timer expiration */
+	if (del_timer_sync(&ct->timeout)) {
+#else
 	if (del_timer(&ct->timeout)) {
+#endif
 		death_by_timeout((unsigned long)ct);
 		dropped = 1;
 		NF_CT_STAT_INC_ATOMIC(net, early_drop);
@@ -1202,7 +1241,13 @@
 		}
 	}
 
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	clear_bit(IPS_PERMANENT_BIT, &ct->status);
+	/* Avoid race with timer expiration */
+	if (del_timer_sync(&ct->timeout)) {
+#else
 	if (del_timer(&ct->timeout)) {
+#endif
 		ct->timeout.function((unsigned long)ct);
 		return true;
 	}
@@ -1323,7 +1368,14 @@
 
 	while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
 		/* Time to push up daises... */
+
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+		clear_bit(IPS_PERMANENT_BIT, &ct->status);
+		/* Avoid race with timer expiration */
+		if (del_timer_sync(&ct->timeout))
+#else
 		if (del_timer(&ct->timeout))
+#endif
 			death_by_timeout((unsigned long)ct);
 		/* ... else the timer will get him soon. */
 
@@ -1485,6 +1537,44 @@
 }
 EXPORT_SYMBOL_GPL(nf_ct_alloc_hashtable);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+int nf_conntrack_set_dpi_allow_report(struct sk_buff *skb)
+{
+	int err = 0;
+	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+
+	nf_conntrack_get(skb->nfct);
+
+	set_bit(IPS_DPI_ALLOWED_BIT, &ct->status);
+
+	nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+	nf_conntrack_put(skb->nfct);
+
+	return err;
+}
+EXPORT_SYMBOL(nf_conntrack_set_dpi_allow_report);
+
+int nf_conntrack_set_dpi_allow_and_mark(struct sk_buff *skb, int mark)
+{
+	int err = 0;
+	struct nf_conn *ct = (struct nf_conn *)skb->nfct;
+
+	nf_conntrack_get(skb->nfct);
+
+	set_bit(IPS_DPI_ALLOWED_BIT, &ct->status);
+
+	ct->mark = mark;
+
+	nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
+
+	nf_conntrack_put(skb->nfct);
+
+	return err;
+}
+EXPORT_SYMBOL(nf_conntrack_set_dpi_allow_and_mark);
+#endif
+
 int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp)
 {
 	int i, bucket;
diff -ur a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
--- a/net/netfilter/nf_conntrack_netlink.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/netfilter/nf_conntrack_netlink.c	2014-02-17 11:56:52.000000000 +0100
@@ -301,6 +301,43 @@
 #define ctnetlink_dump_secctx(a, b) (0)
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO)
+#if defined(CONFIG_COMCERTO_FP)
+static int
+ctnetlink_dump_comcerto_fp(struct sk_buff *skb, const struct nf_conn *ct)
+{
+	struct nlattr *nest_count;
+
+	nest_count = nla_nest_start(skb, CTA_COMCERTO_FP_ORIG | NLA_F_NESTED);
+	if (!nest_count)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(skb, CTA_COMCERTO_FP_MARK, ct->fp_info[IP_CT_DIR_ORIGINAL].mark);
+	NLA_PUT_U32(skb, CTA_COMCERTO_FP_IFINDEX, ct->fp_info[IP_CT_DIR_ORIGINAL].ifindex);
+	NLA_PUT_U32(skb, CTA_COMCERTO_FP_IIF, ct->fp_info[IP_CT_DIR_ORIGINAL].iif);
+
+	nla_nest_end(skb, nest_count);
+
+	nest_count = nla_nest_start(skb, CTA_COMCERTO_FP_REPLY | NLA_F_NESTED);
+	if (!nest_count)
+		goto nla_put_failure;
+
+	NLA_PUT_U32(skb, CTA_COMCERTO_FP_MARK, ct->fp_info[IP_CT_DIR_REPLY].mark);
+	NLA_PUT_U32(skb, CTA_COMCERTO_FP_IFINDEX, ct->fp_info[IP_CT_DIR_REPLY].ifindex);
+	NLA_PUT_U32(skb, CTA_COMCERTO_FP_IIF, ct->fp_info[IP_CT_DIR_REPLY].iif);
+
+	nla_nest_end(skb, nest_count);
+
+	return 0;
+
+nla_put_failure:
+	return -1;
+}
+#else
+#define ctnetlink_dump_comcerto_fp(a, b) (0)
+#endif
+#endif
+
 #define master_tuple(ct) &(ct->master->tuplehash[IP_CT_DIR_ORIGINAL].tuple)
 
 static inline int
@@ -437,6 +474,9 @@
 	    ctnetlink_dump_helpinfo(skb, ct) < 0 ||
 	    ctnetlink_dump_mark(skb, ct) < 0 ||
 	    ctnetlink_dump_secctx(skb, ct) < 0 ||
+#if defined(CONFIG_SYNO_COMCERTO)
+	    ctnetlink_dump_comcerto_fp(skb, ct) < 0 ||
+#endif
 	    ctnetlink_dump_id(skb, ct) < 0 ||
 	    ctnetlink_dump_use(skb, ct) < 0 ||
 	    ctnetlink_dump_master(skb, ct) < 0 ||
@@ -528,6 +568,12 @@
 	       + nla_total_size(0) /* CTA_HELP */
 	       + nla_total_size(NF_CT_HELPER_NAME_LEN) /* CTA_HELP_NAME */
 	       + ctnetlink_secctx_size(ct)
+#if defined(CONFIG_SYNO_COMCERTO) && defined(CONFIG_COMCERTO_FP)
+	       + 2 * nla_total_size(0) /* CTA_COMCERTO_FP_ORIG|REPL */
+	       + 2 * nla_total_size(sizeof(uint32_t)) /* CTA_COMCERTO_FP_MARK */
+	       + 2 * nla_total_size(sizeof(uint32_t)) /* CTA_COMCERTO_FP_IFINDEX */
+	       + 2 * nla_total_size(sizeof(uint32_t)) /* CTA_COMCERTO_FP_IIF */
+#endif
 #ifdef CONFIG_NF_NAT_NEEDED
 	       + 2 * nla_total_size(0) /* CTA_NAT_SEQ_ADJ_ORIG|REPL */
 	       + 6 * nla_total_size(sizeof(u_int32_t)) /* CTA_NAT_SEQ_OFFSET */
@@ -605,6 +651,11 @@
 	if (nf_ct_zone(ct))
 		NLA_PUT_BE16(skb, CTA_ZONE, htons(nf_ct_zone(ct)));
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (ctnetlink_dump_comcerto_fp(skb, ct) < 0)
+		goto nla_put_failure;
+#endif
+
 	if (ctnetlink_dump_id(skb, ct) < 0)
 		goto nla_put_failure;
 
diff -ur a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
--- a/net/netfilter/nf_conntrack_proto_tcp.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/netfilter/nf_conntrack_proto_tcp.c	2014-02-17 11:56:52.000000000 +0100
@@ -29,6 +29,11 @@
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
+#if defined(CONFIG_SYNO_COMCERTO)
+/* Do not check the TCP window for incoming packets  */
+static int nf_ct_tcp_no_window_check __read_mostly = 1;
+#endif
+
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -521,6 +526,11 @@
 	s16 receiver_offset;
 	bool res;
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (nf_ct_tcp_no_window_check)
+		return true;
+#endif
+
 	/*
 	 * Get the required data from the packet.
 	 */
@@ -1328,6 +1338,15 @@
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#if defined(CONFIG_SYNO_COMCERTO)
+	{
+		.procname       = "nf_conntrack_tcp_no_window_check",
+		.data           = &nf_ct_tcp_no_window_check,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+#endif
 	{
 		.procname       = "nf_conntrack_tcp_be_liberal",
 		.data           = &nf_ct_tcp_be_liberal,
diff -ur a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
--- a/net/netfilter/nf_conntrack_sip.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/netfilter/nf_conntrack_sip.c	2014-02-17 11:56:52.000000000 +0100
@@ -1363,8 +1363,31 @@
 {
 	enum ip_conntrack_info ctinfo;
 	struct nf_conn *ct = nf_ct_get(skb, &ctinfo);
+#if defined(CONFIG_SYNO_COMCERTO)
+	struct nf_conn_help *help = nfct_help(ct);
+	enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+#endif
 	unsigned int matchoff, matchlen;
 	unsigned int cseq, i;
+#if defined(CONFIG_SYNO_COMCERTO)
+	union nf_inet_addr addr;
+	__be16 port;
+#endif
+
+#if defined(CONFIG_SYNO_COMCERTO)
+	/* Many Cisco IP phones use a high source port for SIP requests, but
+	 * listen for the response on port 5060.  If we are the local
+	 * router for one of these phones, save the port number from the
+	 * Via: header so that nf_nat_sip can redirect the responses to
+	 * the correct port.
+	 */
+	if (ct_sip_parse_header_uri(ct, *dptr, NULL, *datalen,
+				    SIP_HDR_VIA_UDP, NULL, &matchoff,
+				    &matchlen, &addr, &port) > 0 &&
+	    port != ct->tuplehash[dir].tuple.src.u.udp.port &&
+	    nf_inet_addr_cmp(&addr, &ct->tuplehash[dir].tuple.src.u3))
+		help->help.ct_sip_info.forced_dport = port;
+#endif
 
 	for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) {
 		const struct sip_handler *handler;
diff -ur a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
--- a/net/netfilter/nf_conntrack_standalone.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/netfilter/nf_conntrack_standalone.c	2014-02-17 11:56:52.000000000 +0100
@@ -260,6 +260,12 @@
 #endif /* CONFIG_MV_ETH_NFP_HOOKS */
 #endif
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_NETFILTER_XT_MATCH_LAYER7) || defined(CONFIG_NETFILTER_XT_MATCH_LAYER7_MODULE))
+	if(ct->layer7.app_proto &&
+           seq_printf(s, "l7proto=%s ", ct->layer7.app_proto))
+		return -ENOSPC;
+#endif
+
 	if (seq_printf(s, "use=%u\n", atomic_read(&ct->ct_general.use)))
 		goto release;
 
diff -ur a/net/netfilter/xt_layer7.c b/net/netfilter/xt_layer7.c
--- a/net/netfilter/xt_layer7.c	2013-08-24 11:36:41.000000000 +0200
+++ b/net/netfilter/xt_layer7.c	2014-02-17 11:56:52.000000000 +0100
@@ -314,6 +314,40 @@
 }
 
 /* add the new app data to the conntrack.  Return number of bytes added. */
+#if defined(CONFIG_SYNO_COMCERTO)
+static int add_datastr(char *target, int offset, char *app_data, int len)
+{
+	int length = 0, i;
+	if (!target) return 0;
+
+	/* Strip nulls. Make everything lower case (our regex lib doesn't
+	do case insensitivity).  Add it to the end of the current data. */
+ 	for(i = 0; i < maxdatalen-offset-1 && i < len; i++) {
+		if(app_data[i] != '\0') {
+			/* the kernel version of tolower mungs 'upper ascii' */
+			target[length+offset] =
+				isascii(app_data[i])? 
+					tolower(app_data[i]) : app_data[i];
+			length++;
+		}
+	}
+	target[length+offset] = '\0';
+
+	return length;
+}
+
+/* add the new app data to the conntrack.  Return number of bytes added. */
+static int add_data(struct nf_conn * master_conntrack,
+                    char * app_data, int appdatalen)
+{
+	int length;
+
+	length = add_datastr(master_conntrack->layer7.app_data, master_conntrack->layer7.app_data_len, app_data, appdatalen);
+	master_conntrack->layer7.app_data_len += length;
+
+	return length;
+}
+#else
 static int add_data(struct nf_conn * master_conntrack,
                     char * app_data, int appdatalen)
 {
@@ -344,6 +378,7 @@
 
 	return length;
 }
+#endif
 
 /* taken from drivers/video/modedb.c */
 static int my_atoi(const char *s)
@@ -413,7 +448,9 @@
 }
 
 static bool
-#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35)
+match(const struct sk_buff *skbin, struct xt_action_param *par)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
 match(const struct sk_buff *skbin, const struct xt_match_param *par)
 #else
 match(const struct sk_buff *skbin,
@@ -439,6 +476,9 @@
 	enum ip_conntrack_info master_ctinfo, ctinfo;
 	struct nf_conn *master_conntrack, *conntrack;
 	unsigned char * app_data;
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned char *tmp_data;
+#endif
 	unsigned int pattern_result, appdatalen;
 	regexp * comppattern;
 
@@ -466,8 +506,13 @@
 		master_conntrack = master_ct(master_conntrack);
 
 	/* if we've classified it or seen too many packets */
+#if defined(CONFIG_SYNO_COMCERTO)
+	if(!info->pkt && (total_acct_packets(master_conntrack) > num_packets ||
+	   master_conntrack->layer7.app_proto)) {
+#else
 	if(total_acct_packets(master_conntrack) > num_packets ||
 	   master_conntrack->layer7.app_proto) {
+#endif
 
 		pattern_result = match_no_append(conntrack, master_conntrack,
 						 ctinfo, master_ctinfo, info);
@@ -500,6 +545,27 @@
 	/* the return value gets checked later, when we're ready to use it */
 	comppattern = compile_and_cache(info->pattern, info->protocol);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (info->pkt) {
+		tmp_data = kmalloc(maxdatalen, GFP_ATOMIC);
+		if(!tmp_data){
+			if (net_ratelimit())
+				printk(KERN_ERR "layer7: out of memory in match, bailing.\n");
+			return info->invert;
+		}
+
+		tmp_data[0] = '\0';
+		add_datastr(tmp_data, 0, app_data, appdatalen);
+		pattern_result = ((comppattern && regexec(comppattern, tmp_data)) ? 1 : 0);
+
+		kfree(tmp_data);
+		tmp_data = NULL;
+		spin_unlock_bh(&l7_lock);
+
+		return (pattern_result ^ info->invert);
+	}
+#endif
+
 	/* On the first packet of a connection, allocate space for app data */
 	if(total_acct_packets(master_conntrack) == 1 && !skb->cb[0] &&
 	   !master_conntrack->layer7.app_data){
@@ -576,14 +642,29 @@
 }
 
 // load nf_conntrack_ipv4
+#if defined(CONFIG_SYNO_COMCERTO)
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35)
+static int
+#else
+static bool
+#endif
+#endif
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 28)
+#if defined(CONFIG_SYNO_COMCERTO)
+check(const struct xt_mtchk_param *par)
+#else
 static bool check(const struct xt_mtchk_param *par)
+#endif
 {
         if (nf_ct_l3proto_try_module_get(par->match->family) < 0) {
                 printk(KERN_WARNING "can't load conntrack support for "
                                     "proto=%d\n", par->match->family);
 #else
+#if defined(CONFIG_SYNO_COMCERTO)
+check(const char *tablename, const void *inf,
+#else
 static bool check(const char *tablename, const void *inf,
+#endif
 		 const struct xt_match *match, void *matchinfo,
 		 unsigned int hook_mask)
 {
@@ -591,9 +672,15 @@
                 printk(KERN_WARNING "can't load conntrack support for "
                                     "proto=%d\n", match->family);
 #endif
+#if defined(CONFIG_SYNO_COMCERTO) && LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 35)
+		return -EINVAL;
+	}
+	return 0;
+#else
                 return 0;
         }
 	return 1;
+#endif
 }
 
 
diff -ur a/net/packet/af_packet.c b/net/packet/af_packet.c
--- a/net/packet/af_packet.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/packet/af_packet.c	2014-02-17 11:56:55.000000000 +0100
@@ -296,6 +296,9 @@
 	unsigned int		tp_loss:1;
 	unsigned int		tp_tstamp;
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
+#if defined(CONFIG_SYNO_COMCERTO)
+	unsigned int		pkt_type;
+#endif
 };
 
 #define PACKET_FANOUT_MAX	256
@@ -1392,6 +1395,9 @@
 {
 	struct sock *sk;
 	struct sockaddr_pkt *spkt;
+#if defined(CONFIG_SYNO_COMCERTO)
+	struct packet_sock *po;
+#endif
 
 	/*
 	 *	When we registered the protocol we saved the socket in the data
@@ -1399,6 +1405,9 @@
 	 */
 
 	sk = pt->af_packet_priv;
+#if defined(CONFIG_SYNO_COMCERTO)
+	po = pkt_sk(sk);
+#endif
 
 	/*
 	 *	Yank back the headers [hope the device set this
@@ -1411,7 +1420,11 @@
 	 *	so that this procedure is noop.
 	 */
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (!(po->pkt_type & (1 << skb->pkt_type)))
+#else
 	if (skb->pkt_type == PACKET_LOOPBACK)
+#endif
 		goto out;
 
 	if (!net_eq(dev_net(dev), sock_net(sk)))
@@ -1604,12 +1617,18 @@
 	int skb_len = skb->len;
 	unsigned int snaplen, res;
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto drop;
-
+#endif
 	sk = pt->af_packet_priv;
 	po = pkt_sk(sk);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (!(po->pkt_type & (1 << skb->pkt_type)))
+		goto drop;
+#endif
+
 	if (!net_eq(dev_net(dev), sock_net(sk)))
 		goto drop;
 
@@ -1728,12 +1747,19 @@
 	struct timespec ts;
 	struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
 
+#if !defined(CONFIG_SYNO_COMCERTO)
 	if (skb->pkt_type == PACKET_LOOPBACK)
 		goto drop;
+#endif
 
 	sk = pt->af_packet_priv;
 	po = pkt_sk(sk);
 
+#if defined(CONFIG_SYNO_COMCERTO)
+	if (!(po->pkt_type & (1 << skb->pkt_type)))
+		goto drop;
+#endif
+
 	if (!net_eq(dev_net(dev), sock_net(sk)))
 		goto drop;
 
@@ -2600,6 +2626,9 @@
 	spin_lock_init(&po->bind_lock);
 	mutex_init(&po->pg_vec_lock);
 	po->prot_hook.func = packet_rcv;
+#if defined(CONFIG_SYNO_COMCERTO)
+	po->pkt_type = PACKET_MASK_ANY & ~(1 << PACKET_LOOPBACK);
+#endif
 
 	if (sock->type == SOCK_PACKET)
 		po->prot_hook.func = packet_rcv_spkt;
@@ -3197,6 +3226,18 @@
 
 		return fanout_add(sk, val & 0xffff, val >> 16);
 	}
+#if defined(CONFIG_SYNO_COMCERTO)
+        case PACKET_RECV_TYPE:
+        {
+                unsigned int val;
+                if (optlen != sizeof(val))
+                        return -EINVAL;
+                if (copy_from_user(&val, optval, sizeof(val)))
+                        return -EFAULT;
+                po->pkt_type = val & ~PACKET_LOOPBACK;
+                return 0;
+        }
+#endif
 	default:
 		return -ENOPROTOOPT;
 	}
@@ -3267,6 +3308,15 @@
 
 		data = &val;
 		break;
+#if defined(CONFIG_SYNO_COMCERTO)
+	case PACKET_RECV_TYPE:
+		if (len > sizeof(unsigned int))
+			len = sizeof(unsigned int);
+		val = po->pkt_type;
+
+		data = &val;
+		break;
+#endif
 	case PACKET_VERSION:
 		if (len > sizeof(int))
 			len = sizeof(int);
Nur in b/net/sched: act_connmark.c.
diff -ur a/net/sched/Kconfig b/net/sched/Kconfig
--- a/net/sched/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/net/sched/Kconfig	2014-01-21 09:37:31.000000000 +0100
@@ -148,6 +148,39 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called sch_sfq.
 
+config NET_SCH_ESFQ
+	tristate "Enhanced Stochastic Fairness Queueing (ESFQ)"
+	depends on SYNO_COMCERTO
+	---help---
+	  Say Y here if you want to use the Enhanced Stochastic Fairness
+	  Queueing (ESFQ) packet scheduling algorithm for some of your network
+	  devices or as a leaf discipline for a classful qdisc such as HTB or
+	  CBQ (see the top of <file:net/sched/sch_esfq.c> for details and
+	  references to the SFQ algorithm).
+
+	  This is an enchanced SFQ version which allows you to control some
+	  hardcoded values in the SFQ scheduler.
+
+	  ESFQ also adds control of the hash function used to identify packet
+	  flows. The original SFQ discipline hashes by connection; ESFQ add
+	  several other hashing methods, such as by src IP or by dst IP, which
+	  can be more fair to users in some networking situations.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called sch_esfq.
+
+config NET_SCH_ESFQ_NFCT
+	bool "Connection Tracking Hash Types"
+	depends on NET_SCH_ESFQ && NF_CONNTRACK
+	depends on SYNO_COMCERTO
+	---help---
+	  Say Y here to enable support for hashing based on netfilter connection
+	  tracking information. This is useful for a router that is also using
+	  NAT to connect privately-addressed hosts to the Internet. If you want
+	  to provide fair distribution of upstream bandwidth, ESFQ must use
+	  connection tracking information, since all outgoing packets will share
+	  the same source address.
+
 config NET_SCH_TEQL
 	tristate "True Link Equalizer (TEQL)"
 	---help---
@@ -571,6 +604,20 @@
 	  To compile this code as a module, choose M here: the
 	  module will be called act_csum.
 
+config NET_ACT_CONNMARK
+        tristate "Connection Tracking Marking"
+        depends on NET_CLS_ACT
+        depends on NF_CONNTRACK
+	    depends on NF_CONNTRACK_MARK
+	    depends on SYNO_COMCERTO
+        ---help---
+	  Say Y here to restore the connmark from a scheduler action
+
+	  If unsure, say N.
+
+	  To compile this code as a module, choose M here: the
+	  module will be called act_connmark.
+
 config NET_CLS_IND
 	bool "Incoming device classification"
 	depends on NET_CLS_U32 || NET_CLS_FW
diff -ur a/net/sched/Makefile b/net/sched/Makefile
--- a/net/sched/Makefile	2013-08-03 09:59:52.000000000 +0200
+++ b/net/sched/Makefile	2014-01-21 09:37:31.000000000 +0100
@@ -16,6 +16,9 @@
 obj-$(CONFIG_NET_ACT_SIMP)	+= act_simple.o
 obj-$(CONFIG_NET_ACT_SKBEDIT)	+= act_skbedit.o
 obj-$(CONFIG_NET_ACT_CSUM)	+= act_csum.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_NET_ACT_CONNMARK)	+= act_connmark.o
+endif
 obj-$(CONFIG_NET_SCH_FIFO)	+= sch_fifo.o
 obj-$(CONFIG_NET_SCH_CBQ)	+= sch_cbq.o
 obj-$(CONFIG_NET_SCH_HTB)	+= sch_htb.o
@@ -26,6 +29,9 @@
 obj-$(CONFIG_NET_SCH_DSMARK)	+= sch_dsmark.o
 obj-$(CONFIG_NET_SCH_SFB)	+= sch_sfb.o
 obj-$(CONFIG_NET_SCH_SFQ)	+= sch_sfq.o
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+obj-$(CONFIG_NET_SCH_ESFQ)	+= sch_esfq.o
+endif
 obj-$(CONFIG_NET_SCH_TBF)	+= sch_tbf.o
 obj-$(CONFIG_NET_SCH_TEQL)	+= sch_teql.o
 obj-$(CONFIG_NET_SCH_PRIO)	+= sch_prio.o
Nur in b/net/sched: sch_esfq.c.
diff -ur a/net/unix/af_unix.c b/net/unix/af_unix.c
--- a/net/unix/af_unix.c	2013-08-24 11:36:42.000000000 +0200
+++ b/net/unix/af_unix.c	2014-02-17 11:56:54.000000000 +0100
@@ -2159,7 +2159,7 @@
 	}
 
 	/* No write status requested, avoid expensive OUT tests. */
-	if (wait && !(wait->key & (POLLWRBAND | POLLWRNORM | POLLOUT)))
+	if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
 		return mask;
 
 	writable = unix_writable(sk);
diff -ur a/net/wireless/Kconfig b/net/wireless/Kconfig
--- a/net/wireless/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/net/wireless/Kconfig	2014-01-21 09:37:31.000000000 +0100
@@ -12,10 +12,10 @@
 	depends on WEXT_CORE
 
 config WEXT_SPY
-	bool
+	bool "WEXT_SPY"
 
 config WEXT_PRIV
-	bool
+	bool "WEXT_PRIV"
 	default y
 
 config CFG80211
@@ -144,13 +144,13 @@
 	  you want this built into your kernel.
 
 config LIB80211_CRYPT_WEP
-	tristate
+	tristate "LIB80211_CRYPT_WEP"
 
 config LIB80211_CRYPT_CCMP
-	tristate
+	tristate "LIB80211_CRYPT_CCMP"
 
 config LIB80211_CRYPT_TKIP
-	tristate
+	tristate "LIB80211_CRYPT_TKIP"
 
 config LIB80211_DEBUG
 	bool "lib80211 debugging messages"
diff -ur a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
--- a/net/xfrm/xfrm_output.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/xfrm/xfrm_output.c	2014-02-17 11:56:55.000000000 +0100
@@ -43,11 +43,32 @@
 	struct dst_entry *dst = skb_dst(skb);
 	struct xfrm_state *x = dst->xfrm;
 	struct net *net = xs_net(x);
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	struct xfrm_state *xfrm_vec[XFRM_MAX_DEPTH];
+	int xfrm_nr = 0;
+	int i;
+#endif
 
 	if (err <= 0)
 		goto resume;
 
 	do {
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		if (x->offloaded)  {
+
+			if (xfrm_nr == XFRM_MAX_DEPTH) {
+				err = -ENOBUFS;
+				goto out_exit;
+			}
+
+			if (!x->curlft.use_time) 
+				x->curlft.use_time = get_seconds();
+
+			xfrm_vec[xfrm_nr++] = x;
+			skb->ipsec_offload = 1;
+			goto next_dst;
+		}
+#endif
 		err = xfrm_state_check_space(x, skb);
 		if (err) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
@@ -90,6 +111,9 @@
 			goto error_nolock;
 		}
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+next_dst:
+#endif
 		dst = skb_dst_pop(skb);
 		if (!dst) {
 			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
@@ -101,6 +125,26 @@
 	} while (x && !(x->outer_mode->flags & XFRM_MODE_FLAG_TUNNEL));
 
 	err = 0;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
+		struct sec_path *sp;
+
+		sp = secpath_dup(skb->sp);
+		if (!sp)
+			goto error_nolock;
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+	}
+	if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH)
+		goto error_nolock;
+
+	memcpy(skb->sp->xvec + skb->sp->len, xfrm_vec,
+	       xfrm_nr * sizeof(xfrm_vec[0]));
+	skb->sp->len += xfrm_nr;
+	for (i = 0; i < skb->sp->len; i++)
+		xfrm_state_hold(skb->sp->xvec[i]);
+#endif
 
 out_exit:
 	return err;
diff -ur a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
--- a/net/xfrm/xfrm_policy.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/xfrm/xfrm_policy.c	2014-02-17 11:56:55.000000000 +0100
@@ -46,7 +46,15 @@
 
 static struct kmem_cache *xfrm_dst_cache __read_mostly;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+extern int ipsec_nlkey_flow(u16 xfrm_nr, u16 *xfrm_handle,
+                const struct flowi *fl, u16 family, u16 dir);
+#endif
+
+#if !defined(CONFIG_SYNO_COMCERTO) || (!defined(CONFIG_INET_IPSEC_OFFLOAD) && !defined(CONFIG_INET6_IPSEC_OFFLOAD))
 static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family);
+#endif
+
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo);
 static void xfrm_init_pmtu(struct dst_entry *dst);
 static int stale_bundle(struct dst_entry *dst);
@@ -94,7 +102,10 @@
 	return 0;
 }
 
-static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
+#if !defined(CONFIG_SYNO_COMCERTO) || (!defined(CONFIG_INET_IPSEC_OFFLOAD) && !defined(CONFIG_INET6_IPSEC_OFFLOAD))
+static inline 
+#endif
+struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos,
 						  const xfrm_address_t *saddr,
 						  const xfrm_address_t *daddr,
 						  int family)
@@ -112,8 +123,14 @@
 
 	return dst;
 }
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+EXPORT_SYMBOL(__xfrm_dst_lookup);
+#endif
 
-static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
+#if !defined(CONFIG_SYNO_COMCERTO) || (!defined(CONFIG_INET_IPSEC_OFFLOAD) && !defined(CONFIG_INET6_IPSEC_OFFLOAD))
+static inline 
+#endif
+struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos,
 						xfrm_address_t *prev_saddr,
 						xfrm_address_t *prev_daddr,
 						int family)
@@ -143,6 +160,9 @@
 
 	return dst;
 }
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+EXPORT_SYMBOL(xfrm_dst_lookup);
+#endif
 
 static inline unsigned long make_jiffies(long secs)
 {
@@ -1264,7 +1284,10 @@
  * still valid.
  */
 
-static inline int xfrm_get_tos(const struct flowi *fl, int family)
+#if !defined(CONFIG_SYNO_COMCERTO) || (!defined(CONFIG_INET_IPSEC_OFFLOAD) && !defined(CONFIG_INET6_IPSEC_OFFLOAD))
+static inline 
+#endif
+int xfrm_get_tos(const struct flowi *fl, int family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 	int tos;
@@ -1278,6 +1301,9 @@
 
 	return tos;
 }
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+EXPORT_SYMBOL(xfrm_get_tos);
+#endif
 
 static struct flow_cache_object *xfrm_bundle_flo_get(struct flow_cache_object *flo)
 {
@@ -1461,7 +1487,12 @@
 		xdst->route = dst;
 		dst_copy_metrics(dst1, dst);
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		if ((xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) &&
+			(!xfrm[i]->offloaded)) {
+#else
 		if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) {
+#endif
 			family = xfrm[i]->props.family;
 			dst = xfrm_dst_lookup(xfrm[i], tos, &saddr, &daddr,
 					      family);
@@ -1786,6 +1817,9 @@
 	u16 family = dst_orig->ops->family;
 	u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT);
 	int i, err, num_pols, num_xfrms = 0, drop_pols = 0;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	u8 new_flow = 0;
+#endif
 
 restart:
 	dst = NULL;
@@ -1836,8 +1870,13 @@
 		    !net->xfrm.policy_count[XFRM_POLICY_OUT])
 			goto nopol;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		flo = flow_cache_lookup(net, fl, family, dir, &new_flow,
+					xfrm_bundle_lookup, dst_orig);
+#else
 		flo = flow_cache_lookup(net, fl, family, dir,
 					xfrm_bundle_lookup, dst_orig);
+#endif
 		if (flo == NULL)
 			goto nopol;
 		if (IS_ERR(flo)) {
@@ -1917,6 +1956,30 @@
 		dst_release(dst);
 		dst = dst_orig;
 	}
+
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	if (new_flow) {
+		struct dst_entry *dst1 = dst;
+		struct xfrm_state *x; 
+		u16	xfrm_handle[XFRM_POLICY_TYPE_MAX];
+
+		num_xfrms = 0;
+		memset(xfrm_handle, 0, XFRM_POLICY_TYPE_MAX*sizeof(u16));
+		while((x = dst1->xfrm) != NULL) {
+			if (!x->offloaded)
+				goto ok;
+			xfrm_handle[num_xfrms++] = x->handle;
+			dst1 = dst1->child;
+			if (dst1 == NULL) {
+				err = -EHOSTUNREACH;
+				goto error;
+			}
+		}
+		// sent flow notification to cmm with sa_handle
+		ipsec_nlkey_flow(num_xfrms, xfrm_handle, fl, family, (unsigned short)dir);
+	}
+#endif
+
 ok:
 	xfrm_pols_put(pols, drop_pols);
 	if (dst && dst->xfrm &&
@@ -2042,6 +2105,9 @@
 	int xfrm_nr;
 	int pi;
 	int reverse;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	u8 new_flow = 0;
+#endif
 	struct flowi fl;
 	u8 fl_dir;
 	int xerr_idx = -1;
@@ -2082,8 +2148,13 @@
 	if (!pol) {
 		struct flow_cache_object *flo;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		flo = flow_cache_lookup(net, &fl, family, fl_dir, &new_flow,
+					xfrm_policy_lookup, NULL);
+#else
 		flo = flow_cache_lookup(net, &fl, family, fl_dir,
 					xfrm_policy_lookup, NULL);
+#endif
 		if (IS_ERR_OR_NULL(flo))
 			pol = ERR_CAST(flo);
 		else
@@ -2177,6 +2248,28 @@
 			goto reject;
 		}
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		if (new_flow) {
+			struct xfrm_state *x;
+			u16	xfrm_handle[XFRM_POLICY_TYPE_MAX];
+
+			xfrm_nr = 0;
+			memset(xfrm_handle, 0, XFRM_POLICY_TYPE_MAX*sizeof(u16));
+			for (i=skb->sp->len-1; i>=0; i--) 
+			{
+				x = skb->sp->xvec[i];
+				
+				if (!x->offloaded)
+					goto std_path;
+				
+				xfrm_handle[xfrm_nr++] = x->handle;
+			}
+			// sent flow notification to cmm with sa_handle
+			ipsec_nlkey_flow(xfrm_nr, xfrm_handle, (const struct flowi *)&fl, family, fl_dir);
+		}
+
+std_path:
+#endif
 		xfrm_pols_put(pols, npols);
 		return 1;
 	}
@@ -2506,7 +2599,10 @@
 	read_unlock_bh(&xfrm_policy_afinfo_lock);
 }
 
-static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
+#if !defined(CONFIG_SYNO_COMCERTO) || (!defined(CONFIG_INET_IPSEC_OFFLOAD) && !defined(CONFIG_INET6_IPSEC_OFFLOAD))
+static 
+#endif
+struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo;
 	if (unlikely(family >= NPROTO))
@@ -2517,6 +2613,9 @@
 		read_unlock(&xfrm_policy_afinfo_lock);
 	return afinfo;
 }
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+EXPORT_SYMBOL(xfrm_policy_get_afinfo);
+#endif
 
 static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 {
diff -ur a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
--- a/net/xfrm/xfrm_state.c	2013-08-24 11:36:43.000000000 +0200
+++ b/net/xfrm/xfrm_state.c	2014-02-17 11:56:55.000000000 +0100
@@ -39,6 +39,10 @@
 
 static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+static unsigned short xfrm_state_handle;
+#endif
+
 static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned int family);
 static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
 
@@ -66,11 +70,20 @@
 	return __xfrm_spi_hash(daddr, spi, proto, family, net->xfrm.state_hmask);
 }
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
 static void xfrm_hash_transfer(struct hlist_head *list,
 			       struct hlist_head *ndsttable,
 			       struct hlist_head *nsrctable,
 			       struct hlist_head *nspitable,
+			       struct hlist_head *nhtable,
 			       unsigned int nhashmask)
+#else
+static void xfrm_hash_transfer(struct hlist_head *list,
+			       struct hlist_head *ndsttable,
+			       struct hlist_head *nsrctable,
+			       struct hlist_head *nspitable,
+			       unsigned int nhashmask)
+#endif
 {
 	struct hlist_node *entry, *tmp;
 	struct xfrm_state *x;
@@ -94,6 +107,11 @@
 					    nhashmask);
 			hlist_add_head(&x->byspi, nspitable+h);
 		}
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		if (x->handle) {
+			hlist_add_head(&x->byh, nhtable+(x->handle & nhashmask));
+		}
+#endif
 	}
 }
 
@@ -108,6 +126,9 @@
 {
 	struct net *net = container_of(work, struct net, xfrm.state_hash_work);
 	struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	struct hlist_head *nh, *oh;
+#endif
 	unsigned long nsize, osize;
 	unsigned int nhashmask, ohashmask;
 	int i;
@@ -129,22 +150,41 @@
 		xfrm_hash_free(nsrc, nsize);
 		goto out_unlock;
 	}
-
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	nh = xfrm_hash_alloc(nsize);
+	if (!nh) {
+		xfrm_hash_free(ndst, nsize);
+		xfrm_hash_free(nsrc, nsize);
+		xfrm_hash_free(nspi, nsize);
+		goto out_unlock;
+	}
+#endif
 	spin_lock_bh(&xfrm_state_lock);
 
 	nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
 	for (i = net->xfrm.state_hmask; i >= 0; i--)
-		xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi,
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi, nh,
 				   nhashmask);
+#else
+		xfrm_hash_transfer(net->xfrm.state_bydst+i, ndst, nsrc, nspi,
+				   nhashmask);	
+#endif
 
 	odst = net->xfrm.state_bydst;
 	osrc = net->xfrm.state_bysrc;
 	ospi = net->xfrm.state_byspi;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	oh   = net->xfrm.state_byh;
+#endif
 	ohashmask = net->xfrm.state_hmask;
 
 	net->xfrm.state_bydst = ndst;
 	net->xfrm.state_bysrc = nsrc;
 	net->xfrm.state_byspi = nspi;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	net->xfrm.state_byh   = nh;
+#endif
 	net->xfrm.state_hmask = nhashmask;
 
 	spin_unlock_bh(&xfrm_state_lock);
@@ -153,6 +193,9 @@
 	xfrm_hash_free(odst, osize);
 	xfrm_hash_free(osrc, osize);
 	xfrm_hash_free(ospi, osize);
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	xfrm_hash_free(oh,   osize);
+#endif
 
 out_unlock:
 	mutex_unlock(&hash_resize_mutex);
@@ -494,6 +537,9 @@
 		INIT_HLIST_NODE(&x->bydst);
 		INIT_HLIST_NODE(&x->bysrc);
 		INIT_HLIST_NODE(&x->byspi);
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		INIT_HLIST_NODE(&x->byh);
+#endif
 		tasklet_hrtimer_init(&x->mtimer, xfrm_timer_handler, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 		setup_timer(&x->rtimer, xfrm_replay_timer_handler,
 				(unsigned long)x);
@@ -504,6 +550,12 @@
 		x->lft.hard_packet_limit = XFRM_INF;
 		x->replay_maxage = 0;
 		x->replay_maxdiff = 0;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		x->handle = xfrm_state_handle++;
+		if (x->handle == 0)
+			x->handle = xfrm_state_handle++;
+		hlist_add_head(&x->byh, net->xfrm.state_byh+(x->handle & net->xfrm.state_hmask));
+#endif
 		x->inner_mode = NULL;
 		x->inner_mode_iaf = NULL;
 		spin_lock_init(&x->lock);
@@ -538,6 +590,10 @@
 		hlist_del(&x->bysrc);
 		if (x->id.spi)
 			hlist_del(&x->byspi);
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+		if (x->handle)
+			hlist_del(&x->byh);
+#endif
 		net->xfrm.state_num--;
 		spin_unlock(&xfrm_state_lock);
 
@@ -1408,6 +1464,37 @@
 }
 EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+struct xfrm_state *__xfrm_state_lookup_byhandle(struct net *net, u16 handle)
+{
+	unsigned int h = (handle & net->xfrm.state_hmask);
+	struct xfrm_state *x;
+	struct hlist_node *entry;
+
+	hlist_for_each_entry(x, entry, net->xfrm.state_byh+h, byh) {
+		if (x->handle != handle)
+			continue;
+	
+		xfrm_state_hold(x);
+		return x;
+	}
+
+	return NULL;
+}
+
+struct xfrm_state *
+xfrm_state_lookup_byhandle(struct net *net, u16 handle)
+{
+	struct xfrm_state *x;
+
+	spin_lock_bh(&xfrm_state_lock);
+	x = __xfrm_state_lookup_byhandle(net, handle);
+	spin_unlock_bh(&xfrm_state_lock);
+	return x;
+}
+EXPORT_SYMBOL(xfrm_state_lookup_byhandle);
+#endif
+
 struct xfrm_state *
 xfrm_find_acq(struct net *net, struct xfrm_mark *mark, u8 mode, u32 reqid, u8 proto,
 	      const xfrm_address_t *daddr, const xfrm_address_t *saddr,
@@ -2026,6 +2113,12 @@
 	net->xfrm.state_byspi = xfrm_hash_alloc(sz);
 	if (!net->xfrm.state_byspi)
 		goto out_byspi;
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+	net->xfrm.state_byh = xfrm_hash_alloc(sz);
+	if (!net->xfrm.state_byh)
+		goto out_byh;
+	get_random_bytes(&xfrm_state_handle, sizeof(xfrm_state_handle));
+#endif
 	net->xfrm.state_hmask = ((sz / sizeof(struct hlist_head)) - 1);
 
 	net->xfrm.state_num = 0;
@@ -2035,6 +2128,10 @@
 	init_waitqueue_head(&net->xfrm.km_waitq);
 	return 0;
 
+#if defined(CONFIG_SYNO_COMCERTO) && (defined(CONFIG_INET_IPSEC_OFFLOAD) || defined(CONFIG_INET6_IPSEC_OFFLOAD))
+out_byh:
+	xfrm_hash_free(net->xfrm.state_byspi, sz);
+#endif
 out_byspi:
 	xfrm_hash_free(net->xfrm.state_bysrc, sz);
 out_bysrc:
diff -ur a/scripts/gen_initramfs_list.sh b/scripts/gen_initramfs_list.sh
--- a/scripts/gen_initramfs_list.sh	2013-08-03 09:59:52.000000000 +0200
+++ b/scripts/gen_initramfs_list.sh	2014-01-21 09:37:31.000000000 +0100
@@ -226,7 +226,7 @@
 output="/dev/stdout"
 output_file=""
 is_cpio_compressed=
-compr="gzip -n -9 -f"
+compr="gzip -n -9 -f -"
 
 arg="$1"
 case "$arg" in
@@ -240,9 +240,9 @@
 		output_file="$1"
 		cpio_list="$(mktemp ${TMPDIR:-/tmp}/cpiolist.XXXXXX)"
 		output=${cpio_list}
-		echo "$output_file" | grep -q "\.gz$" && compr="gzip -n -9 -f"
-		echo "$output_file" | grep -q "\.bz2$" && compr="bzip2 -9 -f"
-		echo "$output_file" | grep -q "\.lzma$" && compr="lzma -9 -f"
+		echo "$output_file" | grep -q "\.gz$" && compr="gzip -n -9 -f -"
+		echo "$output_file" | grep -q "\.bz2$" && compr="bzip2 -9 -f -"
+		echo "$output_file" | grep -q "\.lzma$" && compr="lzma e -d20 -lc1 -lp2 -pb2 -eos -si -so"
 		echo "$output_file" | grep -q "\.xz$" && \
 				compr="xz --check=crc32 --lzma2=dict=1MiB"
 		echo "$output_file" | grep -q "\.lzo$" && compr="lzop -9 -f"
@@ -303,7 +303,7 @@
 	if [ "${is_cpio_compressed}" = "compressed" ]; then
 		cat ${cpio_tfile} > ${output_file}
 	else
-		(cat ${cpio_tfile} | ${compr}  - > ${output_file}) \
+		(cat ${cpio_tfile} | ${compr} > ${output_file}) \
 		|| (rm -f ${output_file} ; false)
 	fi
 	[ -z ${cpio_file} ] && rm ${cpio_tfile}
diff -ur a/scripts/kallsyms.c b/scripts/kallsyms.c
--- a/scripts/kallsyms.c	2013-08-24 11:36:41.000000000 +0200
+++ b/scripts/kallsyms.c	2014-02-17 11:56:51.000000000 +0100
@@ -22,6 +22,35 @@
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
+#if defined(CONF6IG_SYNO_COMCERTO) && defined(__APPLE__)
+/* Darwin has no memmem implementation, this one is ripped of the uClibc-0.9.28 source */
+void *memmem (const void *haystack, size_t haystack_len,
+                          const void *needle,  size_t needle_len)
+{
+  const char *begin;
+  const char *const last_possible
+    = (const char *) haystack + haystack_len - needle_len;
+
+  if (needle_len == 0)
+    /* The first occurrence of the empty string is deemed to occur at
+       the beginning of the string.  */
+    return (void *) haystack;
+
+  /* Sanity check, otherwise the loop might search through the whole
+     memory.  */
+  if (__builtin_expect (haystack_len < needle_len, 0))
+    return NULL;
+
+  for (begin = (const char *) haystack; begin <= last_possible; ++begin)
+    if (begin[0] == ((const char *) needle)[0] &&
+        !memcmp ((const void *) &begin[1],
+                 (const void *) ((const char *) needle + 1),
+                 needle_len - 1))
+      return (void *) begin;
+
+  return NULL;
+}
+#endif
 
 #ifndef ARRAY_SIZE
 #define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0]))
diff -ur a/scripts/kconfig/Makefile b/scripts/kconfig/Makefile
--- a/scripts/kconfig/Makefile	2013-08-03 09:59:52.000000000 +0200
+++ b/scripts/kconfig/Makefile	2014-01-21 09:37:31.000000000 +0100
@@ -123,6 +123,11 @@
 # we really need to do so. (Do not call gcc as part of make mrproper)
 HOST_EXTRACFLAGS += $(shell $(CONFIG_SHELL) $(check-lxdialog) -ccflags) \
                     -DLOCALE
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+ifeq ($(shell uname -s),Darwin)
+HOST_LOADLIBES  += -lncurses
+endif
+endif
 
 # ===========================================================================
 # Shared Makefile for the various kconfig executables:
diff -ur a/scripts/Makefile.lib b/scripts/Makefile.lib
--- a/scripts/Makefile.lib	2013-08-03 09:59:52.000000000 +0200
+++ b/scripts/Makefile.lib	2014-01-21 09:37:31.000000000 +0100
@@ -295,9 +295,15 @@
 # ---------------------------------------------------------------------------
 
 quiet_cmd_lzma = LZMA    $@
+ifeq ($(CONFIG_SYNO_COMCERTO),y)
+cmd_lzma = (cat $(filter-out FORCE,$^) | \
+	lzma e -d20 -lc1 -lp2 -pb2 -eos -si -so && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+	(rm -f $@ ; false)
+else
 cmd_lzma = (cat $(filter-out FORCE,$^) | \
 	lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
 	(rm -f $@ ; false)
+endif
 
 quiet_cmd_lzo = LZO     $@
 cmd_lzo = (cat $(filter-out FORCE,$^) | \
diff -ur a/scripts/mod/mk_elfconfig.c b/scripts/mod/mk_elfconfig.c
--- a/scripts/mod/mk_elfconfig.c	2013-08-24 11:36:41.000000000 +0200
+++ b/scripts/mod/mk_elfconfig.c	2014-02-17 11:56:51.000000000 +0100
@@ -1,7 +1,11 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#if !defined(CONFIG_SYNO_COMCERTO) || !defined(__APPLE__)
 #include <elf.h>
+#else
+#include "../../../../../tools/sstrip/include/elf.h"
+#endif
 
 int
 main(int argc, char **argv)
diff -ur a/scripts/mod/modpost.h b/scripts/mod/modpost.h
--- a/scripts/mod/modpost.h	2013-08-24 11:36:41.000000000 +0200
+++ b/scripts/mod/modpost.h	2014-02-17 11:56:51.000000000 +0100
@@ -7,7 +7,11 @@
 #include <sys/mman.h>
 #include <fcntl.h>
 #include <unistd.h>
+#if !defined(CONFIG_SYNO_COMCERTO) || (!(defined(__APPLE__) || defined(__CYGWIN__)))
 #include <elf.h>
+#else
+#include "../../../../../tools/sstrip/include/elf.h"
+#endif
 
 #include "elfconfig.h"
 
diff -ur a/scripts/setlocalversion b/scripts/setlocalversion
--- a/scripts/setlocalversion	2013-08-03 09:59:52.000000000 +0200
+++ b/scripts/setlocalversion	2014-01-21 09:37:31.000000000 +0100
@@ -168,7 +168,7 @@
 	# annotated or signed tagged state (as git describe only
 	# looks at signed or annotated tags - git tag -a/-s) and
 	# LOCALVERSION= is not specified
-	if test "${LOCALVERSION+set}" != "set"; then
+	if test "${CONFIG_LOCALVERSION+set}" != "set"; then
 		scm=$(scm_version --short)
 		res="$res${scm:++}"
 	fi
diff -ur a/sound/sound_core.c b/sound/sound_core.c
--- a/sound/sound_core.c	2013-08-24 11:36:48.000000000 +0200
+++ b/sound/sound_core.c	2014-02-17 11:57:03.000000000 +0100
@@ -594,6 +594,7 @@
 	if (preclaim_oss && !new_fops) {
 		spin_unlock(&sound_loader_lock);
 
+#ifndef MY_ABC_HERE
 		/*
 		 *  Please, don't change this order or code.
 		 *  For ALSA slot means soundcard and OSS emulation code
@@ -613,6 +614,7 @@
 		 */
 		if (request_module("char-major-%d-%d", SOUND_MAJOR, unit) > 0)
 			request_module("char-major-%d", SOUND_MAJOR);
+#endif
 
 		spin_lock(&sound_loader_lock);
 		s = __look_for_unit(chain, unit);
diff -ur a/synoconfigs/88f6180 b/synoconfigs/88f6180
--- a/synoconfigs/88f6180	2013-08-03 09:59:52.000000000 +0200
+++ b/synoconfigs/88f6180	2014-01-21 09:37:34.000000000 +0100
@@ -100,7 +100,7 @@
 CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 CONFIG_PERF_USE_VMALLOC=y
diff -ur a/synoconfigs/88f6281 b/synoconfigs/88f6281
--- a/synoconfigs/88f6281	2013-08-03 09:59:52.000000000 +0200
+++ b/synoconfigs/88f6281	2014-01-21 09:37:34.000000000 +0100
@@ -100,7 +100,7 @@
 # CONFIG_TIMERFD is not set
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 CONFIG_PERF_USE_VMALLOC=y
diff -ur a/synoconfigs/armada370 b/synoconfigs/armada370
--- a/synoconfigs/armada370	2013-08-16 08:07:18.000000000 +0200
+++ b/synoconfigs/armada370	2014-02-17 11:45:16.000000000 +0100
@@ -105,7 +105,7 @@
 CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 CONFIG_PERF_USE_VMALLOC=y
@@ -139,6 +139,7 @@
 CONFIG_SYNO_DISPLAY_CPUINFO=y
 CONFIG_SYNO_FIX_MV_CESA_RACE=y
 CONFIG_SYNO_FIX_OCF_CRYPTODEV_RACE=y
+CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE=y
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -200,6 +201,7 @@
 #
 CONFIG_MMU=y
 # CONFIG_ARCH_INTEGRATOR is not set
+# CONFIG_ARCH_COMCERTO is not set
 # CONFIG_ARCH_REALVIEW is not set
 # CONFIG_ARCH_VERSATILE is not set
 # CONFIG_ARCH_VEXPRESS is not set
@@ -308,11 +310,10 @@
 # CONFIG_MV_INCLUDE_PCI is not set
 CONFIG_MV_INCLUDE_USB=y
 CONFIG_MV_INCLUDE_XOR=y
-# CONFIG_MV_INCLUDE_NFC is not set
+CONFIG_MV_INCLUDE_NFC=y
 # CONFIG_MV_INCLUDE_LEGACY_NAND is not set
 CONFIG_MV_INCLUDE_INTEG_SATA=y
-# CONFIG_MV_INCLUDE_NOR is not set
-# CONFIG_MV_INCLUDE_SDIO is not set
+CONFIG_MV_INCLUDE_SDIO=y
 
 #
 # Armada AMP options
@@ -324,6 +325,11 @@
 CONFIG_MV_FLASH_CTRL=y
 CONFIG_MV_INCLUDE_SFLASH_MTD=y
 CONFIG_MV_SPI_BOOT=y
+CONFIG_MTD_NAND_NFC=y
+CONFIG_MTD_NAND_NFC_GANG_SUPPORT=y
+CONFIG_MTD_NAND_NFC_MLC_SUPPORT=y
+CONFIG_MTD_NAND_NFC_INIT_RESET=y
+# CONFIG_MTD_NAND_NFC_NEGLECT_RNB is not set
 CONFIG_MV_USB_HOST=y
 # CONFIG_MV_USB_DEVICE is not set
 # CONFIG_FEROCEON_PROC is not set
@@ -766,7 +772,7 @@
 # CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
 # CONFIG_NETFILTER_XT_MATCH_LENGTH is not set
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-# CONFIG_NETFILTER_XT_MATCH_MAC is not set
+CONFIG_NETFILTER_XT_MATCH_MAC=m
 # CONFIG_NETFILTER_XT_MATCH_MARK is not set
 CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
 # CONFIG_NETFILTER_XT_MATCH_OWNER is not set
@@ -925,10 +931,14 @@
 CONFIG_WIRELESS_EXT=y
 CONFIG_WEXT_CORE=y
 CONFIG_WEXT_PROC=y
+# CONFIG_WEXT_SPY is not set
 CONFIG_WEXT_PRIV=y
 # CONFIG_CFG80211 is not set
 CONFIG_WIRELESS_EXT_SYSFS=y
 # CONFIG_LIB80211 is not set
+# CONFIG_LIB80211_CRYPT_WEP is not set
+# CONFIG_LIB80211_CRYPT_CCMP is not set
+# CONFIG_LIB80211_CRYPT_TKIP is not set
 
 #
 # CFG80211 needs to be enabled for MAC80211
@@ -1050,14 +1060,33 @@
 # CONFIG_MTD_DOC2001 is not set
 # CONFIG_MTD_DOC2001PLUS is not set
 # CONFIG_MTD_DOCG3 is not set
-# CONFIG_MTD_NAND is not set
+CONFIG_MTD_NAND_ECC=y
+# CONFIG_MTD_NAND_ECC_SMC is not set
+CONFIG_MTD_NAND=y
+CONFIG_MTD_NAND_VERIFY_WRITE=y
+# CONFIG_MTD_NAND_ECC_BCH is not set
+# CONFIG_MTD_SM_COMMON is not set
+# CONFIG_MTD_NAND_MUSEUM_IDS is not set
+# CONFIG_MTD_NAND_DENALI is not set
+# CONFIG_MTD_NAND_GPIO is not set
+CONFIG_MTD_NAND_IDS=y
+# CONFIG_MTD_NAND_RICOH is not set
+# CONFIG_MTD_NAND_DISKONCHIP is not set
+# CONFIG_MTD_NAND_CAFE is not set
+# CONFIG_MTD_NAND_NANDSIM is not set
+# CONFIG_MTD_NAND_PLATFORM is not set
+# CONFIG_MTD_ALAUDA is not set
 # CONFIG_MTD_ONENAND is not set
 
 #
 # LPDDR flash memory drivers
 #
 # CONFIG_MTD_LPDDR is not set
-# CONFIG_MTD_UBI is not set
+CONFIG_MTD_UBI=y
+CONFIG_MTD_UBI_WL_THRESHOLD=4096
+CONFIG_MTD_UBI_BEB_LIMIT=20
+# CONFIG_MTD_UBI_FASTMAP is not set
+# CONFIG_MTD_UBI_GLUEBI is not set
 # CONFIG_PARPORT is not set
 CONFIG_BLK_DEV=y
 # CONFIG_BLK_CPQ_DA is not set
@@ -1576,6 +1605,10 @@
 # Other I2C/SMBus bus drivers
 #
 # CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
 # CONFIG_I2C_DEBUG_CORE is not set
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
@@ -1842,6 +1875,7 @@
 # CONFIG_EZX_PCAP is not set
 # CONFIG_MFD_TIMBERDALE is not set
 # CONFIG_LPC_SCH is not set
+# CONFIG_LPC_ICH is not set
 # CONFIG_MFD_RDC321X is not set
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_VX855 is not set
@@ -1979,6 +2013,7 @@
 # CONFIG_USB_WHCI_HCD is not set
 # CONFIG_USB_HWA_HCD is not set
 # CONFIG_SYNO_XHCI_RING_EXPANSION is not set
+CONFIG_USB_MARVELL_ERRATA_FE_9049667=y
 
 #
 # USB Device Class drivers
@@ -2108,7 +2143,32 @@
 # CONFIG_USB_ULPI is not set
 # CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
-# CONFIG_MMC is not set
+CONFIG_MMC=y
+# CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+# CONFIG_MMC_CLKGATE is not set
+
+#
+# MMC/SD/SDIO Card Drivers
+#
+CONFIG_MMC_BLOCK=y
+CONFIG_MMC_BLOCK_MINORS=8
+CONFIG_MMC_BLOCK_BOUNCE=y
+# CONFIG_SDIO_UART is not set
+# CONFIG_MMC_TEST is not set
+
+#
+# MMC/SD/SDIO Host Controller Drivers
+#
+# CONFIG_MMC_SDHCI is not set
+# CONFIG_MMC_TIFM_SD is not set
+CONFIG_MMC_MVSDIO=y
+# CONFIG_MMC_SPI is not set
+# CONFIG_MMC_CB710 is not set
+# CONFIG_MMC_VIA_SDMMC is not set
+# CONFIG_MMC_DW is not set
+# CONFIG_MMC_VUB300 is not set
+# CONFIG_MMC_USHC is not set
 # CONFIG_MEMSTICK is not set
 CONFIG_NEW_LEDS=y
 CONFIG_LEDS_CLASS=y
@@ -2169,8 +2229,7 @@
 # CONFIG_VME_BUS is not set
 # CONFIG_DX_SEP is not set
 # CONFIG_IIO is not set
-# CONFIG_XVMALLOC is not set
-# CONFIG_ZRAM is not set
+# CONFIG_ZSMALLOC is not set
 # CONFIG_CRYSTALHD is not set
 # CONFIG_USB_ENESTORAGE is not set
 # CONFIG_BCM_WIMAX is not set
@@ -2221,7 +2280,12 @@
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_SYNO_ACL=y
 # CONFIG_BTRFS_FS_POSIX_ACL is not set
+# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set
+# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set
+# CONFIG_BTRFS_DEBUG is not set
+# CONFIG_BTRFS_ASSERT is not set
 # CONFIG_NILFS2_FS is not set
 CONFIG_EXPORTFS=m
 CONFIG_FILE_LOCKING=y
@@ -2293,6 +2357,10 @@
 # CONFIG_BFS_FS is not set
 # CONFIG_EFS_FS is not set
 # CONFIG_JFFS2_FS is not set
+CONFIG_UBIFS_FS=y
+CONFIG_UBIFS_FS_ADVANCED_COMPR=y
+CONFIG_UBIFS_FS_LZO=y
+CONFIG_UBIFS_FS_ZLIB=y
 # CONFIG_LOGFS is not set
 # CONFIG_CRAMFS is not set
 # CONFIG_SQUASHFS is not set
@@ -2500,26 +2568,26 @@
 #
 # Crypto core or helper
 #
-CONFIG_CRYPTO_ALGAPI=m
-CONFIG_CRYPTO_ALGAPI2=m
-CONFIG_CRYPTO_AEAD=m
-CONFIG_CRYPTO_AEAD2=m
-CONFIG_CRYPTO_BLKCIPHER=m
-CONFIG_CRYPTO_BLKCIPHER2=m
-CONFIG_CRYPTO_HASH=m
-CONFIG_CRYPTO_HASH2=m
-CONFIG_CRYPTO_RNG=m
-CONFIG_CRYPTO_RNG2=m
-CONFIG_CRYPTO_PCOMP2=m
-CONFIG_CRYPTO_MANAGER=m
-CONFIG_CRYPTO_MANAGER2=m
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
+CONFIG_CRYPTO_AEAD=y
+CONFIG_CRYPTO_AEAD2=y
+CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG=y
+CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP2=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_USER is not set
 CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
-CONFIG_CRYPTO_WORKQUEUE=m
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
-CONFIG_CRYPTO_AUTHENC=m
+CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
 
 #
@@ -2532,10 +2600,10 @@
 #
 # Block modes
 #
-CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_CBC=y
 # CONFIG_CRYPTO_CTR is not set
-CONFIG_CRYPTO_CTS=m
-CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CTS=y
+CONFIG_CRYPTO_ECB=y
 # CONFIG_CRYPTO_LRW is not set
 # CONFIG_CRYPTO_PCBC is not set
 # CONFIG_CRYPTO_XTS is not set
@@ -2543,39 +2611,39 @@
 #
 # Hash modes
 #
-CONFIG_CRYPTO_HMAC=m
+CONFIG_CRYPTO_HMAC=y
 # CONFIG_CRYPTO_XCBC is not set
 # CONFIG_CRYPTO_VMAC is not set
 
 #
 # Digest
 #
-CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CRC32C=y
 # CONFIG_CRYPTO_GHASH is not set
-CONFIG_CRYPTO_MD4=m
-CONFIG_CRYPTO_MD5=m
+CONFIG_CRYPTO_MD4=y
+CONFIG_CRYPTO_MD5=y
 # CONFIG_CRYPTO_MICHAEL_MIC is not set
 # CONFIG_CRYPTO_RMD128 is not set
 # CONFIG_CRYPTO_RMD160 is not set
 # CONFIG_CRYPTO_RMD256 is not set
 # CONFIG_CRYPTO_RMD320 is not set
-CONFIG_CRYPTO_SHA1=m
-CONFIG_CRYPTO_SHA256=m
-CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_SHA1=y
+CONFIG_CRYPTO_SHA256=y
+CONFIG_CRYPTO_SHA512=y
 # CONFIG_CRYPTO_TGR192 is not set
 # CONFIG_CRYPTO_WP512 is not set
 
 #
 # Ciphers
 #
-CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_AES=y
 # CONFIG_CRYPTO_ANUBIS is not set
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARC4=y
 # CONFIG_CRYPTO_BLOWFISH is not set
 # CONFIG_CRYPTO_CAMELLIA is not set
 # CONFIG_CRYPTO_CAST5 is not set
 # CONFIG_CRYPTO_CAST6 is not set
-CONFIG_CRYPTO_DES=m
+CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_FCRYPT is not set
 # CONFIG_CRYPTO_KHAZAD is not set
 # CONFIG_CRYPTO_SALSA20 is not set
@@ -2587,14 +2655,14 @@
 #
 # Compression
 #
-CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_DEFLATE=y
 # CONFIG_CRYPTO_ZLIB is not set
-# CONFIG_CRYPTO_LZO is not set
+CONFIG_CRYPTO_LZO=y
 
 #
 # Random Number Generation
 #
-CONFIG_CRYPTO_ANSI_CPRNG=m
+CONFIG_CRYPTO_ANSI_CPRNG=y
 # CONFIG_CRYPTO_USER_API_HASH is not set
 # CONFIG_CRYPTO_USER_API_SKCIPHER is not set
 # CONFIG_CRYPTO_HW is not set
@@ -2619,6 +2687,10 @@
 # CONFIG_OCF_UBSEC_SSB is not set
 # CONFIG_OCF_OCFNULL is not set
 # CONFIG_OCF_BENCH is not set
+
+#
+# OCF Configuration
+#
 # CONFIG_BINARY_PRINTF is not set
 
 #
@@ -2626,18 +2698,18 @@
 #
 CONFIG_RAID6_PQ=y
 CONFIG_BITREVERSE=y
-CONFIG_CRC_CCITT=m
+CONFIG_CRC_CCITT=y
 CONFIG_CRC16=y
 CONFIG_CRC_T10DIF=y
-CONFIG_CRC_ITU_T=m
+CONFIG_CRC_ITU_T=y
 CONFIG_CRC32=y
 # CONFIG_CRC7 is not set
-CONFIG_LIBCRC32C=m
+CONFIG_LIBCRC32C=y
 # CONFIG_CRC8 is not set
 CONFIG_ZLIB_INFLATE=y
-CONFIG_ZLIB_DEFLATE=m
-CONFIG_LZO_COMPRESS=m
-CONFIG_LZO_DECOMPRESS=m
+CONFIG_ZLIB_DEFLATE=y
+CONFIG_LZO_COMPRESS=y
+CONFIG_LZO_DECOMPRESS=y
 CONFIG_XZ_DEC=y
 CONFIG_XZ_DEC_X86=y
 CONFIG_XZ_DEC_POWERPC=y
@@ -2650,6 +2722,10 @@
 CONFIG_DECOMPRESS_GZIP=y
 CONFIG_DECOMPRESS_LZMA=y
 CONFIG_DECOMPRESS_XZ=y
+# CONFIG_TEXTSEARCH is not set
+# CONFIG_TEXTSEARCH_KMP is not set
+# CONFIG_TEXTSEARCH_BM is not set
+# CONFIG_TEXTSEARCH_FSM is not set
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
diff -ur a/synoconfigs/armadaxp b/synoconfigs/armadaxp
--- a/synoconfigs/armadaxp	2013-08-24 11:29:13.000000000 +0200
+++ b/synoconfigs/armadaxp	2014-02-17 11:45:15.000000000 +0100
@@ -109,7 +109,7 @@
 CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 CONFIG_PERF_USE_VMALLOC=y
@@ -144,6 +144,8 @@
 #
 CONFIG_SYNO_DISPLAY_CPUINFO=y
 CONFIG_SYNO_FIX_MV_CESA_RACE=y
+CONFIG_SYNO_FIX_OCF_CRYPTODEV_RACE=y
+CONFIG_SYNO_FIX_OCF_CRYPTOSOFT_RACE=y
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -206,6 +208,7 @@
 #
 CONFIG_MMU=y
 # CONFIG_ARCH_INTEGRATOR is not set
+# CONFIG_ARCH_COMCERTO is not set
 # CONFIG_ARCH_REALVIEW is not set
 # CONFIG_ARCH_VERSATILE is not set
 # CONFIG_ARCH_VEXPRESS is not set
@@ -376,7 +379,31 @@
 #
 # BM configuration
 #
-# CONFIG_MV_ETH_BM is not set
+CONFIG_MV_ETH_BM=y
+CONFIG_MV_ETH_BM_CPU=y
+CONFIG_MV_ETH_BM_0_PKT_SIZE=0
+CONFIG_MV_ETH_BM_1_PKT_SIZE=0
+CONFIG_MV_ETH_BM_2_PKT_SIZE=256
+CONFIG_MV_ETH_BM_3_PKT_SIZE=256
+CONFIG_MV_ETH_BM_PORT_0=y
+CONFIG_MV_ETH_BM_PORT_0_LONG_POOL=0
+CONFIG_MV_ETH_BM_PORT_0_SHORT_POOL=2
+CONFIG_MV_ETH_BM_PORT_0_LONG_BUF_NUM=2048
+CONFIG_MV_ETH_BM_PORT_0_SHORT_BUF_NUM=3072
+CONFIG_MV_ETH_BM_PORT_1=y
+CONFIG_MV_ETH_BM_PORT_1_LONG_POOL=1
+CONFIG_MV_ETH_BM_PORT_1_SHORT_POOL=3
+CONFIG_MV_ETH_BM_PORT_1_LONG_BUF_NUM=2048
+CONFIG_MV_ETH_BM_PORT_1_SHORT_BUF_NUM=3072
+CONFIG_MV_ETH_BM_PORT_2=y
+CONFIG_MV_ETH_BM_PORT_2_LONG_POOL=2
+CONFIG_MV_ETH_BM_PORT_2_SHORT_POOL=3
+CONFIG_MV_ETH_BM_PORT_2_LONG_BUF_NUM=2048
+CONFIG_MV_ETH_BM_PORT_2_SHORT_BUF_NUM=3072
+CONFIG_MV_ETH_BM_PORT_3=y
+CONFIG_MV_ETH_BM_PORT_3_LONG_POOL=3
+CONFIG_MV_ETH_BM_PORT_3_SHORT_POOL=3
+CONFIG_MV_ETH_BM_PORT_3_LONG_BUF_NUM=2048
 CONFIG_MV_ETH_LEGACY_PARSER=y
 # CONFIG_MV_ETH_PNC is not set
 # CONFIG_MV_ETH_PMT is not set
@@ -831,7 +858,7 @@
 # CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
 # CONFIG_NETFILTER_XT_MATCH_LENGTH is not set
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-# CONFIG_NETFILTER_XT_MATCH_MAC is not set
+CONFIG_NETFILTER_XT_MATCH_MAC=m
 # CONFIG_NETFILTER_XT_MATCH_MARK is not set
 CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
 # CONFIG_NETFILTER_XT_MATCH_OWNER is not set
@@ -993,10 +1020,14 @@
 CONFIG_WIRELESS_EXT=y
 CONFIG_WEXT_CORE=y
 CONFIG_WEXT_PROC=y
+# CONFIG_WEXT_SPY is not set
 CONFIG_WEXT_PRIV=y
 # CONFIG_CFG80211 is not set
 CONFIG_WIRELESS_EXT_SYSFS=y
 # CONFIG_LIB80211 is not set
+# CONFIG_LIB80211_CRYPT_WEP is not set
+# CONFIG_LIB80211_CRYPT_CCMP is not set
+# CONFIG_LIB80211_CRYPT_TKIP is not set
 
 #
 # CFG80211 needs to be enabled for MAC80211
@@ -1648,6 +1679,10 @@
 # Other I2C/SMBus bus drivers
 #
 # CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
 # CONFIG_I2C_DEBUG_CORE is not set
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
@@ -1894,6 +1929,7 @@
 # CONFIG_EZX_PCAP is not set
 # CONFIG_MFD_TIMBERDALE is not set
 # CONFIG_LPC_SCH is not set
+# CONFIG_LPC_ICH is not set
 # CONFIG_MFD_RDC321X is not set
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_VX855 is not set
@@ -2032,6 +2068,7 @@
 # CONFIG_USB_WHCI_HCD is not set
 # CONFIG_USB_HWA_HCD is not set
 # CONFIG_SYNO_XHCI_RING_EXPANSION is not set
+CONFIG_USB_MARVELL_ERRATA_FE_9049667=y
 
 #
 # USB Device Class drivers
@@ -2240,8 +2277,7 @@
 # CONFIG_VME_BUS is not set
 # CONFIG_DX_SEP is not set
 # CONFIG_IIO is not set
-# CONFIG_XVMALLOC is not set
-# CONFIG_ZRAM is not set
+# CONFIG_ZSMALLOC is not set
 # CONFIG_CRYSTALHD is not set
 # CONFIG_USB_ENESTORAGE is not set
 # CONFIG_BCM_WIMAX is not set
@@ -2293,7 +2329,12 @@
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_SYNO_ACL=y
 # CONFIG_BTRFS_FS_POSIX_ACL is not set
+# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set
+# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set
+# CONFIG_BTRFS_DEBUG is not set
+# CONFIG_BTRFS_ASSERT is not set
 # CONFIG_NILFS2_FS is not set
 CONFIG_EXPORTFS=m
 CONFIG_FILE_LOCKING=y
@@ -2574,27 +2615,27 @@
 #
 # Crypto core or helper
 #
-CONFIG_CRYPTO_ALGAPI=m
-CONFIG_CRYPTO_ALGAPI2=m
-CONFIG_CRYPTO_AEAD=m
-CONFIG_CRYPTO_AEAD2=m
-CONFIG_CRYPTO_BLKCIPHER=m
-CONFIG_CRYPTO_BLKCIPHER2=m
-CONFIG_CRYPTO_HASH=m
-CONFIG_CRYPTO_HASH2=m
-CONFIG_CRYPTO_RNG=m
-CONFIG_CRYPTO_RNG2=m
-CONFIG_CRYPTO_PCOMP2=m
-CONFIG_CRYPTO_MANAGER=m
-CONFIG_CRYPTO_MANAGER2=m
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
+CONFIG_CRYPTO_AEAD=y
+CONFIG_CRYPTO_AEAD2=y
+CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
+CONFIG_CRYPTO_HASH=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG=y
+CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP2=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_USER is not set
 CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
 # CONFIG_CRYPTO_PCRYPT is not set
-CONFIG_CRYPTO_WORKQUEUE=m
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
-CONFIG_CRYPTO_AUTHENC=m
+CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
 
 #
@@ -2607,10 +2648,10 @@
 #
 # Block modes
 #
-CONFIG_CRYPTO_CBC=m
+CONFIG_CRYPTO_CBC=y
 # CONFIG_CRYPTO_CTR is not set
-CONFIG_CRYPTO_CTS=m
-CONFIG_CRYPTO_ECB=m
+CONFIG_CRYPTO_CTS=y
+CONFIG_CRYPTO_ECB=y
 # CONFIG_CRYPTO_LRW is not set
 # CONFIG_CRYPTO_PCBC is not set
 # CONFIG_CRYPTO_XTS is not set
@@ -2618,39 +2659,39 @@
 #
 # Hash modes
 #
-CONFIG_CRYPTO_HMAC=m
+CONFIG_CRYPTO_HMAC=y
 # CONFIG_CRYPTO_XCBC is not set
 # CONFIG_CRYPTO_VMAC is not set
 
 #
 # Digest
 #
-CONFIG_CRYPTO_CRC32C=m
+CONFIG_CRYPTO_CRC32C=y
 # CONFIG_CRYPTO_GHASH is not set
-CONFIG_CRYPTO_MD4=m
-CONFIG_CRYPTO_MD5=m
+CONFIG_CRYPTO_MD4=y
+CONFIG_CRYPTO_MD5=y
 # CONFIG_CRYPTO_MICHAEL_MIC is not set
 # CONFIG_CRYPTO_RMD128 is not set
 # CONFIG_CRYPTO_RMD160 is not set
 # CONFIG_CRYPTO_RMD256 is not set
 # CONFIG_CRYPTO_RMD320 is not set
-CONFIG_CRYPTO_SHA1=m
-CONFIG_CRYPTO_SHA256=m
-CONFIG_CRYPTO_SHA512=m
+CONFIG_CRYPTO_SHA1=y
+CONFIG_CRYPTO_SHA256=y
+CONFIG_CRYPTO_SHA512=y
 # CONFIG_CRYPTO_TGR192 is not set
 # CONFIG_CRYPTO_WP512 is not set
 
 #
 # Ciphers
 #
-CONFIG_CRYPTO_AES=m
+CONFIG_CRYPTO_AES=y
 # CONFIG_CRYPTO_ANUBIS is not set
-CONFIG_CRYPTO_ARC4=m
+CONFIG_CRYPTO_ARC4=y
 # CONFIG_CRYPTO_BLOWFISH is not set
 # CONFIG_CRYPTO_CAMELLIA is not set
 # CONFIG_CRYPTO_CAST5 is not set
 # CONFIG_CRYPTO_CAST6 is not set
-CONFIG_CRYPTO_DES=m
+CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_FCRYPT is not set
 # CONFIG_CRYPTO_KHAZAD is not set
 # CONFIG_CRYPTO_SALSA20 is not set
@@ -2662,14 +2703,14 @@
 #
 # Compression
 #
-CONFIG_CRYPTO_DEFLATE=m
+CONFIG_CRYPTO_DEFLATE=y
 # CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
 # Random Number Generation
 #
-CONFIG_CRYPTO_ANSI_CPRNG=m
+CONFIG_CRYPTO_ANSI_CPRNG=y
 # CONFIG_CRYPTO_USER_API_HASH is not set
 # CONFIG_CRYPTO_USER_API_SKCIPHER is not set
 # CONFIG_CRYPTO_HW is not set
@@ -2694,6 +2735,10 @@
 # CONFIG_OCF_UBSEC_SSB is not set
 # CONFIG_OCF_OCFNULL is not set
 # CONFIG_OCF_BENCH is not set
+
+#
+# OCF Configuration
+#
 # CONFIG_BINARY_PRINTF is not set
 
 #
@@ -2701,16 +2746,16 @@
 #
 CONFIG_RAID6_PQ=y
 CONFIG_BITREVERSE=y
-CONFIG_CRC_CCITT=m
+CONFIG_CRC_CCITT=y
 CONFIG_CRC16=y
 CONFIG_CRC_T10DIF=y
-CONFIG_CRC_ITU_T=m
+CONFIG_CRC_ITU_T=y
 CONFIG_CRC32=y
 # CONFIG_CRC7 is not set
-CONFIG_LIBCRC32C=m
+CONFIG_LIBCRC32C=y
 # CONFIG_CRC8 is not set
 CONFIG_ZLIB_INFLATE=y
-CONFIG_ZLIB_DEFLATE=m
+CONFIG_ZLIB_DEFLATE=y
 CONFIG_LZO_COMPRESS=m
 CONFIG_LZO_DECOMPRESS=m
 CONFIG_XZ_DEC=y
@@ -2725,6 +2770,10 @@
 CONFIG_DECOMPRESS_GZIP=y
 CONFIG_DECOMPRESS_LZMA=y
 CONFIG_DECOMPRESS_XZ=y
+# CONFIG_TEXTSEARCH is not set
+# CONFIG_TEXTSEARCH_KMP is not set
+# CONFIG_TEXTSEARCH_BM is not set
+# CONFIG_TEXTSEARCH_FSM is not set
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
Nur in b/synoconfigs: avoton.
diff -ur a/synoconfigs/bromolow b/synoconfigs/bromolow
--- a/synoconfigs/bromolow	2013-08-16 08:07:18.000000000 +0200
+++ b/synoconfigs/bromolow	2014-01-21 09:37:34.000000000 +0100
@@ -10,7 +10,8 @@
 CONFIG_SYNO_X64=y
 CONFIG_SYNO_BROMOLOW=y
 # CONFIG_SYNO_CEDARVIEW is not set
-# CONFIG_SYNO_AMD_RICHLAND is not set
+# CONFIG_SYNO_AVOTON is not set
+# CONFIG_SYNO_KVMX64 is not set
 CONFIG_OUTPUT_FORMAT="elf64-x86-64"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -80,7 +81,8 @@
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
-# CONFIG_POSIX_MQUEUE is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_TASKSTATS is not set
@@ -145,7 +147,7 @@
 CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 
@@ -185,8 +187,14 @@
 #
 # GCOV-based kernel profiling
 #
+
+#
+# Synology special config
+#
 CONFIG_SYNO_ADT7490_FEATURES=y
 CONFIG_SYNO_DISPLAY_CPUINFO=y
+CONFIG_SYNO_EFI=y
+CONFIG_SYNO_DUAL_HEAD=y
 # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -351,7 +359,7 @@
 # CONFIG_MTRR_SANITIZER is not set
 # CONFIG_X86_PAT is not set
 CONFIG_ARCH_RANDOM=y
-# CONFIG_EFI is not set
+CONFIG_EFI=y
 # CONFIG_SECCOMP is not set
 # CONFIG_CC_STACKPROTECTOR is not set
 # CONFIG_HZ_100 is not set
@@ -640,7 +648,7 @@
 # CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
 # CONFIG_NETFILTER_XT_MATCH_LENGTH is not set
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-# CONFIG_NETFILTER_XT_MATCH_MAC is not set
+CONFIG_NETFILTER_XT_MATCH_MAC=m
 # CONFIG_NETFILTER_XT_MATCH_MARK is not set
 CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
 # CONFIG_NETFILTER_XT_MATCH_OWNER is not set
@@ -804,10 +812,14 @@
 CONFIG_WIRELESS_EXT=y
 CONFIG_WEXT_CORE=y
 CONFIG_WEXT_PROC=y
+# CONFIG_WEXT_SPY is not set
 CONFIG_WEXT_PRIV=y
 # CONFIG_CFG80211 is not set
 CONFIG_WIRELESS_EXT_SYSFS=y
 # CONFIG_LIB80211 is not set
+# CONFIG_LIB80211_CRYPT_WEP is not set
+# CONFIG_LIB80211_CRYPT_CCMP is not set
+# CONFIG_LIB80211_CRYPT_TKIP is not set
 
 #
 # CFG80211 needs to be enabled for MAC80211
@@ -1170,7 +1182,6 @@
 # CAIF transport drivers
 #
 CONFIG_ETHERNET=y
-CONFIG_MDIO=m
 CONFIG_NET_VENDOR_3COM=y
 # CONFIG_VORTEX is not set
 # CONFIG_TYPHOON is not set
@@ -1221,8 +1232,7 @@
 # CONFIG_IGB is not set
 # CONFIG_IGBVF is not set
 # CONFIG_IXGB is not set
-CONFIG_IXGBE=m
-CONFIG_IXGBE_DCA=y
+# CONFIG_IXGBE is not set
 # CONFIG_IXGBEVF is not set
 CONFIG_NET_VENDOR_I825XX=y
 # CONFIG_ZNET is not set
@@ -1513,6 +1523,10 @@
 # Other I2C/SMBus bus drivers
 #
 # CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
 # CONFIG_I2C_DEBUG_CORE is not set
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
@@ -1727,6 +1741,7 @@
 # CONFIG_EZX_PCAP is not set
 # CONFIG_MFD_CS5535 is not set
 # CONFIG_LPC_SCH is not set
+# CONFIG_LPC_ICH is not set
 # CONFIG_MFD_RDC321X is not set
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_VX855 is not set
@@ -1854,7 +1869,26 @@
 CONFIG_SOUND=m
 CONFIG_SOUND_OSS_CORE=y
 CONFIG_SOUND_OSS_CORE_PRECLAIM=y
-# CONFIG_SND is not set
+CONFIG_SND=m
+# CONFIG_SND_SEQUENCER is not set
+# CONFIG_SND_MIXER_OSS is not set
+# CONFIG_SND_PCM_OSS is not set
+# CONFIG_SND_DYNAMIC_MINORS is not set
+CONFIG_SND_SUPPORT_OLD_API=y
+# CONFIG_SND_VERBOSE_PROCFS is not set
+# CONFIG_SND_VERBOSE_PRINTK is not set
+# CONFIG_SND_DEBUG is not set
+CONFIG_SND_DMA_SGBUF=y
+# CONFIG_SND_RAWMIDI_SEQ is not set
+# CONFIG_SND_OPL3_LIB_SEQ is not set
+# CONFIG_SND_OPL4_LIB_SEQ is not set
+# CONFIG_SND_SBAWE_SEQ is not set
+# CONFIG_SND_EMU10K1_SEQ is not set
+# CONFIG_SND_DRIVERS is not set
+# CONFIG_SND_PCI is not set
+# CONFIG_SND_SPI is not set
+# CONFIG_SND_USB is not set
+# CONFIG_SND_SOC is not set
 CONFIG_SOUND_PRIME=m
 # CONFIG_SOUND_OSS is not set
 CONFIG_HID_SUPPORT=y
@@ -1883,6 +1917,7 @@
 # CONFIG_HID_BELKIN is not set
 # CONFIG_HID_CHERRY is not set
 # CONFIG_HID_CHICONY is not set
+# CONFIG_HID_PRODIKEYS is not set
 # CONFIG_HID_CYPRESS is not set
 # CONFIG_HID_DRAGONRISE is not set
 # CONFIG_HID_EMS_FF is not set
@@ -2244,6 +2279,7 @@
 # CONFIG_TRANZPORT is not set
 # CONFIG_POHMELFS is not set
 # CONFIG_IDE_PHISON is not set
+# CONFIG_LINE6_USB is not set
 # CONFIG_DRM_NOUVEAU is not set
 
 #
@@ -2258,8 +2294,9 @@
 # CONFIG_VME_BUS is not set
 # CONFIG_DX_SEP is not set
 # CONFIG_IIO is not set
-# CONFIG_XVMALLOC is not set
-# CONFIG_ZRAM is not set
+CONFIG_ZSMALLOC=y
+CONFIG_ZRAM=m
+# CONFIG_ZRAM_DEBUG is not set
 # CONFIG_FB_SM7XX is not set
 # CONFIG_CRYSTALHD is not set
 # CONFIG_FB_XGI is not set
@@ -2313,6 +2350,7 @@
 #
 # CONFIG_EDD is not set
 CONFIG_FIRMWARE_MEMMAP=y
+CONFIG_EFI_VARS=y
 # CONFIG_DELL_RBU is not set
 # CONFIG_DCDBAS is not set
 CONFIG_DMIID=y
@@ -2351,7 +2389,12 @@
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_SYNO_ACL=y
 # CONFIG_BTRFS_FS_POSIX_ACL is not set
+# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set
+# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set
+# CONFIG_BTRFS_DEBUG is not set
+# CONFIG_BTRFS_ASSERT is not set
 # CONFIG_NILFS2_FS is not set
 CONFIG_EXPORTFS=m
 CONFIG_FILE_LOCKING=y
@@ -2774,6 +2817,10 @@
 # OCF Configuration
 #
 # CONFIG_OCF_OCF is not set
+
+#
+# OCF Configuration
+#
 CONFIG_HAVE_KVM=y
 # CONFIG_VIRTUALIZATION is not set
 # CONFIG_BINARY_PRINTF is not set
@@ -2800,6 +2847,10 @@
 # CONFIG_XZ_DEC_BCJ is not set
 CONFIG_DECOMPRESS_GZIP=y
 CONFIG_DECOMPRESS_LZMA=y
+# CONFIG_TEXTSEARCH is not set
+# CONFIG_TEXTSEARCH_KMP is not set
+# CONFIG_TEXTSEARCH_BM is not set
+# CONFIG_TEXTSEARCH_FSM is not set
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
Nur in b/synoconfigs: bromolowhost.
diff -ur a/synoconfigs/cedarview b/synoconfigs/cedarview
--- a/synoconfigs/cedarview	2013-08-16 08:07:18.000000000 +0200
+++ b/synoconfigs/cedarview	2014-01-21 09:37:34.000000000 +0100
@@ -10,7 +10,8 @@
 CONFIG_SYNO_X64=y
 # CONFIG_SYNO_BROMOLOW is not set
 CONFIG_SYNO_CEDARVIEW=y
-# CONFIG_SYNO_AMD_RICHLAND is not set
+# CONFIG_SYNO_AVOTON is not set
+# CONFIG_SYNO_KVMX64 is not set
 CONFIG_OUTPUT_FORMAT="elf64-x86-64"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -80,7 +81,8 @@
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
-# CONFIG_POSIX_MQUEUE is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_TASKSTATS is not set
@@ -145,7 +147,7 @@
 CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 
@@ -185,6 +187,10 @@
 #
 # GCOV-based kernel profiling
 #
+
+#
+# Synology special config
+#
 CONFIG_SYNO_INCREASE_SIL3132_OUT_SWING=y
 CONFIG_SYNO_DISPLAY_CPUINFO=y
 # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
@@ -635,7 +641,7 @@
 # CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
 # CONFIG_NETFILTER_XT_MATCH_LENGTH is not set
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-# CONFIG_NETFILTER_XT_MATCH_MAC is not set
+CONFIG_NETFILTER_XT_MATCH_MAC=m
 # CONFIG_NETFILTER_XT_MATCH_MARK is not set
 CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
 # CONFIG_NETFILTER_XT_MATCH_OWNER is not set
@@ -799,10 +805,14 @@
 CONFIG_WIRELESS_EXT=y
 CONFIG_WEXT_CORE=y
 CONFIG_WEXT_PROC=y
+# CONFIG_WEXT_SPY is not set
 CONFIG_WEXT_PRIV=y
 # CONFIG_CFG80211 is not set
 CONFIG_WIRELESS_EXT_SYSFS=y
 # CONFIG_LIB80211 is not set
+# CONFIG_LIB80211_CRYPT_WEP is not set
+# CONFIG_LIB80211_CRYPT_CCMP is not set
+# CONFIG_LIB80211_CRYPT_TKIP is not set
 
 #
 # CFG80211 needs to be enabled for MAC80211
@@ -1471,6 +1481,10 @@
 # Other I2C/SMBus bus drivers
 #
 # CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
 # CONFIG_I2C_DEBUG_CORE is not set
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
@@ -1685,6 +1699,7 @@
 # CONFIG_EZX_PCAP is not set
 # CONFIG_MFD_CS5535 is not set
 CONFIG_LPC_SCH=y
+# CONFIG_LPC_ICH is not set
 # CONFIG_MFD_RDC321X is not set
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_VX855 is not set
@@ -1813,7 +1828,26 @@
 CONFIG_SOUND=m
 CONFIG_SOUND_OSS_CORE=y
 CONFIG_SOUND_OSS_CORE_PRECLAIM=y
-# CONFIG_SND is not set
+CONFIG_SND=m
+# CONFIG_SND_SEQUENCER is not set
+# CONFIG_SND_MIXER_OSS is not set
+# CONFIG_SND_PCM_OSS is not set
+# CONFIG_SND_DYNAMIC_MINORS is not set
+CONFIG_SND_SUPPORT_OLD_API=y
+# CONFIG_SND_VERBOSE_PROCFS is not set
+# CONFIG_SND_VERBOSE_PRINTK is not set
+# CONFIG_SND_DEBUG is not set
+CONFIG_SND_DMA_SGBUF=y
+# CONFIG_SND_RAWMIDI_SEQ is not set
+# CONFIG_SND_OPL3_LIB_SEQ is not set
+# CONFIG_SND_OPL4_LIB_SEQ is not set
+# CONFIG_SND_SBAWE_SEQ is not set
+# CONFIG_SND_EMU10K1_SEQ is not set
+# CONFIG_SND_DRIVERS is not set
+# CONFIG_SND_PCI is not set
+# CONFIG_SND_SPI is not set
+# CONFIG_SND_USB is not set
+# CONFIG_SND_SOC is not set
 CONFIG_SOUND_PRIME=m
 # CONFIG_SOUND_OSS is not set
 CONFIG_HID_SUPPORT=y
@@ -1842,6 +1876,7 @@
 # CONFIG_HID_BELKIN is not set
 # CONFIG_HID_CHERRY is not set
 # CONFIG_HID_CHICONY is not set
+# CONFIG_HID_PRODIKEYS is not set
 # CONFIG_HID_CYPRESS is not set
 # CONFIG_HID_DRAGONRISE is not set
 # CONFIG_HID_EMS_FF is not set
@@ -2184,6 +2219,7 @@
 # CONFIG_TRANZPORT is not set
 # CONFIG_POHMELFS is not set
 # CONFIG_IDE_PHISON is not set
+# CONFIG_LINE6_USB is not set
 # CONFIG_DRM_NOUVEAU is not set
 
 #
@@ -2198,8 +2234,9 @@
 # CONFIG_VME_BUS is not set
 # CONFIG_DX_SEP is not set
 # CONFIG_IIO is not set
-# CONFIG_XVMALLOC is not set
-# CONFIG_ZRAM is not set
+CONFIG_ZSMALLOC=y
+CONFIG_ZRAM=m
+# CONFIG_ZRAM_DEBUG is not set
 # CONFIG_FB_SM7XX is not set
 # CONFIG_CRYSTALHD is not set
 # CONFIG_FB_XGI is not set
@@ -2291,7 +2328,12 @@
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_SYNO_ACL=y
 # CONFIG_BTRFS_FS_POSIX_ACL is not set
+# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set
+# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set
+# CONFIG_BTRFS_DEBUG is not set
+# CONFIG_BTRFS_ASSERT is not set
 # CONFIG_NILFS2_FS is not set
 CONFIG_EXPORTFS=m
 CONFIG_FILE_LOCKING=y
@@ -2712,6 +2754,10 @@
 # OCF Configuration
 #
 # CONFIG_OCF_OCF is not set
+
+#
+# OCF Configuration
+#
 CONFIG_HAVE_KVM=y
 # CONFIG_VIRTUALIZATION is not set
 # CONFIG_BINARY_PRINTF is not set
@@ -2738,6 +2784,10 @@
 # CONFIG_XZ_DEC_BCJ is not set
 CONFIG_DECOMPRESS_GZIP=y
 CONFIG_DECOMPRESS_LZMA=y
+# CONFIG_TEXTSEARCH is not set
+# CONFIG_TEXTSEARCH_KMP is not set
+# CONFIG_TEXTSEARCH_BM is not set
+# CONFIG_TEXTSEARCH_FSM is not set
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
Nur in b/synoconfigs: comcerto2k.
diff -ur a/synoconfigs/evansport b/synoconfigs/evansport
--- a/synoconfigs/evansport	2013-08-03 09:59:52.000000000 +0200
+++ b/synoconfigs/evansport	2014-01-21 09:37:34.000000000 +0100
@@ -77,7 +77,8 @@
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
-# CONFIG_POSIX_MQUEUE is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_TASKSTATS is not set
@@ -190,6 +191,10 @@
 #
 # GCOV-based kernel profiling
 #
+
+#
+# Synology special config
+#
 CONFIG_SYNO_DISPLAY_CPUINFO=y
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
@@ -691,7 +696,7 @@
 # CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
 # CONFIG_NETFILTER_XT_MATCH_LENGTH is not set
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-# CONFIG_NETFILTER_XT_MATCH_MAC is not set
+CONFIG_NETFILTER_XT_MATCH_MAC=m
 # CONFIG_NETFILTER_XT_MATCH_MARK is not set
 CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
 # CONFIG_NETFILTER_XT_MATCH_OWNER is not set
@@ -853,10 +858,14 @@
 CONFIG_WIRELESS_EXT=y
 CONFIG_WEXT_CORE=y
 CONFIG_WEXT_PROC=y
+# CONFIG_WEXT_SPY is not set
 CONFIG_WEXT_PRIV=y
 # CONFIG_CFG80211 is not set
 CONFIG_WIRELESS_EXT_SYSFS=y
 # CONFIG_LIB80211 is not set
+# CONFIG_LIB80211_CRYPT_WEP is not set
+# CONFIG_LIB80211_CRYPT_CCMP is not set
+# CONFIG_LIB80211_CRYPT_TKIP is not set
 
 #
 # CFG80211 needs to be enabled for MAC80211
@@ -1590,6 +1599,10 @@
 #
 # CONFIG_I2C_STUB is not set
 # CONFIG_SCx200_ACB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
 # CONFIG_I2C_DEBUG_CORE is not set
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
@@ -1866,6 +1879,7 @@
 # CONFIG_MFD_CS5535 is not set
 # CONFIG_MFD_TIMBERDALE is not set
 CONFIG_LPC_SCH=y
+# CONFIG_LPC_ICH is not set
 # CONFIG_MFD_RDC321X is not set
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_VX855 is not set
@@ -2394,8 +2408,9 @@
 # CONFIG_VME_BUS is not set
 # CONFIG_DX_SEP is not set
 # CONFIG_IIO is not set
-# CONFIG_XVMALLOC is not set
-# CONFIG_ZRAM is not set
+CONFIG_ZSMALLOC=y
+CONFIG_ZRAM=m
+# CONFIG_ZRAM_DEBUG is not set
 # CONFIG_FB_SM7XX is not set
 # CONFIG_CRYSTALHD is not set
 # CONFIG_FB_XGI is not set
@@ -2487,7 +2502,12 @@
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_SYNO_ACL=y
 # CONFIG_BTRFS_FS_POSIX_ACL is not set
+# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set
+# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set
+# CONFIG_BTRFS_DEBUG is not set
+# CONFIG_BTRFS_ASSERT is not set
 # CONFIG_NILFS2_FS is not set
 CONFIG_EXPORTFS=m
 CONFIG_FILE_LOCKING=y
@@ -2911,6 +2931,10 @@
 # OCF Configuration
 #
 # CONFIG_OCF_OCF is not set
+
+#
+# OCF Configuration
+#
 CONFIG_HAVE_KVM=y
 # CONFIG_VIRTUALIZATION is not set
 # CONFIG_BINARY_PRINTF is not set
@@ -2944,6 +2968,10 @@
 # CONFIG_XZ_DEC_TEST is not set
 CONFIG_DECOMPRESS_GZIP=y
 CONFIG_DECOMPRESS_LZMA=y
+# CONFIG_TEXTSEARCH is not set
+# CONFIG_TEXTSEARCH_KMP is not set
+# CONFIG_TEXTSEARCH_BM is not set
+# CONFIG_TEXTSEARCH_FSM is not set
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
diff -ur a/synoconfigs/Kconfig b/synoconfigs/Kconfig
--- a/synoconfigs/Kconfig	2013-08-24 11:29:13.000000000 +0200
+++ b/synoconfigs/Kconfig	2014-02-17 11:45:15.000000000 +0100
@@ -24,6 +24,13 @@
 	help
 	  Say Y to Support display CPU Infomation
 
+config SYNO_COMCERTO2K_CPU_AFFINITY
+	bool "Set default cpu affinity to core 0"
+	depends on SYNO_COMCERTO
+	default y
+	help
+		Set default CPU afiinity on core 0 for md, flush thread
+
 config SYNO_FIX_MV_CESA_RACE
 	bool "Fix Marvell's crypto driver race"
 	depends on MV_CESA
@@ -31,4 +38,103 @@
 	help
 		Say Y here to fix Marvell crypto driver race condition
 
+config SYNO_FIX_OCF_CRYPTODEV_RACE
+	bool "Fix OCF's race in cryptodev interface"
+	depends on OCF_OCF
+	default y
+	help
+		Fix OCF race in cryptodev interface
+
+config SYNO_FIX_OCF_CRYPTOSOFT_RACE
+	bool "Fix OCF's race in cryptosoft interface"
+	depends on OCF_OCF
+	default y
+	help
+		Fix OCF race in cryptosoft interface
+
+config SYNO_C2K_UART
+	bool "support Synology comcerto2k UART0 features"
+	depends on SYNO_COMCERTO
+	default n
+	help
+		If you need Synology comcerto2k UART0 features supports, choose yes.
+		Which will modify UART gpio
+
+config SYNO_C2K_NET
+	bool "Apply Synology comcerto2k network modify"
+	depends on SYNO_COMCERTO
+	default n
+	help
+		If you need Synology comcerto2k network modifies, choose yes.
+		Which will modify network interface name and pfe data
+
+config SYNO_C2K_SERIAL_FIX
+    bool "C2K serial workaround"
+	depends on SYNO_COMCERTO
+	help
+	  Say Y to add C2K serial workaround
+
+config SYNO_C2K_SPI_PARTITION
+    bool "apply synology spi partition in c2k"
+	depends on SYNO_COMCERTO
+	help
+	  Say Y to add synology spi partition
+
+config SYNO_C2K_GPIO_READ_SHIFT
+	bool "shift the read GPIO value before returning"
+	depends on SYNO_COMCERTO
+	help
+	  Say Y to add GPIO read shift
+
+config SYNO_C2K_REBOOT_POWEROFF_BY_MICROP
+	bool "Use MicroP to implement the reboot and poweroff"
+	depends on SYNO_COMCERTO
+	help
+	  Say Y to add MicroP reboot and poweroff
+
+config SYNO_C2K_WOL_ENABLE
+	bool "Enable WOL function"
+	depends on SYNO_COMCERTO
+	help
+	  Say Y to add WOL function enable
+
+config SYNO_C2K_XOR_RWLOCK
+	bool "Ensure XOR driver read/write data is correct"
+	depends on SYNO_COMCERTO && COMCERTO_XOR
+	help
+	  Say Y to Ensure XOR driver read/write data is correct
+
+config SYNO_C2K_FIX_DWC_OTG_DEADLOCK
+	bool "Fix dwc_otg deadload issue"
+	depends on SYNO_COMCERTO && DWC_OTG
+	help
+	  Say Y to Fix dwc_otg deadload issue
+
+config SYNO_C2K_PCIE_SWITCH_FIX
+	bool "Longer nop delay for avoid PCIE switch blocking"
+	depends on SYNO_COMCERTO
+	help
+	  Say Y to enable longer nop delay
+
+config SYNO_C2K_USB_VBUS_CONTROL
+	bool "Manually enable/disable the USB vbus power via gpio"
+	depends on SYNO_COMCERTO
+	help
+	  Say Y to enable controlling of USB vbus
+
+config SYNO_EFI
+	bool "Enable EFI runtime service"
+	depends on SYNO_BROMOLOW
+	default y
+	help
+	  Enable EFI runtime service for UEFI platform
+
 endmenu
+
+config SYNO_DUAL_HEAD
+	bool "Synology Dual Head Host"
+	depends on SYNO_BROMOLOW
+	default n
+	help
+	  Dual head host use STAT DOM as the synoboot device. In this model SATA
+	  controller & SAS expander exist simultaneously.
Nur in b/synoconfigs: kvmx64.
diff -ur a/synoconfigs/ppc8533 b/synoconfigs/ppc8533
--- a/synoconfigs/ppc8533	2013-08-03 09:59:52.000000000 +0200
+++ b/synoconfigs/ppc8533	2014-01-21 09:37:34.000000000 +0100
@@ -138,7 +138,7 @@
 # CONFIG_TIMERFD is not set
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 
diff -ur a/synoconfigs/ppc854x b/synoconfigs/ppc854x
--- a/synoconfigs/ppc854x	2013-08-03 09:59:52.000000000 +0200
+++ b/synoconfigs/ppc854x	2014-01-21 09:37:34.000000000 +0100
@@ -138,7 +138,7 @@
 # CONFIG_TIMERFD is not set
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 
Nur in a/synoconfigs: richland.
diff -ur a/synoconfigs/x86 b/synoconfigs/x86
--- a/synoconfigs/x86	2013-08-03 09:59:52.000000000 +0200
+++ b/synoconfigs/x86	2014-01-21 09:37:34.000000000 +0100
@@ -74,7 +74,8 @@
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
-# CONFIG_POSIX_MQUEUE is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -122,7 +123,7 @@
 CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_HAVE_PERF_EVENTS=y
 
 #
diff -ur a/synoconfigs/x86_64 b/synoconfigs/x86_64
--- a/synoconfigs/x86_64	2013-08-03 09:59:52.000000000 +0200
+++ b/synoconfigs/x86_64	2014-01-21 09:37:34.000000000 +0100
@@ -10,7 +10,8 @@
 CONFIG_SYNO_X64=y
 # CONFIG_SYNO_BROMOLOW is not set
 # CONFIG_SYNO_CEDARVIEW is not set
-# CONFIG_SYNO_AMD_RICHLAND is not set
+# CONFIG_SYNO_AVOTON is not set
+# CONFIG_SYNO_KVMX64 is not set
 CONFIG_OUTPUT_FORMAT="elf64-x86-64"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -80,7 +81,8 @@
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
-# CONFIG_POSIX_MQUEUE is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_FHANDLE is not set
 # CONFIG_TASKSTATS is not set
@@ -145,7 +147,7 @@
 CONFIG_TIMERFD=y
 CONFIG_EVENTFD=y
 CONFIG_SHMEM=y
-# CONFIG_AIO is not set
+CONFIG_AIO=y
 CONFIG_EMBEDDED=y
 CONFIG_HAVE_PERF_EVENTS=y
 
@@ -185,6 +187,10 @@
 #
 # GCOV-based kernel profiling
 #
+
+#
+# Synology special config
+#
 CONFIG_SYNO_DISPLAY_CPUINFO=y
 # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
 CONFIG_SLABINFO=y
@@ -633,7 +639,7 @@
 # CONFIG_NETFILTER_XT_MATCH_IPRANGE is not set
 # CONFIG_NETFILTER_XT_MATCH_LENGTH is not set
 CONFIG_NETFILTER_XT_MATCH_LIMIT=m
-# CONFIG_NETFILTER_XT_MATCH_MAC is not set
+CONFIG_NETFILTER_XT_MATCH_MAC=m
 # CONFIG_NETFILTER_XT_MATCH_MARK is not set
 CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m
 # CONFIG_NETFILTER_XT_MATCH_OWNER is not set
@@ -797,10 +803,14 @@
 CONFIG_WIRELESS_EXT=y
 CONFIG_WEXT_CORE=y
 CONFIG_WEXT_PROC=y
+# CONFIG_WEXT_SPY is not set
 CONFIG_WEXT_PRIV=y
 # CONFIG_CFG80211 is not set
 CONFIG_WIRELESS_EXT_SYSFS=y
 # CONFIG_LIB80211 is not set
+# CONFIG_LIB80211_CRYPT_WEP is not set
+# CONFIG_LIB80211_CRYPT_CCMP is not set
+# CONFIG_LIB80211_CRYPT_TKIP is not set
 
 #
 # CFG80211 needs to be enabled for MAC80211
@@ -1464,6 +1474,10 @@
 # Other I2C/SMBus bus drivers
 #
 # CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
 # CONFIG_I2C_DEBUG_CORE is not set
 # CONFIG_I2C_DEBUG_ALGO is not set
 # CONFIG_I2C_DEBUG_BUS is not set
@@ -1678,6 +1692,7 @@
 # CONFIG_EZX_PCAP is not set
 # CONFIG_MFD_CS5535 is not set
 # CONFIG_LPC_SCH is not set
+# CONFIG_LPC_ICH is not set
 # CONFIG_MFD_RDC321X is not set
 # CONFIG_MFD_JANZ_CMODIO is not set
 # CONFIG_MFD_VX855 is not set
@@ -1805,7 +1820,26 @@
 CONFIG_SOUND=m
 CONFIG_SOUND_OSS_CORE=y
 CONFIG_SOUND_OSS_CORE_PRECLAIM=y
-# CONFIG_SND is not set
+CONFIG_SND=m
+# CONFIG_SND_SEQUENCER is not set
+# CONFIG_SND_MIXER_OSS is not set
+# CONFIG_SND_PCM_OSS is not set
+# CONFIG_SND_DYNAMIC_MINORS is not set
+CONFIG_SND_SUPPORT_OLD_API=y
+# CONFIG_SND_VERBOSE_PROCFS is not set
+# CONFIG_SND_VERBOSE_PRINTK is not set
+# CONFIG_SND_DEBUG is not set
+CONFIG_SND_DMA_SGBUF=y
+# CONFIG_SND_RAWMIDI_SEQ is not set
+# CONFIG_SND_OPL3_LIB_SEQ is not set
+# CONFIG_SND_OPL4_LIB_SEQ is not set
+# CONFIG_SND_SBAWE_SEQ is not set
+# CONFIG_SND_EMU10K1_SEQ is not set
+# CONFIG_SND_DRIVERS is not set
+# CONFIG_SND_PCI is not set
+# CONFIG_SND_SPI is not set
+# CONFIG_SND_USB is not set
+# CONFIG_SND_SOC is not set
 CONFIG_SOUND_PRIME=m
 # CONFIG_SOUND_OSS is not set
 CONFIG_HID_SUPPORT=y
@@ -1834,6 +1868,7 @@
 # CONFIG_HID_BELKIN is not set
 # CONFIG_HID_CHERRY is not set
 # CONFIG_HID_CHICONY is not set
+# CONFIG_HID_PRODIKEYS is not set
 # CONFIG_HID_CYPRESS is not set
 # CONFIG_HID_DRAGONRISE is not set
 # CONFIG_HID_EMS_FF is not set
@@ -2176,6 +2211,7 @@
 # CONFIG_TRANZPORT is not set
 # CONFIG_POHMELFS is not set
 # CONFIG_IDE_PHISON is not set
+# CONFIG_LINE6_USB is not set
 # CONFIG_DRM_NOUVEAU is not set
 
 #
@@ -2190,8 +2226,9 @@
 # CONFIG_VME_BUS is not set
 # CONFIG_DX_SEP is not set
 # CONFIG_IIO is not set
-# CONFIG_XVMALLOC is not set
-# CONFIG_ZRAM is not set
+CONFIG_ZSMALLOC=y
+CONFIG_ZRAM=m
+# CONFIG_ZRAM_DEBUG is not set
 # CONFIG_FB_SM7XX is not set
 # CONFIG_CRYSTALHD is not set
 # CONFIG_FB_XGI is not set
@@ -2282,7 +2319,12 @@
 # CONFIG_GFS2_FS is not set
 # CONFIG_OCFS2_FS is not set
 CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_SYNO_ACL=y
 # CONFIG_BTRFS_FS_POSIX_ACL is not set
+# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set
+# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set
+# CONFIG_BTRFS_DEBUG is not set
+# CONFIG_BTRFS_ASSERT is not set
 # CONFIG_NILFS2_FS is not set
 CONFIG_EXPORTFS=m
 CONFIG_FILE_LOCKING=y
@@ -2703,6 +2745,10 @@
 # OCF Configuration
 #
 # CONFIG_OCF_OCF is not set
+
+#
+# OCF Configuration
+#
 CONFIG_HAVE_KVM=y
 # CONFIG_VIRTUALIZATION is not set
 # CONFIG_BINARY_PRINTF is not set
@@ -2729,6 +2775,10 @@
 # CONFIG_XZ_DEC_BCJ is not set
 CONFIG_DECOMPRESS_GZIP=y
 CONFIG_DECOMPRESS_LZMA=y
+# CONFIG_TEXTSEARCH is not set
+# CONFIG_TEXTSEARCH_KMP is not set
+# CONFIG_TEXTSEARCH_BM is not set
+# CONFIG_TEXTSEARCH_FSM is not set
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
diff -ur a/tools/virtio/linux/virtio.h b/tools/virtio/linux/virtio.h
--- a/tools/virtio/linux/virtio.h	2013-08-24 11:36:46.000000000 +0200
+++ b/tools/virtio/linux/virtio.h	2014-02-17 11:57:01.000000000 +0100
@@ -186,21 +186,12 @@
 #endif
 
 /* Interfaces exported by virtio_ring. */
-int virtqueue_add_buf_gfp(struct virtqueue *vq,
-			  struct scatterlist sg[],
-			  unsigned int out_num,
-			  unsigned int in_num,
-			  void *data,
-			  gfp_t gfp);
-
-static inline int virtqueue_add_buf(struct virtqueue *vq,
-				    struct scatterlist sg[],
-				    unsigned int out_num,
-				    unsigned int in_num,
-				    void *data)
-{
-	return virtqueue_add_buf_gfp(vq, sg, out_num, in_num, data, GFP_ATOMIC);
-}
+int virtqueue_add_buf(struct virtqueue *vq,
+		      struct scatterlist sg[],
+		      unsigned int out_num,
+		      unsigned int in_num,
+		      void *data,
+		      gfp_t gfp);
 
 void virtqueue_kick(struct virtqueue *vq);
 
@@ -214,6 +205,7 @@
 struct virtqueue *vring_new_virtqueue(unsigned int num,
 				      unsigned int vring_align,
 				      struct virtio_device *vdev,
+				      bool weak_barriers,
 				      void *pages,
 				      void (*notify)(struct virtqueue *vq),
 				      void (*callback)(struct virtqueue *vq),
diff -ur a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c
--- a/tools/virtio/virtio_test.c	2013-08-24 11:36:46.000000000 +0200
+++ b/tools/virtio/virtio_test.c	2014-02-17 11:57:01.000000000 +0100
@@ -92,7 +92,8 @@
 	assert(r >= 0);
 	memset(info->ring, 0, vring_size(num, 4096));
 	vring_init(&info->vring, num, info->ring, 4096);
-	info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev, info->ring,
+	info->vq = vring_new_virtqueue(info->vring.num, 4096, &dev->vdev,
+				       true, info->ring,
 				       vq_notify, vq_callback, "test");
 	assert(info->vq);
 	info->vq->priv = info;
@@ -160,7 +161,8 @@
 			if (started < bufs) {
 				sg_init_one(&sl, dev->buf, dev->buf_size);
 				r = virtqueue_add_buf(vq->vq, &sl, 1, 0,
-						      dev->buf + started);
+						      dev->buf + started,
+						      GFP_ATOMIC);
 				if (likely(r >= 0)) {
 					++started;
 					virtqueue_kick(vq->vq);
diff -ur a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
--- a/virt/kvm/irq_comm.c	2013-08-24 11:36:50.000000000 +0200
+++ b/virt/kvm/irq_comm.c	2014-02-17 11:57:10.000000000 +0100
@@ -138,6 +138,20 @@
 	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+	struct kvm_kernel_irq_routing_entry route;
+
+	if (!irqchip_in_kernel(kvm) || msi->flags != 0)
+		return -EINVAL;
+
+	route.msi.address_lo = msi->address_lo;
+	route.msi.address_hi = msi->address_hi;
+	route.msi.data = msi->data;
+
+	return kvm_set_msi(&route, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1);
+}
+
 /*
  * Return value:
  *  < 0   Interrupt was ignored (masked or not delivered for other reasons)
diff -ur a/virt/kvm/Kconfig b/virt/kvm/Kconfig
--- a/virt/kvm/Kconfig	2013-08-03 09:59:52.000000000 +0200
+++ b/virt/kvm/Kconfig	2014-01-21 09:37:35.000000000 +0100
@@ -18,3 +18,9 @@
 
 config KVM_ASYNC_PF
        bool
+
+config HAVE_KVM_MSI
+       bool
+
+config HAVE_KVM_CPU_RELAX_INTERCEPT
+       bool
diff -ur a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
--- a/virt/kvm/kvm_main.c	2013-08-24 11:36:50.000000000 +0200
+++ b/virt/kvm/kvm_main.c	2014-02-17 11:57:10.000000000 +0100
@@ -2064,6 +2064,17 @@
 		mutex_unlock(&kvm->lock);
 		break;
 #endif
+#ifdef CONFIG_HAVE_KVM_MSI
+	case KVM_SIGNAL_MSI: {
+		struct kvm_msi msi;
+
+		r = -EFAULT;
+		if (copy_from_user(&msi, argp, sizeof msi))
+			goto out;
+		r = kvm_send_userspace_msi(kvm, &msi);
+		break;
+	}
+#endif
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 		if (r == -ENOTTY)
@@ -2192,6 +2203,9 @@
 	case KVM_CAP_SET_BOOT_CPU_ID:
 #endif
 	case KVM_CAP_INTERNAL_ERROR_DATA:
+#ifdef CONFIG_HAVE_KVM_MSI
+	case KVM_CAP_SIGNAL_MSI:
+#endif
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 	case KVM_CAP_IRQ_ROUTING: