/*
 *  arch/arm/mach-ox820/platsmp.c
 *
 *  Copyright (C) 2002 ARM Ltd.
 *  All Rights Reserved
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
#include <linux/init.h>
#include <linux/device.h>
#include <linux/jiffies.h>
#include <linux/smp.h>
#include <linux/io.h>
#include <linux/dma-mapping.h>
#include <asm/cacheflush.h>
#include <asm/localtimer.h>
#include <asm/smp_scu.h>
#include <mach/rps-irq.h>
#include <mach/hardware.h>
#include <asm/tlbflush.h>
#include <asm/cputype.h>

static void __iomem *scu_base = __io_address(OX820_ARM11MP_SCU_BASE);

static inline unsigned int get_core_count(void)
{
	return scu_get_core_count(scu_base);
}

static DEFINE_SPINLOCK(boot_lock);

#include <mach/ipi.h>

/* When working with the two CPUs, the initiating CPU number is used to index
 * into this per-cpu variable */
DEFINE_PER_CPU(struct fiq_coherency_communication_s, fiq_coherency_communication);

void __cpuinit platform_secondary_init(unsigned int cpu)
{
	trace_hardirqs_off();

	/*
	 * If any interrupts are already enabled for the primary
	 * core (e.g. timer irq), then they will not have been enabled
	 * for us: do so
	 */
	gic_cpu_init(0, gic_cpu_base_addr);

	/*
	 * Synchronise with the boot thread.
	 */
	spin_lock(&boot_lock);
	spin_unlock(&boot_lock);
}

int __cpuinit boot_secondary(unsigned int cpu, struct task_struct *idle)
{
	extern void secondary_startup(void);

	unsigned long timeout;

	/*
	 * Set synchronisation state between this boot processor
	 * and the secondary one
	 */
	spin_lock(&boot_lock);

	/*
	 * Don't know why we need this when realview and omap2 appear not to, but
	 * the secondary CPU doesn't start without it.
	 */
	flush_cache_all();

	/*
	 * Enable gic interrupts on the secondary CPU so the interrupt that wakes
	 * it from WFI can be received
	 */
	writel(1, __io_address(OX820_GIC_CPUN_BASE_ADDR(cpu) + GIC_CPU_CTRL));

	/* Write the address that we want the cpu to start at. */
	writel(virt_to_phys(secondary_startup), HOLDINGPEN_LOCATION);
    wmb();
	writel(cpu, HOLDINGPEN_CPU);
	wmb();

    /* Wake the CPU from WFI */
	smp_cross_call(get_cpu_mask(cpu));

	/* Give the secondary CPU time to get going */
	timeout = jiffies + HZ/10;
	while (time_before(jiffies, timeout));

	spin_unlock(&boot_lock);

	return 0;
}

/*
 * Initialise the CPU possible map early - this describes the CPUs
 * which may be present or become present in the system.
 */
void __init smp_init_cpus(void)
{
	unsigned int i, ncores = get_core_count();

	for (i = 0; i < ncores; i++)
		set_cpu_possible(i, true);
}

void __init smp_prepare_cpus(unsigned int max_cpus)
{
	unsigned int ncores = get_core_count();
	unsigned int cpu = smp_processor_id();
	int i;

	/* sanity check */
	if (ncores == 0) {
		printk(KERN_ERR
		       "OX820: strange CM count of 0? Default to 1\n");
		ncores = 1;
	}

	if (ncores > NR_CPUS) {
		printk(KERN_WARNING
		       "OX820: no. of cores (%d) greater than configured "
		       "maximum of %d - clipping\n",
		       ncores, NR_CPUS);
		ncores = NR_CPUS;
	}

	smp_store_cpu_info(cpu);

	/*
	 * are we trying to boot more cores than exist?
	 */
	if (max_cpus > ncores)
		max_cpus = ncores;

	/*
	 * Initialise the present map, which describes the set of CPUs
	 * actually populated at the present time.
	 */
	for (i = 0; i < max_cpus; i++)
		set_cpu_present(i, true);

	if (max_cpus > 1) {
		/*
		 * Enable the local timer or broadcast device for the
		 * boot CPU, but only if we have more than one CPU.
		 */
		percpu_timer_setup();

		/* Initialise the SCU */
		scu_enable(scu_base);
	}
}

/**
 * Perform a dma scatterlist mappings and cache coherency operations,
 * communicated between all cpus using a FIQ generated by an RPS core.
 */
int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
	   enum dma_data_direction dir)
{
    struct scatterlist* sg_element;
	int i;
    unsigned long flags;
    cpumask_t callmap;
    unsigned int cpu;
    void (*chosen_op)(const void *, const void *);
    void (*cache_operation[3])(const void *, const void *) = {
        [DMA_BIDIRECTIONAL] = dmac_flush_range,
        [DMA_TO_DEVICE] = dmac_clean_range,
        [DMA_FROM_DEVICE] = dmac_inv_range
    };    
    
    /* calculate dma_address */
	for_each_sg(sg, sg_element, nents, i) {
		sg_element->dma_address = page_to_dma(dev, sg_page(sg_element)) +
		    sg_element->offset;
		if (dma_mapping_error(dev, sg_element->dma_address))
			return 0;
	}

    /* prevent re-entrance on this processor */
    local_irq_save(flags);
    
    /* work out who's around */
    cpu = get_cpu();
    callmap = cpu_online_map;
    cpu_clear(cpu, callmap);
    
    /* If we're up here when the other CPU is still processing a previous cache
     * coherency operation, it's either memory corruption or some other nasty */
    if(per_cpu(fiq_coherency_communication, cpu).nents != 0)
        BUG();
        
    /* only do this if there are other CPUs */
    if (!cpus_empty(callmap)) {
        /* can only do so much, if this keeps failing make IPI_SGDMA_ELEMENTS
         * bigger */
        if (nents > IPI_SGDMA_ELEMENTS)
            BUG();
        
        /* Due to the unstoppable monster nature of the FIQ handler, it is very
         * limited in what it can look at. sg_virt() uses page-tables, locks 
         * and all sorts of useful things that can be broken by the FIQ handler
         * running them at the wrong time. So we must do them here and pass a
         * array of start/end addresses that can almost be passed straight
         * into the hardware. */
        for_each_sg(sg, sg_element, nents, i) {
            struct smp_dma_cache_range_s temp;
            temp.start = sg_virt(sg_element);
            temp.end = temp.start + sg_dma_len(sg_element);
            per_cpu(fiq_coherency_communication,cpu).message.cache_coherency.range[i] = temp;
        }
        
        per_cpu(fiq_coherency_communication,cpu).type = CACHE_COHERENCY;
        per_cpu(fiq_coherency_communication,cpu).message.cache_coherency.type = dir;
        per_cpu(fiq_coherency_communication,cpu).nents = nents;
        smp_wmb();
    
        /* inform the other processor that it has work to do with a FIQ */
        if (cpu == 0) {
            OX820_RPS_trigger_fiq(1);
        } else {
            OX820_RPS_trigger_fiq(0);
        }
    }
    
    /* do our own cache work whilst we wait */
    chosen_op = cache_operation[dir];
    for_each_sg(sg, sg_element, nents, i) {
        chosen_op(sg_virt(sg_element), sg_virt(sg_element) + sg_dma_len(sg_element));
    }
    
    /* rendezvous the two cpus here */
    while (per_cpu(fiq_coherency_communication,cpu).nents != 0) {
        barrier();
    }
    
    put_cpu();
    local_irq_restore(flags);
	return nents;
}

EXPORT_SYMBOL(dma_map_sg);

/**
 * High level FIQ handler
 *
 * Performs cache coherency operations on a list of memory ranges prepared by 
 * the other CPU by dma_map_sg() or __smp_dma_cache_op()
 *
 * This code is run in FIQ mode using the small-ish FIQ stack. It shouldn't call
 * any code that uses exclusive access instructions, such as locks. printk
 * uses locks.
 *
 * It is likely that two instances of this function may be running at once so
 * persistent storage must be per-cpu (no static variables)
 *
 */
asmlinkage void do_coherency(void)
{
    /* cpu = our cpu */
    unsigned int cpu;
	unsigned int other_cpu;
    unsigned int i;
    struct fiq_coherency_communication_s* comms;
    void (*cache_operation[3])(const void *, const void *) = {
        [DMA_BIDIRECTIONAL] = dmac_flush_range,
        [DMA_TO_DEVICE] = dmac_clean_range,
        [DMA_FROM_DEVICE] = dmac_inv_range
    };    

    /* get the CPU number */    
    cpu = hard_smp_processor_id();
    other_cpu = (cpu == 0) ? 1 : 0;
    
    /* mask out the fiq */
    OX820_RPS_clear_fiq(cpu);

    /* get the ipi work structure */
    comms = &(per_cpu(fiq_coherency_communication,other_cpu));
      
    if (comms->type == CACHE_COHERENCY) {
        void (*chosen_op)(const void *, const void *);
        /* iterate through the array of ranges, doing cache operations */
        chosen_op = cache_operation[comms->message.cache_coherency.type];
        for(i = 0; i < comms->nents; ++i) {
            chosen_op(comms->message.cache_coherency.range[i].start, comms->message.cache_coherency.range[i].end);
        }
    } else {
        // Must be a TLB operation
        void (*chosen_op)(void *);
        chosen_op = comms->message.tlb.tlb_op;
        chosen_op(comms->message.tlb.tlb_arg);   
    }
    
    /* mark as done by setting the number of entries to 0 */
    comms->nents = 0;
    wmb();
}

/**
 * cpumask_test_cpu - test for a cpu in a cpumask
 * @cpu: cpu number (< nr_cpu_ids)
 * @cpumask: the cpumask pointer
 *
 * No static inline type checking - see Subtlety (1) above.
 */
#define cpumask_test_cpu(cpu, cpumask) \
	test_bit(cpumask_check(cpu), cpumask_bits((cpumask)))

#include <linux/thread_info.h>
#define raw_smp_processor_id() (current_thread_info()->cpu)

/*
 * Call a tlb function on the other processors via a FIQ
 */
static void tlb_on_other_cpu(void (*func) (void *info), void *info, int cpu)
{

    cpumask_t callmap;

    /* work out who's around */
    callmap = cpu_online_map;
    cpu_clear(cpu, callmap);

    /* If we're up here when the other CPU is still processing a previous cache
     * coherency or TLB operation, it's either memory corruption or some other nasty */
    if(per_cpu(fiq_coherency_communication, cpu).nents != 0)
        BUG();
        
    /* only do this if there are other CPUs */
    if (!cpus_empty(callmap)) {

        per_cpu(fiq_coherency_communication,cpu).type = TLB;
        per_cpu(fiq_coherency_communication,cpu).message.tlb.tlb_arg = info;
        per_cpu(fiq_coherency_communication,cpu).message.tlb.tlb_op =  func;
        per_cpu(fiq_coherency_communication,cpu).nents = 1;
        smp_wmb();
    
        /* inform the other processor that it has work to do with a FIQ */
        if (cpu == 0) {
            OX820_RPS_trigger_fiq(1);
        } else {
            OX820_RPS_trigger_fiq(0);
        }
    }
}

/*
 * Call a tlb function on some processors
 */
static void tlb_on_each_cpu_mask(void (*func) (void *info), void *info, int wait, const struct cpumask *mask)
{

    unsigned long flags;
	int next_cpu, this_cpu = smp_processor_id();
    int cpu;
    
    /* prevent re-entrance on this processor */
    local_irq_save(flags);

    cpu = get_cpu();

	/* So, what the first CPU they want? */
	next_cpu = cpumask_first_and(mask, cpu_online_mask);

    /* if it's this cpu, pick the next one */
    if (next_cpu == this_cpu) {
        next_cpu =  cpumask_next_and(next_cpu, mask, cpu_online_mask);
    }

    if (next_cpu != this_cpu) {
        // Instigate a FIQ-based IPI
        tlb_on_other_cpu(func, info, cpu);
        
    }
    
    /* if the local cpu matches the mask, run a local call */
    if (cpumask_test_cpu(smp_processor_id(), mask)) {
		func(info);
    }

    /* rendezvous the two cpus here */
    while (per_cpu(fiq_coherency_communication,cpu).nents != 0) {
        barrier();
    }
    
    put_cpu();
    local_irq_restore(flags);
}

/*
 * Call a tlb function on all processors
 */
static void tlb_on_each_cpu(void (*func) (void *info), void *info, int wait)
{

    unsigned long flags;
    int cpu;

    /* prevent re-entrance on this processor */
    local_irq_save(flags);

    cpu = get_cpu();

    // Instigate a FIQ-based IPI
    tlb_on_other_cpu(func, info, cpu);
    // Run local one too.
    func(info);

    /* rendezvous the two cpus here */
    while (per_cpu(fiq_coherency_communication,cpu).nents != 0) {
        barrier();
    }
    
    put_cpu();
    local_irq_restore(flags);

}

/* all SMP configurations have the extended CPUID registers */
static inline int tlb_ops_need_broadcast(void)
{
	return ((read_cpuid_ext(CPUID_EXT_MMFR3) >> 12) & 0xf) < 2;
}

static inline void ipi_flush_tlb_all(void *ignored)
{
	local_flush_tlb_all();
}

static inline void ipi_flush_tlb_mm(void *arg)
{
	struct mm_struct *mm = (struct mm_struct *)arg;

	local_flush_tlb_mm(mm);
}

static inline void ipi_flush_tlb_page(void *arg)
{
	struct tlb_args *ta = (struct tlb_args *)arg;

	local_flush_tlb_page(ta->ta_vma, ta->ta_start);
}

static inline void ipi_flush_tlb_kernel_page(void *arg)
{
	struct tlb_args *ta = (struct tlb_args *)arg;

	local_flush_tlb_kernel_page(ta->ta_start);
}

static inline void ipi_flush_tlb_range(void *arg)
{
	struct tlb_args *ta = (struct tlb_args *)arg;

	local_flush_tlb_range(ta->ta_vma, ta->ta_start, ta->ta_end);
}

static inline void ipi_flush_tlb_kernel_range(void *arg)
{
	struct tlb_args *ta = (struct tlb_args *)arg;

	local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end);
}

void flush_tlb_all(void)
{
	if (tlb_ops_need_broadcast())
		tlb_on_each_cpu(ipi_flush_tlb_all, NULL, 1);
	else
		local_flush_tlb_all();
}
EXPORT_SYMBOL(flush_tlb_all);

void flush_tlb_mm(struct mm_struct *mm)
{
	if (tlb_ops_need_broadcast())
		tlb_on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, &mm->cpu_vm_mask);
	else
		local_flush_tlb_mm(mm);
}
EXPORT_SYMBOL(flush_tlb_mm);

void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr)
{
	if (tlb_ops_need_broadcast()) {
		struct tlb_args ta;
		ta.ta_vma = vma;
		ta.ta_start = uaddr;
		tlb_on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, &vma->vm_mm->cpu_vm_mask);
	} else
		local_flush_tlb_page(vma, uaddr);
}
EXPORT_SYMBOL(flush_tlb_page);

void flush_tlb_kernel_page(unsigned long kaddr)
{
	if (tlb_ops_need_broadcast()) {
		struct tlb_args ta;
		ta.ta_start = kaddr;
		tlb_on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1);
	} else
		local_flush_tlb_kernel_page(kaddr);
}
EXPORT_SYMBOL(flush_tlb_kernel_page);

void flush_tlb_range(struct vm_area_struct *vma,
                     unsigned long start, unsigned long end)
{
	if (tlb_ops_need_broadcast()) {
		struct tlb_args ta;
		ta.ta_vma = vma;
		ta.ta_start = start;
		ta.ta_end = end;
		tlb_on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, &vma->vm_mm->cpu_vm_mask);
	} else
		local_flush_tlb_range(vma, start, end);
}
EXPORT_SYMBOL(flush_tlb_range);

void flush_tlb_kernel_range(unsigned long start, unsigned long end)
{
	if (tlb_ops_need_broadcast()) {
		struct tlb_args ta;
		ta.ta_start = start;
		ta.ta_end = end;
		tlb_on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1);
	} else
		local_flush_tlb_kernel_range(start, end);
}
EXPORT_SYMBOL(flush_tlb_kernel_range);
