forked from rpms/kernel
		
	
		
			
				
	
	
		
			249 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
			
		
		
	
	
			249 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Diff
		
	
	
	
	
	
| Date: 	Mon, 30 Jan 2012 22:37:28 +0100
 | |
| Message-ID: <CAPRPZsAt+e3cy1YTriikpb2SNN=jOusvnPF0ByFeun+uaBa5Og@mail.gmail.com>
 | |
| Subject: [PATCH] Unhandled IRQs on AMD E-450: temporarily switch to
 | |
|  low-performance polling IRQ mode
 | |
| From: Jeroen Van den Keybus <jeroen.vandenkeybus@gmail.com>
 | |
| To: linux-kernel@vger.kernel.org
 | |
| Cc: Clemens Ladisch <clemens@ladisch.de>, "Huang, Shane" <Shane.Huang@amd.com>,
 | |
|         Borislav Petkov <bp@amd64.org>, "Nguyen, Dong" <Dong.Nguyen@amd.com>,
 | |
|         jesse.brandeburg@gmail.com
 | |
| Content-Type: text/plain; charset=ISO-8859-1
 | |
| Sender: linux-kernel-owner@vger.kernel.org
 | |
| Precedence: bulk
 | |
| List-ID: <linux-kernel.vger.kernel.org>
 | |
| X-Mailing-List: 	linux-kernel@vger.kernel.org
 | |
| X-RedHat-Spam-Score: -4.898  (DKIM_ADSP_CUSTOM_MED,DKIM_SIGNED,FREEMAIL_FROM,RCVD_IN_DNSWL_HI,T_DKIM_INVALID,T_RP_MATCHES_RCVD)
 | |
| X-Scanned-By: MIMEDefang 2.67 on 10.5.11.12
 | |
| X-Scanned-By: MIMEDefang 2.68 on 10.5.110.19
 | |
| Status: RO
 | |
| Content-Length: 7029
 | |
| Lines: 189
 | |
| 
 | |
| It seems that some motherboard designs using the ASM1083 PCI/PCIe
 | |
| bridge (PCI device ID 1b21:1080, Rev. 01) suffer from stuck IRQ lines
 | |
| on the PCI bus (causing the kernel to emit 'IRQxx: nobody cared' and
 | |
| disable the IRQ). The following patch is an attempt to mitigate the
 | |
| serious impact of permanently disabling an IRQ in that case and
 | |
| actually make PCI devices better usable on this platform.
 | |
| 
 | |
| It seems that the bridge fails to issue a IRQ deassertion message on
 | |
| the PCIe bus, when the relevant driver causes the interrupting PCI
 | |
| device to deassert its IRQ line. To solve this issue, it was tried to
 | |
| re-issue an IRQ on a PCI device being able to do so (e1000 in this
 | |
| case), but we suspect that the attempt to re-assert/deassert may have
 | |
| occurred too soon after the initial IRQ for the ASM1083. Anyway, it
 | |
| didn't work but if, after some delay, a new IRQ occurred, the related
 | |
| IRQ deassertion message eventually did clear the IOAPIC IRQ. It would
 | |
| be useful to re-enable the IRQ here.
 | |
| 
 | |
| Therefore the patch below to poll_spurious_irqs() in spurious.c is
 | |
| proposed, It does the following:
 | |
| 
 | |
| 1. lets the kernel decide that an IRQ is unhandled after only 10
 | |
| positives (instead of 100,000);
 | |
| 2. briefly (a few seconds or so, currently 1 s) switches to polling
 | |
| IRQ at a higher rate than usual (100..1,000Hz instead of 10Hz,
 | |
| currently 100Hz), but not too high to avoid excessive CPU load. Any
 | |
| device drivers 'see' their interrupts handled with a higher latency
 | |
| than usual, but they will still operate properly;
 | |
| 3. afterwards, simply reenable the IRQ.
 | |
| 
 | |
| If proper operation of the PCIe legacy IRQ line emulation is restored
 | |
| after 3, the system operates again at normal performance. If the IRQ
 | |
| is still stuck after this procedure, the sequence repeats.
 | |
| 
 | |
| If a genuinely stuck IRQ is used with this solution, the system would
 | |
| simply sustain short bursts of 10 unhandled IRQs per second, and use
 | |
| polling mode indefinitely at a moderate 100Hz rate. It seemed a good
 | |
| alternative to the default irqpoll behaviour to me, which is why I
 | |
| left it in poll_spurious_irqs() (instead of creating a new kernel
 | |
| option). Additionally, if any device happens to share an IRQ with a
 | |
| faulty one, that device is no longer banned forever.
 | |
| 
 | |
| Debugging output is still present and may be removed. Bad IRQ
 | |
| reporting is also commented out now.
 | |
| 
 | |
| I have now tried it for about 2 months and I can conclude the following:
 | |
| 
 | |
| 1. The patch works and, judging from my Firewire card interrupt on
 | |
| IRQ16, which repeats every 64 secs, I can confirm that the IRQ usually
 | |
| gets reset when a new IRQ arrives (polling mode runs for 64 seconds
 | |
| every time).
 | |
| 2. When testing a SiL-3114 SATA PCI card behind the ASM1083, I could
 | |
| keep this running at fairly high speeds (50..70MB/s) for an hour or
 | |
| so, but eventually the SiL driver crashed. In such conditions the PCI
 | |
| system had to deal with a few hundred IRQs per second / polling mode
 | |
| kicking in every 5..10 seconds).
 | |
| 
 | |
| I would like to thank Clemens Ladisch for his invaluable help in
 | |
| finding a solution (and providing a patch to avoid my SATA going down
 | |
| every time during debugging).
 | |
| 
 | |
| 
 | |
| Signed-off-by: Jeroen Van den Keybus <jeroen.vandenkeybus@gmail.com>
 | |
| 
 | |
| Make it less chatty.  Only kick it in if we detect an ASM1083 PCI bridge.
 | |
| 
 | |
| Josh Boyer <jwboyer@redhat.com>
 | |
| ======
 | |
| 
 | |
| --- linux-2.6.orig/kernel/irq/spurious.c
 | |
| +++ linux-2.6/kernel/irq/spurious.c
 | |
| @@ -18,6 +18,8 @@
 | |
|  
 | |
|  static int irqfixup __read_mostly;
 | |
|  
 | |
| +int irq_poll_and_retry = 0;
 | |
| +
 | |
|  #define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
 | |
|  static void poll_spurious_irqs(unsigned long dummy);
 | |
|  static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
 | |
| @@ -141,12 +143,13 @@ out:
 | |
|  static void poll_spurious_irqs(unsigned long dummy)
 | |
|  {
 | |
|  	struct irq_desc *desc;
 | |
| -	int i;
 | |
| +	int i, poll_again;
 | |
|  
 | |
|  	if (atomic_inc_return(&irq_poll_active) != 1)
 | |
|  		goto out;
 | |
|  	irq_poll_cpu = smp_processor_id();
 | |
|  
 | |
| +	poll_again = 0; /* Will stay false as long as no polling candidate is found */
 | |
|  	for_each_irq_desc(i, desc) {
 | |
|  		unsigned int state;
 | |
|  
 | |
| @@ -159,14 +162,33 @@ static void poll_spurious_irqs(unsigned
 | |
|  		if (!(state & IRQS_SPURIOUS_DISABLED))
 | |
|  			continue;
 | |
|  
 | |
| -		local_irq_disable();
 | |
| -		try_one_irq(i, desc, true);
 | |
| -		local_irq_enable();
 | |
| +		/* We end up here with a disabled spurious interrupt.
 | |
| +		   desc->irqs_unhandled now tracks the number of times
 | |
| +		   the interrupt has been polled */
 | |
| +		if (irq_poll_and_retry) {
 | |
| +			if (desc->irqs_unhandled < 100) { /* 1 second delay with poll frequency 100 Hz */
 | |
| +				local_irq_disable();
 | |
| +				try_one_irq(i, desc, true);
 | |
| +				local_irq_enable();
 | |
| +				desc->irqs_unhandled++;
 | |
| +				poll_again = 1;
 | |
| +			} else {
 | |
| +				irq_enable(desc); /* Reenable the interrupt line */
 | |
| +				desc->depth--;
 | |
| +				desc->istate &= (~IRQS_SPURIOUS_DISABLED);
 | |
| +				desc->irqs_unhandled = 0;
 | |
| +			}
 | |
| +		} else {
 | |
| +			local_irq_disable();
 | |
| +			try_one_irq(i, desc, true);
 | |
| +			local_irq_enable();
 | |
| +		}
 | |
|  	}
 | |
| +	if (poll_again)
 | |
| +		mod_timer(&poll_spurious_irq_timer,
 | |
| +			  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 | |
|  out:
 | |
|  	atomic_dec(&irq_poll_active);
 | |
| -	mod_timer(&poll_spurious_irq_timer,
 | |
| -		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 | |
|  }
 | |
|  
 | |
|  static inline int bad_action_ret(irqreturn_t action_ret)
 | |
| @@ -177,11 +199,19 @@ static inline int bad_action_ret(irqretu
 | |
|  }
 | |
|  
 | |
|  /*
 | |
| - * If 99,900 of the previous 100,000 interrupts have not been handled
 | |
| + * If 9 of the previous 10 interrupts have not been handled
 | |
|   * then assume that the IRQ is stuck in some manner. Drop a diagnostic
 | |
|   * and try to turn the IRQ off.
 | |
|   *
 | |
| - * (The other 100-of-100,000 interrupts may have been a correctly
 | |
| + * Although this may cause early deactivation of a sporadically
 | |
| + * malfunctioning IRQ line, the poll system will:
 | |
| + * a) Poll it for 100 cycles at a 100 Hz rate
 | |
| + * b) Reenable it afterwards
 | |
| + *
 | |
| + * In worst case, with current settings,  this will cause short bursts
 | |
| + * of 10 interrupts every second.
 | |
| + *
 | |
| + * (The other single interrupt may have been a correctly
 | |
|   *  functioning device sharing an IRQ with the failing one)
 | |
|   */
 | |
|  static void
 | |
| @@ -269,6 +299,8 @@ try_misrouted_irq(unsigned int irq, stru
 | |
|  void note_interrupt(unsigned int irq, struct irq_desc *desc,
 | |
|  		    irqreturn_t action_ret)
 | |
|  {
 | |
| +	int unhandled_thresh = 999000;
 | |
| +
 | |
|  	if (desc->istate & IRQS_POLL_INPROGRESS)
 | |
|  		return;
 | |
|  
 | |
| @@ -302,19 +334,31 @@ void note_interrupt(unsigned int irq, st
 | |
|  	}
 | |
|  
 | |
|  	desc->irq_count++;
 | |
| -	if (likely(desc->irq_count < 100000))
 | |
| -		return;
 | |
| +	if (!irq_poll_and_retry)
 | |
| +		if (likely(desc->irq_count < 100000))
 | |
| +			return;
 | |
| +	else
 | |
| +		if (likely(desc->irq_count < 10))
 | |
| +			return;
 | |
|  
 | |
|  	desc->irq_count = 0;
 | |
| -	if (unlikely(desc->irqs_unhandled > 99900)) {
 | |
| +	if (irq_poll_and_retry)
 | |
| +		unhandled_thresh = 9;
 | |
| +
 | |
| +	if (unlikely(desc->irqs_unhandled >= unhandled_thresh)) {
 | |
|  		/*
 | |
| -		 * The interrupt is stuck
 | |
| +		 * The interrupt might be stuck
 | |
|  		 */
 | |
| -		__report_bad_irq(irq, desc, action_ret);
 | |
| +		if (!irq_poll_and_retry) {
 | |
| +			__report_bad_irq(irq, desc, action_ret);
 | |
| +			printk(KERN_EMERG "Disabling IRQ %d\n", irq);
 | |
| +		} else {
 | |
| +			printk(KERN_INFO "IRQ %d might be stuck.  Polling\n",
 | |
| +				irq);
 | |
| +		}
 | |
|  		/*
 | |
|  		 * Now kill the IRQ
 | |
|  		 */
 | |
| -		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 | |
|  		desc->istate |= IRQS_SPURIOUS_DISABLED;
 | |
|  		desc->depth++;
 | |
|  		irq_disable(desc);
 | |
| --- linux-2.6.orig/drivers/pci/quirks.c
 | |
| +++ linux-2.6/drivers/pci/quirks.c
 | |
| @@ -1677,6 +1677,22 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_IN
 | |
|  DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	0x260a, quirk_intel_pcie_pm);
 | |
|  DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL,	0x260b, quirk_intel_pcie_pm);
 | |
|  
 | |
| +/* ASM108x transparent PCI bridges apparently have broken IRQ deassert
 | |
| + * handling.  This causes interrupts to get "stuck" and eventually disabled.
 | |
| + * However, the interrupts are often shared and disabling them is fairly bad.
 | |
| + * It's been somewhat successful to switch to polling mode and retry after
 | |
| + * a bit, so let's do that.
 | |
| + */
 | |
| +extern int irq_poll_and_retry;
 | |
| +static void quirk_asm108x_poll_interrupts(struct pci_dev *dev)
 | |
| +{
 | |
| +	dev_info(&dev->dev, "Buggy bridge found [%04x:%04x]\n",
 | |
| +		dev->vendor, dev->device);
 | |
| +	dev_info(&dev->dev, "Stuck interrupts will be polled and retried\n");
 | |
| +	irq_poll_and_retry = 1;
 | |
| +}
 | |
| +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ASMEDIA,	0x1080,	quirk_asm108x_poll_interrupts);
 | |
| +
 | |
|  #ifdef CONFIG_X86_IO_APIC
 | |
|  /*
 | |
|   * Boot interrupts on some chipsets cannot be turned off. For these chipsets,
 |