25313f2bcf
Should look at making the frequency of checking dynamic, but this will help the immediate problem.
257 lines
11 KiB
Diff
257 lines
11 KiB
Diff
From davej Mon Jan 30 16:40:11 2012
|
|
Return-Path: linux-kernel-owner@vger.kernel.org
|
|
X-Spam-Checker-Version: SpamAssassin 3.3.2 (2011-06-06) on
|
|
gelk.kernelslacker.org
|
|
X-Spam-Level:
|
|
X-Spam-Status: No, score=-4.9 required=5.0 tests=DKIM_ADSP_CUSTOM_MED,
|
|
DKIM_SIGNED,FREEMAIL_FROM,RCVD_IN_DNSWL_HI,T_DKIM_INVALID,T_RP_MATCHES_RCVD,
|
|
T_TO_NO_BRKTS_FREEMAIL,UNPARSEABLE_RELAY autolearn=unavailable version=3.3.2
|
|
Received: from mail.corp.redhat.com [10.5.5.52]
|
|
by gelk.kernelslacker.org with IMAP (fetchmail-6.3.20)
|
|
for <davej@localhost> (single-drop); Mon, 30 Jan 2012 16:40:11 -0500 (EST)
|
|
Received: from zmta01.collab.prod.int.phx2.redhat.com (LHLO
|
|
zmta01.collab.prod.int.phx2.redhat.com) (10.5.5.31) by
|
|
zmail11.collab.prod.int.phx2.redhat.com with LMTP; Mon, 30 Jan 2012
|
|
16:37:45 -0500 (EST)
|
|
Received: from localhost (localhost.localdomain [127.0.0.1])
|
|
by zmta01.collab.prod.int.phx2.redhat.com (Postfix) with ESMTP id 4BAD0114054;
|
|
Mon, 30 Jan 2012 16:37:45 -0500 (EST)
|
|
X-Quarantine-ID: <1529X45BXJfc>
|
|
Authentication-Results: zmta01.collab.prod.int.phx2.redhat.com (amavisd-new);
|
|
dkim=softfail (fail, body has been altered) header.i=@gmail.com
|
|
Received: from zmta01.collab.prod.int.phx2.redhat.com ([127.0.0.1])
|
|
by localhost (zmta01.collab.prod.int.phx2.redhat.com [127.0.0.1]) (amavisd-new, port 10024)
|
|
with ESMTP id 1529X45BXJfc; Mon, 30 Jan 2012 16:37:45 -0500 (EST)
|
|
Received: from int-mx02.intmail.prod.int.phx2.redhat.com (int-mx02.intmail.prod.int.phx2.redhat.com [10.5.11.12])
|
|
by zmta01.collab.prod.int.phx2.redhat.com (Postfix) with ESMTP id 717AC11404F;
|
|
Mon, 30 Jan 2012 16:37:44 -0500 (EST)
|
|
Received: from mx1.redhat.com (ext-mx14.extmail.prod.ext.phx2.redhat.com [10.5.110.19])
|
|
by int-mx02.intmail.prod.int.phx2.redhat.com (8.13.8/8.13.8) with ESMTP id q0ULbhea005095;
|
|
Mon, 30 Jan 2012 16:37:43 -0500
|
|
Received: from vger.kernel.org (vger.kernel.org [209.132.180.67])
|
|
by mx1.redhat.com (8.14.4/8.14.4) with ESMTP id q0ULQIU9002068;
|
|
Mon, 30 Jan 2012 16:37:42 -0500
|
|
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
|
|
id S1753551Ab2A3Vha (ORCPT <rfc822;lcapitulino@redhat.com>
|
|
+ 52 others); Mon, 30 Jan 2012 16:37:30 -0500
|
|
Received: from mail-pz0-f46.google.com ([209.85.210.46]:44901 "EHLO
|
|
mail-pz0-f46.google.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
|
|
with ESMTP id S1751932Ab2A3Vh2 (ORCPT
|
|
<rfc822;linux-kernel@vger.kernel.org>);
|
|
Mon, 30 Jan 2012 16:37:28 -0500
|
|
Received: by dadi2 with SMTP id i2so3911730dad.19
|
|
for <linux-kernel@vger.kernel.org>; Mon, 30 Jan 2012 13:37:28 -0800 (PST)
|
|
Received-SPF: pass (google.com: domain of jeroen.vandenkeybus@gmail.com designates 10.68.218.68 as permitted sender) client-ip=10.68.218.68;
|
|
Authentication-Results: mr.google.com; spf=pass (google.com: domain of jeroen.vandenkeybus@gmail.com designates 10.68.218.68 as permitted sender) smtp.mail=jeroen.vandenkeybus@gmail.com; dkim=pass header.i=jeroen.vandenkeybus@gmail.com
|
|
Received: from mr.google.com ([10.68.218.68])
|
|
by 10.68.218.68 with SMTP id pe4mr25063612pbc.97.1327959448228 (num_hops = 1);
|
|
Mon, 30 Jan 2012 13:37:28 -0800 (PST)
|
|
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
|
|
d=gmail.com; s=gamma;
|
|
h=mime-version:date:message-id:subject:from:to:cc:content-type;
|
|
bh=acUEKJRLazlNr7PWqoJIHm/MPkfhI5SUq1Z0ntfqXSE=;
|
|
b=J1hmytKDfluL7NI73mVH+flbQ2+36FPRRx+DFhrPs8hiOebxJHysZZH+etW1ppCJG0
|
|
ORowrKZYuyXb1CVYkSAYSnZ60r0edu8VycE4wsVItKQV8f0ZFyFZi5HteL1KiBRHqTYI
|
|
soeRaI/zW4cJv3AbTTc1Aj/4/HXKyuPtj0Ayc=
|
|
MIME-Version: 1.0
|
|
Received: by 10.68.218.68 with SMTP id pe4mr20868027pbc.97.1327959448085; Mon,
|
|
30 Jan 2012 13:37:28 -0800 (PST)
|
|
Received: by 10.143.38.11 with HTTP; Mon, 30 Jan 2012 13:37:28 -0800 (PST)
|
|
Date: Mon, 30 Jan 2012 22:37:28 +0100
|
|
Message-ID: <CAPRPZsAt+e3cy1YTriikpb2SNN=jOusvnPF0ByFeun+uaBa5Og@mail.gmail.com>
|
|
Subject: [PATCH] Unhandled IRQs on AMD E-450: temporarily switch to
|
|
low-performance polling IRQ mode
|
|
From: Jeroen Van den Keybus <jeroen.vandenkeybus@gmail.com>
|
|
To: linux-kernel@vger.kernel.org
|
|
Cc: Clemens Ladisch <clemens@ladisch.de>, "Huang, Shane" <Shane.Huang@amd.com>,
|
|
Borislav Petkov <bp@amd64.org>, "Nguyen, Dong" <Dong.Nguyen@amd.com>,
|
|
jesse.brandeburg@gmail.com
|
|
Content-Type: text/plain; charset=ISO-8859-1
|
|
Sender: linux-kernel-owner@vger.kernel.org
|
|
Precedence: bulk
|
|
List-ID: <linux-kernel.vger.kernel.org>
|
|
X-Mailing-List: linux-kernel@vger.kernel.org
|
|
X-RedHat-Spam-Score: -4.898 (DKIM_ADSP_CUSTOM_MED,DKIM_SIGNED,FREEMAIL_FROM,RCVD_IN_DNSWL_HI,T_DKIM_INVALID,T_RP_MATCHES_RCVD)
|
|
X-Scanned-By: MIMEDefang 2.67 on 10.5.11.12
|
|
X-Scanned-By: MIMEDefang 2.68 on 10.5.110.19
|
|
Status: RO
|
|
Content-Length: 7029
|
|
Lines: 189
|
|
|
|
It seems that some motherboard designs using the ASM1083 PCI/PCIe
|
|
bridge (PCI device ID 1b21:1080, Rev. 01) suffer from stuck IRQ lines
|
|
on the PCI bus (causing the kernel to emit 'IRQxx: nobody cared' and
|
|
disable the IRQ). The following patch is an attempt to mitigate the
|
|
serious impact of permanently disabling an IRQ in that case and
|
|
actually make PCI devices better usable on this platform.
|
|
|
|
It seems that the bridge fails to issue a IRQ deassertion message on
|
|
the PCIe bus, when the relevant driver causes the interrupting PCI
|
|
device to deassert its IRQ line. To solve this issue, it was tried to
|
|
re-issue an IRQ on a PCI device being able to do so (e1000 in this
|
|
case), but we suspect that the attempt to re-assert/deassert may have
|
|
occurred too soon after the initial IRQ for the ASM1083. Anyway, it
|
|
didn't work but if, after some delay, a new IRQ occurred, the related
|
|
IRQ deassertion message eventually did clear the IOAPIC IRQ. It would
|
|
be useful to re-enable the IRQ here.
|
|
|
|
Therefore the patch below to poll_spurious_irqs() in spurious.c is
|
|
proposed, It does the following:
|
|
|
|
1. lets the kernel decide that an IRQ is unhandled after only 10
|
|
positives (instead of 100,000);
|
|
2. briefly (a few seconds or so, currently 1 s) switches to polling
|
|
IRQ at a higher rate than usual (100..1,000Hz instead of 10Hz,
|
|
currently 100Hz), but not too high to avoid excessive CPU load. Any
|
|
device drivers 'see' their interrupts handled with a higher latency
|
|
than usual, but they will still operate properly;
|
|
3. afterwards, simply reenable the IRQ.
|
|
|
|
If proper operation of the PCIe legacy IRQ line emulation is restored
|
|
after 3, the system operates again at normal performance. If the IRQ
|
|
is still stuck after this procedure, the sequence repeats.
|
|
|
|
If a genuinely stuck IRQ is used with this solution, the system would
|
|
simply sustain short bursts of 10 unhandled IRQs per second, and use
|
|
polling mode indefinitely at a moderate 100Hz rate. It seemed a good
|
|
alternative to the default irqpoll behaviour to me, which is why I
|
|
left it in poll_spurious_irqs() (instead of creating a new kernel
|
|
option). Additionally, if any device happens to share an IRQ with a
|
|
faulty one, that device is no longer banned forever.
|
|
|
|
Debugging output is still present and may be removed. Bad IRQ
|
|
reporting is also commented out now.
|
|
|
|
I have now tried it for about 2 months and I can conclude the following:
|
|
|
|
1. The patch works and, judging from my Firewire card interrupt on
|
|
IRQ16, which repeats every 64 secs, I can confirm that the IRQ usually
|
|
gets reset when a new IRQ arrives (polling mode runs for 64 seconds
|
|
every time).
|
|
2. When testing a SiL-3114 SATA PCI card behind the ASM1083, I could
|
|
keep this running at fairly high speeds (50..70MB/s) for an hour or
|
|
so, but eventually the SiL driver crashed. In such conditions the PCI
|
|
system had to deal with a few hundred IRQs per second / polling mode
|
|
kicking in every 5..10 seconds).
|
|
|
|
I would like to thank Clemens Ladisch for his invaluable help in
|
|
finding a solution (and providing a patch to avoid my SATA going down
|
|
every time during debugging).
|
|
|
|
|
|
Signed-off-by: Jeroen Van den Keybus <jeroen.vandenkeybus@gmail.com>
|
|
|
|
Make it less chatty. Josh Boyer <jwboyer@redhat.com>
|
|
======
|
|
|
|
--- linux-2.6.orig/kernel/irq/spurious.c
|
|
+++ linux-2.6/kernel/irq/spurious.c
|
|
@@ -18,7 +18,7 @@
|
|
|
|
static int irqfixup __read_mostly;
|
|
|
|
-#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
|
|
+#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/100)
|
|
static void poll_spurious_irqs(unsigned long dummy);
|
|
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
|
|
static int irq_poll_cpu;
|
|
@@ -141,14 +141,15 @@ out:
|
|
static void poll_spurious_irqs(unsigned long dummy)
|
|
{
|
|
struct irq_desc *desc;
|
|
- int i;
|
|
+ int i, poll_again;
|
|
|
|
if (atomic_inc_return(&irq_poll_active) != 1)
|
|
goto out;
|
|
irq_poll_cpu = smp_processor_id();
|
|
|
|
+ poll_again = 0; /* Will stay false as long as no polling candidate is found */
|
|
for_each_irq_desc(i, desc) {
|
|
- unsigned int state;
|
|
+ unsigned int state, irq;
|
|
|
|
if (!i)
|
|
continue;
|
|
@@ -159,14 +160,29 @@ static void poll_spurious_irqs(unsigned
|
|
if (!(state & IRQS_SPURIOUS_DISABLED))
|
|
continue;
|
|
|
|
- local_irq_disable();
|
|
- try_one_irq(i, desc, true);
|
|
- local_irq_enable();
|
|
+ /* We end up here with a disabled spurious interrupt.
|
|
+ desc->irqs_unhandled now tracks the number of times
|
|
+ the interrupt has been polled */
|
|
+
|
|
+ irq = desc->irq_data.irq;
|
|
+ if (desc->irqs_unhandled < 100) { /* 1 second delay with poll frequency 100 Hz */
|
|
+ local_irq_disable();
|
|
+ try_one_irq(i, desc, true);
|
|
+ local_irq_enable();
|
|
+ desc->irqs_unhandled++;
|
|
+ poll_again = 1;
|
|
+ } else {
|
|
+ irq_enable(desc); /* Reenable the interrupt line */
|
|
+ desc->depth--;
|
|
+ desc->istate &= (~IRQS_SPURIOUS_DISABLED);
|
|
+ desc->irqs_unhandled = 0;
|
|
+ }
|
|
}
|
|
+ if (poll_again)
|
|
+ mod_timer(&poll_spurious_irq_timer,
|
|
+ jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
|
|
out:
|
|
atomic_dec(&irq_poll_active);
|
|
- mod_timer(&poll_spurious_irq_timer,
|
|
- jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
|
|
}
|
|
|
|
static inline int bad_action_ret(irqreturn_t action_ret)
|
|
@@ -177,11 +193,19 @@ static inline int bad_action_ret(irqretu
|
|
}
|
|
|
|
/*
|
|
- * If 99,900 of the previous 100,000 interrupts have not been handled
|
|
+ * If 9 of the previous 10 interrupts have not been handled
|
|
* then assume that the IRQ is stuck in some manner. Drop a diagnostic
|
|
* and try to turn the IRQ off.
|
|
*
|
|
- * (The other 100-of-100,000 interrupts may have been a correctly
|
|
+ * Although this may cause early deactivation of a sporadically
|
|
+ * malfunctioning IRQ line, the poll system will:
|
|
+ * a) Poll it for 100 cycles at a 100 Hz rate
|
|
+ * b) Reenable it afterwards
|
|
+ *
|
|
+ * In worst case, with current settings, this will cause short bursts
|
|
+ * of 10 interrupts every second.
|
|
+ *
|
|
+ * (The other single interrupt may have been a correctly
|
|
* functioning device sharing an IRQ with the failing one)
|
|
*/
|
|
static void
|
|
@@ -302,19 +326,19 @@ void note_interrupt(unsigned int irq, st
|
|
}
|
|
|
|
desc->irq_count++;
|
|
- if (likely(desc->irq_count < 100000))
|
|
+ if (likely(desc->irq_count < 10))
|
|
return;
|
|
|
|
desc->irq_count = 0;
|
|
- if (unlikely(desc->irqs_unhandled > 99900)) {
|
|
+ if (unlikely(desc->irqs_unhandled >= 9)) {
|
|
/*
|
|
* The interrupt is stuck
|
|
*/
|
|
- __report_bad_irq(irq, desc, action_ret);
|
|
+ /* __report_bad_irq(irq, desc, action_ret); */
|
|
/*
|
|
* Now kill the IRQ
|
|
*/
|
|
- printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
|
|
+ printk(KERN_EMERG "Disabling IRQ %d\n", irq);
|
|
desc->istate |= IRQS_SPURIOUS_DISABLED;
|
|
desc->depth++;
|
|
irq_disable(desc);
|