diff --git a/Makefile.rhelver b/Makefile.rhelver index d1eef56490..41b5607b08 100644 --- a/Makefile.rhelver +++ b/Makefile.rhelver @@ -12,7 +12,7 @@ RHEL_MINOR = 10 # # Use this spot to avoid future merge conflicts. # Do not trim this comment. -RHEL_RELEASE = 553.66.1 +RHEL_RELEASE = 553.69.1 # # ZSTREAM diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 7f6620cf22..c2c7bebe30 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1439,22 +1439,12 @@ __acquires(bitmap->lock) &(bitmap->bp[page].map[pageoff]); } -int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) +int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, + unsigned long sectors) { if (!bitmap) return 0; - if (behind) { - int bw; - atomic_inc(&bitmap->behind_writes); - bw = atomic_read(&bitmap->behind_writes); - if (bw > bitmap->behind_writes_used) - bitmap->behind_writes_used = bw; - - pr_debug("inc write-behind count %d/%lu\n", - bw, bitmap->mddev->bitmap_info.max_write_behind); - } - while (sectors) { sector_t blocks; bitmap_counter_t *bmc; @@ -1504,17 +1494,10 @@ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long s EXPORT_SYMBOL(md_bitmap_startwrite); void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind) + unsigned long sectors) { if (!bitmap) return; - if (behind) { - if (atomic_dec_and_test(&bitmap->behind_writes)) - wake_up(&bitmap->behind_wait); - pr_debug("dec write-behind count %d/%lu\n", - atomic_read(&bitmap->behind_writes), - bitmap->mddev->bitmap_info.max_write_behind); - } while (sectors) { sector_t blocks; @@ -1528,15 +1511,16 @@ void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, return; } - if (success && !bitmap->mddev->degraded && - bitmap->events_cleared < bitmap->mddev->events) { - bitmap->events_cleared = bitmap->mddev->events; - bitmap->need_sync = 1; - sysfs_notify_dirent_safe(bitmap->sysfs_can_clear); - } - - if (!success && !NEEDED(*bmc)) + if (!bitmap->mddev->degraded) { + if (bitmap->events_cleared < bitmap->mddev->events) { + bitmap->events_cleared = bitmap->mddev->events; + bitmap->need_sync = 1; + sysfs_notify_dirent_safe( + bitmap->sysfs_can_clear); + } + } else if (!NEEDED(*bmc)) { *bmc |= NEEDED_MASK; + } if (COUNTER(*bmc) == COUNTER_MAX) wake_up(&bitmap->overflow_wait); @@ -1816,6 +1800,39 @@ void md_bitmap_free(struct bitmap *bitmap) } EXPORT_SYMBOL(md_bitmap_free); +void md_bitmap_start_behind_write(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + int bw; + + if (!bitmap) + return; + + atomic_inc(&bitmap->behind_writes); + bw = atomic_read(&bitmap->behind_writes); + if (bw > bitmap->behind_writes_used) + bitmap->behind_writes_used = bw; + + pr_debug("inc write-behind count %d/%lu\n", + bw, bitmap->mddev->bitmap_info.max_write_behind); +} +EXPORT_SYMBOL(md_bitmap_start_behind_write); + +void md_bitmap_end_behind_write(struct mddev *mddev) +{ + struct bitmap *bitmap = mddev->bitmap; + + if (!bitmap) + return; + + if (atomic_dec_and_test(&bitmap->behind_writes)) + wake_up(&bitmap->behind_wait); + pr_debug("dec write-behind count %d/%lu\n", + atomic_read(&bitmap->behind_writes), + bitmap->mddev->bitmap_info.max_write_behind); +} +EXPORT_SYMBOL(md_bitmap_end_behind_write); + void md_bitmap_wait_behind_writes(struct mddev *mddev) { struct bitmap *bitmap = mddev->bitmap; diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 8a3788c9bf..9264c52fdd 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -252,9 +252,9 @@ void md_bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long /* these are exported */ int md_bitmap_startwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int behind); + unsigned long sectors); void md_bitmap_endwrite(struct bitmap *bitmap, sector_t offset, - unsigned long sectors, int success, int behind); + unsigned long sectors); int md_bitmap_start_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int degraded); void md_bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, int aborted); void md_bitmap_close_sync(struct bitmap *bitmap); @@ -274,6 +274,8 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, sector_t *lo, sector_t *hi, bool clear_bits); void md_bitmap_free(struct bitmap *bitmap); void md_bitmap_wait_behind_writes(struct mddev *mddev); +void md_bitmap_start_behind_write(struct mddev *mddev); +void md_bitmap_end_behind_write(struct mddev *mddev); static inline bool md_bitmap_enabled(struct bitmap *bitmap) { diff --git a/drivers/md/md-linear.c b/drivers/md/md-linear.c index 2462adea9b..682e67264d 100644 --- a/drivers/md/md-linear.c +++ b/drivers/md/md-linear.c @@ -279,6 +279,7 @@ static bool linear_make_request(struct mddev *mddev, struct bio *bio) bio = split; } + md_account_bio(mddev, &bio); bio_set_dev(bio, tmp_dev->rdev->bdev); bio->bi_iter.bi_sector = bio->bi_iter.bi_sector - start_sector + data_offset; diff --git a/drivers/md/md-multipath.c b/drivers/md/md-multipath.c index 73259bcaf1..e24da4c08a 100644 --- a/drivers/md/md-multipath.c +++ b/drivers/md/md-multipath.c @@ -116,6 +116,7 @@ static bool multipath_make_request(struct mddev *mddev, struct bio * bio) && md_flush_request(mddev, bio)) return true; + md_account_bio(mddev, &bio); mp_bh = mempool_alloc(&conf->pool, GFP_NOIO); mp_bh->master_bio = bio; diff --git a/drivers/md/md.c b/drivers/md/md.c index d05a2ca0fc..8c71050495 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2367,7 +2367,7 @@ int md_integrity_register(struct mddev *mddev) pr_debug("md: data integrity enabled on %s\n", mdname(mddev)); if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE) || (mddev->level != 1 && mddev->level != 10 && - bioset_integrity_create(&mddev->io_acct_set, BIO_POOL_SIZE))) { + bioset_integrity_create(&mddev->io_clone_set, BIO_POOL_SIZE))) { /* * No need to handle the failure of bioset_integrity_create, * because the function is called by md_run() -> pers->run(), @@ -5984,9 +5984,9 @@ int md_run(struct mddev *mddev) goto exit_bio_set; } - if (!bioset_initialized(&mddev->io_acct_set)) { - err = bioset_init(&mddev->io_acct_set, BIO_POOL_SIZE, - offsetof(struct md_io_acct, bio_clone), 0); + if (!bioset_initialized(&mddev->io_clone_set)) { + err = bioset_init(&mddev->io_clone_set, BIO_POOL_SIZE, + offsetof(struct md_io_clone, bio_clone), 0); if (err) goto exit_sync_set; } @@ -6172,7 +6172,7 @@ bitmap_abort: module_put(pers->owner); md_bitmap_destroy(mddev); abort: - bioset_exit(&mddev->io_acct_set); + bioset_exit(&mddev->io_clone_set); exit_sync_set: bioset_exit(&mddev->sync_set); exit_bio_set: @@ -6398,7 +6398,7 @@ static void __md_stop(struct mddev *mddev) percpu_ref_exit(&mddev->active_io); bioset_exit(&mddev->bio_set); bioset_exit(&mddev->sync_set); - bioset_exit(&mddev->io_acct_set); + bioset_exit(&mddev->io_clone_set); } void md_stop(struct mddev *mddev) @@ -8768,44 +8768,69 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, } EXPORT_SYMBOL_GPL(md_submit_discard_bio); -static void md_end_io_acct(struct bio *bio) +static void md_bitmap_start(struct mddev *mddev, + struct md_io_clone *md_io_clone) { - struct md_io_acct *md_io_acct = bio->bi_private; - struct bio *orig_bio = md_io_acct->orig_bio; - struct mddev *mddev = md_io_acct->mddev; + if (mddev->pers->bitmap_sector) + mddev->pers->bitmap_sector(mddev, &md_io_clone->offset, + &md_io_clone->sectors); + + md_bitmap_startwrite(mddev->bitmap, md_io_clone->offset, + md_io_clone->sectors); +} + +static void md_bitmap_end(struct mddev *mddev, struct md_io_clone *md_io_clone) +{ + md_bitmap_endwrite(mddev->bitmap, md_io_clone->offset, + md_io_clone->sectors); +} + +static void md_end_clone_io(struct bio *bio) +{ + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; + + if (bio_data_dir(orig_bio) == WRITE && mddev->bitmap) + md_bitmap_end(mddev, md_io_clone); orig_bio->bi_status = bio->bi_status; - bio_end_io_acct(orig_bio, md_io_acct->start_time); + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + bio_put(bio); bio_endio(orig_bio); - percpu_ref_put(&mddev->active_io); } -/* - * Used by personalities that don't already clone the bio and thus can't - * easily add the timestamp to their extended bio structure. - */ +static void md_clone_bio(struct mddev *mddev, struct bio **bio) +{ + struct md_io_clone *md_io_clone; + struct bio *clone = + bio_clone_fast(*bio, GFP_NOIO, &mddev->io_clone_set); + + md_io_clone = container_of(clone, struct md_io_clone, bio_clone); + md_io_clone->orig_bio = *bio; + md_io_clone->mddev = mddev; + if (blk_queue_io_stat((*bio)->bi_disk->queue)) + md_io_clone->start_time = bio_start_io_acct(*bio); + + if (bio_data_dir(*bio) == WRITE && mddev->bitmap) { + md_io_clone->offset = (*bio)->bi_iter.bi_sector; + md_io_clone->sectors = bio_sectors(*bio); + md_bitmap_start(mddev, md_io_clone); + } + + clone->bi_end_io = md_end_clone_io; + clone->bi_private = md_io_clone; + *bio = clone; +} + void md_account_bio(struct mddev *mddev, struct bio **bio) { - struct md_io_acct *md_io_acct; - struct bio *clone; - - if (!blk_queue_io_stat((*bio)->bi_disk->queue)) - return; - percpu_ref_get(&mddev->active_io); - - clone = bio_clone_fast(*bio, GFP_NOIO, &mddev->io_acct_set); - md_io_acct = container_of(clone, struct md_io_acct, bio_clone); - md_io_acct->orig_bio = *bio; - md_io_acct->start_time = bio_start_io_acct(*bio); - md_io_acct->mddev = mddev; - - clone->bi_end_io = md_end_io_acct; - clone->bi_private = md_io_acct; - *bio = clone; + md_clone_bio(mddev, bio); } EXPORT_SYMBOL_GPL(md_account_bio); diff --git a/drivers/md/md.h b/drivers/md/md.h index 40ddb28bd4..85ce058c49 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -517,7 +517,7 @@ struct mddev { struct bio_set sync_set; /* for sync operations like * metadata and bitmap writes */ - struct bio_set io_acct_set; /* for raid0 and raid5 io accounting */ + struct bio_set io_clone_set; /* Generic flush handling. * The last to finish preflush schedules a worker to submit @@ -662,6 +662,9 @@ struct md_personality int (*congested)(struct mddev *mddev, int bits); /* Changes the consistency policy of an active array. */ int (*change_consistency_policy)(struct mddev *mddev, const char *buf); + /* convert io ranges from array to bitmap */ + void (*bitmap_sector)(struct mddev *mddev, sector_t *offset, + unsigned long *sectors); }; struct md_sysfs_entry { @@ -740,10 +743,12 @@ struct md_thread { void *private; }; -struct md_io_acct { +struct md_io_clone { struct mddev *mddev; struct bio *orig_bio; unsigned long start_time; + sector_t offset; + unsigned long sectors; struct bio bio_clone; }; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index be9f5fe29b..e4cd9a86ac 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -308,8 +308,6 @@ static void call_bio_endio(struct r1bio *r1_bio) if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (blk_queue_io_stat(bio->bi_disk->queue)) - bio_end_io_acct(bio, r1_bio->start_time); bio_endio(bio); } @@ -426,11 +424,8 @@ static void close_write(struct r1bio *r1_bio) bio_put(r1_bio->behind_master_bio); r1_bio->behind_master_bio = NULL; } - /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, - r1_bio->sectors, - !test_bit(R1BIO_Degraded, &r1_bio->state), - test_bit(R1BIO_BehindIO, &r1_bio->state)); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + md_bitmap_end_behind_write(r1_bio->mddev); md_write_end(r1_bio->mddev); } @@ -487,8 +482,6 @@ static void raid1_end_write_request(struct bio *bio) if (!test_bit(Faulty, &rdev->flags)) set_bit(R1BIO_WriteError, &r1_bio->state); else { - /* Fail the request */ - set_bit(R1BIO_Degraded, &r1_bio->state); /* Finished with this branch */ r1_bio->bios[mirror] = NULL; to_put = bio; @@ -1342,10 +1335,10 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, } r1_bio->read_disk = rdisk; - - if (!r1bio_existed && blk_queue_io_stat(bio->bi_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); - + if (!r1bio_existed) { + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; + } read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r1_bio->bios[rdisk] = read_bio; @@ -1454,11 +1447,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, break; } r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags)) { - if (i < conf->raid_disks) - set_bit(R1BIO_Degraded, &r1_bio->state); + if (!rdev || test_bit(Faulty, &rdev->flags)) continue; - } atomic_inc(&rdev->nr_pending); if (test_bit(WriteErrorSeen, &rdev->flags)) { @@ -1484,16 +1474,6 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, */ max_sectors = bad_sectors; rdev_dec_pending(rdev, mddev); - /* We don't set R1BIO_Degraded as that - * only applies if the disk is - * missing, so it might be re-added, - * and we want to know to recover this - * chunk. - * In this case the device is here, - * and the fact that this chunk is not - * in-sync is recorded in the bad - * block log - */ continue; } if (is_bad) { @@ -1545,8 +1525,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, r1_bio->sectors = max_sectors; } - if (blk_queue_io_stat(bio->bi_disk->queue)) - r1_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r1_bio->master_bio = bio; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@ -1570,8 +1550,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, alloc_behind_master_bio(r1_bio, bio); } - md_bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, - test_bit(R1BIO_BehindIO, &r1_bio->state)); + if (test_bit(R1BIO_BehindIO, &r1_bio->state)) + md_bitmap_start_behind_write(mddev); first_clone = 0; } @@ -2525,12 +2505,10 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) * errors. */ fail = true; - if (!narrow_write_error(r1_bio, m)) { + if (!narrow_write_error(r1_bio, m)) md_error(conf->mddev, conf->mirrors[m].rdev); /* an I/O failed, we can't clear the bitmap */ - set_bit(R1BIO_Degraded, &r1_bio->state); - } rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } @@ -2622,8 +2600,6 @@ static void raid1d(struct md_thread *thread) list_del(&r1_bio->retry_list); idx = sector_to_idx(r1_bio->sector); atomic_dec(&conf->nr_queued[idx]); - if (mddev->degraded) - set_bit(R1BIO_Degraded, &r1_bio->state); if (test_bit(R1BIO_WriteError, &r1_bio->state)) close_write(r1_bio); raid_end_bio_io(r1_bio); diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index ccf10e59b1..e1f3cb73fa 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -158,7 +158,6 @@ struct r1bio { sector_t sector; int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx @@ -189,7 +188,6 @@ struct r1bio { enum r1bio_state { R1BIO_Uptodate, R1BIO_IsSync, - R1BIO_Degraded, R1BIO_BehindIO, /* Set ReadError on bios that experience a readerror so that * raid1d knows what to do with them. diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b1a170bf1f..56d0246c18 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -327,8 +327,6 @@ static void raid_end_bio_io(struct r10bio *r10_bio) if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) bio->bi_status = BLK_STS_IOERR; - if (blk_queue_io_stat(bio->bi_disk->queue)) - bio_end_io_acct(bio, r10_bio->start_time); bio_endio(bio); /* * Wake up any possible resync thread that waits for the device @@ -432,11 +430,6 @@ static void raid10_end_read_request(struct bio *bio) static void close_write(struct r10bio *r10_bio) { - /* clear the bitmap if all writes complete successfully */ - md_bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, - r10_bio->sectors, - !test_bit(R10BIO_Degraded, &r10_bio->state), - 0); md_write_end(r10_bio->mddev); } @@ -506,7 +499,6 @@ static void raid10_end_write_request(struct bio *bio) set_bit(R10BIO_WriteError, &r10_bio->state); else { /* Fail the request */ - set_bit(R10BIO_Degraded, &r10_bio->state); r10_bio->devs[slot].bio = NULL; to_put = bio; dec_rdev = 1; @@ -1206,7 +1198,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, } static void raid10_read_request(struct mddev *mddev, struct bio *bio, - struct r10bio *r10_bio) + struct r10bio *r10_bio, bool io_accounting) { struct r10conf *conf = mddev->private; struct bio *read_bio; @@ -1277,8 +1269,10 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, } slot = r10_bio->read_slot; - if (blk_queue_io_stat(bio->bi_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + if (io_accounting) { + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; + } read_bio = bio_clone_fast(bio, gfp, &mddev->bio_set); r10_bio->devs[slot].bio = read_bio; @@ -1526,10 +1520,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->devs[i].bio = NULL; r10_bio->devs[i].repl_bio = NULL; - if (!rdev && !rrdev) { - set_bit(R10BIO_Degraded, &r10_bio->state); + if (!rdev && !rrdev) continue; - } if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; @@ -1546,14 +1538,6 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, * to other devices yet */ max_sectors = bad_sectors; - /* We don't set R10BIO_Degraded as that - * only applies if the disk is missing, - * so it might be re-added, and we want to - * know to recover this chunk. - * In this case the device is here, and the - * fact that this chunk is not in-sync is - * recorded in the bad block log. - */ continue; } if (is_bad) { @@ -1587,10 +1571,9 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->master_bio = bio; } - if (blk_queue_io_stat(bio->bi_disk->queue)) - r10_bio->start_time = bio_start_io_acct(bio); + md_account_bio(mddev, &bio); + r10_bio->master_bio = bio; atomic_set(&r10_bio->remaining, 1); - md_bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); for (i = 0; i < conf->copies; i++) { if (r10_bio->devs[i].bio) @@ -1619,7 +1602,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) conf->geo.raid_disks); if (bio_data_dir(bio) == READ) - raid10_read_request(mddev, bio, r10_bio); + raid10_read_request(mddev, bio, r10_bio, true); else raid10_write_request(mddev, bio, r10_bio); } @@ -3040,7 +3023,7 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) rdev_dec_pending(rdev, mddev); r10_bio->state = 0; - raid10_read_request(mddev, r10_bio->master_bio, r10_bio); + raid10_read_request(mddev, r10_bio->master_bio, r10_bio, false); /* * allow_barrier after re-submit to ensure no sync io * can be issued while regular io pending. @@ -3112,11 +3095,8 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) rdev_dec_pending(rdev, conf->mddev); } else if (bio != NULL && bio->bi_status) { fail = true; - if (!narrow_write_error(r10_bio, m)) { + if (!narrow_write_error(r10_bio, m)) md_error(conf->mddev, rdev); - set_bit(R10BIO_Degraded, - &r10_bio->state); - } rdev_dec_pending(rdev, conf->mddev); } bio = r10_bio->devs[m].repl_bio; @@ -3175,8 +3155,6 @@ static void raid10d(struct md_thread *thread) r10_bio = list_first_entry(&tmp, struct r10bio, retry_list); list_del(&r10_bio->retry_list); - if (mddev->degraded) - set_bit(R10BIO_Degraded, &r10_bio->state); if (test_bit(R10BIO_WriteError, &r10_bio->state)) diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index e9a0efc5a8..5fb8b984c7 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -124,7 +124,6 @@ struct r10bio { sector_t sector; /* virtual sector number */ int sectors; unsigned long state; - unsigned long start_time; struct mddev *mddev; /* * original bio going to /dev/mdx @@ -163,7 +162,6 @@ enum r10bio_state { R10BIO_IsSync, R10BIO_IsRecover, R10BIO_IsReshape, - R10BIO_Degraded, /* Set ReadError on bios that experience a read error * so that raid10d knows what to do with them. */ diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index b96cdbb7b1..d2bf4154c7 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -322,10 +322,6 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, if (sh->dev[i].written) { set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); } } } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a8595925e1..45e187821f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -915,8 +915,7 @@ static bool stripe_can_batch(struct stripe_head *sh) if (raid5_has_log(conf) || raid5_has_ppl(conf)) return false; return test_bit(STRIPE_BATCH_READY, &sh->state) && - !test_bit(STRIPE_BITMAP_PENDING, &sh->state) && - is_full_stripe_write(sh); + is_full_stripe_write(sh); } /* we only do back search */ @@ -1372,8 +1371,6 @@ again: generic_make_request(rbi); } if (!rdev && !rrdev) { - if (op_is_write(op)) - set_bit(STRIPE_DEGRADED, &sh->state); pr_debug("skip op %d on disc %d for sector %llu\n", bi->bi_opf, i, (unsigned long long)sh->sector); clear_bit(R5_LOCKED, &sh->dev[i].flags); @@ -2949,7 +2946,6 @@ static void raid5_end_write_request(struct bio *bi) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (bi->bi_status) { - set_bit(STRIPE_DEGRADED, &sh->state); set_bit(WriteErrorSeen, &rdev->flags); set_bit(R5_WriteError, &sh->dev[i].flags); if (!test_and_set_bit(WantReplacement, &rdev->flags)) @@ -3616,29 +3612,9 @@ static void __add_stripe_bio(struct stripe_head *sh, struct bio *bi, (*bip)->bi_iter.bi_sector, sh->sector, dd_idx, sh->dev[dd_idx].sector); - if (conf->mddev->bitmap && firstwrite) { - /* Cannot hold spinlock over bitmap_startwrite, - * but must ensure this isn't added to a batch until - * we have added to the bitmap and set bm_seq. - * So set STRIPE_BITMAP_PENDING to prevent - * batching. - * If multiple __add_stripe_bio() calls race here they - * much all set STRIPE_BITMAP_PENDING. So only the first one - * to complete "bitmap_startwrite" gets to set - * STRIPE_BIT_DELAY. This is important as once a stripe - * is added to a batch, STRIPE_BIT_DELAY cannot be changed - * any more. - */ - set_bit(STRIPE_BITMAP_PENDING, &sh->state); - spin_unlock_irq(&sh->stripe_lock); - md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0); - spin_lock_irq(&sh->stripe_lock); - clear_bit(STRIPE_BITMAP_PENDING, &sh->state); - if (!sh->batch_head) { - sh->bm_seq = conf->seq_flush+1; - set_bit(STRIPE_BIT_DELAY, &sh->state); - } + if (conf->mddev->bitmap && firstwrite && !sh->batch_head) { + sh->bm_seq = conf->seq_flush+1; + set_bit(STRIPE_BIT_DELAY, &sh->state); } } @@ -3689,7 +3665,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, BUG_ON(sh->batch_head); for (i = disks; i--; ) { struct bio *bi; - int bitmap_end = 0; if (test_bit(R5_ReadError, &sh->dev[i].flags)) { struct md_rdev *rdev; @@ -3716,8 +3691,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].towrite = NULL; sh->overwrite_disks = 0; spin_unlock_irq(&sh->stripe_lock); - if (bi) - bitmap_end = 1; log_stripe_write_finished(sh); @@ -3732,10 +3705,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bio_io_error(bi); bi = nextbi; } - if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); - bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; sh->dev[i].written = NULL; @@ -3744,7 +3713,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, sh->dev[i].page = sh->dev[i].orig_page; } - if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); @@ -3778,9 +3746,6 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, bi = nextbi; } } - if (bitmap_end) - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), 0, 0); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -4131,10 +4096,6 @@ returnbi: bio_endio(wbi); wbi = wbi2; } - md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - RAID5_STRIPE_SECTORS(conf), - !test_bit(STRIPE_DEGRADED, &sh->state), - 0); if (head_sh->batch_head) { sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -4411,7 +4372,6 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, s->locked++; set_bit(R5_Wantwrite, &dev->flags); - clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); break; case check_state_run: @@ -4568,7 +4528,6 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, clear_bit(R5_Wantwrite, &dev->flags); s->locked--; } - clear_bit(STRIPE_DEGRADED, &sh->state); set_bit(STRIPE_INSYNC, &sh->state); break; @@ -4968,8 +4927,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, (1 << STRIPE_COMPUTE_RUN) | (1 << STRIPE_DISCARD) | (1 << STRIPE_BATCH_READY) | - (1 << STRIPE_BATCH_ERR) | - (1 << STRIPE_BITMAP_PENDING)), + (1 << STRIPE_BATCH_ERR)), "stripe state: %lx\n", sh->state); WARN_ONCE(head_sh->state & ((1 << STRIPE_DISCARD) | (1 << STRIPE_REPLACED)), @@ -4977,7 +4935,6 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED) | (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); @@ -5522,26 +5479,17 @@ static struct bio *remove_bio_from_retry(struct r5conf *conf, */ static void raid5_align_endio(struct bio *bi) { - struct md_io_acct *md_io_acct = bi->bi_private; - struct bio *raid_bi = md_io_acct->orig_bio; - struct mddev *mddev; - struct r5conf *conf; - struct md_rdev *rdev; + struct bio *raid_bi = bi->bi_private; + struct md_rdev *rdev = (void *)raid_bi->bi_next; + struct mddev *mddev = rdev->mddev; + struct r5conf *conf = mddev->private; blk_status_t error = bi->bi_status; - unsigned long start_time = md_io_acct->start_time; bio_put(bi); - - rdev = (void*)raid_bi->bi_next; raid_bi->bi_next = NULL; - mddev = rdev->mddev; - conf = mddev->private; - rdev_dec_pending(rdev, conf->mddev); if (!error) { - if (blk_queue_io_stat(raid_bi->bi_disk->queue)) - bio_end_io_acct(raid_bi, start_time); bio_endio(raid_bi); if (atomic_dec_and_test(&conf->active_aligned_reads)) wake_up(&conf->wait_for_quiescent); @@ -5560,7 +5508,6 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct md_rdev *rdev; sector_t sector, end_sector, first_bad; int bad_sectors, dd_idx; - struct md_io_acct *md_io_acct; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5597,17 +5544,14 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } - align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_acct_set); - md_io_acct = container_of(align_bio, struct md_io_acct, bio_clone); + md_account_bio(mddev, &raid_bio); raid_bio->bi_next = (void *)rdev; - if (blk_queue_io_stat(raid_bio->bi_disk->queue)) - md_io_acct->start_time = bio_start_io_acct(raid_bio); - md_io_acct->orig_bio = raid_bio; - bio_set_dev(align_bio, rdev->bdev); + align_bio = bio_clone_fast(raid_bio, GFP_NOIO, &mddev->io_clone_set); align_bio->bi_end_io = raid5_align_endio; - align_bio->bi_private = md_io_acct; + align_bio->bi_private = raid_bio; align_bio->bi_iter.bi_sector = sector; + bio_set_dev(align_bio, rdev->bdev); /* No reshape active, so we can trust rdev->data_offset */ align_bio->bi_iter.bi_sector += rdev->data_offset; @@ -5909,13 +5853,6 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) } spin_unlock_irq(&sh->stripe_lock); if (conf->mddev->bitmap) { - for (d = 0; - d < conf->raid_disks - conf->max_degraded; - d++) - md_bitmap_startwrite(mddev->bitmap, - sh->sector, - RAID5_STRIPE_SECTORS(conf), - 0); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); } @@ -6033,6 +5970,87 @@ static bool reshape_disabled(struct mddev *mddev) return is_md_suspended(mddev) || !md_is_rdwr(mddev); } +enum reshape_loc { + LOC_NO_RESHAPE, + LOC_AHEAD_OF_RESHAPE, + LOC_INSIDE_RESHAPE, + LOC_BEHIND_RESHAPE, +}; + +static enum reshape_loc get_reshape_loc(struct mddev *mddev, + struct r5conf *conf, sector_t logical_sector) +{ + sector_t reshape_progress, reshape_safe; + /* + * Spinlock is needed as reshape_progress may be + * 64bit on a 32bit platform, and so it might be + * possible to see a half-updated value + * Of course reshape_progress could change after + * the lock is dropped, so once we get a reference + * to the stripe that we think it is, we will have + * to check again. + */ + spin_lock_irq(&conf->device_lock); + reshape_progress = conf->reshape_progress; + reshape_safe = conf->reshape_safe; + spin_unlock_irq(&conf->device_lock); + if (reshape_progress == MaxSector) + return LOC_NO_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_progress)) + return LOC_AHEAD_OF_RESHAPE; + if (ahead_of_reshape(mddev, logical_sector, reshape_safe)) + return LOC_INSIDE_RESHAPE; + return LOC_BEHIND_RESHAPE; +} + +static void raid5_bitmap_sector(struct mddev *mddev, sector_t *offset, + unsigned long *sectors) +{ + struct r5conf *conf = mddev->private; + sector_t start = *offset; + sector_t end = start + *sectors; + sector_t prev_start = start; + sector_t prev_end = end; + int sectors_per_chunk; + enum reshape_loc loc; + int dd_idx; + + sectors_per_chunk = conf->chunk_sectors * + (conf->raid_disks - conf->max_degraded); + start = round_down(start, sectors_per_chunk); + end = round_up(end, sectors_per_chunk); + + start = raid5_compute_sector(conf, start, 0, &dd_idx, NULL); + end = raid5_compute_sector(conf, end, 0, &dd_idx, NULL); + + /* + * For LOC_INSIDE_RESHAPE, this IO will wait for reshape to make + * progress, hence it's the same as LOC_BEHIND_RESHAPE. + */ + loc = get_reshape_loc(mddev, conf, prev_start); + if (likely(loc != LOC_AHEAD_OF_RESHAPE)) { + *offset = start; + *sectors = end - start; + return; + } + + sectors_per_chunk = conf->prev_chunk_sectors * + (conf->previous_raid_disks - conf->max_degraded); + prev_start = round_down(prev_start, sectors_per_chunk); + prev_end = round_down(prev_end, sectors_per_chunk); + + prev_start = raid5_compute_sector(conf, prev_start, 1, &dd_idx, NULL); + prev_end = raid5_compute_sector(conf, prev_end, 1, &dd_idx, NULL); + + /* + * for LOC_AHEAD_OF_RESHAPE, reshape can make progress before this IO + * is handled in make_stripe_request(), we can't know this here hence + * we set bits for both. + */ + *offset = min(start, prev_start); + *sectors = max(end, prev_end) - *offset; +} + static enum stripe_result make_stripe_request(struct mddev *mddev, struct r5conf *conf, struct stripe_request_ctx *ctx, sector_t logical_sector, struct bio *bi) @@ -6047,28 +6065,14 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, seq = read_seqcount_begin(&conf->gen_lock); if (unlikely(conf->reshape_progress != MaxSector)) { - /* - * Spinlock is needed as reshape_progress may be - * 64bit on a 32bit platform, and so it might be - * possible to see a half-updated value - * Of course reshape_progress could change after - * the lock is dropped, so once we get a reference - * to the stripe that we think it is, we will have - * to check again. - */ - spin_lock_irq(&conf->device_lock); - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_progress)) { - previous = 1; - } else { - if (ahead_of_reshape(mddev, logical_sector, - conf->reshape_safe)) { - spin_unlock_irq(&conf->device_lock); - ret = STRIPE_SCHEDULE_AND_RETRY; - goto out; - } + enum reshape_loc loc = get_reshape_loc(mddev, conf, + logical_sector); + if (loc == LOC_INSIDE_RESHAPE) { + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } - spin_unlock_irq(&conf->device_lock); + if (loc == LOC_AHEAD_OF_RESHAPE) + previous = 1; } new_sector = raid5_compute_sector(conf, logical_sector, previous, @@ -6250,8 +6254,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) /* Bail out if conflicts with reshape and REQ_NOWAIT is set */ if ((bi->bi_opf & REQ_NOWAIT) && (conf->reshape_progress != MaxSector) && - !ahead_of_reshape(mddev, logical_sector, conf->reshape_progress) && - ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { + get_reshape_loc(mddev, conf, logical_sector) == LOC_INSIDE_RESHAPE) { bio_wouldblock_error(bi); if (rw == WRITE) md_write_end(mddev); @@ -9158,6 +9161,7 @@ static struct md_personality raid6_personality = .takeover = raid6_takeover, .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, + .bitmap_sector = raid5_bitmap_sector, }; static struct md_personality raid5_personality = { @@ -9184,6 +9188,7 @@ static struct md_personality raid5_personality = .takeover = raid5_takeover, .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, + .bitmap_sector = raid5_bitmap_sector, }; static struct md_personality raid4_personality = @@ -9211,6 +9216,7 @@ static struct md_personality raid4_personality = .takeover = raid4_takeover, .congested = raid5_congested, .change_consistency_policy = raid5_change_consistency_policy, + .bitmap_sector = raid5_bitmap_sector, }; static int __init raid5_init(void) diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index daddd30534..c85963443a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -358,7 +358,6 @@ enum { STRIPE_REPLACED, STRIPE_PREREAD_ACTIVE, STRIPE_DELAYED, - STRIPE_DEGRADED, STRIPE_BIT_DELAY, STRIPE_EXPANDING, STRIPE_EXPAND_SOURCE, @@ -372,9 +371,6 @@ enum { STRIPE_ON_RELEASE_LIST, STRIPE_BATCH_READY, STRIPE_BATCH_ERR, - STRIPE_BITMAP_PENDING, /* Being added to bitmap, don't add - * to batch yet. - */ STRIPE_LOG_TRAPPED, /* trapped into log (see raid5-cache.c) * this bit is used in two scenarios: * diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index 19600d35aa..40ac37fe9d 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -520,11 +520,11 @@ static int pcan_usb_fd_decode_canmsg(struct pcan_usb_fd_if *usb_if, else memcpy(cfd->data, rm->d, cfd->len); - peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(rm->ts_low)); - netdev->stats.rx_packets++; netdev->stats.rx_bytes += cfd->len; + peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(rm->ts_low)); + return 0; } @@ -586,11 +586,11 @@ static int pcan_usb_fd_decode_status(struct pcan_usb_fd_if *usb_if, if (!skb) return -ENOMEM; - peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(sm->ts_low)); - netdev->stats.rx_packets++; netdev->stats.rx_bytes += cf->can_dlc; + peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(sm->ts_low)); + return 0; } diff --git a/drivers/net/wireless/realtek/rtw88/coex.c b/drivers/net/wireless/realtek/rtw88/coex.c index 86467d2f88..d0ed6a4e86 100644 --- a/drivers/net/wireless/realtek/rtw88/coex.c +++ b/drivers/net/wireless/realtek/rtw88/coex.c @@ -309,7 +309,7 @@ static void rtw_coex_tdma_timer_base(struct rtw_dev *rtwdev, u8 type) { struct rtw_coex *coex = &rtwdev->coex; struct rtw_coex_stat *coex_stat = &coex->stat; - u8 para[2] = {0}; + u8 para[6] = {}; u8 times; u16 tbtt_interval = coex_stat->wl_beacon_interval; diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f40e5aab73..55400db2b1 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -227,12 +227,16 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table); #define tlb_needs_table_invalidate() (true) #endif +void tlb_remove_table_sync_one(void); + #else #ifdef tlb_needs_table_invalidate #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE #endif +static inline void tlb_remove_table_sync_one(void) { } + #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */ diff --git a/kernel/padata.c b/kernel/padata.c index e83259bb24..6915280bb8 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -1213,12 +1213,16 @@ EXPORT_SYMBOL(padata_alloc_shell); */ void padata_free_shell(struct padata_shell *ps) { - struct padata_instance *pinst = ps->pinst; + /* + * Wait for all _do_serial calls to finish to avoid touching + * freed pd's and ps's. + */ + synchronize_rcu(); - mutex_lock(&pinst->lock); + mutex_lock(&ps->pinst->lock); list_del(&ps->list); padata_free_pd(rcu_dereference_protected(ps->pd, 1)); - mutex_unlock(&pinst->lock); + mutex_unlock(&ps->pinst->lock); kfree(ps); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index baef70a50e..30bc44843d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4314,7 +4314,7 @@ int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, * @ip: The instruction pointer address to remove the data from * * Returns the data if it is found, otherwise NULL. - * Note, if the data pointer is used as the data itself, (see + * Note, if the data pointer is used as the data itself, (see * ftrace_func_mapper_find_ip(), then the return value may be meaningless, * if the data pointer was set to zero. */ @@ -5037,8 +5037,6 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) __add_hash_entry(direct_functions, entry); ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0); - if (ret) - remove_hash_entry(direct_functions, entry); if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) { ret = register_ftrace_function(&direct_ops); @@ -5047,6 +5045,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr) } if (ret) { + remove_hash_entry(direct_functions, entry); kfree(entry); if (!direct->count) { list_del_rcu(&direct->next); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1a5f45092b..f1385ebb3b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5655,6 +5655,13 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, return 0; pud_clear(pud); + /* + * Once our caller drops the rmap lock, some other process might be + * using this page table as a normal, non-hugetlb page table. + * Wait for pending gup_fast() in other threads to finish before letting + * that happen. + */ + tlb_remove_table_sync_one(); put_page(virt_to_page(ptep)); mm_dec_nr_pmds(mm); /* diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 24f8d29af0..8fe629e536 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1118,6 +1118,7 @@ static void collapse_huge_page(struct mm_struct *mm, _pmd = pmdp_collapse_flush(vma, address, pmd); spin_unlock(pmd_ptl); mmu_notifier_invalidate_range_end(&range); + tlb_remove_table_sync_one(); spin_lock(pte_ptl); isolated = __collapse_huge_page_isolate(vma, address, pte, @@ -1383,6 +1384,42 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return 0; } +/* + * A note about locking: + * Trying to take the page table spinlocks would be useless here because those + * are only used to synchronize: + * + * - modifying terminal entries (ones that point to a data page, not to another + * page table) + * - installing *new* non-terminal entries + * + * Instead, we need roughly the same kind of protection as free_pgtables() or + * mm_take_all_locks() (but only for a single VMA): + * The mmap lock together with this VMA's rmap locks covers all paths towards + * the page table entries we're messing with here, except for hardware page + * table walks and lockless_pages_from_mm(). + */ +static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd; + + mmap_assert_write_locked(mm); + if (vma->vm_file) + lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem); + /* + * All anon_vmas attached to the VMA have the same root and are + * therefore locked by the same lock. + */ + if (vma->anon_vma) + lockdep_assert_held_write(&vma->anon_vma->root->rwsem); + + pmd = pmdp_collapse_flush(vma, addr, pmdp); + tlb_remove_table_sync_one(); + mm_dec_nr_ptes(mm); + pte_free(mm, pmd_pgtable(pmd)); +} + /** * Try to collapse a pte-mapped THP for mm at address haddr. * @@ -1396,7 +1433,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) struct vm_area_struct *vma = find_vma(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; - pmd_t *pmd, _pmd; + pmd_t *pmd; spinlock_t *ptl; int count = 0; int i; @@ -1426,6 +1463,20 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!pmd) goto drop_hpage; + /* + * We need to lock the mapping so that from here on, only GUP-fast and + * hardware page walks can access the parts of the page tables that + * we're operating on. + * See collapse_and_free_pmd(). + */ + i_mmap_lock_write(vma->vm_file->f_mapping); + + /* + * This spinlock should be unnecessary: Nobody else should be accessing + * the page tables under spinlock protection here, only + * lockless_pages_from_mm() and the hardware page walker can access page + * tables while all the high-level locks are held in write mode. + */ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); /* step 1: check all mapped PTEs are to the right huge page */ @@ -1472,11 +1523,15 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) } /* step 4: collapse pmd */ - ptl = pmd_lock(vma->vm_mm, pmd); - _pmd = pmdp_collapse_flush(vma, haddr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); + /* we make no change to anon, but protect concurrent anon page lookup */ + if (vma->anon_vma) + anon_vma_lock_write(vma->anon_vma); + + collapse_and_free_pmd(mm, vma, haddr, pmd); + + if (vma->anon_vma) + anon_vma_unlock_write(vma->anon_vma); + i_mmap_unlock_write(vma->vm_file->f_mapping); drop_hpage: unlock_page(hpage); @@ -1485,6 +1540,7 @@ drop_hpage: abort: pte_unmap_unlock(start_pte, ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); goto drop_hpage; } @@ -1516,7 +1572,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; struct mm_struct *mm; unsigned long addr; - pmd_t *pmd, _pmd; + pmd_t *pmd; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1534,7 +1590,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * An alternative would be drop the check, but check that page * table is clear before calling pmdp_collapse_flush() under * ptl. It has higher chance to recover THP for the VMA, but - * has higher cost too. + * has higher cost too. It would also probably require locking + * the anon_vma. */ if (vma->anon_vma) continue; @@ -1555,14 +1612,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * reverse order. Trylock is a way to avoid deadlock. */ if (mmap_write_trylock(mm)) { - if (!khugepaged_test_exit(mm)) { - spinlock_t *ptl = pmd_lock(mm, pmd); - /* assume page table is clear */ - _pmd = pmdp_collapse_flush(vma, addr, pmd); - spin_unlock(ptl); - mm_dec_nr_ptes(mm); - pte_free(mm, pmd_pgtable(_pmd)); - } + if (!khugepaged_test_exit(mm)) + collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { /* Try again later */ diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 5569524bc9..cabe7d7f14 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -139,7 +139,7 @@ static void tlb_remove_table_smp_sync(void *arg) /* Simply deliver the interrupt */ } -static void tlb_remove_table_sync_one(void) +void tlb_remove_table_sync_one(void) { /* * This isn't an RCU grace period and hence the page-tables cannot be @@ -163,8 +163,6 @@ static void tlb_remove_table_free(struct mmu_table_batch *batch) #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */ -static void tlb_remove_table_sync_one(void) { } - static void tlb_remove_table_free(struct mmu_table_batch *batch) { __tlb_remove_table_free(batch); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 109d8cd704..088006e7eb 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1984,21 +1984,21 @@ static void mld_send_cr(struct inet6_dev *idev) static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) { - struct net *net = dev_net(dev); - struct sock *sk = net->ipv6.igmp_sk; + const struct in6_addr *snd_addr, *saddr; + int err, len, payload_len, full_len; + struct in6_addr addr_buf; struct inet6_dev *idev; struct sk_buff *skb; struct mld_msg *hdr; - const struct in6_addr *snd_addr, *saddr; - struct in6_addr addr_buf; int hlen = LL_RESERVED_SPACE(dev); int tlen = dev->needed_tailroom; - int err, len, payload_len, full_len; u8 ra[8] = { IPPROTO_ICMPV6, 0, IPV6_TLV_ROUTERALERT, 2, 0, 0, IPV6_TLV_PADN, 0 }; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; + struct net *net; + struct sock *sk; if (type == ICMPV6_MGM_REDUCTION) snd_addr = &in6addr_linklocal_allrouters; @@ -2009,20 +2009,21 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) payload_len = len + sizeof(ra); full_len = sizeof(struct ipv6hdr) + payload_len; + skb = alloc_skb(hlen + tlen + full_len, GFP_ATOMIC); + rcu_read_lock(); - IP6_UPD_PO_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUT, full_len); - rcu_read_unlock(); - - skb = sock_alloc_send_skb(sk, hlen + tlen + full_len, 1, &err); + net = dev_net_rcu(dev); + idev = __in6_dev_get(dev); + IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, full_len); if (!skb) { - rcu_read_lock(); - IP6_INC_STATS(net, __in6_dev_get(dev), - IPSTATS_MIB_OUTDISCARDS); + IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); rcu_read_unlock(); return; } + sk = net->ipv6.igmp_sk; + skb_set_owner_w(skb, sk); + skb->priority = TC_PRIO_CONTROL; skb_reserve(skb, hlen); @@ -2047,9 +2048,6 @@ static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) IPPROTO_ICMPV6, csum_partial(hdr, len, 0)); - rcu_read_lock(); - idev = __in6_dev_get(skb->dev); - icmpv6_flow_init(sk, &fl6, type, &ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->dev->ifindex); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9415dd1aea..c05fa560f9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2408,10 +2408,10 @@ static void ip6_negative_advice(struct sock *sk, if (rt->rt6i_flags & RTF_CACHE) { rcu_read_lock(); if (rt6_check_expired(rt)) { - /* counteract the dst_release() in sk_dst_reset() */ - dst_hold(dst); + /* rt/dst can not be destroyed yet, + * because of rcu_read_lock() + */ sk_dst_reset(sk); - rt6_remove_exception_rt(rt); } rcu_read_unlock();