diff options
author | Mike Pagano <mpagano@gentoo.org> | 2015-01-09 13:56:42 -0500 |
---|---|---|
committer | Mike Pagano <mpagano@gentoo.org> | 2015-01-09 13:56:42 -0500 |
commit | 42d91f1cec2de8a0cd48296616204322fdaf9449 (patch) | |
tree | 466e9e064f74d1ead2ebb4f5e6356b774f877c23 | |
parent | Add DEVPTS_MULTIPLE_INSTANCES when GENTOO_LINUX_INIT_SYSTEMD is selected. See... (diff) | |
download | linux-patches-42d91f1cec2de8a0cd48296616204322fdaf9449.tar.gz linux-patches-42d91f1cec2de8a0cd48296616204322fdaf9449.tar.bz2 linux-patches-42d91f1cec2de8a0cd48296616204322fdaf9449.zip |
-rw-r--r-- | 0000_README | 27 | ||||
-rw-r--r-- | 1063_linux-3.10.64.patch | 1539 | ||||
-rwxr-xr-x | 5000_BFQ-4-block-Switch-from-v6r2-for-3.10.0-v6r2-for-3.10.patch | 59 | ||||
-rw-r--r-- | 5001_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r7-3.10.patch (renamed from 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.10.patch) | 34 | ||||
-rw-r--r-- | 5002_BFQ-2-block-introduce-the-v7r7-I-O-sched-for-3.10.patch1 (renamed from 5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.10.patch1) | 2674 | ||||
-rw-r--r-- | 5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r7-for-3.10.patch1 (renamed from 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.10.patch1) | 739 |
6 files changed, 3943 insertions, 1129 deletions
diff --git a/0000_README b/0000_README index 689a2fe3..940fe8ce 100644 --- a/0000_README +++ b/0000_README @@ -294,6 +294,10 @@ Patch: 1062_linux-3.10.63.patch From: http://www.kernel.org Desc: Linux 3.10.63 +Patch: 1063_linux-3.10.64.patch +From: http://www.kernel.org +Desc: Linux 3.10.64 + Patch: 1500_XATTR_USER_PREFIX.patch From: https://bugs.gentoo.org/show_bug.cgi?id=470644 Desc: Support for namespace user.pax.* on tmpfs. @@ -338,22 +342,19 @@ Patch: 4567_distro-Gentoo-Kconfig.patch From: Tom Wijsman <TomWij@gentoo.org Desc: Add Gentoo Linux support config settings and defaults. -Patch: 5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.10.patch -From: http://algo.ing.unimo.it/people/paolo/disk_sched/ -Desc: BFQ v6r2 patch 1 for 3.10: Build, cgroups and kconfig bits +Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch +From: https://github.com/graysky2/kernel_gcc_patch/ +Desc: Kernel patch enables gcc optimizations for additional CPUs. -Patch: 5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.10.patch1 +Patch: 5001_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r7-3.10.patch From: http://algo.ing.unimo.it/people/paolo/disk_sched/ -Desc: BFQ v6r2 patch 2 for 3.10: BFQ Scheduler +Desc: BFQ v7r7 patch 1 for 3.10: Build, cgroups and kconfig bits -Patch: 5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.10.patch1 +Patch: 5002_BFQ-2-block-introduce-the-v7r7-I-O-sched-for-3.10.patch1 From: http://algo.ing.unimo.it/people/paolo/disk_sched/ -Desc: BFQ v6r2 patch 3 for 3.10: Early Queue Merge (EQM) +Desc: BFQ v7r7 patch 2 for 3.10: BFQ Scheduler -Patch: 5000_BFQ-4-block-Switch-from-v6r2-for-3.10.0-v6r2-for-3.10.patch -From: https://groups.google.com/forum/#!topic/bfq-iosched/f4Lg5INzQ-k -Desc: BFQ v6r2 patch 4 for 3.10: Switch to 3.10.8-rc1. +Patch: 5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r7-for-3.10.0.patch +From: http://algo.ing.unimo.it/people/paolo/disk_sched/ +Desc: BFQ v7r7 patch 3 for 3.10: Early Queue Merge (EQM) -Patch: 5000_enable-additional-cpu-optimizations-for-gcc.patch -From: https://github.com/graysky2/kernel_gcc_patch/ -Desc: Kernel patch enables gcc optimizations for additional CPUs. diff --git a/1063_linux-3.10.64.patch b/1063_linux-3.10.64.patch new file mode 100644 index 00000000..053f529d --- /dev/null +++ b/1063_linux-3.10.64.patch @@ -0,0 +1,1539 @@ +diff --git a/Makefile b/Makefile +index 9383fe24baa9..e5b63fb3d0e1 100644 +--- a/Makefile ++++ b/Makefile +@@ -1,6 +1,6 @@ + VERSION = 3 + PATCHLEVEL = 10 +-SUBLEVEL = 63 ++SUBLEVEL = 64 + EXTRAVERSION = + NAME = TOSSUG Baby Fish + +diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c +index 8b6e4f5288a2..a98afed9348b 100644 +--- a/arch/s390/kernel/compat_linux.c ++++ b/arch/s390/kernel/compat_linux.c +@@ -248,7 +248,7 @@ asmlinkage long sys32_setgroups16(int gidsetsize, u16 __user *grouplist) + struct group_info *group_info; + int retval; + +- if (!capable(CAP_SETGID)) ++ if (!may_setgroups()) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; +diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h +index 46727eb37bfe..6e1aaf73852a 100644 +--- a/arch/x86/include/uapi/asm/ldt.h ++++ b/arch/x86/include/uapi/asm/ldt.h +@@ -28,6 +28,13 @@ struct user_desc { + unsigned int seg_not_present:1; + unsigned int useable:1; + #ifdef __x86_64__ ++ /* ++ * Because this bit is not present in 32-bit user code, user ++ * programs can pass uninitialized values here. Therefore, in ++ * any context in which a user_desc comes from a 32-bit program, ++ * the kernel must act as though lm == 0, regardless of the ++ * actual value. ++ */ + unsigned int lm:1; + #endif + }; +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index cd6d9a5a42f6..c4ff2a916139 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -279,7 +279,14 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) + static void __init paravirt_ops_setup(void) + { + pv_info.name = "KVM"; +- pv_info.paravirt_enabled = 1; ++ ++ /* ++ * KVM isn't paravirt in the sense of paravirt_enabled. A KVM ++ * guest kernel works like a bare metal kernel with additional ++ * features, and paravirt_enabled is about features that are ++ * missing. ++ */ ++ pv_info.paravirt_enabled = 0; + + if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) + pv_cpu_ops.io_delay = kvm_io_delay; +diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c +index 3dd37ebd591b..41514f56c241 100644 +--- a/arch/x86/kernel/kvmclock.c ++++ b/arch/x86/kernel/kvmclock.c +@@ -265,7 +265,6 @@ void __init kvmclock_init(void) + #endif + kvm_get_preset_lpj(); + clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); +- pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) +diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c +index f99a242730e9..7099ab1e075b 100644 +--- a/arch/x86/kernel/process_64.c ++++ b/arch/x86/kernel/process_64.c +@@ -279,24 +279,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + + fpu = switch_fpu_prepare(prev_p, next_p, cpu); + +- /* +- * Reload esp0, LDT and the page table pointer: +- */ ++ /* Reload esp0 and ss1. */ + load_sp0(tss, next); + +- /* +- * Switch DS and ES. +- * This won't pick up thread selector changes, but I guess that is ok. +- */ +- savesegment(es, prev->es); +- if (unlikely(next->es | prev->es)) +- loadsegment(es, next->es); +- +- savesegment(ds, prev->ds); +- if (unlikely(next->ds | prev->ds)) +- loadsegment(ds, next->ds); +- +- + /* We must save %fs and %gs before load_TLS() because + * %fs and %gs may be cleared by load_TLS(). + * +@@ -305,41 +290,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) + savesegment(fs, fsindex); + savesegment(gs, gsindex); + ++ /* ++ * Load TLS before restoring any segments so that segment loads ++ * reference the correct GDT entries. ++ */ + load_TLS(next, cpu); + + /* +- * Leave lazy mode, flushing any hypercalls made here. +- * This must be done before restoring TLS segments so +- * the GDT and LDT are properly updated, and must be +- * done before math_state_restore, so the TS bit is up +- * to date. ++ * Leave lazy mode, flushing any hypercalls made here. This ++ * must be done after loading TLS entries in the GDT but before ++ * loading segments that might reference them, and and it must ++ * be done before math_state_restore, so the TS bit is up to ++ * date. + */ + arch_end_context_switch(next_p); + ++ /* Switch DS and ES. ++ * ++ * Reading them only returns the selectors, but writing them (if ++ * nonzero) loads the full descriptor from the GDT or LDT. The ++ * LDT for next is loaded in switch_mm, and the GDT is loaded ++ * above. ++ * ++ * We therefore need to write new values to the segment ++ * registers on every context switch unless both the new and old ++ * values are zero. ++ * ++ * Note that we don't need to do anything for CS and SS, as ++ * those are saved and restored as part of pt_regs. ++ */ ++ savesegment(es, prev->es); ++ if (unlikely(next->es | prev->es)) ++ loadsegment(es, next->es); ++ ++ savesegment(ds, prev->ds); ++ if (unlikely(next->ds | prev->ds)) ++ loadsegment(ds, next->ds); ++ + /* + * Switch FS and GS. + * +- * Segment register != 0 always requires a reload. Also +- * reload when it has changed. When prev process used 64bit +- * base always reload to avoid an information leak. ++ * These are even more complicated than FS and GS: they have ++ * 64-bit bases are that controlled by arch_prctl. Those bases ++ * only differ from the values in the GDT or LDT if the selector ++ * is 0. ++ * ++ * Loading the segment register resets the hidden base part of ++ * the register to 0 or the value from the GDT / LDT. If the ++ * next base address zero, writing 0 to the segment register is ++ * much faster than using wrmsr to explicitly zero the base. ++ * ++ * The thread_struct.fs and thread_struct.gs values are 0 ++ * if the fs and gs bases respectively are not overridden ++ * from the values implied by fsindex and gsindex. They ++ * are nonzero, and store the nonzero base addresses, if ++ * the bases are overridden. ++ * ++ * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should ++ * be impossible. ++ * ++ * Therefore we need to reload the segment registers if either ++ * the old or new selector is nonzero, and we need to override ++ * the base address if next thread expects it to be overridden. ++ * ++ * This code is unnecessarily slow in the case where the old and ++ * new indexes are zero and the new base is nonzero -- it will ++ * unnecessarily write 0 to the selector before writing the new ++ * base address. ++ * ++ * Note: This all depends on arch_prctl being the only way that ++ * user code can override the segment base. Once wrfsbase and ++ * wrgsbase are enabled, most of this code will need to change. + */ + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); ++ + /* +- * Check if the user used a selector != 0; if yes +- * clear 64bit base, since overloaded base is always +- * mapped to the Null selector ++ * If user code wrote a nonzero value to FS, then it also ++ * cleared the overridden base address. ++ * ++ * XXX: if user code wrote 0 to FS and cleared the base ++ * address itself, we won't notice and we'll incorrectly ++ * restore the prior base address next time we reschdule ++ * the process. + */ + if (fsindex) + prev->fs = 0; + } +- /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); ++ ++ /* This works (and fails) the same way as fsindex above. */ + if (gsindex) + prev->gs = 0; + } +diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c +index f7fec09e3e3a..4e942f31b1a7 100644 +--- a/arch/x86/kernel/tls.c ++++ b/arch/x86/kernel/tls.c +@@ -27,6 +27,37 @@ static int get_free_idx(void) + return -ESRCH; + } + ++static bool tls_desc_okay(const struct user_desc *info) ++{ ++ if (LDT_empty(info)) ++ return true; ++ ++ /* ++ * espfix is required for 16-bit data segments, but espfix ++ * only works for LDT segments. ++ */ ++ if (!info->seg_32bit) ++ return false; ++ ++ /* Only allow data segments in the TLS array. */ ++ if (info->contents > 1) ++ return false; ++ ++ /* ++ * Non-present segments with DPL 3 present an interesting attack ++ * surface. The kernel should handle such segments correctly, ++ * but TLS is very difficult to protect in a sandbox, so prevent ++ * such segments from being created. ++ * ++ * If userspace needs to remove a TLS entry, it can still delete ++ * it outright. ++ */ ++ if (info->seg_not_present) ++ return false; ++ ++ return true; ++} ++ + static void set_tls_desc(struct task_struct *p, int idx, + const struct user_desc *info, int n) + { +@@ -66,6 +97,9 @@ int do_set_thread_area(struct task_struct *p, int idx, + if (copy_from_user(&info, u_info, sizeof(info))) + return -EFAULT; + ++ if (!tls_desc_okay(&info)) ++ return -EINVAL; ++ + if (idx == -1) + idx = info.entry_number; + +@@ -192,6 +226,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, + { + struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; + const struct user_desc *info; ++ int i; + + if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || + (pos % sizeof(struct user_desc)) != 0 || +@@ -205,6 +240,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, + else + info = infobuf; + ++ for (i = 0; i < count / sizeof(struct user_desc); i++) ++ if (!tls_desc_okay(info + i)) ++ return -EINVAL; ++ + set_tls_desc(target, + GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), + info, count / sizeof(struct user_desc)); +diff --git a/crypto/af_alg.c b/crypto/af_alg.c +index bf948e134981..6ef6e2ad344e 100644 +--- a/crypto/af_alg.c ++++ b/crypto/af_alg.c +@@ -449,6 +449,9 @@ void af_alg_complete(struct crypto_async_request *req, int err) + { + struct af_alg_completion *completion = req->data; + ++ if (err == -EINPROGRESS) ++ return; ++ + completion->err = err; + complete(&completion->completion); + } +diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c +index 5a2c75499824..a79cbd6038f6 100644 +--- a/drivers/md/bitmap.c ++++ b/drivers/md/bitmap.c +@@ -883,7 +883,6 @@ void bitmap_unplug(struct bitmap *bitmap) + { + unsigned long i; + int dirty, need_write; +- int wait = 0; + + if (!bitmap || !bitmap->storage.filemap || + test_bit(BITMAP_STALE, &bitmap->flags)) +@@ -901,16 +900,13 @@ void bitmap_unplug(struct bitmap *bitmap) + clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); + write_page(bitmap, bitmap->storage.filemap[i], 0); + } +- if (dirty) +- wait = 1; +- } +- if (wait) { /* if any writes were performed, we need to wait on them */ +- if (bitmap->storage.file) +- wait_event(bitmap->write_wait, +- atomic_read(&bitmap->pending_writes)==0); +- else +- md_super_wait(bitmap->mddev); + } ++ if (bitmap->storage.file) ++ wait_event(bitmap->write_wait, ++ atomic_read(&bitmap->pending_writes)==0); ++ else ++ md_super_wait(bitmap->mddev); ++ + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) + bitmap_file_kick(bitmap); + } +diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c +index c9b4ca9e0696..e855a190270d 100644 +--- a/drivers/md/dm-bufio.c ++++ b/drivers/md/dm-bufio.c +@@ -529,6 +529,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, + end_io(&b->bio, r); + } + ++static void inline_endio(struct bio *bio, int error) ++{ ++ bio_end_io_t *end_fn = bio->bi_private; ++ ++ /* ++ * Reset the bio to free any attached resources ++ * (e.g. bio integrity profiles). ++ */ ++ bio_reset(bio); ++ ++ end_fn(bio, error); ++} ++ + static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, + bio_end_io_t *end_io) + { +@@ -540,7 +553,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, + b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; + b->bio.bi_sector = block << b->c->sectors_per_block_bits; + b->bio.bi_bdev = b->c->bdev; +- b->bio.bi_end_io = end_io; ++ b->bio.bi_end_io = inline_endio; ++ /* ++ * Use of .bi_private isn't a problem here because ++ * the dm_buffer's inline bio is local to bufio. ++ */ ++ b->bio.bi_private = end_io; + + /* + * We assume that if len >= PAGE_SIZE ptr is page-aligned. +diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c +index afb419e514bf..056d09c33af1 100644 +--- a/drivers/md/persistent-data/dm-space-map-metadata.c ++++ b/drivers/md/persistent-data/dm-space-map-metadata.c +@@ -493,7 +493,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count + { + struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); + +- return smm->ll.nr_blocks; ++ *count = smm->ll.nr_blocks; ++ ++ return 0; + } + + static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) +diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c +index 15e1463e5e13..17fe83e81ea4 100644 +--- a/drivers/mfd/tc6393xb.c ++++ b/drivers/mfd/tc6393xb.c +@@ -263,6 +263,17 @@ static int tc6393xb_ohci_disable(struct platform_device *dev) + return 0; + } + ++static int tc6393xb_ohci_suspend(struct platform_device *dev) ++{ ++ struct tc6393xb_platform_data *tcpd = dev_get_platdata(dev->dev.parent); ++ ++ /* We can't properly store/restore OHCI state, so fail here */ ++ if (tcpd->resume_restore) ++ return -EBUSY; ++ ++ return tc6393xb_ohci_disable(dev); ++} ++ + static int tc6393xb_fb_enable(struct platform_device *dev) + { + struct tc6393xb *tc6393xb = dev_get_drvdata(dev->dev.parent); +@@ -403,7 +414,7 @@ static struct mfd_cell tc6393xb_cells[] = { + .num_resources = ARRAY_SIZE(tc6393xb_ohci_resources), + .resources = tc6393xb_ohci_resources, + .enable = tc6393xb_ohci_enable, +- .suspend = tc6393xb_ohci_disable, ++ .suspend = tc6393xb_ohci_suspend, + .resume = tc6393xb_ohci_enable, + .disable = tc6393xb_ohci_disable, + }, +diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c +index 9aca9462a12f..7ad66823d022 100644 +--- a/drivers/mmc/card/block.c ++++ b/drivers/mmc/card/block.c +@@ -257,7 +257,7 @@ static ssize_t force_ro_show(struct device *dev, struct device_attribute *attr, + int ret; + struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); + +- ret = snprintf(buf, PAGE_SIZE, "%d", ++ ret = snprintf(buf, PAGE_SIZE, "%d\n", + get_disk_ro(dev_to_disk(dev)) ^ + md->read_only); + mmc_blk_put(md); +diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c +index 4956c99ed90e..78b4fe845245 100644 +--- a/drivers/scsi/megaraid/megaraid_sas_base.c ++++ b/drivers/scsi/megaraid/megaraid_sas_base.c +@@ -933,7 +933,7 @@ megasas_issue_blocked_abort_cmd(struct megasas_instance *instance, + abort_fr->abort_mfi_phys_addr_hi = 0; + + cmd->sync_cmd = 1; +- cmd->cmd_status = 0xFF; ++ cmd->cmd_status = ENODATA; + + instance->instancet->issue_dcmd(instance, cmd); + +diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c +index abecce399354..7360f03ddbe1 100644 +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -3857,12 +3857,6 @@ again: + if (ret) + break; + +- /* opt_discard */ +- if (btrfs_test_opt(root, DISCARD)) +- ret = btrfs_error_discard_extent(root, start, +- end + 1 - start, +- NULL); +- + clear_extent_dirty(unpin, start, end, GFP_NOFS); + btrfs_error_unpin_extent_range(root, start, end); + cond_resched(); +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index bbafa05519da..f99c71e40f8b 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5277,7 +5277,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, + update_global_block_rsv(fs_info); + } + +-static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) ++static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, ++ const bool return_free_space) + { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_block_group_cache *cache = NULL; +@@ -5301,7 +5302,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) + + if (start < cache->last_byte_to_unpin) { + len = min(len, cache->last_byte_to_unpin - start); +- btrfs_add_free_space(cache, start, len); ++ if (return_free_space) ++ btrfs_add_free_space(cache, start, len); + } + + start += len; +@@ -5364,7 +5366,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + end + 1 - start, NULL); + + clear_extent_dirty(unpin, start, end, GFP_NOFS); +- unpin_extent_range(root, start, end); ++ unpin_extent_range(root, start, end, true); + cond_resched(); + } + +@@ -8564,7 +8566,7 @@ out: + + int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) + { +- return unpin_extent_range(root, start, end); ++ return unpin_extent_range(root, start, end, false); + } + + int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, +diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c +index a4a7a1a8da95..0a3809500599 100644 +--- a/fs/btrfs/extent_map.c ++++ b/fs/btrfs/extent_map.c +@@ -263,8 +263,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, + if (!em) + goto out; + +- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) +- list_move(&em->list, &tree->modified_extents); + em->generation = gen; + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->mod_start = em->start; +diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c +index f71ec125290d..1da2446bf6b0 100644 +--- a/fs/ecryptfs/crypto.c ++++ b/fs/ecryptfs/crypto.c +@@ -2102,7 +2102,6 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, + break; + case 2: + dst[dst_byte_offset++] |= (src_byte); +- dst[dst_byte_offset] = 0; + current_bit_offset = 0; + break; + } +diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c +index a7abbea2c096..9ff3664bb3ea 100644 +--- a/fs/ecryptfs/file.c ++++ b/fs/ecryptfs/file.c +@@ -196,23 +196,11 @@ static int ecryptfs_open(struct inode *inode, struct file *file) + { + int rc = 0; + struct ecryptfs_crypt_stat *crypt_stat = NULL; +- struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct dentry *ecryptfs_dentry = file->f_path.dentry; + /* Private value of ecryptfs_dentry allocated in + * ecryptfs_lookup() */ + struct ecryptfs_file_info *file_info; + +- mount_crypt_stat = &ecryptfs_superblock_to_private( +- ecryptfs_dentry->d_sb)->mount_crypt_stat; +- if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) +- && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR) +- || (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC) +- || (file->f_flags & O_APPEND))) { +- printk(KERN_WARNING "Mount has encrypted view enabled; " +- "files may only be read\n"); +- rc = -EPERM; +- goto out; +- } + /* Released in ecryptfs_release or end of function if failure */ + file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); + ecryptfs_set_file_private(file, file_info); +diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c +index e924cf45aad9..329a9cc2b2eb 100644 +--- a/fs/ecryptfs/main.c ++++ b/fs/ecryptfs/main.c +@@ -494,6 +494,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags + { + struct super_block *s; + struct ecryptfs_sb_info *sbi; ++ struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_dentry_info *root_info; + const char *err = "Getting sb failed"; + struct inode *inode; +@@ -512,6 +513,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags + err = "Error parsing options"; + goto out; + } ++ mount_crypt_stat = &sbi->mount_crypt_stat; + + s = sget(fs_type, NULL, set_anon_super, flags, NULL); + if (IS_ERR(s)) { +@@ -558,11 +560,19 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags + + /** + * Set the POSIX ACL flag based on whether they're enabled in the lower +- * mount. Force a read-only eCryptfs mount if the lower mount is ro. +- * Allow a ro eCryptfs mount even when the lower mount is rw. ++ * mount. + */ + s->s_flags = flags & ~MS_POSIXACL; +- s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL); ++ s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; ++ ++ /** ++ * Force a read-only eCryptfs mount when: ++ * 1) The lower mount is ro ++ * 2) The ecryptfs_encrypted_view mount option is specified ++ */ ++ if (path.dentry->d_sb->s_flags & MS_RDONLY || ++ mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) ++ s->s_flags |= MS_RDONLY; + + s->s_maxbytes = path.dentry->d_sb->s_maxbytes; + s->s_blocksize = path.dentry->d_sb->s_blocksize; +diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c +index f488bbae541a..735d7522a3a9 100644 +--- a/fs/isofs/rock.c ++++ b/fs/isofs/rock.c +@@ -30,6 +30,7 @@ struct rock_state { + int cont_size; + int cont_extent; + int cont_offset; ++ int cont_loops; + struct inode *inode; + }; + +@@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode) + rs->inode = inode; + } + ++/* Maximum number of Rock Ridge continuation entries */ ++#define RR_MAX_CE_ENTRIES 32 ++ + /* + * Returns 0 if the caller should continue scanning, 1 if the scan must end + * and -ve on error. +@@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs) + goto out; + } + ret = -EIO; ++ if (++rs->cont_loops >= RR_MAX_CE_ENTRIES) ++ goto out; + bh = sb_bread(rs->inode->i_sb, rs->cont_extent); + if (bh) { + memcpy(rs->buffer, bh->b_data + rs->cont_offset, +@@ -356,6 +362,9 @@ repeat: + rs.cont_size = isonum_733(rr->u.CE.size); + break; + case SIG('E', 'R'): ++ /* Invalid length of ER tag id? */ ++ if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len) ++ goto out; + ISOFS_SB(inode->i_sb)->s_rock = 1; + printk(KERN_DEBUG "ISO 9660 Extensions: "); + { +diff --git a/fs/namespace.c b/fs/namespace.c +index 154822397780..d0244c8ba09c 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -1342,6 +1342,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) + goto dput_and_out; + if (!check_mnt(mnt)) + goto dput_and_out; ++ retval = -EPERM; ++ if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) ++ goto dput_and_out; + + retval = do_umount(mnt, flags); + dput_and_out: +@@ -1816,7 +1819,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags, + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && + !(mnt_flags & MNT_NODEV)) { +- return -EPERM; ++ /* Was the nodev implicitly added in mount? */ ++ if ((mnt->mnt_ns->user_ns != &init_user_ns) && ++ !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { ++ mnt_flags |= MNT_NODEV; ++ } else { ++ return -EPERM; ++ } + } + if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && + !(mnt_flags & MNT_NOSUID)) { +diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c +index 60426ccb3b65..2f970de02b16 100644 +--- a/fs/ncpfs/ioctl.c ++++ b/fs/ncpfs/ioctl.c +@@ -448,7 +448,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg + result = -EIO; + } + } +- result = 0; + } + mutex_unlock(&server->root_setup_lock); + +diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c +index 78787948f69d..20ebcfa3c92e 100644 +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -6418,6 +6418,9 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) + + dprintk("--> %s\n", __func__); + ++ /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ ++ pnfs_get_layout_hdr(NFS_I(inode)->layout); ++ + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); + if (!lgp->args.layout.pages) { + nfs4_layoutget_release(lgp); +@@ -6430,9 +6433,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) + lgp->res.seq_res.sr_slot = NULL; + nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); + +- /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ +- pnfs_get_layout_hdr(NFS_I(inode)->layout); +- + task = rpc_run_task(&task_setup_data); + if (IS_ERR(task)) + return ERR_CAST(task); +diff --git a/fs/proc/base.c b/fs/proc/base.c +index de12b8128b95..8fc784aef0b8 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -2612,6 +2612,57 @@ static const struct file_operations proc_projid_map_operations = { + .llseek = seq_lseek, + .release = proc_id_map_release, + }; ++ ++static int proc_setgroups_open(struct inode *inode, struct file *file) ++{ ++ struct user_namespace *ns = NULL; ++ struct task_struct *task; ++ int ret; ++ ++ ret = -ESRCH; ++ task = get_proc_task(inode); ++ if (task) { ++ rcu_read_lock(); ++ ns = get_user_ns(task_cred_xxx(task, user_ns)); ++ rcu_read_unlock(); ++ put_task_struct(task); ++ } ++ if (!ns) ++ goto err; ++ ++ if (file->f_mode & FMODE_WRITE) { ++ ret = -EACCES; ++ if (!ns_capable(ns, CAP_SYS_ADMIN)) ++ goto err_put_ns; ++ } ++ ++ ret = single_open(file, &proc_setgroups_show, ns); ++ if (ret) ++ goto err_put_ns; ++ ++ return 0; ++err_put_ns: ++ put_user_ns(ns); ++err: ++ return ret; ++} ++ ++static int proc_setgroups_release(struct inode *inode, struct file *file) ++{ ++ struct seq_file *seq = file->private_data; ++ struct user_namespace *ns = seq->private; ++ int ret = single_release(inode, file); ++ put_user_ns(ns); ++ return ret; ++} ++ ++static const struct file_operations proc_setgroups_operations = { ++ .open = proc_setgroups_open, ++ .write = proc_setgroups_write, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = proc_setgroups_release, ++}; + #endif /* CONFIG_USER_NS */ + + static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, +@@ -2720,6 +2771,7 @@ static const struct pid_entry tgid_base_stuff[] = { + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), ++ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), + #endif + #ifdef CONFIG_CHECKPOINT_RESTORE + REG("timers", S_IRUGO, proc_timers_operations), +@@ -3073,6 +3125,7 @@ static const struct pid_entry tid_base_stuff[] = { + REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), + REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), + REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), ++ REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), + #endif + }; + +diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c +index d7c6dbe4194b..d89f324bc387 100644 +--- a/fs/udf/symlink.c ++++ b/fs/udf/symlink.c +@@ -80,11 +80,17 @@ static int udf_symlink_filler(struct file *file, struct page *page) + struct inode *inode = page->mapping->host; + struct buffer_head *bh = NULL; + unsigned char *symlink; +- int err = -EIO; ++ int err; + unsigned char *p = kmap(page); + struct udf_inode_info *iinfo; + uint32_t pos; + ++ /* We don't support symlinks longer than one block */ ++ if (inode->i_size > inode->i_sb->s_blocksize) { ++ err = -ENAMETOOLONG; ++ goto out_unmap; ++ } ++ + iinfo = UDF_I(inode); + pos = udf_block_map(inode, 0); + +@@ -94,8 +100,10 @@ static int udf_symlink_filler(struct file *file, struct page *page) + } else { + bh = sb_bread(inode->i_sb, pos); + +- if (!bh) +- goto out; ++ if (!bh) { ++ err = -EIO; ++ goto out_unlock_inode; ++ } + + symlink = bh->b_data; + } +@@ -109,9 +117,10 @@ static int udf_symlink_filler(struct file *file, struct page *page) + unlock_page(page); + return 0; + +-out: ++out_unlock_inode: + up_read(&iinfo->i_data_sem); + SetPageError(page); ++out_unmap: + kunmap(page); + unlock_page(page); + return err; +diff --git a/include/linux/cred.h b/include/linux/cred.h +index 04421e825365..6c58dd7cb9ac 100644 +--- a/include/linux/cred.h ++++ b/include/linux/cred.h +@@ -68,6 +68,7 @@ extern void groups_free(struct group_info *); + extern int set_current_groups(struct group_info *); + extern int set_groups(struct cred *, struct group_info *); + extern int groups_search(const struct group_info *, kgid_t); ++extern bool may_setgroups(void); + + /* access the groups "array" with this macro */ + #define GROUP_AT(gi, i) \ +diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h +index 14105c26a836..a37081cf59da 100644 +--- a/include/linux/user_namespace.h ++++ b/include/linux/user_namespace.h +@@ -17,6 +17,10 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ + } extent[UID_GID_MAP_MAX_EXTENTS]; + }; + ++#define USERNS_SETGROUPS_ALLOWED 1UL ++ ++#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED ++ + struct user_namespace { + struct uid_gid_map uid_map; + struct uid_gid_map gid_map; +@@ -27,6 +31,7 @@ struct user_namespace { + kuid_t owner; + kgid_t group; + unsigned int proc_inum; ++ unsigned long flags; + bool may_mount_sysfs; + bool may_mount_proc; + }; +@@ -59,6 +64,9 @@ extern struct seq_operations proc_projid_seq_operations; + extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); + extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); + extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); ++extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); ++extern int proc_setgroups_show(struct seq_file *m, void *v); ++extern bool userns_may_setgroups(const struct user_namespace *ns); + #else + + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) +@@ -83,6 +91,10 @@ static inline void put_user_ns(struct user_namespace *ns) + { + } + ++static inline bool userns_may_setgroups(const struct user_namespace *ns) ++{ ++ return true; ++} + #endif + + void update_mnt_policy(struct user_namespace *userns); +diff --git a/kernel/groups.c b/kernel/groups.c +index 6b2588dd04ff..67b4ba30475f 100644 +--- a/kernel/groups.c ++++ b/kernel/groups.c +@@ -6,6 +6,7 @@ + #include <linux/slab.h> + #include <linux/security.h> + #include <linux/syscalls.h> ++#include <linux/user_namespace.h> + #include <asm/uaccess.h> + + /* init to 2 - one for init_task, one to ensure it is never freed */ +@@ -223,6 +224,14 @@ out: + return i; + } + ++bool may_setgroups(void) ++{ ++ struct user_namespace *user_ns = current_user_ns(); ++ ++ return ns_capable(user_ns, CAP_SETGID) && ++ userns_may_setgroups(user_ns); ++} ++ + /* + * SMP: Our groups are copy-on-write. We can set them safely + * without another task interfering. +@@ -233,7 +242,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) + struct group_info *group_info; + int retval; + +- if (!nsown_capable(CAP_SETGID)) ++ if (!may_setgroups()) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; +diff --git a/kernel/pid.c b/kernel/pid.c +index 0eb6d8e8b1da..3cdba5173600 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -335,6 +335,8 @@ out: + + out_unlock: + spin_unlock_irq(&pidmap_lock); ++ put_pid_ns(ns); ++ + out_free: + while (++i <= ns->level) + free_pidmap(pid->numbers + i); +diff --git a/kernel/uid16.c b/kernel/uid16.c +index f6c83d7ef000..d58cc4d8f0d1 100644 +--- a/kernel/uid16.c ++++ b/kernel/uid16.c +@@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) + struct group_info *group_info; + int retval; + +- if (!nsown_capable(CAP_SETGID)) ++ if (!may_setgroups()) + return -EPERM; + if ((unsigned)gidsetsize > NGROUPS_MAX) + return -EINVAL; +diff --git a/kernel/user.c b/kernel/user.c +index 69b4c3d48cde..6bbef5604101 100644 +--- a/kernel/user.c ++++ b/kernel/user.c +@@ -51,6 +51,7 @@ struct user_namespace init_user_ns = { + .owner = GLOBAL_ROOT_UID, + .group = GLOBAL_ROOT_GID, + .proc_inum = PROC_USER_INIT_INO, ++ .flags = USERNS_INIT_FLAGS, + .may_mount_sysfs = true, + .may_mount_proc = true, + }; +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 9bea1d7dd21f..3f2fb33d291a 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -24,6 +24,7 @@ + #include <linux/fs_struct.h> + + static struct kmem_cache *user_ns_cachep __read_mostly; ++static DEFINE_MUTEX(userns_state_mutex); + + static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, +@@ -99,6 +100,11 @@ int create_user_ns(struct cred *new) + ns->owner = owner; + ns->group = group; + ++ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ ++ mutex_lock(&userns_state_mutex); ++ ns->flags = parent_ns->flags; ++ mutex_unlock(&userns_state_mutex); ++ + set_cred_user_ns(new, ns); + + update_mnt_policy(ns); +@@ -577,9 +583,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent + return false; + } + +- +-static DEFINE_MUTEX(id_map_mutex); +- + static ssize_t map_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos, + int cap_setid, +@@ -596,7 +599,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, + ssize_t ret = -EINVAL; + + /* +- * The id_map_mutex serializes all writes to any given map. ++ * The userns_state_mutex serializes all writes to any given map. + * + * Any map is only ever written once. + * +@@ -614,7 +617,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, + * order and smp_rmb() is guaranteed that we don't have crazy + * architectures returning stale data. + */ +- mutex_lock(&id_map_mutex); ++ mutex_lock(&userns_state_mutex); + + ret = -EPERM; + /* Only allow one successful write to the map */ +@@ -741,7 +744,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, + *ppos = count; + ret = count; + out: +- mutex_unlock(&id_map_mutex); ++ mutex_unlock(&userns_state_mutex); + if (page) + free_page(page); + return ret; +@@ -800,17 +803,21 @@ static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, + struct uid_gid_map *new_map) + { +- /* Allow mapping to your own filesystem ids */ +- if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { ++ const struct cred *cred = file->f_cred; ++ /* Don't allow mappings that would allow anything that wouldn't ++ * be allowed without the establishment of unprivileged mappings. ++ */ ++ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && ++ uid_eq(ns->owner, cred->euid)) { + u32 id = new_map->extent[0].lower_first; + if (cap_setid == CAP_SETUID) { + kuid_t uid = make_kuid(ns->parent, id); +- if (uid_eq(uid, file->f_cred->fsuid)) ++ if (uid_eq(uid, cred->euid)) + return true; +- } +- else if (cap_setid == CAP_SETGID) { ++ } else if (cap_setid == CAP_SETGID) { + kgid_t gid = make_kgid(ns->parent, id); +- if (gid_eq(gid, file->f_cred->fsgid)) ++ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && ++ gid_eq(gid, cred->egid)) + return true; + } + } +@@ -830,6 +837,100 @@ static bool new_idmap_permitted(const struct file *file, + return false; + } + ++int proc_setgroups_show(struct seq_file *seq, void *v) ++{ ++ struct user_namespace *ns = seq->private; ++ unsigned long userns_flags = ACCESS_ONCE(ns->flags); ++ ++ seq_printf(seq, "%s\n", ++ (userns_flags & USERNS_SETGROUPS_ALLOWED) ? ++ "allow" : "deny"); ++ return 0; ++} ++ ++ssize_t proc_setgroups_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct seq_file *seq = file->private_data; ++ struct user_namespace *ns = seq->private; ++ char kbuf[8], *pos; ++ bool setgroups_allowed; ++ ssize_t ret; ++ ++ /* Only allow a very narrow range of strings to be written */ ++ ret = -EINVAL; ++ if ((*ppos != 0) || (count >= sizeof(kbuf))) ++ goto out; ++ ++ /* What was written? */ ++ ret = -EFAULT; ++ if (copy_from_user(kbuf, buf, count)) ++ goto out; ++ kbuf[count] = '\0'; ++ pos = kbuf; ++ ++ /* What is being requested? */ ++ ret = -EINVAL; ++ if (strncmp(pos, "allow", 5) == 0) { ++ pos += 5; ++ setgroups_allowed = true; ++ } ++ else if (strncmp(pos, "deny", 4) == 0) { ++ pos += 4; ++ setgroups_allowed = false; ++ } ++ else ++ goto out; ++ ++ /* Verify there is not trailing junk on the line */ ++ pos = skip_spaces(pos); ++ if (*pos != '\0') ++ goto out; ++ ++ ret = -EPERM; ++ mutex_lock(&userns_state_mutex); ++ if (setgroups_allowed) { ++ /* Enabling setgroups after setgroups has been disabled ++ * is not allowed. ++ */ ++ if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) ++ goto out_unlock; ++ } else { ++ /* Permanently disabling setgroups after setgroups has ++ * been enabled by writing the gid_map is not allowed. ++ */ ++ if (ns->gid_map.nr_extents != 0) ++ goto out_unlock; ++ ns->flags &= ~USERNS_SETGROUPS_ALLOWED; ++ } ++ mutex_unlock(&userns_state_mutex); ++ ++ /* Report a successful write */ ++ *ppos = count; ++ ret = count; ++out: ++ return ret; ++out_unlock: ++ mutex_unlock(&userns_state_mutex); ++ goto out; ++} ++ ++bool userns_may_setgroups(const struct user_namespace *ns) ++{ ++ bool allowed; ++ ++ mutex_lock(&userns_state_mutex); ++ /* It is not safe to use setgroups until a gid mapping in ++ * the user namespace has been established. ++ */ ++ allowed = ns->gid_map.nr_extents != 0; ++ /* Is setgroups allowed? */ ++ allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); ++ mutex_unlock(&userns_state_mutex); ++ ++ return allowed; ++} ++ + static void *userns_get(struct task_struct *task) + { + struct user_namespace *user_ns; +diff --git a/net/mac80211/key.c b/net/mac80211/key.c +index 67059b88fea5..635d0972b688 100644 +--- a/net/mac80211/key.c ++++ b/net/mac80211/key.c +@@ -607,7 +607,7 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, + int i; + + mutex_lock(&local->key_mtx); +- for (i = 0; i < NUM_DEFAULT_KEYS; i++) { ++ for (i = 0; i < ARRAY_SIZE(sta->gtk); i++) { + key = key_mtx_dereference(local, sta->gtk[i]); + if (!key) + continue; +diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c +index 85bc6d498b46..9299a38c372e 100644 +--- a/net/mac80211/rx.c ++++ b/net/mac80211/rx.c +@@ -1585,14 +1585,14 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) + sc = le16_to_cpu(hdr->seq_ctrl); + frag = sc & IEEE80211_SCTL_FRAG; + +- if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) +- goto out; +- + if (is_multicast_ether_addr(hdr->addr1)) { + rx->local->dot11MulticastReceivedFrameCount++; +- goto out; ++ goto out_no_led; + } + ++ if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) ++ goto out; ++ + I802_DEBUG_INC(rx->local->rx_handlers_fragments); + + if (skb_linearize(rx->skb)) +@@ -1683,9 +1683,10 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) + status->rx_flags |= IEEE80211_RX_FRAGMENTED; + + out: ++ ieee80211_led_rx(rx->local); ++ out_no_led: + if (rx->sta) + rx->sta->rx_packets++; +- ieee80211_led_rx(rx->local); + return RX_CONTINUE; + } + +diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c +index 9e1e005c7596..c4c8df4b214d 100644 +--- a/security/keys/encrypted-keys/encrypted.c ++++ b/security/keys/encrypted-keys/encrypted.c +@@ -1018,10 +1018,13 @@ static int __init init_encrypted(void) + ret = encrypted_shash_alloc(); + if (ret < 0) + return ret; ++ ret = aes_get_sizes(); ++ if (ret < 0) ++ goto out; + ret = register_key_type(&key_type_encrypted); + if (ret < 0) + goto out; +- return aes_get_sizes(); ++ return 0; + out: + encrypted_shash_release(); + return ret; +diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c +index 1b3ff2fda4d0..517785052f1c 100644 +--- a/tools/testing/selftests/mount/unprivileged-remount-test.c ++++ b/tools/testing/selftests/mount/unprivileged-remount-test.c +@@ -6,6 +6,8 @@ + #include <sys/types.h> + #include <sys/mount.h> + #include <sys/wait.h> ++#include <sys/vfs.h> ++#include <sys/statvfs.h> + #include <stdlib.h> + #include <unistd.h> + #include <fcntl.h> +@@ -32,11 +34,14 @@ + # define CLONE_NEWPID 0x20000000 + #endif + ++#ifndef MS_REC ++# define MS_REC 16384 ++#endif + #ifndef MS_RELATIME +-#define MS_RELATIME (1 << 21) ++# define MS_RELATIME (1 << 21) + #endif + #ifndef MS_STRICTATIME +-#define MS_STRICTATIME (1 << 24) ++# define MS_STRICTATIME (1 << 24) + #endif + + static void die(char *fmt, ...) +@@ -48,17 +53,14 @@ static void die(char *fmt, ...) + exit(EXIT_FAILURE); + } + +-static void write_file(char *filename, char *fmt, ...) ++static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap) + { + char buf[4096]; + int fd; + ssize_t written; + int buf_len; +- va_list ap; + +- va_start(ap, fmt); + buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); +- va_end(ap); + if (buf_len < 0) { + die("vsnprintf failed: %s\n", + strerror(errno)); +@@ -69,6 +71,8 @@ static void write_file(char *filename, char *fmt, ...) + + fd = open(filename, O_WRONLY); + if (fd < 0) { ++ if ((errno == ENOENT) && enoent_ok) ++ return; + die("open of %s failed: %s\n", + filename, strerror(errno)); + } +@@ -87,6 +91,65 @@ static void write_file(char *filename, char *fmt, ...) + } + } + ++static void maybe_write_file(char *filename, char *fmt, ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ vmaybe_write_file(true, filename, fmt, ap); ++ va_end(ap); ++ ++} ++ ++static void write_file(char *filename, char *fmt, ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ vmaybe_write_file(false, filename, fmt, ap); ++ va_end(ap); ++ ++} ++ ++static int read_mnt_flags(const char *path) ++{ ++ int ret; ++ struct statvfs stat; ++ int mnt_flags; ++ ++ ret = statvfs(path, &stat); ++ if (ret != 0) { ++ die("statvfs of %s failed: %s\n", ++ path, strerror(errno)); ++ } ++ if (stat.f_flag & ~(ST_RDONLY | ST_NOSUID | ST_NODEV | \ ++ ST_NOEXEC | ST_NOATIME | ST_NODIRATIME | ST_RELATIME | \ ++ ST_SYNCHRONOUS | ST_MANDLOCK)) { ++ die("Unrecognized mount flags\n"); ++ } ++ mnt_flags = 0; ++ if (stat.f_flag & ST_RDONLY) ++ mnt_flags |= MS_RDONLY; ++ if (stat.f_flag & ST_NOSUID) ++ mnt_flags |= MS_NOSUID; ++ if (stat.f_flag & ST_NODEV) ++ mnt_flags |= MS_NODEV; ++ if (stat.f_flag & ST_NOEXEC) ++ mnt_flags |= MS_NOEXEC; ++ if (stat.f_flag & ST_NOATIME) ++ mnt_flags |= MS_NOATIME; ++ if (stat.f_flag & ST_NODIRATIME) ++ mnt_flags |= MS_NODIRATIME; ++ if (stat.f_flag & ST_RELATIME) ++ mnt_flags |= MS_RELATIME; ++ if (stat.f_flag & ST_SYNCHRONOUS) ++ mnt_flags |= MS_SYNCHRONOUS; ++ if (stat.f_flag & ST_MANDLOCK) ++ mnt_flags |= ST_MANDLOCK; ++ ++ return mnt_flags; ++} ++ + static void create_and_enter_userns(void) + { + uid_t uid; +@@ -100,13 +163,10 @@ static void create_and_enter_userns(void) + strerror(errno)); + } + ++ maybe_write_file("/proc/self/setgroups", "deny"); + write_file("/proc/self/uid_map", "0 %d 1", uid); + write_file("/proc/self/gid_map", "0 %d 1", gid); + +- if (setgroups(0, NULL) != 0) { +- die("setgroups failed: %s\n", +- strerror(errno)); +- } + if (setgid(0) != 0) { + die ("setgid(0) failed %s\n", + strerror(errno)); +@@ -118,7 +178,8 @@ static void create_and_enter_userns(void) + } + + static +-bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) ++bool test_unpriv_remount(const char *fstype, const char *mount_options, ++ int mount_flags, int remount_flags, int invalid_flags) + { + pid_t child; + +@@ -151,9 +212,11 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) + strerror(errno)); + } + +- if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) { +- die("mount of /tmp failed: %s\n", +- strerror(errno)); ++ if (mount("testing", "/tmp", fstype, mount_flags, mount_options) != 0) { ++ die("mount of %s with options '%s' on /tmp failed: %s\n", ++ fstype, ++ mount_options? mount_options : "", ++ strerror(errno)); + } + + create_and_enter_userns(); +@@ -181,62 +244,127 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) + + static bool test_unpriv_remount_simple(int mount_flags) + { +- return test_unpriv_remount(mount_flags, mount_flags, 0); ++ return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags, 0); + } + + static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags) + { +- return test_unpriv_remount(mount_flags, mount_flags, invalid_flags); ++ return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags, ++ invalid_flags); ++} ++ ++static bool test_priv_mount_unpriv_remount(void) ++{ ++ pid_t child; ++ int ret; ++ const char *orig_path = "/dev"; ++ const char *dest_path = "/tmp"; ++ int orig_mnt_flags, remount_mnt_flags; ++ ++ child = fork(); ++ if (child == -1) { ++ die("fork failed: %s\n", ++ strerror(errno)); ++ } ++ if (child != 0) { /* parent */ ++ pid_t pid; ++ int status; ++ pid = waitpid(child, &status, 0); ++ if (pid == -1) { ++ die("waitpid failed: %s\n", ++ strerror(errno)); ++ } ++ if (pid != child) { ++ die("waited for %d got %d\n", ++ child, pid); ++ } ++ if (!WIFEXITED(status)) { ++ die("child did not terminate cleanly\n"); ++ } ++ return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; ++ } ++ ++ orig_mnt_flags = read_mnt_flags(orig_path); ++ ++ create_and_enter_userns(); ++ ret = unshare(CLONE_NEWNS); ++ if (ret != 0) { ++ die("unshare(CLONE_NEWNS) failed: %s\n", ++ strerror(errno)); ++ } ++ ++ ret = mount(orig_path, dest_path, "bind", MS_BIND | MS_REC, NULL); ++ if (ret != 0) { ++ die("recursive bind mount of %s onto %s failed: %s\n", ++ orig_path, dest_path, strerror(errno)); ++ } ++ ++ ret = mount(dest_path, dest_path, "none", ++ MS_REMOUNT | MS_BIND | orig_mnt_flags , NULL); ++ if (ret != 0) { ++ /* system("cat /proc/self/mounts"); */ ++ die("remount of /tmp failed: %s\n", ++ strerror(errno)); ++ } ++ ++ remount_mnt_flags = read_mnt_flags(dest_path); ++ if (orig_mnt_flags != remount_mnt_flags) { ++ die("Mount flags unexpectedly changed during remount of %s originally mounted on %s\n", ++ dest_path, orig_path); ++ } ++ exit(EXIT_SUCCESS); + } + + int main(int argc, char **argv) + { +- if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) { ++ if (!test_unpriv_remount_simple(MS_RDONLY)) { + die("MS_RDONLY malfunctions\n"); + } +- if (!test_unpriv_remount_simple(MS_NODEV)) { ++ if (!test_unpriv_remount("devpts", "newinstance", MS_NODEV, MS_NODEV, 0)) { + die("MS_NODEV malfunctions\n"); + } +- if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) { ++ if (!test_unpriv_remount_simple(MS_NOSUID)) { + die("MS_NOSUID malfunctions\n"); + } +- if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) { ++ if (!test_unpriv_remount_simple(MS_NOEXEC)) { + die("MS_NOEXEC malfunctions\n"); + } +- if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV, +- MS_NOATIME|MS_NODEV)) ++ if (!test_unpriv_remount_atime(MS_RELATIME, ++ MS_NOATIME)) + { + die("MS_RELATIME malfunctions\n"); + } +- if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV, +- MS_NOATIME|MS_NODEV)) ++ if (!test_unpriv_remount_atime(MS_STRICTATIME, ++ MS_NOATIME)) + { + die("MS_STRICTATIME malfunctions\n"); + } +- if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV, +- MS_STRICTATIME|MS_NODEV)) ++ if (!test_unpriv_remount_atime(MS_NOATIME, ++ MS_STRICTATIME)) + { +- die("MS_RELATIME malfunctions\n"); ++ die("MS_NOATIME malfunctions\n"); + } +- if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV, +- MS_NOATIME|MS_NODEV)) ++ if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME, ++ MS_NOATIME)) + { +- die("MS_RELATIME malfunctions\n"); ++ die("MS_RELATIME|MS_NODIRATIME malfunctions\n"); + } +- if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV, +- MS_NOATIME|MS_NODEV)) ++ if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME, ++ MS_NOATIME)) + { +- die("MS_RELATIME malfunctions\n"); ++ die("MS_STRICTATIME|MS_NODIRATIME malfunctions\n"); + } +- if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV, +- MS_STRICTATIME|MS_NODEV)) ++ if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME, ++ MS_STRICTATIME)) + { +- die("MS_RELATIME malfunctions\n"); ++ die("MS_NOATIME|MS_DIRATIME malfunctions\n"); + } +- if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV, +- MS_NOATIME|MS_NODEV)) ++ if (!test_unpriv_remount("ramfs", NULL, MS_STRICTATIME, 0, MS_NOATIME)) + { + die("Default atime malfunctions\n"); + } ++ if (!test_priv_mount_unpriv_remount()) { ++ die("Mount flags unexpectedly changed after remount\n"); ++ } + return EXIT_SUCCESS; + } diff --git a/5000_BFQ-4-block-Switch-from-v6r2-for-3.10.0-v6r2-for-3.10.patch b/5000_BFQ-4-block-Switch-from-v6r2-for-3.10.0-v6r2-for-3.10.patch deleted file mode 100755 index 8f850c6c..00000000 --- a/5000_BFQ-4-block-Switch-from-v6r2-for-3.10.0-v6r2-for-3.10.patch +++ /dev/null @@ -1,59 +0,0 @@ -From 994451c7668678f1bf3ec86345bef1c1d549ba45 Mon Sep 17 00:00:00 2001 -From: Arianna Avanzini <avanzini.arianna@gmail.com> -Date: Wed, 24 Jul 2013 21:43:47 +0200 -Subject: [PATCH] block: Switch from BFQ-v6r2 for 3.10.0 to BFQ-v6r2 for - 3.10.8-rc1. - ---- - block/bfq-iosched.c | 18 +++++++++++++++--- - 1 file changed, 15 insertions(+), 3 deletions(-) - -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index bc57923..bbe79fb 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -2930,14 +2930,22 @@ static void bfq_exit_queue(struct elevator_queue *e) - kfree(bfqd); - } - --static int bfq_init_queue(struct request_queue *q) -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - { - struct bfq_group *bfqg; - struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (eq == NULL) -+ return -ENOMEM; - - bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); -- if (bfqd == NULL) -+ if (bfqd == NULL) { -+ kobject_put(&eq->kobj); - return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; - - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -@@ -2948,11 +2956,15 @@ static int bfq_init_queue(struct request_queue *q) - atomic_inc(&bfqd->oom_bfqq.ref); - - bfqd->queue = q; -- q->elevator->elevator_data = bfqd; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); - - bfqg = bfq_alloc_root_group(bfqd, q->node); - if (bfqg == NULL) { - kfree(bfqd); -+ kobject_put(&eq->kobj); - return -ENOMEM; - } - --- -1.8.1.4 - diff --git a/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.10.patch b/5001_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r7-3.10.patch index 6d842d8e..c76d0e98 100644 --- a/5000_BFQ-1-block-cgroups-kconfig-build-bits-for-v6r2-3.10.patch +++ b/5001_BFQ-1-block-cgroups-kconfig-build-bits-for-v7r7-3.10.patch @@ -1,7 +1,7 @@ -From 13fa5ddac2963e304e90c5beb4bc996e3557479d Mon Sep 17 00:00:00 2001 -From: Matteo Bernardini <matteo.bernardini@gmail.com> -Date: Thu, 9 May 2013 18:58:50 +0200 -Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v6r2-3.10 +From d8a4dc281659c63154708bfd1a66c7ad0fdd2f09 Mon Sep 17 00:00:00 2001 +From: Arianna Avanzini <avanzini.arianna@gmail.com> +Date: Mon, 27 Jan 2014 23:50:08 +0100 +Subject: [PATCH 1/3] block: cgroups, kconfig, build bits for BFQ-v7r7-3.10 Update Kconfig.iosched and do the related Makefile changes to include kernel configuration options for BFQ. Also add the bfqio controller @@ -9,15 +9,14 @@ to the cgroups subsystem. Signed-off-by: Paolo Valente <paolo.valente@unimore.it> Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com> -Signed-off-by: Matteo Bernardini <matteo.bernardini@gmail.com> --- - block/Kconfig.iosched | 25 +++++++++++++++++++++++++ + block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ block/Makefile | 1 + include/linux/cgroup_subsys.h | 6 ++++++ - 3 files changed, 32 insertions(+) + 3 files changed, 39 insertions(+) diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 421bef9..695e064 100644 +index 421bef9..0ee5f0f 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED @@ -33,7 +32,7 @@ index 421bef9..695e064 100644 + It aims at distributing the bandwidth as desired, independently of + the disk parameters and with any workload. It also tries to + guarantee low latency to interactive and soft real-time -+ applications. If compiled built-in (saying Y here), BFQ can ++ applications. If compiled built-in (saying Y here), BFQ can + be configured to support hierarchical scheduling. + +config CGROUP_BFQIO @@ -48,17 +47,24 @@ index 421bef9..695e064 100644 choice prompt "Default I/O scheduler" default DEFAULT_CFQ -@@ -52,6 +73,9 @@ choice +@@ -52,6 +73,16 @@ choice config DEFAULT_CFQ bool "CFQ" if IOSCHED_CFQ=y + config DEFAULT_BFQ + bool "BFQ" if IOSCHED_BFQ=y ++ help ++ Selects BFQ as the default I/O scheduler which will be ++ used by default for all block devices. ++ The BFQ I/O scheduler aims at distributing the bandwidth ++ as desired, independently of the disk parameters and with ++ any workload. It also tries to guarantee low latency to ++ interactive and soft real-time applications. + config DEFAULT_NOOP bool "No-op" -@@ -61,6 +85,7 @@ config DEFAULT_IOSCHED +@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED string default "deadline" if DEFAULT_DEADLINE default "cfq" if DEFAULT_CFQ @@ -79,7 +85,7 @@ index 39b76ba..c0d20fa 100644 obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += blk-integrity.o diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h -index 6e7ec64..ffa1d1f 100644 +index 6e7ec64..e5e6b0d 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -84,3 +84,9 @@ SUBSYS(bcache) @@ -87,11 +93,11 @@ index 6e7ec64..ffa1d1f 100644 /* */ + -+#ifdef CONFIG_CGROUP_BFQIO ++#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO) +SUBSYS(bfqio) +#endif + +/* */ -- -1.8.1.4 +2.1.3 diff --git a/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.10.patch1 b/5002_BFQ-2-block-introduce-the-v7r7-I-O-sched-for-3.10.patch1 index a9bcc372..07fd62f9 100644 --- a/5000_BFQ-2-block-introduce-the-v6r2-I-O-sched-for-3.10.patch1 +++ b/5002_BFQ-2-block-introduce-the-v7r7-I-O-sched-for-3.10.patch1 @@ -1,13 +1,15 @@ -From 2e949c3d4d8ba2af46dcedc80707ebba277d759f Mon Sep 17 00:00:00 2001 -From: Arianna Avanzini <avanzini.arianna@gmail.com> +From db537b3062665d5442c516b31d396e61dc4c145c Mon Sep 17 00:00:00 2001 +From: Paolo Valente <paolo.valente@unimore.it> Date: Thu, 9 May 2013 19:10:02 +0200 -Subject: [PATCH 2/3] block: introduce the BFQ-v6r2 I/O sched for 3.10 +Subject: [PATCH 2/3] block: introduce the BFQ-v7r7 I/O sched for 3.10 -Add the BFQ-v6r2 I/O scheduler to 3.10. -The general structure is borrowed from CFQ, as much code. A (bfq_)queue -is associated to each task doing I/O on a device, and each time a -scheduling decision has to be made a queue is selected and served until -it expires. +Add the BFQ-v7r7 I/O scheduler to 3.10. +The general structure is borrowed from CFQ, as much of the code for +handling I/O contexts Over time, several useful features have been +ported from CFQ as well (details in the changelog in README.BFQ). A +(bfq_)queue is associated to each task doing I/O on a device, and each +time a scheduling decision has to be made a queue is selected and served +until it expires. - Slices are given in the service domain: tasks are assigned budgets, measured in number of sectors. Once got the disk, a task @@ -22,25 +24,27 @@ it expires. preserving an O(log N) overall complexity. - A low-latency tunable is provided; if enabled, both interactive - and soft real-time applications are guaranteed very low latency. + and soft real-time applications are guaranteed a very low latency. - - Latency guarantees are preserved also in presence of NCQ. + - Latency guarantees are preserved also in the presence of NCQ. - - Also with flash-based devices, a high throughput is achieved while - still preserving latency guarantees. + - Also with flash-based devices, a high throughput is achieved + while still preserving latency guarantees. - - Useful features borrowed from CFQ: cooperating-queues merging (with - some additional optimizations with respect to the original CFQ version), - static fallback queue for OOM. + - BFQ features Early Queue Merge (EQM), a sort of fusion of the + cooperating-queue-merging and the preemption mechanisms present + in CFQ. EQM is in fact a unified mechanism that tries to get a + sequential read pattern, and hence a high throughput, with any + set of processes performing interleaved I/O over a contiguous + sequence of sectors. - BFQ supports full hierarchical scheduling, exporting a cgroups - interface. Each node has a full scheduler, so each group can - be assigned its own ioprio (mapped to a weight, see next point) - and an ioprio_class. + interface. Since each node has a full scheduler, each group can + be assigned its own weight. - - If the cgroups interface is used, weights can be explictly - assigned, otherwise ioprio values are mapped to weights using the - relation weight = IOPRIO_BE_NR - ioprio. + - If the cgroups interface is not used, only I/O priorities can be + assigned to processes, with ioprio values mapped to weights + with the relation weight = IOPRIO_BE_NR - ioprio. - ioprio classes are served in strict priority order, i.e., lower priority queues are not served as long as there are higher @@ -52,13 +56,12 @@ it expires. Signed-off-by: Paolo Valente <paolo.valente@unimore.it> Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com> --- - block/bfq-cgroup.c | 881 ++++++++++++ - block/bfq-ioc.c | 36 + - block/bfq-iosched.c | 3070 +++++++++++++++++++++++++++++++++++++++++ - block/bfq-sched.c | 1072 ++++++++++++++ - block/bfq.h | 603 ++++++++ - include/linux/cgroup_subsys.h | 2 +- - 6 files changed, 5663 insertions(+), 1 deletion(-) + block/bfq-cgroup.c | 913 ++++++++++++ + block/bfq-ioc.c | 36 + + block/bfq-iosched.c | 3890 +++++++++++++++++++++++++++++++++++++++++++++++++++ + block/bfq-sched.c | 1214 ++++++++++++++++ + block/bfq.h | 773 ++++++++++ + 5 files changed, 6826 insertions(+) create mode 100644 block/bfq-cgroup.c create mode 100644 block/bfq-ioc.c create mode 100644 block/bfq-iosched.c @@ -67,10 +70,10 @@ Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com> diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c new file mode 100644 -index 0000000..6d57239 +index 0000000..e4d7b8a --- /dev/null +++ b/block/bfq-cgroup.c -@@ -0,0 +1,881 @@ +@@ -0,0 +1,913 @@ +/* + * BFQ: CGROUPS support. + * @@ -82,7 +85,8 @@ index 0000000..6d57239 + * + * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> + * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. + */ + +#ifdef CONFIG_CGROUP_BFQIO @@ -151,6 +155,12 @@ index 0000000..6d57239 + entity->new_weight = bfq_ioprio_to_weight(bgrp->ioprio); + entity->new_ioprio = bgrp->ioprio; + } else { ++ if (bgrp->weight < BFQ_MIN_WEIGHT || ++ bgrp->weight > BFQ_MAX_WEIGHT) { ++ printk(KERN_CRIT "bfq_group_init_entity: " ++ "bgrp->weight %d\n", bgrp->weight); ++ BUG(); ++ } + entity->new_weight = bgrp->weight; + entity->new_ioprio = bfq_weight_to_ioprio(bgrp->weight); + } @@ -158,6 +168,7 @@ index 0000000..6d57239 + entity->ioprio = entity->new_ioprio; + entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class; + entity->my_sched_data = &bfqg->sched_data; ++ bfqg->active_entities = 0; +} + +static inline void bfq_group_set_parent(struct bfq_group *bfqg, @@ -215,8 +226,9 @@ index 0000000..6d57239 + bfq_group_set_parent(prev, bfqg); + /* + * Build a list of allocated nodes using the bfqd -+ * filed, that is still unused and will be initialized -+ * only after the node will be connected. ++ * filed, that is still unused and will be ++ * initialized only after the node will be ++ * connected. + */ + prev->bfqd = bfqg; + prev = bfqg; @@ -236,7 +248,8 @@ index 0000000..6d57239 +} + +/** -+ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy. ++ * bfq_group_chain_link - link an allocated group chain to a cgroup ++ * hierarchy. + * @bfqd: the queue descriptor. + * @cgroup: the leaf cgroup to start from. + * @leaf: the leaf group (to be associated to @cgroup). @@ -296,7 +309,7 @@ index 0000000..6d57239 + * to the root have a group associated to @bfqd. + * + * If the allocation fails, return the root group: this breaks guarantees -+ * but is a safe fallbak. If this loss becames a problem it can be ++ * but is a safe fallback. If this loss becomes a problem it can be + * mitigated using the equivalent weight (given by the product of the + * weights of the groups in the path from @group to the root) in the + * root scheduler. @@ -347,7 +360,8 @@ index 0000000..6d57239 + resume = !RB_EMPTY_ROOT(&bfqq->sort_list); + + BUG_ON(resume && !entity->on_st); -+ BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue); ++ BUG_ON(busy && !resume && entity->on_st && ++ bfqq != bfqd->in_service_queue); + + if (busy) { + BUG_ON(atomic_read(&bfqq->ref) < 2); @@ -370,7 +384,7 @@ index 0000000..6d57239 + if (busy && resume) + bfq_activate_bfqq(bfqd, bfqq); + -+ if (bfqd->active_queue == NULL && !bfqd->rq_in_driver) ++ if (bfqd->in_service_queue == NULL && !bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); +} + @@ -435,7 +449,8 @@ index 0000000..6d57239 + struct bfq_data *bfqd; + unsigned long uninitialized_var(flags); + -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags); ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), ++ &flags); + if (bfqd != NULL) { + __bfq_bic_change_cgroup(bfqd, bic, cgroup); + bfq_put_bfqd_unlock(bfqd, &flags); @@ -507,7 +522,8 @@ index 0000000..6d57239 +} + +/** -+ * bfq_reparent_active_entities - move to the root group all active entities. ++ * bfq_reparent_active_entities - move to the root group all active ++ * entities. + * @bfqd: the device data structure with the root group. + * @bfqg: the group to move from. + * @st: the service tree with the entities. @@ -524,11 +540,12 @@ index 0000000..6d57239 + if (!RB_EMPTY_ROOT(&st->active)) + entity = bfq_entity_of(rb_first(active)); + -+ for (; entity != NULL ; entity = bfq_entity_of(rb_first(active))) ++ for (; entity != NULL; entity = bfq_entity_of(rb_first(active))) + bfq_reparent_leaf_entity(bfqd, entity); + -+ if (bfqg->sched_data.active_entity != NULL) -+ bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity); ++ if (bfqg->sched_data.in_service_entity != NULL) ++ bfq_reparent_leaf_entity(bfqd, ++ bfqg->sched_data.in_service_entity); + + return; +} @@ -551,8 +568,8 @@ index 0000000..6d57239 + hlist_del(&bfqg->group_node); + + /* -+ * Empty all service_trees belonging to this group before deactivating -+ * the group itself. ++ * Empty all service_trees belonging to this group before ++ * deactivating the group itself. + */ + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { + st = bfqg->sched_data.service_tree + i; @@ -560,7 +577,7 @@ index 0000000..6d57239 + /* + * The idle tree may still contain bfq_queues belonging + * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. Noone else ++ * cgroup from the one being destroyed now. No one else + * can access them so it's safe to act without any lock. + */ + bfq_flush_idle_tree(st); @@ -572,7 +589,7 @@ index 0000000..6d57239 + * all the leaf entities corresponding to these queues + * to the root_group. + * Also, it may happen that the group has an entity -+ * under service, which is disconnected from the active ++ * in service, which is disconnected from the active + * tree: it must be moved, too. + * There is no need to put the sync queues, as the + * scheduler has taken no reference. @@ -585,8 +602,8 @@ index 0000000..6d57239 + BUG_ON(!RB_EMPTY_ROOT(&st->active)); + BUG_ON(!RB_EMPTY_ROOT(&st->idle)); + } -+ BUG_ON(bfqg->sched_data.next_active != NULL); -+ BUG_ON(bfqg->sched_data.active_entity != NULL); ++ BUG_ON(bfqg->sched_data.next_in_service != NULL); ++ BUG_ON(bfqg->sched_data.in_service_entity != NULL); + + /* + * We may race with device destruction, take extra care when @@ -604,23 +621,24 @@ index 0000000..6d57239 + /* + * No need to defer the kfree() to the end of the RCU grace + * period: we are called from the destroy() callback of our -+ * cgroup, so we can be sure that noone is a) still using ++ * cgroup, so we can be sure that no one is a) still using + * this cgroup or b) doing lookups in it. + */ + kfree(bfqg); +} + -+static void bfq_end_raising_async(struct bfq_data *bfqd) ++static void bfq_end_wr_async(struct bfq_data *bfqd) +{ + struct hlist_node *tmp; + struct bfq_group *bfqg; + + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) -+ bfq_end_raising_async_queues(bfqd, bfqg); ++ bfq_end_wr_async_queues(bfqd, bfqg); ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +/** -+ * bfq_disconnect_groups - diconnect @bfqd from all its groups. ++ * bfq_disconnect_groups - disconnect @bfqd from all its groups. + * @bfqd: the device descriptor being exited. + * + * When the device exits we just make sure that no lookup can return @@ -632,7 +650,7 @@ index 0000000..6d57239 + struct hlist_node *tmp; + struct bfq_group *bfqg; + -+ bfq_log(bfqd, "disconnect_groups beginning") ; ++ bfq_log(bfqd, "disconnect_groups beginning"); + hlist_for_each_entry_safe(bfqg, tmp, &bfqd->group_list, bfqd_node) { + hlist_del(&bfqg->bfqd_node); + @@ -648,7 +666,7 @@ index 0000000..6d57239 + rcu_assign_pointer(bfqg->bfqd, NULL); + + bfq_log(bfqd, "disconnect_groups: put async for group %p", -+ bfqg) ; ++ bfqg); + bfq_put_async_queues(bfqd, bfqg); + } +} @@ -677,7 +695,7 @@ index 0000000..6d57239 + struct bfqio_cgroup *bgrp; + int i; + -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ bfqg = kzalloc_node(sizeof(*bfqg), GFP_KERNEL, node); + if (bfqg == NULL) + return NULL; + @@ -744,16 +762,31 @@ index 0000000..6d57239 + bgrp->__VAR = (unsigned short)val; \ + hlist_for_each_entry(bfqg, &bgrp->group_data, group_node) { \ + /* \ -+ * Setting the ioprio_changed flag of the entity \ -+ * to 1 with new_##__VAR == ##__VAR would re-set \ -+ * the value of the weight to its ioprio mapping. \ -+ * Set the flag only if necessary. \ -+ */ \ -+ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ -+ bfqg->entity.new_##__VAR = (unsigned short)val; \ -+ smp_wmb(); \ -+ bfqg->entity.ioprio_changed = 1; \ -+ } \ ++ * Setting the ioprio_changed flag of the entity \ ++ * to 1 with new_##__VAR == ##__VAR would re-set \ ++ * the value of the weight to its ioprio mapping. \ ++ * Set the flag only if necessary. \ ++ */ \ ++ if ((unsigned short)val != bfqg->entity.new_##__VAR) { \ ++ bfqg->entity.new_##__VAR = (unsigned short)val; \ ++ /* \ ++ * Make sure that the above new value has been \ ++ * stored in bfqg->entity.new_##__VAR before \ ++ * setting the ioprio_changed flag. In fact, \ ++ * this flag may be read asynchronously (in \ ++ * critical sections protected by a different \ ++ * lock than that held here), and finding this \ ++ * flag set may cause the execution of the code \ ++ * for updating parameters whose value may \ ++ * depend also on bfqg->entity.new_##__VAR (in \ ++ * __bfq_entity_update_weight_prio). \ ++ * This barrier makes sure that the new value \ ++ * of bfqg->entity.new_##__VAR is correctly \ ++ * seen in that code. \ ++ */ \ ++ smp_wmb(); \ ++ bfqg->entity.ioprio_changed = 1; \ ++ } \ + } \ + spin_unlock_irq(&bgrp->lock); \ + \ @@ -825,10 +858,11 @@ index 0000000..6d57239 + ioc = task->io_context; + if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1) + /* -+ * ioc == NULL means that the task is either too young or -+ * exiting: if it has still no ioc the ioc can't be shared, -+ * if the task is exiting the attach will fail anyway, no -+ * matter what we return here. ++ * ioc == NULL means that the task is either too ++ * young or exiting: if it has still no ioc the ++ * ioc can't be shared, if the task is exiting the ++ * attach will fail anyway, no matter what we ++ * return here. + */ + ret = -EINVAL; + task_unlock(task); @@ -857,8 +891,9 @@ index 0000000..6d57239 + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(icq, &ioc->icq_list, ioc_node) -+ if (!strncmp(icq->q->elevator->type->elevator_name, -+ "bfq", ELV_NAME_MAX)) ++ if (!strncmp( ++ icq->q->elevator->type->elevator_name, ++ "bfq", ELV_NAME_MAX)) + bfq_bic_change_cgroup(icq_to_bic(icq), + cgroup); + rcu_read_unlock(); @@ -922,9 +957,9 @@ index 0000000..6d57239 +{ +} + -+static void bfq_end_raising_async(struct bfq_data *bfqd) ++static void bfq_end_wr_async(struct bfq_data *bfqd) +{ -+ bfq_end_raising_async_queues(bfqd, bfqd->root_group); ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +static inline void bfq_disconnect_groups(struct bfq_data *bfqd) @@ -954,7 +989,7 @@ index 0000000..6d57239 +#endif diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c new file mode 100644 -index 0000000..326e3ec +index 0000000..7f6b000 --- /dev/null +++ b/block/bfq-ioc.c @@ -0,0 +1,36 @@ @@ -990,18 +1025,18 @@ index 0000000..326e3ec +static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc) +{ -+ if(ioc) ++ if (ioc) + return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); + return NULL; +} diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c new file mode 100644 -index 0000000..b230927 +index 0000000..9de51e3 --- /dev/null +++ b/block/bfq-iosched.c -@@ -0,0 +1,3070 @@ +@@ -0,0 +1,3890 @@ +/* -+ * BFQ, or Budget Fair Queueing, disk scheduler. ++ * Budget Fair Queueing (BFQ) disk scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> @@ -1011,50 +1046,55 @@ index 0000000..b230927 + * + * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> + * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file. ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. + * -+ * BFQ is a proportional share disk scheduling algorithm based on the -+ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets, -+ * measured in number of sectors, to tasks instead of time slices. -+ * The disk is not granted to the active task for a given time slice, -+ * but until it has exahusted its assigned budget. This change from -+ * the time to the service domain allows BFQ to distribute the disk -+ * bandwidth among tasks as desired, without any distortion due to -+ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc -+ * internal scheduler, called B-WF2Q+, to schedule tasks according to -+ * their budgets. Thanks to this accurate scheduler, BFQ can afford -+ * to assign high budgets to disk-bound non-seeky tasks (to boost the -+ * throughput), and yet guarantee low latencies to interactive and -+ * soft real-time applications. ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based on ++ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, ++ * measured in number of sectors, to processes instead of time slices. The ++ * device is not granted to the in-service process for a given time slice, ++ * but until it has exhausted its assigned budget. This change from the time ++ * to the service domain allows BFQ to distribute the device throughput ++ * among processes as desired, without any distortion due to ZBR, workload ++ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, ++ * called B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated to processes. Thanks to the ++ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to ++ * I/O-bound processes issuing sequential requests (to boost the ++ * throughput), and yet guarantee a low latency to interactive and soft ++ * real-time applications. + * -+ * BFQ has been introduced in [1], where the interested reader can -+ * find an accurate description of the algorithm, the bandwidth -+ * distribution and latency guarantees it provides, plus formal proofs -+ * of all the properties. With respect to the algorithm presented in -+ * the paper, this implementation adds several little heuristics, and -+ * a hierarchical extension, based on H-WF2Q+. ++ * BFQ is described in [1], where also a reference to the initial, more ++ * theoretical paper on BFQ can be found. The interested reader can find ++ * in the latter paper full details on the main algorithm, as well as ++ * formulas of the guarantees and formal proofs of all the properties. ++ * With respect to the version of BFQ presented in these papers, this ++ * implementation adds a few more heuristics, such as the one that ++ * guarantees a low latency to soft real-time applications, and a ++ * hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, that is described in [2], together with + * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) + * complexity derives from the one introduced with EEVDF in [3]. + * -+ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling -+ * with Deterministic Guarantees on Bandwidth Distribution,'', -+ * IEEE Transactions on Computer, May 2010. ++ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness ++ * with the BFQ Disk I/O Scheduler'', ++ * Proceedings of the 5th Annual International Systems and Storage ++ * Conference (SYSTOR '12), June 2012. + * -+ * http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing + * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation,'' technical report. + * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include <linux/module.h> +#include <linux/slab.h> @@ -1089,7 +1129,7 @@ index 0000000..b230927 +/* + * Async to sync throughput distribution is controlled as follows: + * when an async request is served, the entity is charged the number -+ * of sectors of the request, multipled by the factor below ++ * of sectors of the request, multiplied by the factor below + */ +static const int bfq_async_charge_factor = 10; + @@ -1116,21 +1156,46 @@ index 0000000..b230927 +#define BFQ_RATE_SHIFT 16 + +/* -+ * The duration of the weight raising for interactive applications is -+ * computed automatically (as default behaviour), using the following -+ * formula: duration = (R / r) * T, where r is the peak rate of the -+ * disk, and R and T are two reference parameters. In particular, R is -+ * the peak rate of a reference disk, and T is about the maximum time -+ * for starting popular large applications on that disk, under BFQ and -+ * while reading two files in parallel. Finally, BFQ uses two -+ * different pairs (R, T) depending on whether the disk is rotational -+ * or non-rotational. ++ * By default, BFQ computes the duration of the weight raising for ++ * interactive applications automatically, using the following formula: ++ * duration = (R / r) * T, where r is the peak rate of the device, and ++ * R and T are two reference parameters. ++ * In particular, R is the peak rate of the reference device (see below), ++ * and T is a reference time: given the systems that are likely to be ++ * installed on the reference device according to its speed class, T is ++ * about the maximum time needed, under BFQ and while reading two files in ++ * parallel, to load typical large applications on these systems. ++ * In practice, the slower/faster the device at hand is, the more/less it ++ * takes to load applications with respect to the reference device. ++ * Accordingly, the longer/shorter BFQ grants weight raising to interactive ++ * applications. ++ * ++ * BFQ uses four different reference pairs (R, T), depending on: ++ * . whether the device is rotational or non-rotational; ++ * . whether the device is slow, such as old or portable HDDs, as well as ++ * SD cards, or fast, such as newer HDDs and SSDs. ++ * ++ * The device's speed class is dynamically (re)detected in ++ * bfq_update_peak_rate() every time the estimated peak rate is updated. ++ * ++ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] ++ * are the reference values for a slow/fast rotational device, whereas ++ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for ++ * a slow/fast non-rotational device. Finally, device_speed_thresh are the ++ * thresholds used to switch between speed classes. ++ * Both the reference peak rates and the thresholds are measured in ++ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. ++ */ ++static int R_slow[2] = {1536, 10752}; ++static int R_fast[2] = {17415, 34791}; ++/* ++ * To improve readability, a conversion function is used to initialize the ++ * following arrays, which entails that they can be initialized only in a ++ * function. + */ -+#define T_rot (msecs_to_jiffies(5500)) -+#define T_nonrot (msecs_to_jiffies(2000)) -+/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */ -+#define R_rot 17415 -+#define R_nonrot 34791 ++static int T_slow[2]; ++static int T_fast[2]; ++static int device_speed_thresh[2]; + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) @@ -1336,6 +1401,125 @@ index 0000000..b230927 + bfqq->pos_root = NULL; +} + ++/* ++ * Tell whether there are active queues or groups with differentiated weights. ++ */ ++static inline bool bfq_differentiated_weights(struct bfq_data *bfqd) ++{ ++ BUG_ON(!bfqd->hw_tag); ++ /* ++ * For weights to differ, at least one of the trees must contain ++ * at least two nodes. ++ */ ++ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right) ++#ifdef CONFIG_CGROUP_BFQIO ++ ) || ++ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && ++ (bfqd->group_weights_tree.rb_node->rb_left || ++ bfqd->group_weights_tree.rb_node->rb_right) ++#endif ++ ); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input entity, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if: ++ * - the device does not support queueing; ++ * - the entity is already associated with a counter, which happens if: ++ * 1) the entity is associated with a queue, 2) a request arrival ++ * has caused the queue to become both non-weight-raised, and hence ++ * change its weight, and backlogged; in this respect, each ++ * of the two events causes an invocation of this function, ++ * 3) this is the invocation of this function caused by the second ++ * event. This second invocation is actually useless, and we handle ++ * this fact by exiting immediately. More efficient or clearer ++ * solutions might possibly be adopted. ++ */ ++ if (!bfqd->hw_tag || entity->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ entity->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ entity->weight_counter->weight = entity->weight; ++ rb_link_node(&entity->weight_counter->weights_node, parent, new); ++ rb_insert_color(&entity->weight_counter->weights_node, root); ++ ++inc_counter: ++ entity->weight_counter->num_active++; ++} ++ ++/* ++ * Decrement the weight counter associated with the entity, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root) ++{ ++ /* ++ * Check whether the entity is actually associated with a counter. ++ * In fact, the device may not be considered NCQ-capable for a while, ++ * which implies that no insertion in the weight trees is performed, ++ * after which the device may start to be deemed NCQ-capable, and hence ++ * this function may start to be invoked. This may cause the function ++ * to be invoked for entities that are not associated with any counter. ++ */ ++ if (!entity->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(entity->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!entity->weight_counter->num_active); ++ entity->weight_counter->num_active--; ++ if (entity->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&entity->weight_counter->weights_node, root); ++ kfree(entity->weight_counter); ++ ++reset_entity_pointer: ++ entity->weight_counter = NULL; ++} ++ +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) @@ -1360,37 +1544,12 @@ index 0000000..b230927 + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + -+static void bfq_del_rq_rb(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue) -+ bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root != NULL) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+} -+ +/* see the definition of bfq_async_charge_factor for details */ +static inline unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + return blk_rq_sectors(rq) * -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) * ++ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * + bfq_async_charge_factor)); +} + @@ -1416,7 +1575,7 @@ index 0000000..b230927 + if (next_rq == NULL) + return; + -+ if (bfqq == bfqd->active_queue) ++ if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. @@ -1424,21 +1583,24 @@ index 0000000..b230927 + return; + + BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->active_entity); ++ BUG_ON(entity == entity->sched_data->in_service_entity); + + new_budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget); -+ bfq_activate_bfqq(bfqd, bfqq); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", ++ new_budget); ++ bfq_activate_bfqq(bfqd, bfqq); ++ } +} + -+static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) ++static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + -+ if (bfqd->bfq_raising_max_time > 0) -+ return bfqd->bfq_raising_max_time; ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; + + dur = bfqd->RT_prod; + do_div(dur, bfqd->peak_rate); @@ -1446,17 +1608,230 @@ index 0000000..b230927 + return dur; +} + -+static void bfq_add_rq_rb(struct request *rq) ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static inline void bfq_reset_burst_list(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* burst not yet large: add bfqq to the burst list */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues happen to become active shortly after each other, then, ++ * to help the processes associated to these queues get their job done as ++ * soon as possible, it is usually better to not grant either weight-raising ++ * or device idling to these queues. In this comment we describe, firstly, ++ * the reasons why this fact holds, and, secondly, the next function, which ++ * implements the main steps needed to properly mark these queues so that ++ * they can then be treated in a different way. ++ * ++ * As for the terminology, we say that a queue becomes active, i.e., ++ * switches from idle to backlogged, either when it is created (as a ++ * consequence of the arrival of an I/O request), or, if already existing, ++ * when a new request for the queue arrives while the queue is idle. ++ * Bursts of activations, i.e., activations of different queues occurring ++ * shortly after each other, are typically caused by services or applications ++ * that spawn or reactivate many parallel threads/processes. Examples are ++ * systemd during boot or git grep. ++ * ++ * These services or applications benefit mostly from a high throughput: ++ * the quicker the requests of the activated queues are cumulatively served, ++ * the sooner the target job of these queues gets completed. As a consequence, ++ * weight-raising any of these queues, which also implies idling the device ++ * for it, is almost always counterproductive: in most cases it just lowers ++ * throughput. ++ * ++ * On the other hand, a burst of activations may be also caused by the start ++ * of an application that does not consist in a lot of parallel I/O-bound ++ * threads. In fact, with a complex application, the burst may be just a ++ * consequence of the fact that several processes need to be executed to ++ * start-up the application. To start an application as quickly as possible, ++ * the best thing to do is to privilege the I/O related to the application ++ * with respect to all other I/O. Therefore, the best strategy to start as ++ * quickly as possible an application that causes a burst of activations is ++ * to weight-raise all the queues activated during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the two ++ * types of bursts need to be distinguished. Fortunately, this seems ++ * relatively easy to do, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that bursts with a larger size ++ * than that threshold are apparently caused only by services or commands ++ * such as systemd or git grep. For brevity, hereafter we call just 'large' ++ * these bursts. BFQ *does not* weight-raise queues whose activations occur ++ * in a large burst. In addition, for each of these queues BFQ performs or ++ * does not perform idling depending on which choice boosts the throughput ++ * most. The exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Turning back to the next function, it implements all the steps needed ++ * to detect the occurrence of a large burst and to properly mark all the ++ * queues belonging to it (so that they can then be treated in a different ++ * way). This goal is achieved by maintaining a special "burst list" that ++ * holds, temporarily, the queues that belong to the burst in progress. The ++ * list is then used to mark these queues as belonging to a large burst if ++ * the burst does become large. The main steps are the following. ++ * ++ * . when the very first queue is activated, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is activated while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is activated a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool idle_for_long_time) ++{ ++ /* ++ * If bfqq happened to be activated in a burst, but has been idle ++ * for at least as long as an interactive queue, then we assume ++ * that, in the overall I/O initiated in the burst, the I/O ++ * associated to bfqq is finished. So bfqq does not need to be ++ * treated as a queue belonging to a burst anymore. Accordingly, ++ * we reset bfqq's in_large_burst flag if set, and remove bfqq ++ * from the burst list if it's there. We do not decrement instead ++ * burst_size, because the fact that bfqq does not need to belong ++ * to the burst list any more does not invalidate the fact that ++ * bfqq may have been activated during the current burst. ++ */ ++ if (idle_for_long_time) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, then there is nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq)) ++ return; ++ ++ /* ++ * If bfqq's activation happens late enough, then the current ++ * burst is finished, and related data structures must be reset. ++ * ++ * In this respect, consider the special case where bfqq is the very ++ * first queue being activated. In this case, last_ins_in_burst is ++ * not yet significant when we get here. But it is easy to verify ++ * that, whether or not the following condition is true, bfqq will ++ * end up being inserted into the burst list. In particular the ++ * list will happen to contain only bfqq. And this is exactly what ++ * has to happen, as bfqq may be the first queue in a possible ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval)) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ return; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ return; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++} ++ ++static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_entity *entity = &bfqq->entity; + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; -+ unsigned long old_raising_coeff = bfqq->raising_coeff; -+ int idle_for_long_time = bfqq->budget_timeout + -+ bfqd->bfq_raising_min_idle_time < jiffies; ++ unsigned long old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; + -+ bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq)); ++ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + @@ -1477,83 +1852,165 @@ index 0000000..b230927 + bfq_rq_pos_tree_add(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) { -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && -+ bfqq->soft_rt_next_start < jiffies; ++ bool soft_rt, ++ idle_for_long_time = time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++ ++ if (bfq_bfqq_sync(bfqq)) { ++ bool already_in_burst = ++ !hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq); ++ bfq_handle_burst(bfqd, bfqq, idle_for_long_time); ++ /* ++ * If bfqq was not already in the current burst, ++ * then, at this point, bfqq either has been ++ * added to the current burst or has caused the ++ * current burst to terminate. In particular, in ++ * the second case, bfqq has become the first ++ * queue in a possible new burst. ++ * In both cases last_ins_in_burst needs to be ++ * moved forward. ++ */ ++ if (!already_in_burst) ++ bfqd->last_ins_in_burst = jiffies; ++ } ++ ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ time_is_before_jiffies(bfqq->soft_rt_next_start); ++ interactive = !bfq_bfqq_in_large_burst(bfqq) && ++ idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + -+ if (! bfqd->low_latency) ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (time_before(jiffies, ++ RQ_BIC(rq)->ttime.last_end_request + ++ bfqd->bfq_slice_idle)) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ } ++ ++ if (!bfqd->low_latency) + goto add_bfqq_busy; + + /* + * If the queue is not being boosted and has been idle + * for enough time, start a weight-raising period + */ -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) { -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff; -+ if (idle_for_long_time) -+ bfqq->raising_cur_max_time = -+ bfq_wrais_duration(bfqd); ++ if (old_wr_coeff == 1 && (interactive || soft_rt)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ if (interactive) ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + else -+ bfqq->raising_cur_max_time = -+ bfqd->bfq_raising_rt_max_time; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; + bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %llu msec," -+ "rais_max_time %u", -+ bfqq->last_rais_start_finish, -+ jiffies_to_msecs(bfqq-> -+ raising_cur_max_time)); -+ } else if (old_raising_coeff > 1) { -+ if (idle_for_long_time) -+ bfqq->raising_cur_max_time = -+ bfq_wrais_duration(bfqd); -+ else if (bfqq->raising_cur_max_time == -+ bfqd->bfq_raising_rt_max_time && -+ !soft_rt) { -+ bfqq->raising_coeff = 1; ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ else if (bfq_bfqq_in_large_burst(bfqq) || ++ (bfqq->wr_cur_max_time == ++ bfqd->bfq_wr_rt_max_time && ++ !soft_rt)) { ++ bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %llu msec," -+ "rais_max_time %u", -+ bfqq->last_rais_start_finish, -+ jiffies_to_msecs(bfqq-> -+ raising_cur_max_time)); -+ } ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (time_before( ++ bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time, ++ jiffies + ++ bfqd->bfq_wr_rt_max_time) && ++ soft_rt) { ++ /* ++ * ++ * The remaining weight-raising time is lower ++ * than bfqd->bfq_wr_rt_max_time, which ++ * means that the application is enjoying ++ * weight raising either because deemed soft- ++ * rt in the near past, or because deemed ++ * interactive a long ago. In both cases, ++ * resetting now the current remaining weight- ++ * raising time for the application to the ++ * weight-raising duration for soft rt ++ * applications would not cause any latency ++ * increase for the application (as the new ++ * duration would be higher than the remaining ++ * time). ++ * ++ * In addition, the application is now meeting ++ * the requirements for being deemed soft rt. ++ * In the end we can correctly and safely ++ * (re)charge the weight-raising duration for ++ * the application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } + } -+ if (old_raising_coeff != bfqq->raising_coeff) ++ if (old_wr_coeff != bfqq->wr_coeff) + entity->ioprio_changed = 1; +add_bfqq_busy: ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); + bfq_add_bfqq_busy(bfqd, bfqq); -+ } else { -+ if(bfqd->low_latency && old_raising_coeff == 1 && -+ !rq_is_sync(rq) && -+ bfqq->last_rais_start_finish + -+ bfqd->bfq_raising_min_inter_arr_async < jiffies) { -+ bfqq->raising_coeff = bfqd->bfq_raising_coeff; -+ bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd); -+ ++ } else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; + entity->ioprio_changed = 1; + bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting at %llu msec," -+ "rais_max_time %u", -+ bfqq->last_rais_start_finish, -+ jiffies_to_msecs(bfqq-> -+ raising_cur_max_time)); -+ } -+ bfq_updated_next_req(bfqd, bfqq); ++ "non-idle wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); + } + -+ if(bfqd->low_latency && -+ (old_raising_coeff == 1 || bfqq->raising_coeff == 1 || -+ idle_for_long_time)) -+ bfqq->last_rais_start_finish = jiffies; -+} -+ -+static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq) -+{ -+ elv_rb_del(&bfqq->sort_list, rq); -+ bfqq->queued[rq_is_sync(rq)]--; -+ bfqq->bfqd->queued--; -+ bfq_add_rq_rb(rq); ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, @@ -1568,11 +2025,8 @@ index 0000000..b230927 + return NULL; + + bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ if (bfqq != NULL) { -+ sector_t sector = bio->bi_sector + bio_sectors(bio); -+ -+ return elv_rb_find(&bfqq->sort_list, sector); -+ } ++ if (bfqq != NULL) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); + + return NULL; +} @@ -1587,11 +2041,12 @@ index 0000000..b230927 + (long long unsigned)bfqd->last_position); +} + -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++static inline void bfq_deactivate_request(struct request_queue *q, ++ struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + -+ WARN_ON(bfqd->rq_in_driver == 0); ++ BUG_ON(bfqd->rq_in_driver == 0); + bfqd->rq_in_driver--; +} + @@ -1599,6 +2054,7 @@ index 0000000..b230927 +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); @@ -1606,10 +2062,25 @@ index 0000000..b230927 + } + + list_del_init(&rq->queuelist); -+ bfq_del_rq_rb(rq); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) ++ bfq_del_bfqq_busy(bfqd, bfqq, 1); ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root != NULL) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } + + if (rq->cmd_flags & REQ_META) { -+ WARN_ON(bfqq->meta_pending == 0); ++ BUG_ON(bfqq->meta_pending == 0); + bfqq->meta_pending--; + } +} @@ -1632,10 +2103,33 @@ index 0000000..b230927 +static void bfq_merged_request(struct request_queue *q, struct request *req, + int type) +{ -+ if (type == ELEVATOR_FRONT_MERGE) { ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); -+ -+ bfq_reposition_rq_rb(bfqq, req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ elv_rb_add(&bfqq->sort_list, req); ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(next_rq == NULL); ++ bfqq->next_rq = next_rq; ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_rq_pos_tree_add(bfqd, bfqq); ++ } + } +} + @@ -1660,39 +2154,41 @@ index 0000000..b230927 +} + +/* Must be called with bfqq != NULL */ -+static inline void bfq_bfqq_end_raising(struct bfq_queue *bfqq) ++static inline void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + BUG_ON(bfqq == NULL); -+ bfqq->raising_coeff = 1; -+ bfqq->raising_cur_max_time = 0; ++ if (bfq_bfqq_busy(bfqq)) ++ bfqq->bfqd->wr_busy_queues--; ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; + /* Trigger a weight change on the next activation of the queue */ + bfqq->entity.ioprio_changed = 1; +} + -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j] != NULL) -+ bfq_bfqq_end_raising(bfqg->async_bfqq[i][j]); ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq != NULL) -+ bfq_bfqq_end_raising(bfqg->async_idle_bfqq); ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + -+static void bfq_end_raising(struct bfq_data *bfqd) ++static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(bfqd->queue->queue_lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_raising(bfqq); ++ bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_raising(bfqq); -+ bfq_end_raising_async(bfqd); ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); + + spin_unlock_irq(bfqd->queue->queue_lock); +} @@ -1723,8 +2219,8 @@ index 0000000..b230927 + return bfqq == RQ_BFQQ(rq); +} + -+static void __bfq_set_active_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) +{ + if (bfqq != NULL) { + bfq_mark_bfqq_must_alloc(bfqq); @@ -1733,25 +2229,26 @@ index 0000000..b230927 + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_in_service_queue, cur-budget = %lu", + bfqq->entity.budget); + } + -+ bfqd->active_queue = bfqq; ++ bfqd->in_service_queue = bfqq; +} + +/* -+ * Get and set a new active queue for service. ++ * Get and set a new queue for service. + */ -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) +{ + if (!bfqq) + bfqq = bfq_get_next_queue(bfqd); + else + bfq_get_next_queue_forced(bfqd, bfqq); + -+ __bfq_set_active_queue(bfqd, bfqq); ++ __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + @@ -1794,8 +2291,8 @@ index 0000000..b230927 + + /* + * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by next_request -+ * position). ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close(bfqd, __bfqq->next_rq)) @@ -1897,36 +2394,16 @@ index 0000000..b230927 + return bfqd->bfq_max_budget / 32; +} + -+/* -+ * Decides whether idling should be done for given device and -+ * given active queue. -+ */ -+static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd, -+ struct bfq_queue *active_bfqq) -+{ -+ if (active_bfqq == NULL) -+ return false; -+ /* -+ * If device is SSD it has no seek penalty, disable idling; but -+ * do so only if: -+ * - device does not support queuing, otherwise we still have -+ * a problem with sync vs async workloads; -+ * - the queue is not weight-raised, to preserve guarantees. -+ */ -+ return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag && -+ active_bfqq->raising_coeff == 1); -+} -+ +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ -+ struct bfq_queue *bfqq = bfqd->active_queue; ++ struct bfq_queue *bfqq = bfqd->in_service_queue; + struct bfq_io_cq *bic; + unsigned long sl; + -+ WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); + -+ /* Tasks have exited, don't wait. */ -+ bic = bfqd->active_bic; ++ /* Processes have exited, don't wait. */ ++ bic = bfqd->in_service_bic; + if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0) + return; + @@ -1943,11 +2420,17 @@ index 0000000..b230927 + * BFQ_MIN_TT. This happened to help reduce latency. + */ + sl = bfqd->bfq_slice_idle; -+ if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) && -+ bfqq->entity.service > bfq_max_budget(bfqd) / 8 && -+ bfqq->raising_coeff == 1) ++ /* ++ * Unless the queue is being weight-raised, grant only minimum idle ++ * time if the queue either has been seeky for long enough or has ++ * already proved to be constantly seeky. ++ */ ++ if (bfq_sample_valid(bfqq->seek_samples) && ++ ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > ++ bfq_max_budget(bfqq->bfqd) / 8) || ++ bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1) + sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); -+ else if (bfqq->raising_coeff > 1) ++ else if (bfqq->wr_coeff > 1) + sl = sl * 3; + bfqd->last_idling_start = ktime_get(); + mod_timer(&bfqd->idle_slice_timer, jiffies + sl); @@ -1956,15 +2439,15 @@ index 0000000..b230927 +} + +/* -+ * Set the maximum time for the active queue to consume its ++ * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the disk + * throughput (always guaranteed with a time slice scheme as in CFQ). + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd) +{ -+ struct bfq_queue *bfqq = bfqd->active_queue; ++ struct bfq_queue *bfqq = bfqd->in_service_queue; + unsigned int timeout_coeff; -+ if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time) ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; @@ -1988,8 +2471,18 @@ index 0000000..b230927 + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq = RQ_BFQQ(rq); + -+ bfq_remove_request(rq); ++ /* ++ * For consistency, the next instruction should have been executed ++ * after removing the request from the queue and dispatching it. ++ * We execute instead this instruction before bfq_remove_request() ++ * (and hence introduce a temporary inconsistency), for efficiency. ++ * In fact, in a forced_dispatch, this prevents two counters related ++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq ++ * is not in service, and then to be incremented again after ++ * incrementing bfqq->dispatched. ++ */ + bfqq->dispatched++; ++ bfq_remove_request(rq); + elv_dispatch_sort(q, rq); + + if (bfq_bfqq_sync(bfqq)) @@ -2019,9 +2512,7 @@ index 0000000..b230927 + return rq; +} + -+/* -+ * Must be called with the queue_lock held. -+ */ ++/* Must be called with the queue_lock held. */ +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; @@ -2084,9 +2575,9 @@ index 0000000..b230927 + +static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ -+ BUG_ON(bfqq != bfqd->active_queue); ++ BUG_ON(bfqq != bfqd->in_service_queue); + -+ __bfq_bfqd_reset_active(bfqd); ++ __bfq_bfqd_reset_in_service(bfqd); + + /* + * If this bfqq is shared between multiple processes, check @@ -2099,11 +2590,11 @@ index 0000000..b230927 + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* -+ * overloading budget_timeout field to store when -+ * the queue remains with no backlog, used by -+ * the weight-raising mechanism ++ * Overloading budget_timeout field to store the time ++ * at which the queue remains with no backlog; used by ++ * the weight-raising mechanism. + */ -+ bfqq->budget_timeout = jiffies ; ++ bfqq->budget_timeout = jiffies; + bfq_del_bfqq_busy(bfqd, bfqq, 1); + } else { + bfq_activate_bfqq(bfqd, bfqq); @@ -2133,14 +2624,14 @@ index 0000000..b230927 + budget = bfqq->max_budget; + min_budget = bfq_min_budget(bfqd); + -+ BUG_ON(bfqq != bfqd->active_queue); ++ BUG_ON(bfqq != bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue)); ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq)) { + switch (reason) { @@ -2151,7 +2642,7 @@ index 0000000..b230927 + case BFQ_BFQQ_TOO_IDLE: + /* + * This is the only case where we may reduce -+ * the budget: if there is no requets of the ++ * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of @@ -2167,13 +2658,13 @@ index 0000000..b230927 + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of -+ * the still oustanding ones. So in this ++ * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ -+ if (bfqq->dispatched > 0) /* still oustanding reqs */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) @@ -2329,11 +2820,26 @@ index 0000000..b230927 + bfqd->peak_rate_samples++; + + if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && -+ update && bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd->peak_rate, timeout); -+ bfq_log(bfqd, "new max_budget=%lu", -+ bfqd->bfq_max_budget); ++ update) { ++ int dev_type = blk_queue_nonrot(bfqd->queue); ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd->peak_rate, ++ timeout); ++ bfq_log(bfqd, "new max_budget=%lu", ++ bfqd->bfq_max_budget); ++ } ++ if (bfqd->device_speed == BFQ_BFQD_FAST && ++ bfqd->peak_rate < device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_SLOW; ++ bfqd->RT_prod = R_slow[dev_type] * ++ T_slow[dev_type]; ++ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && ++ bfqd->peak_rate > device_speed_thresh[dev_type]) { ++ bfqd->device_speed = BFQ_BFQD_FAST; ++ bfqd->RT_prod = R_fast[dev_type] * ++ T_fast[dev_type]; ++ } + } + } + @@ -2368,6 +2874,77 @@ index 0000000..b230927 + return expected > (4 * bfqq->entity.budget) / 3; +} + ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy application may happen to behave in an ++ * isochronous way if the CPU load is high. In fact, the application may ++ * stop issuing requests while the CPUs are busy serving other processes, ++ * then restart, then stop again for a while, and so on. In addition, if ++ * the disk achieves a low enough throughput with the request pattern ++ * issued by the application (e.g., because the request pattern is random ++ * and/or the device is slow), then the application may meet the above ++ * bandwidth requirement too. To prevent such a greedy application to be ++ * deemed as soft real-time, a further rule is used in the computation of ++ * soft_rt_next_start: soft_rt_next_start must be higher than the current ++ * time plus the maximum time for which the arrival of a request is waited ++ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. ++ * This filters out greedy applications, as the latter issue instead their ++ * next request as soon as possible after the last one has been completed ++ * (in contrast, when a batch of requests is completed, a soft real-time ++ * application spends some time processing data). ++ * ++ * Unfortunately, the last filter may easily generate false positives if ++ * only bfqd->bfq_slice_idle is used as a reference time interval and one ++ * or both the following cases occur: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or higher ++ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with ++ * HZ=100. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, we do not use as a reference time interval just ++ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In ++ * particular we add the minimum number of jiffies for which the filter ++ * seems to be quite precise also in embedded systems and KVM/QEMU virtual ++ * machines. ++ */ ++static inline unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return max(bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + bfqq->bfqd->bfq_slice_idle + 4); ++} ++ ++/* ++ * Return the largest-possible time instant such that, for as long as possible, ++ * the current time will be lower than this time instant according to the macro ++ * time_is_before_jiffies(). ++ */ ++static inline unsigned long bfq_infinity_from_now(unsigned long now) ++{ ++ return now + ULONG_MAX / 2; ++} ++ +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. @@ -2404,7 +2981,7 @@ index 0000000..b230927 + enum bfqq_expiration reason) +{ + int slow; -+ BUG_ON(bfqq != bfqd->active_queue); ++ BUG_ON(bfqq != bfqd->in_service_queue); + + /* Update disk peak rate for autotuning and check whether the + * process is slow (see bfq_update_peak_rate). @@ -2415,7 +2992,7 @@ index 0000000..b230927 + * As above explained, 'punish' slow (i.e., seeky), timed-out + * and async queues, to favor sequential sync workloads. + * -+ * Processes doing IO in the slower disk zones will tend to be ++ * Processes doing I/O in the slower disk zones will tend to be + * slow(er) even if not seeky. Hence, since the estimated peak + * rate is actually an average over the disk surface, these + * processes may timeout just for bad luck. To avoid punishing @@ -2426,23 +3003,71 @@ index 0000000..b230927 + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) + bfq_bfqq_charge_full_budget(bfqq); + -+ if (bfqd->low_latency && bfqq->raising_coeff == 1) -+ bfqq->last_rais_start_finish = jiffies; ++ bfqq->service_from_backlogged += bfqq->entity.service; + -+ if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) { -+ if(reason != BFQ_BFQQ_BUDGET_TIMEOUT) -+ bfqq->soft_rt_next_start = -+ jiffies + -+ HZ * bfqq->entity.service / -+ bfqd->bfq_raising_max_softrt_rate; -+ else -+ bfqq->soft_rt_next_start = -1; /* infinity */ ++ if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ !bfq_bfqq_constantly_seeky(bfqq)) { ++ bfq_mark_bfqq_constantly_seeky(bfqq); ++ if (!blk_queue_nonrot(bfqd->queue)) ++ bfqd->const_seeky_busy_in_flight_queues++; ++ } ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ bfqq->entity.service <= 2 * bfqq->entity.budget / 10 ) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding requests, ++ * then the request pattern is isochronous (see the comments ++ * to the function bfq_bfqq_softrt_next_start()). Hence we ++ * can compute soft_rt_next_start. If, instead, the queue ++ * still has outstanding requests, then we have to wait ++ * for the completion of all the outstanding requests to ++ * discover whether the request pattern is actually ++ * isochronous. ++ */ ++ if (bfqq->dispatched == 0) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ else { ++ /* ++ * The application is still waiting for the ++ * completion of one or more requests: ++ * prevent it from possibly being incorrectly ++ * deemed as soft real-time by setting its ++ * soft_rt_next_start to infinity. In fact, ++ * without this assignment, the application ++ * would be incorrectly deemed as soft ++ * real-time if: ++ * 1) it issued a new request before the ++ * completion of all its in-flight ++ * requests, and ++ * 2) at that time, its soft_rt_next_start ++ * happened to be in the past. ++ */ ++ bfqq->soft_rt_next_start = ++ bfq_infinity_from_now(jiffies); ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } + } ++ + bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow, -+ bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); ++ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, ++ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); + -+ /* Increase, decrease or leave budget unchanged according to reason */ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + __bfq_bfqq_expire(bfqd, bfqq); +} @@ -2454,18 +3079,15 @@ index 0000000..b230927 + */ +static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ -+ if (bfq_bfqq_budget_new(bfqq)) -+ return 0; -+ -+ if (time_before(jiffies, bfqq->budget_timeout)) ++ if (bfq_bfqq_budget_new(bfqq) || ++ time_before(jiffies, bfqq->budget_timeout)) + return 0; -+ + return 1; +} + +/* + * If we expire a queue that is waiting for the arrival of a new -+ * request, we may prevent the fictitious timestamp backshifting that ++ * request, we may prevent the fictitious timestamp back-shifting that + * allows the guarantees of the queue to be preserved (see [1] for + * this tricky aspect). Hence we return true only if this condition + * does not hold, or if the queue is slow enough to deserve only to be @@ -2474,7 +3096,7 @@ index 0000000..b230927 +static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wr %d left %d timeout %d", ++ "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); @@ -2486,37 +3108,194 @@ index 0000000..b230927 +} + +/* -+ * If the active queue is empty, but it is sync and either of the following -+ * conditions holds, then: 1) the queue must remain active and cannot be -+ * expired, and 2) the disk must be idled to wait for the possible arrival -+ * of a new request for the queue. The conditions are: -+ * - the device is rotational and not performing NCQ, and the queue has its -+ * idle window set (in this case, waiting for a new request for the queue -+ * is likely to boost the disk throughput); -+ * - the queue is weight-raised (waiting for the request is necessary for -+ * providing the queue with fairness and latency guarantees). ++ * Device idling is allowed only for the queues for which this function ++ * returns true. For this reason, the return value of this function plays a ++ * critical role for both throughput boosting and service guarantees. The ++ * return value is computed through a logical expression. In this rather ++ * long comment, we try to briefly describe all the details and motivations ++ * behind the components of this logical expression. ++ * ++ * First, the expression is false if bfqq is not sync, or if: bfqq happened ++ * to become active during a large burst of queue activations, and the ++ * pattern of requests bfqq contains boosts the throughput if bfqq is ++ * expired. In fact, queues that became active during a large burst benefit ++ * only from throughput, as discussed in the comments to bfq_handle_burst. ++ * In this respect, expiring bfqq certainly boosts the throughput on NCQ- ++ * capable flash-based devices, whereas, on rotational devices, it boosts ++ * the throughput only if bfqq contains random requests. ++ * ++ * On the opposite end, if (a) bfqq is sync, (b) the above burst-related ++ * condition does not hold, and (c) bfqq is being weight-raised, then the ++ * expression always evaluates to true, as device idling is instrumental ++ * for preserving low-latency guarantees (see [1]). If, instead, conditions ++ * (a) and (b) do hold, but (c) does not, then the expression evaluates to ++ * true only if: (1) bfqq is I/O-bound and has a non-null idle window, and ++ * (2) at least one of the following two conditions holds. ++ * The first condition is that the device is not performing NCQ, because ++ * idling the device most certainly boosts the throughput if this condition ++ * holds and bfqq is I/O-bound and has been granted a non-null idle window. ++ * The second compound condition is made of the logical AND of two components. ++ * ++ * The first component is true only if there is no weight-raised busy ++ * queue. This guarantees that the device is not idled for a sync non- ++ * weight-raised queue when there are busy weight-raised queues. The former ++ * is then expired immediately if empty. Combined with the timestamping ++ * rules of BFQ (see [1] for details), this causes sync non-weight-raised ++ * queues to get a lower number of requests served, and hence to ask for a ++ * lower number of requests from the request pool, before the busy weight- ++ * raised queues get served again. ++ * ++ * This is beneficial for the processes associated with weight-raised ++ * queues, when the request pool is saturated (e.g., in the presence of ++ * write hogs). In fact, if the processes associated with the other queues ++ * ask for requests at a lower rate, then weight-raised processes have a ++ * higher probability to get a request from the pool immediately (or at ++ * least soon) when they need one. Hence they have a higher probability to ++ * actually get a fraction of the disk throughput proportional to their ++ * high weight. This is especially true with NCQ-capable drives, which ++ * enqueue several requests in advance and further reorder internally- ++ * queued requests. ++ * ++ * In the end, mistreating non-weight-raised queues when there are busy ++ * weight-raised queues seems to mitigate starvation problems in the ++ * presence of heavy write workloads and NCQ, and hence to guarantee a ++ * higher application and system responsiveness in these hostile scenarios. ++ * ++ * If the first component of the compound condition is instead true, i.e., ++ * there is no weight-raised busy queue, then the second component of the ++ * compound condition takes into account service-guarantee and throughput ++ * issues related to NCQ (recall that the compound condition is evaluated ++ * only if the device is detected as supporting NCQ). ++ * ++ * As for service guarantees, allowing the drive to enqueue more than one ++ * request at a time, and hence delegating de facto final scheduling ++ * decisions to the drive's internal scheduler, causes loss of control on ++ * the actual request service order. In this respect, when the drive is ++ * allowed to enqueue more than one request at a time, the service ++ * distribution enforced by the drive's internal scheduler is likely to ++ * coincide with the desired device-throughput distribution only in the ++ * following, perfectly symmetric, scenario: ++ * 1) all active queues have the same weight, ++ * 2) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 3) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Even in such a scenario, sequential I/O may still receive a preferential ++ * treatment, but this is not likely to be a big issue with flash-based ++ * devices, because of their non-dramatic loss of throughput with random ++ * I/O. Things do differ with HDDs, for which additional care is taken, as ++ * explained after completing the discussion for flash-based devices. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly the ++ * above symmetry conditions would be quite complex and time-consuming. ++ * Therefore BFQ evaluates instead the following stronger sub-conditions, ++ * for which it is much easier to maintain the needed state: ++ * 1) all active queues have the same weight, ++ * 2) all active groups have the same weight, ++ * 3) all active groups have at most one active child each. ++ * In particular, the last two conditions are always true if hierarchical ++ * support and the cgroups interface are not enabled, hence no state needs ++ * to be maintained in this case. ++ * ++ * According to the above considerations, the second component of the ++ * compound condition evaluates to true if any of the above symmetry ++ * sub-condition does not hold, or the device is not flash-based. Therefore, ++ * if also the first component is true, then idling is allowed for a sync ++ * queue. These are the only sub-conditions considered if the device is ++ * flash-based, as, for such a device, it is sensible to force idling only ++ * for service-guarantee issues. In fact, as for throughput, idling ++ * NCQ-capable flash-based devices would not boost the throughput even ++ * with sequential I/O; rather it would lower the throughput in proportion ++ * to how fast the device is. In the end, (only) if all the three ++ * sub-conditions hold and the device is flash-based, the compound ++ * condition evaluates to false and therefore no idling is performed. ++ * ++ * As already said, things change with a rotational device, where idling ++ * boosts the throughput with sequential I/O (even with NCQ). Hence, for ++ * such a device the second component of the compound condition evaluates ++ * to true also if the following additional sub-condition does not hold: ++ * the queue is constantly seeky. Unfortunately, this different behavior ++ * with respect to flash-based devices causes an additional asymmetry: if ++ * some sync queues enjoy idling and some other sync queues do not, then ++ * the latter get a low share of the device throughput, simply because the ++ * former get many requests served after being set as in service, whereas ++ * the latter do not. As a consequence, to guarantee the desired throughput ++ * distribution, on HDDs the compound expression evaluates to true (and ++ * hence device idling is performed) also if the following last symmetry ++ * condition does not hold: no other queue is benefiting from idling. Also ++ * this last condition is actually replaced with a simpler-to-maintain and ++ * stronger condition: there is no busy queue which is not constantly seeky ++ * (and hence may also benefit from idling). ++ * ++ * To sum up, when all the required symmetry and throughput-boosting ++ * sub-conditions hold, the second component of the compound condition ++ * evaluates to false, and hence no idling is performed. This helps to ++ * keep the drives' internal queues full on NCQ-capable devices, and hence ++ * to boost the throughput, without causing 'almost' any loss of service ++ * guarantees. The 'almost' follows from the fact that, if the internal ++ * queue of one such device is filled while all the sub-conditions hold, ++ * but at some point in time some sub-condition stops to hold, then it may ++ * become impossible to let requests be served in the new desired order ++ * until all the requests already queued in the device have been served. + */ -+static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq, -+ int budg_timeout) ++static inline bool bfq_bfqq_must_not_expire(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; ++#ifdef CONFIG_CGROUP_BFQIO ++#define symmetric_scenario (!bfqd->active_numerous_groups && \ ++ !bfq_differentiated_weights(bfqd)) ++#else ++#define symmetric_scenario (!bfq_differentiated_weights(bfqd)) ++#endif ++#define cond_for_seeky_on_ncq_hdd (bfq_bfqq_constantly_seeky(bfqq) && \ ++ bfqd->busy_in_flight_queues == \ ++ bfqd->const_seeky_busy_in_flight_queues) + -+ return (bfq_bfqq_sync(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && -+ bfqd->bfq_slice_idle != 0 && -+ ((bfq_bfqq_idle_window(bfqq) && !bfqd->hw_tag && -+ !blk_queue_nonrot(bfqd->queue)) -+ || bfqq->raising_coeff > 1) && -+ (bfqd->rq_in_driver == 0 || -+ budg_timeout || -+ bfqq->raising_coeff > 1) && -+ !bfq_close_cooperator(bfqd, bfqq) && -+ (!bfq_bfqq_coop(bfqq) || -+ !bfq_bfqq_some_coop_idle(bfqq)) && -+ !bfq_queue_nonrot_noidle(bfqd, bfqq)); ++#define cond_for_expiring_in_burst (bfq_bfqq_in_large_burst(bfqq) && \ ++ bfqd->hw_tag && \ ++ (blk_queue_nonrot(bfqd->queue) || \ ++ bfq_bfqq_constantly_seeky(bfqq))) ++ ++/* ++ * Condition for expiring a non-weight-raised queue (and hence not idling ++ * the device). ++ */ ++#define cond_for_expiring_non_wr (bfqd->hw_tag && \ ++ (bfqd->wr_busy_queues > 0 || \ ++ (symmetric_scenario && \ ++ (blk_queue_nonrot(bfqd->queue) || \ ++ cond_for_seeky_on_ncq_hdd)))) ++ ++ return bfq_bfqq_sync(bfqq) && ++ !cond_for_expiring_in_burst && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_idle_window(bfqq) && ++ !cond_for_expiring_non_wr) ++ ); ++} ++ ++/* ++ * If the in-service queue is empty but sync, and the function ++ * bfq_bfqq_must_not_expire returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the disk must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments to the function bfq_bfqq_must_not_expire for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_bfqq_must_not_expire itself ++ * returns true. ++ */ ++static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && ++ bfq_bfqq_must_not_expire(bfqq); +} + +/* -+ * Select a queue for service. If we have a current active queue, ++ * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) @@ -2524,13 +3303,12 @@ index 0000000..b230927 + struct bfq_queue *bfqq, *new_bfqq = NULL; + struct request *next_rq; + enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ int budg_timeout; + -+ bfqq = bfqd->active_queue; ++ bfqq = bfqd->in_service_queue; + if (bfqq == NULL) + goto new_queue; + -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue"); ++ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + /* + * If another queue has a request waiting within our mean seek @@ -2543,9 +3321,9 @@ index 0000000..b230927 + if (new_bfqq != NULL && bfqq->new_bfqq == NULL) + bfq_setup_merge(bfqq, new_bfqq); + -+ budg_timeout = bfq_may_expire_for_budg_timeout(bfqq); -+ if (budg_timeout && -+ !bfq_bfqq_must_idle(bfqq, budg_timeout)) ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !timer_pending(&bfqd->idle_slice_timer) && ++ !bfq_bfqq_must_idle(bfqq)) + goto expire; + + next_rq = bfqq->next_rq; @@ -2560,16 +3338,18 @@ index 0000000..b230927 + goto expire; + } else { + /* -+ * The idle timer may be pending because we may not -+ * disable disk idling even when a new request arrives ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. + */ + if (timer_pending(&bfqd->idle_slice_timer)) { + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, -+ * 2) then the block layer has unplugged the -+ * device, causing the dispatch to be invoked. ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to @@ -2587,14 +3367,12 @@ index 0000000..b230927 + } + + /* -+ * No requests pending. If there is no cooperator, and the active -+ * queue still has requests in flight or is idling for a new request, -+ * then keep it. ++ * No requests pending. If the in-service queue still has requests ++ * in flight (possibly waiting for a completion) or is idling for a ++ * new request, then keep it. + */ + if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && -+ (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) && -+ !bfq_queue_nonrot_noidle(bfqd, bfqq)))) { ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { + bfqq = NULL; + goto keep_queue; + } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { @@ -2610,58 +3388,48 @@ index 0000000..b230927 +expire: + bfq_bfqq_expire(bfqd, bfqq, 0, reason); +new_queue: -+ bfqq = bfq_set_active_queue(bfqd, new_bfqq); ++ bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); + bfq_log(bfqd, "select_queue: new queue %d returned", + bfqq != NULL ? bfqq->pid : 0); +keep_queue: + return bfqq; +} + -+static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++static void bfq_update_wr_data(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) +{ -+ if (bfqq->raising_coeff > 1) { /* queue is being boosted */ ++ if (bfqq->wr_coeff > 1) { /* queue is being boosted */ + struct bfq_entity *entity = &bfqq->entity; + + bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, " -+ "old raising coeff %u, w %d(%d)", ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - -+ bfqq->last_rais_start_finish), -+ jiffies_to_msecs(bfqq->raising_cur_max_time), -+ bfqq->raising_coeff, ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + -+ BUG_ON(bfqq != bfqd->active_queue && entity->weight != -+ entity->orig_weight * bfqq->raising_coeff); -+ if(entity->ioprio_changed) -+ bfq_log_bfqq(bfqd, bfqq, -+ "WARN: pending prio change"); ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->ioprio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + /* -+ * If too much time has elapsed from the beginning -+ * of this weight-raising period and process is not soft -+ * real-time, stop it ++ * If the queue was activated in a burst, or ++ * too much time has elapsed from the beginning ++ * of this weight-raising, then end weight raising. + */ -+ if (jiffies - bfqq->last_rais_start_finish > -+ bfqq->raising_cur_max_time) { -+ int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 && -+ bfqq->soft_rt_next_start < jiffies; -+ -+ bfqq->last_rais_start_finish = jiffies; -+ if (soft_rt) -+ bfqq->raising_cur_max_time = -+ bfqd->bfq_raising_rt_max_time; -+ else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %llu msec," -+ "rais_max_time %u", -+ bfqq->last_rais_start_finish, -+ jiffies_to_msecs(bfqq-> -+ raising_cur_max_time)); -+ bfq_bfqq_end_raising(bfqq); -+ __bfq_entity_update_weight_prio( -+ bfq_entity_service_tree(entity), -+ entity); -+ } ++ if (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ bfqq->last_wr_start_finish = jiffies; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_bfqq_end_wr(bfqq); ++ __bfq_entity_update_weight_prio( ++ bfq_entity_service_tree(entity), ++ entity); + } + } +} @@ -2687,20 +3455,18 @@ index 0000000..b230927 + + if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { + /* -+ * This may happen if the next rq is chosen -+ * in fifo order instead of sector order. -+ * The budget is properly dimensioned -+ * to be always sufficient to serve the next request -+ * only if it is chosen in sector order. The reason is -+ * that it would be quite inefficient and little useful -+ * to always make sure that the budget is large enough -+ * to serve even the possible next rq in fifo order. ++ * This may happen if the next rq is chosen in fifo order ++ * instead of sector order. The budget is properly ++ * dimensioned to be always sufficient to serve the next ++ * request only if it is chosen in sector order. The reason ++ * is that it would be quite inefficient and little useful ++ * to always make sure that the budget is large enough to ++ * serve even the possible next rq in fifo order. + * In fact, requests are seldom served in fifo order. + * -+ * Expire the queue for budget exhaustion, and -+ * make sure that the next act_budget is enough -+ * to serve the next request, even if it comes -+ * from the fifo expired path. ++ * Expire the queue for budget exhaustion, and make sure ++ * that the next act_budget is enough to serve the next ++ * request, even if it comes from the fifo expired path. + */ + bfqq->next_rq = rq; + /* @@ -2716,19 +3482,19 @@ index 0000000..b230927 + bfq_bfqq_served(bfqq, service_to_charge); + bfq_dispatch_insert(bfqd->queue, rq); + -+ update_raising_data(bfqd, bfqq); ++ bfq_update_wr_data(bfqd, bfqq); + -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), " -+ "budg left %lu", ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %lu", + blk_rq_sectors(rq), + (long long unsigned)blk_rq_pos(rq), + bfq_bfqq_budget_left(bfqq)); + + dispatched++; + -+ if (bfqd->active_bic == NULL) { ++ if (bfqd->in_service_bic == NULL) { + atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->active_bic = RQ_BIC(rq); ++ bfqd->in_service_bic = RQ_BIC(rq); + } + + if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && @@ -2757,8 +3523,8 @@ index 0000000..b230927 +} + +/* -+ * Drain our current requests. Used for barriers and when switching -+ * io schedulers on-the-fly. ++ * Drain our current requests. ++ * Used for barriers and when switching io schedulers on-the-fly. + */ +static int bfq_forced_dispatch(struct bfq_data *bfqd) +{ @@ -2766,7 +3532,7 @@ index 0000000..b230927 + struct bfq_service_tree *st; + int dispatched = 0; + -+ bfqq = bfqd->active_queue; ++ bfqq = bfqd->in_service_queue; + if (bfqq != NULL) + __bfq_bfqq_expire(bfqd, bfqq); + @@ -2802,7 +3568,8 @@ index 0000000..b230927 + if (unlikely(force)) + return bfq_forced_dispatch(bfqd); + -+ if((bfqq = bfq_select_queue(bfqd)) == NULL) ++ bfqq = bfq_select_queue(bfqd); ++ if (bfqq == NULL) + return 0; + + max_dispatch = bfqd->bfq_quantum; @@ -2825,11 +3592,11 @@ index 0000000..b230927 + bfq_clear_bfqq_wait_request(bfqq); + BUG_ON(timer_pending(&bfqd->idle_slice_timer)); + -+ if (! bfq_dispatch_request(bfqd, bfqq)) ++ if (!bfq_dispatch_request(bfqd, bfqq)) + return 0; + -+ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d" -+ "(max_disp %d)", bfqq->pid, max_dispatch); ++ bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d (max_disp %d)", ++ bfqq->pid, max_dispatch); + + return 1; +} @@ -2855,7 +3622,18 @@ index 0000000..b230927 + BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); + BUG_ON(bfqq->entity.tree != NULL); + BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqd->active_queue == bfqq); ++ BUG_ON(bfqd->in_service_queue == bfqq); ++ ++ if (bfq_bfqq_sync(bfqq)) ++ /* ++ * The fact that this queue is being destroyed does not ++ * invalidate the fact that this queue may have been ++ * activated during the current burst. As a consequence, ++ * although the queue does not exist anymore, and hence ++ * needs to be removed from the burst list if there, ++ * the burst size has not to be decremented. ++ */ ++ hlist_del_init(&bfqq->burst_list_node); + + bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); + @@ -2873,10 +3651,8 @@ index 0000000..b230927 + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { -+ if (__bfqq == bfqq) { -+ WARN(1, "bfqq->new_bfqq loop detected.\n"); ++ if (__bfqq == bfqq) + break; -+ } + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; @@ -2885,7 +3661,7 @@ index 0000000..b230927 + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ -+ if (bfqq == bfqd->active_queue) { ++ if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq); + bfq_schedule_dispatch(bfqd); + } @@ -2898,7 +3674,7 @@ index 0000000..b230927 + bfq_put_queue(bfqq); +} + -+static void bfq_init_icq(struct io_cq *icq) ++static inline void bfq_init_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + @@ -2936,7 +3712,8 @@ index 0000000..b230927 + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + switch (ioprio_class) { + default: -+ printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class); ++ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, ++ "bfq: bad prio class %d\n", ioprio_class); + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. @@ -2959,13 +3736,15 @@ index 0000000..b230927 + break; + } + ++ if (bfqq->entity.new_ioprio < 0 || ++ bfqq->entity.new_ioprio >= IOPRIO_BE_NR) { ++ printk(KERN_CRIT "bfq_init_prio_data: new_ioprio %d\n", ++ bfqq->entity.new_ioprio); ++ BUG(); ++ } ++ + bfqq->entity.ioprio_changed = 1; + -+ /* -+ * Keep track of original prio settings in case we have to temporarily -+ * elevate the priority of this queue. -+ */ -+ bfqq->org_ioprio = bfqq->entity.new_ioprio; + bfq_clear_bfqq_prio_changed(bfqq); +} + @@ -2977,10 +3756,11 @@ index 0000000..b230927 + unsigned long uninitialized_var(flags); + int ioprio = bic->icq.ioc->ioprio; + -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags); ++ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), ++ &flags); + /* -+ * This condition may trigger on a newly created bic, be sure to drop the -+ * lock before returning. ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. + */ + if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio)) + goto out; @@ -3015,6 +3795,7 @@ index 0000000..b230927 +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); + + atomic_set(&bfqq->ref, 0); + bfqq->bfqd = bfqd; @@ -3026,14 +3807,19 @@ index 0000000..b230927 + bfq_mark_bfqq_idle_window(bfqq); + bfq_mark_bfqq_sync(bfqq); + } ++ bfq_mark_bfqq_IO_bound(bfqq); + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->pid = pid; + -+ bfqq->raising_coeff = 1; -+ bfqq->last_rais_start_finish = 0; -+ bfqq->soft_rt_next_start = -1; ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = 0; ++ /* ++ * Set to the value for which bfqq will not be deemed as ++ * soft rt when it becomes backlogged. ++ */ ++ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); +} + +static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, @@ -3073,14 +3859,13 @@ index 0000000..b230927 + + if (bfqq != NULL) { + bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync); ++ bfq_init_prio_data(bfqq, bic); ++ bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + } -+ -+ bfq_init_prio_data(bfqq, bic); -+ bfq_init_entity(&bfqq->entity, bfqg); + } + + if (new_bfqq != NULL) @@ -3127,7 +3912,8 @@ index 0000000..b230927 + bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask); + + /* -+ * Pin the queue now that it's allocated, scheduler exit will prune it. ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. + */ + if (!is_sync && *async_bfqq == NULL) { + atomic_inc(&bfqq->ref); @@ -3150,7 +3936,8 @@ index 0000000..b230927 + + bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; + bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples; ++ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / ++ bic->ttime.ttime_samples; +} + +static void bfq_update_io_seektime(struct bfq_data *bfqd, @@ -3180,19 +3967,6 @@ index 0000000..b230927 + bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; + total = bfqq->seek_total + (bfqq->seek_samples/2); + do_div(total, bfqq->seek_samples); -+ if (bfq_bfqq_coop(bfqq)) { -+ /* -+ * If the mean seektime increases for a (non-seeky) shared -+ * queue, some cooperator is likely to be idling too much. -+ * On the contrary, if it decreases, some cooperator has -+ * probably waked up. -+ * -+ */ -+ if ((sector_t)total < bfqq->seek_mean) -+ bfq_mark_bfqq_some_coop_idle(bfqq) ; -+ else if ((sector_t)total > bfqq->seek_mean) -+ bfq_clear_bfqq_some_coop_idle(bfqq) ; -+ } + bfqq->seek_mean = (sector_t)total; + + bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, @@ -3218,11 +3992,11 @@ index 0000000..b230927 + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || + bfqd->bfq_slice_idle == 0 || + (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -+ bfqq->raising_coeff == 1)) ++ bfqq->wr_coeff == 1)) + enable_idle = 0; + else if (bfq_sample_valid(bic->ttime.ttime_samples)) { + if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -+ bfqq->raising_coeff == 1) ++ bfqq->wr_coeff == 1) + enable_idle = 0; + else + enable_idle = 1; @@ -3250,6 +4024,13 @@ index 0000000..b230927 + + bfq_update_io_thinktime(bfqd, bic); + bfq_update_io_seektime(bfqd, bfqq, rq); ++ if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { ++ bfq_clear_bfqq_constantly_seeky(bfqq); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); ++ bfqd->const_seeky_busy_in_flight_queues--; ++ } ++ } + if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || + !BFQQ_SEEKY(bfqq)) + bfq_update_idle_window(bfqd, bfqq, bic); @@ -3261,43 +4042,52 @@ index 0000000..b230927 + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + -+ if (bfqq == bfqd->active_queue) { ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ int small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ int budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ + /* -+ * If there is just this request queued and the request -+ * is small, just exit. -+ * In this way, if the disk is being idled to wait for a new -+ * request from the active queue, we avoid unplugging the -+ * device now. ++ * There is just this request queued: if the request ++ * is small and the queue is not to be expired, then ++ * just exit. + * -+ * By doing so, we spare the disk to be committed -+ * to serve just a small request. On the contrary, we wait for ++ * In this way, if the disk is being idled to wait for ++ * a new request from the in-service queue, we avoid ++ * unplugging the device and committing the disk to serve ++ * just a small request. On the contrary, we wait for + * the block layer to decide when to unplug the device: -+ * hopefully, new requests will be merged to this -+ * one quickly, then the device will be unplugged -+ * and larger requests will be dispatched. ++ * hopefully, new requests will be merged to this one ++ * quickly, then the device will be unplugged and ++ * larger requests will be dispatched. + */ -+ if (bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32) { -+ return; -+ } -+ if (bfq_bfqq_wait_request(bfqq)) { -+ /* -+ * If we are waiting for a request for this queue, let -+ * it rip immediately and flag that we must not expire -+ * this queue just now. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ del_timer(&bfqd->idle_slice_timer); -+ /* -+ * Here we can safely expire the queue, in -+ * case of budget timeout, without wasting -+ * guarantees -+ */ -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, 0, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ __blk_run_queue(bfqd->queue); -+ } ++ if (small_req && !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or the queue is to ++ * be expired: in both cases disk idling is to be ++ * stopped, so clear wait_request flag and reset ++ * timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ del_timer(&bfqd->idle_slice_timer); ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); ++ ++ /* ++ * Let the request rip immediately, or let a new queue be ++ * selected if bfqq has just been expired. ++ */ ++ __blk_run_queue(bfqd->queue); + } +} + @@ -3309,7 +4099,7 @@ index 0000000..b230927 + assert_spin_locked(bfqd->queue->queue_lock); + bfq_init_prio_data(bfqq, RQ_BIC(rq)); + -+ bfq_add_rq_rb(rq); ++ bfq_add_request(rq); + + rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); + list_add_tail(&rq->queuelist, &bfqq->fifo); @@ -3346,50 +4136,74 @@ index 0000000..b230927 +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); ++ bool sync = bfq_bfqq_sync(bfqq); + -+ bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)", -+ blk_rq_sectors(rq), sync); ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", ++ blk_rq_sectors(rq), sync); + + bfq_update_hw_tag(bfqd); + -+ WARN_ON(!bfqd->rq_in_driver); -+ WARN_ON(!bfqq->dispatched); ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); + bfqd->rq_in_driver--; + bfqq->dispatched--; + -+ if (bfq_bfqq_sync(bfqq)) -+ bfqd->sync_flight--; ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ bfq_weights_tree_remove(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ BUG_ON(!bfqd->busy_in_flight_queues); ++ bfqd->busy_in_flight_queues--; ++ if (bfq_bfqq_constantly_seeky(bfqq)) { ++ BUG_ON(!bfqd-> ++ const_seeky_busy_in_flight_queues); ++ bfqd->const_seeky_busy_in_flight_queues--; ++ } ++ } ++ } + -+ if (sync) ++ if (sync) { ++ bfqd->sync_flight--; + RQ_BIC(rq)->ttime.last_end_request = jiffies; ++ } + + /* -+ * If this is the active queue, check if it needs to be expired, ++ * If we are waiting to discover whether the request pattern of the ++ * task associated with the queue is actually isochronous, and ++ * both requisites for this condition to hold are satisfied, then ++ * compute soft_rt_next_start (see the comments to the function ++ * bfq_bfqq_softrt_next_start()). ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ -+ if (bfqd->active_queue == bfqq) { -+ int budg_timeout = bfq_may_expire_for_budg_timeout(bfqq); ++ if (bfqd->in_service_queue == bfqq) { + if (bfq_bfqq_budget_new(bfqq)) + bfq_set_budget_timeout(bfqd); + -+ /* Idling is disabled also for cooperation issues: -+ * 1) there is a close cooperator for the queue, or -+ * 2) the queue is shared and some cooperator is likely -+ * to be idle (in this case, by not arming the idle timer, -+ * we try to slow down the queue, to prevent the zones -+ * of the disk accessed by the active cooperators to become -+ * too distant from the zone that will be accessed by the -+ * currently idle cooperators) -+ */ -+ if (bfq_bfqq_must_idle(bfqq, budg_timeout)) ++ if (bfq_bfqq_must_idle(bfqq)) { + bfq_arm_slice_timer(bfqd); -+ else if (budg_timeout) ++ goto out; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_bfqq_must_not_expire(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, 0, ++ BFQ_BFQQ_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); ++ ++out: ++ return; +} + +static inline int __bfq_may_queue(struct bfq_queue *bfqq) @@ -3411,9 +4225,9 @@ index 0000000..b230927 + + /* + * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be queued. -+ * So just lookup a possibly existing queue, or return 'may queue' -+ * if that fails. ++ * does not necessarily imply that a request actually will be ++ * queued. So just lookup a possibly existing queue, or return ++ * 'may queue' if that fails. + */ + bic = bfq_bic_lookup(bfqd, tsk->io_context); + if (bic == NULL) @@ -3453,14 +4267,14 @@ index 0000000..b230927 + +static struct bfq_queue * +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq) ++ struct bfq_queue *bfqq) +{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (long unsigned)bfqq->new_bfqq->pid); -+ bic_set_bfqq(bic, bfqq->new_bfqq, 1); -+ bfq_mark_bfqq_coop(bfqq->new_bfqq); -+ bfq_put_queue(bfqq); -+ return bic_to_bfqq(bic, 1); ++ bic_set_bfqq(bic, bfqq->new_bfqq, 1); ++ bfq_mark_bfqq_coop(bfqq->new_bfqq); ++ bfq_put_queue(bfqq); ++ return bic_to_bfqq(bic, 1); +} + +/* @@ -3473,7 +4287,6 @@ index 0000000..b230927 + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; -+ bfq_clear_bfqq_some_coop_idle(bfqq); + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; @@ -3569,7 +4382,7 @@ index 0000000..b230927 +} + +/* -+ * Handler of the expiration of the timer running if the active_queue ++ * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static void bfq_idle_slice_timer(unsigned long data) @@ -3581,14 +4394,14 @@ index 0000000..b230927 + + spin_lock_irqsave(bfqd->queue->queue_lock, flags); + -+ bfqq = bfqd->active_queue; ++ bfqq = bfqd->in_service_queue; + /* -+ * Theoretical race here: active_queue can be NULL or different -+ * from the queue that was idling if the timer handler spins on -+ * the queue_lock and a new request arrives for the current -+ * queue and there is a full dispatch cycle that changes the -+ * active_queue. This can hardly happen, but in the worst case -+ * we just expire a queue too early. ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if the timer handler ++ * spins on the queue_lock and a new request arrives for the ++ * current queue and there is a full dispatch cycle that changes ++ * the in-service queue. This can hardly happen, but in the worst ++ * case we just expire a queue too early. + */ + if (bfqq != NULL) { + bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); @@ -3602,9 +4415,9 @@ index 0000000..b230927 + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the first -+ * request of the active queue arrives during -+ * disk idling ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. + */ + reason = BFQ_BFQQ_TOO_IDLE; + else @@ -3645,7 +4458,7 @@ index 0000000..b230927 + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure untill all the requests on a device are gone). ++ * exist for sure until all the requests on a device are gone). + */ +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ @@ -3668,7 +4481,7 @@ index 0000000..b230927 + + spin_lock_irq(q->queue_lock); + -+ BUG_ON(bfqd->active_queue != NULL); ++ BUG_ON(bfqd->in_service_queue != NULL); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, 0); + @@ -3690,7 +4503,7 @@ index 0000000..b230927 + struct bfq_group *bfqg; + struct bfq_data *bfqd; + -+ bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node); ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (bfqd == NULL) + return -ENOMEM; + @@ -3701,6 +4514,14 @@ index 0000000..b230927 + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0); + atomic_inc(&bfqd->oom_bfqq.ref); ++ bfqd->oom_bfqq.entity.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.entity.new_ioprio_class = IOPRIO_CLASS_BE; ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.ioprio_changed = 1; + + bfqd->queue = q; + q->elevator->elevator_data = bfqd; @@ -3712,17 +4533,24 @@ index 0000000..b230927 + } + + bfqd->root_group = bfqg; ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++#ifdef CONFIG_CGROUP_BFQIO ++ bfqd->active_numerous_groups = 0; ++#endif + + init_timer(&bfqd->idle_slice_timer); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + bfqd->idle_slice_timer.data = (unsigned long)bfqd; + + bfqd->rq_pos_tree = RB_ROOT; ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->group_weights_tree = RB_ROOT; + + INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + @@ -3739,23 +4567,38 @@ index 0000000..b230927 + bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; + bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; + ++ bfqd->bfq_coop_thresh = 2; ++ bfqd->bfq_failed_cooperations = 7000; ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 11; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(500); ++ + bfqd->low_latency = true; + -+ bfqd->bfq_raising_coeff = 20; -+ bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_raising_max_time = 0; -+ bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_raising_max_softrt_rate = 7000; -+ -+ /* Initially estimate the device's peak rate as the reference rate */ -+ if (blk_queue_nonrot(bfqd->queue)) { -+ bfqd->RT_prod = R_nonrot * T_nonrot; -+ bfqd->peak_rate = R_nonrot; -+ } else { -+ bfqd->RT_prod = R_rot * T_rot; -+ bfqd->peak_rate = R_rot; -+ } ++ bfqd->bfq_wr_coeff = 20; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ bfqd->busy_in_flight_queues = 0; ++ bfqd->const_seeky_busy_in_flight_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device peak rate is ++ * equal to the highest reference rate. ++ */ ++ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * ++ T_fast[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->device_speed = BFQ_BFQD_FAST; + + return 0; +} @@ -3779,10 +4622,11 @@ index 0000000..b230927 + return sprintf(page, "%d\n", var); +} + -+static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count) ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) +{ + unsigned long new_val; -+ int ret = strict_strtoul(page, 10, &new_val); ++ int ret = kstrtoul(page, 10, &new_val); + + if (ret == 0) + *var = new_val; @@ -3790,12 +4634,12 @@ index 0000000..b230927 + return count; +} + -+static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page) ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) +{ + struct bfq_data *bfqd = e->elevator_data; -+ return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_raising_max_time) : -+ jiffies_to_msecs(bfq_wrais_duration(bfqd))); ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); +} + +static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) @@ -3812,15 +4656,13 @@ index 0000000..b230927 + num_char += sprintf(page + num_char, "Active:\n"); + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { + num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d," -+ " dur %d/%u\n", ++ "pid%d: weight %hu, nr_queued %d %d, dur %d/%u\n", + bfqq->pid, + bfqq->entity.weight, + bfqq->queued[0], + bfqq->queued[1], -+ jiffies_to_msecs(jiffies - -+ bfqq->last_rais_start_finish), -+ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + num_char += sprintf(page + num_char, "Idle:\n"); @@ -3830,8 +4672,8 @@ index 0000000..b230927 + bfqq->pid, + bfqq->entity.weight, + jiffies_to_msecs(jiffies - -+ bfqq->last_rais_start_finish), -+ jiffies_to_msecs(bfqq->raising_cur_max_time)); ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); + } + + spin_unlock_irq(bfqd->queue->queue_lock); @@ -3855,19 +4697,17 @@ index 0000000..b230927 +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0); ++SHOW_FUNCTION(bfq_max_budget_async_rq_show, ++ bfqd->bfq_max_budget_async_rq, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); +SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0); -+SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1); -+SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time, -+ 1); -+SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show, -+ bfqd->bfq_raising_min_inter_arr_async, ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, + 1); -+SHOW_FUNCTION(bfq_raising_max_softrt_rate_show, -+ bfqd->bfq_raising_max_softrt_rate, 0); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ @@ -3900,18 +4740,16 @@ index 0000000..b230927 + 1, INT_MAX, 0); +STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, + INT_MAX, 1); -+STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0, ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, + INT_MAX, 1); -+STORE_FUNCTION(bfq_raising_min_idle_time_store, -+ &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_raising_min_inter_arr_async_store, -+ &bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_raising_max_softrt_rate_store, -+ &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); +#undef STORE_FUNCTION + +/* do nothing for the moment */ @@ -3980,7 +4818,7 @@ index 0000000..b230927 + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_raising(bfqd); ++ bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return ret; @@ -4001,12 +4839,12 @@ index 0000000..b230927 + BFQ_ATTR(timeout_sync), + BFQ_ATTR(timeout_async), + BFQ_ATTR(low_latency), -+ BFQ_ATTR(raising_coeff), -+ BFQ_ATTR(raising_max_time), -+ BFQ_ATTR(raising_rt_max_time), -+ BFQ_ATTR(raising_min_idle_time), -+ BFQ_ATTR(raising_min_inter_arr_async), -+ BFQ_ATTR(raising_max_softrt_rate), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), + BFQ_ATTR(weights), + __ATTR_NULL +}; @@ -4053,7 +4891,25 @@ index 0000000..b230927 + if (bfq_slab_setup()) + return -ENOMEM; + ++ /* ++ * Times to load large popular applications for the typical systems ++ * installed on the reference devices (see the comments before the ++ * definitions of the two arrays). ++ */ ++ T_slow[0] = msecs_to_jiffies(2600); ++ T_slow[1] = msecs_to_jiffies(1000); ++ T_fast[0] = msecs_to_jiffies(5500); ++ T_fast[1] = msecs_to_jiffies(2000); ++ ++ /* ++ * Thresholds that determine the switch between speed classes (see ++ * the comments before the definition of the array). ++ */ ++ device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; ++ device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; ++ + elv_register(&iosched_bfq); ++ pr_info("BFQ I/O-scheduler version: v7r7"); + + return 0; +} @@ -4069,13 +4925,12 @@ index 0000000..b230927 + +MODULE_AUTHOR("Fabio Checconi, Paolo Valente"); +MODULE_LICENSE("GPL"); -+MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler"); diff --git a/block/bfq-sched.c b/block/bfq-sched.c new file mode 100644 -index 0000000..03f8061 +index 0000000..2931563 --- /dev/null +++ b/block/bfq-sched.c -@@ -0,0 +1,1072 @@ +@@ -0,0 +1,1214 @@ +/* + * BFQ: Hierarchical B-WF2Q+ scheduler. + * @@ -4099,32 +4954,32 @@ index 0000000..03f8061 + int extract, + struct bfq_data *bfqd); + -+static inline void bfq_update_budget(struct bfq_entity *next_active) ++static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + -+ BUG_ON(next_active == NULL); ++ BUG_ON(next_in_service == NULL); + -+ group_sd = next_active->sched_data; ++ group_sd = next_in_service->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity -+ * as it must never become an active entity. ++ * as it must never become an in-service entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity != NULL) -+ bfqg_entity->budget = next_active->budget; ++ bfqg_entity->budget = next_in_service->budget; +} + -+static int bfq_update_next_active(struct bfq_sched_data *sd) ++static int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ -+ struct bfq_entity *next_active; ++ struct bfq_entity *next_in_service; + -+ if (sd->active_entity != NULL) ++ if (sd->in_service_entity != NULL) + /* will update/requeue at the end of service */ + return 0; + @@ -4135,19 +4990,19 @@ index 0000000..03f8061 + * next from this subtree. By now we worry more about + * correctness than about performance... + */ -+ next_active = bfq_lookup_next_entity(sd, 0, NULL); -+ sd->next_active = next_active; ++ next_in_service = bfq_lookup_next_entity(sd, 0, NULL); ++ sd->next_in_service = next_in_service; + -+ if (next_active != NULL) -+ bfq_update_budget(next_active); ++ if (next_in_service != NULL) ++ bfq_update_budget(next_in_service); + + return 1; +} + -+static inline void bfq_check_next_active(struct bfq_sched_data *sd, -+ struct bfq_entity *entity) ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) +{ -+ BUG_ON(sd->next_active != entity); ++ BUG_ON(sd->next_in_service != entity); +} +#else +#define for_each_entity(entity) \ @@ -4156,17 +5011,17 @@ index 0000000..03f8061 +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity != NULL; entity = parent) + -+static inline int bfq_update_next_active(struct bfq_sched_data *sd) ++static inline int bfq_update_next_in_service(struct bfq_sched_data *sd) +{ + return 0; +} + -+static inline void bfq_check_next_active(struct bfq_sched_data *sd, -+ struct bfq_entity *entity) ++static inline void bfq_check_next_in_service(struct bfq_sched_data *sd, ++ struct bfq_entity *entity) +{ +} + -+static inline void bfq_update_budget(struct bfq_entity *next_active) ++static inline void bfq_update_budget(struct bfq_entity *next_in_service) +{ +} +#endif @@ -4175,7 +5030,8 @@ index 0000000..03f8061 + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time wraparounds. ++ * (big shift values increase it), and the period of virtual time ++ * wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + @@ -4407,8 +5263,18 @@ index 0000000..03f8061 + goto up; +} + ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root); ++ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_entity *entity, ++ struct rb_root *root); ++ ++ +/** -+ * bfq_active_insert - insert an entity in the active tree of its group/device. ++ * bfq_active_insert - insert an entity in the active tree of its ++ * group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * @@ -4422,6 +5288,11 @@ index 0000000..03f8061 +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; ++#ifdef CONFIG_CGROUP_BFQIO ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif + + bfq_insert(&st->active, entity); + @@ -4432,17 +5303,36 @@ index 0000000..03f8061 + + bfq_update_active_tree(node); + ++#ifdef CONFIG_CGROUP_BFQIO ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif + if (bfqq != NULL) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++#ifdef CONFIG_CGROUP_BFQIO ++ else { /* bfq_group */ ++ BUG_ON(!bfqd); ++ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); ++ } ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ bfqg->active_entities++; ++ if (bfqg->active_entities == 2) ++ bfqd->active_numerous_groups++; ++ } ++#endif +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) ++static inline unsigned short bfq_ioprio_to_weight(int ioprio) +{ -+ WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); + return IOPRIO_BE_NR - ioprio; +} + @@ -4454,19 +5344,17 @@ index 0000000..03f8061 + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR + */ -+static unsigned short bfq_weight_to_ioprio(int weight) ++static inline unsigned short bfq_weight_to_ioprio(int weight) +{ -+ WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); + return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight; +} + +static inline void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd; + + if (bfqq != NULL) { -+ sd = entity->sched_data; + atomic_inc(&bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, atomic_read(&bfqq->ref)); @@ -4513,6 +5401,11 @@ index 0000000..03f8061 +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; ++#ifdef CONFIG_CGROUP_BFQIO ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); @@ -4520,8 +5413,31 @@ index 0000000..03f8061 + if (node != NULL) + bfq_update_active_tree(node); + ++#ifdef CONFIG_CGROUP_BFQIO ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif + if (bfqq != NULL) + list_del(&bfqq->bfqq_list); ++#ifdef CONFIG_CGROUP_BFQIO ++ else { /* bfq_group */ ++ BUG_ON(!bfqd); ++ bfq_weights_tree_remove(bfqd, entity, ++ &bfqd->group_weights_tree); ++ } ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ BUG_ON(!bfqg->active_entities); ++ bfqg->active_entities--; ++ if (bfqg->active_entities == 1) { ++ BUG_ON(!bfqd->active_numerous_groups); ++ bfqd->active_numerous_groups--; ++ } ++ } ++#endif +} + +/** @@ -4619,11 +5535,37 @@ index 0000000..03f8061 + + if (entity->ioprio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned short prev_weight, new_weight; ++ struct bfq_data *bfqd = NULL; ++ struct rb_root *root; ++#ifdef CONFIG_CGROUP_BFQIO ++ struct bfq_sched_data *sd; ++ struct bfq_group *bfqg; ++#endif ++ ++ if (bfqq != NULL) ++ bfqd = bfqq->bfqd; ++#ifdef CONFIG_CGROUP_BFQIO ++ else { ++ sd = entity->my_sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++ BUG_ON(!bfqd); ++ } ++#endif + + BUG_ON(old_st->wsum < entity->weight); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { ++ if (entity->new_weight < BFQ_MIN_WEIGHT || ++ entity->new_weight > BFQ_MAX_WEIGHT) { ++ printk(KERN_CRIT "update_weight_prio: " ++ "new_weight %d\n", ++ entity->new_weight); ++ BUG(); ++ } + entity->orig_weight = entity->new_weight; + entity->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); @@ -4646,8 +5588,31 @@ index 0000000..03f8061 + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); -+ entity->weight = entity->orig_weight * -+ (bfqq != NULL ? bfqq->raising_coeff : 1); ++ ++ prev_weight = entity->weight; ++ new_weight = entity->orig_weight * ++ (bfqq != NULL ? bfqq->wr_coeff : 1); ++ /* ++ * If the weight of the entity changes, remove the entity ++ * from its old weight counter (if there is a counter ++ * associated with the entity), and add it to the counter ++ * associated with its new weight. ++ */ ++ if (prev_weight != new_weight) { ++ root = bfqq ? &bfqd->queue_weights_tree : ++ &bfqd->group_weights_tree; ++ bfq_weights_tree_remove(bfqd, entity, root); ++ } ++ entity->weight = new_weight; ++ /* ++ * Add the entity to its weights tree only if it is ++ * not associated with a weight-raised queue. ++ */ ++ if (prev_weight != new_weight && ++ (bfqq ? bfqq->wr_coeff == 1 : 1)) ++ /* If we get here, root has been initialized. */ ++ bfq_weights_tree_add(bfqd, entity, root); ++ + new_st->wsum += entity->weight; + + if (new_st != old_st) @@ -4658,7 +5623,8 @@ index 0000000..03f8061 +} + +/** -+ * bfq_bfqq_served - update the scheduler status after selection for service. ++ * bfq_bfqq_served - update the scheduler status after selection for ++ * service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * @@ -4718,7 +5684,7 @@ index 0000000..03f8061 + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + -+ if (entity == sd->active_entity) { ++ if (entity == sd->in_service_entity) { + BUG_ON(entity->tree != NULL); + /* + * If we are requeueing the current entity we have @@ -4727,12 +5693,12 @@ index 0000000..03f8061 + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; -+ sd->active_entity = NULL; ++ sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) { + /* + * Requeueing an entity due to a change of some -+ * next_active entity below it. We reuse the old -+ * start time. ++ * next_in_service entity below it. We reuse the ++ * old start time. + */ + bfq_active_extract(st, entity); + } else if (entity->tree == &st->idle) { @@ -4776,11 +5742,11 @@ index 0000000..03f8061 + __bfq_activate_entity(entity); + + sd = entity->sched_data; -+ if (!bfq_update_next_active(sd)) ++ if (!bfq_update_next_in_service(sd)) + /* + * No need to propagate the activation to the + * upper entities, as they will be updated when -+ * the active entity is rescheduled. ++ * the in-service entity is rescheduled. + */ + break; + } @@ -4797,24 +5763,24 @@ index 0000000..03f8061 + * and if the caller did not specify @requeue, put it on the idle tree. + * + * Return %1 if the caller should update the entity hierarchy, i.e., -+ * if the entity was under service or if it was the next_active for ++ * if the entity was in service or if it was the next_in_service for + * its sched_data; return %0 otherwise. + */ +static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ int was_active = entity == sd->active_entity; ++ int was_in_service = entity == sd->in_service_entity; + int ret = 0; + + if (!entity->on_st) + return 0; + -+ BUG_ON(was_active && entity->tree != NULL); ++ BUG_ON(was_in_service && entity->tree != NULL); + -+ if (was_active) { ++ if (was_in_service) { + bfq_calc_finish(entity, entity->service); -+ sd->active_entity = NULL; ++ sd->in_service_entity = NULL; + } else if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (entity->tree == &st->idle) @@ -4822,16 +5788,16 @@ index 0000000..03f8061 + else if (entity->tree != NULL) + BUG(); + -+ if (was_active || sd->next_active == entity) -+ ret = bfq_update_next_active(sd); ++ if (was_in_service || sd->next_in_service == entity) ++ ret = bfq_update_next_in_service(sd); + + if (!requeue || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity); + else + bfq_idle_insert(st, entity); + -+ BUG_ON(sd->active_entity == entity); -+ BUG_ON(sd->next_active == entity); ++ BUG_ON(sd->in_service_entity == entity); ++ BUG_ON(sd->next_in_service == entity); + + return ret; +} @@ -4853,11 +5819,11 @@ index 0000000..03f8061 + /* + * The parent entity is still backlogged, and + * we don't need to update it as it is still -+ * under service. ++ * in service. + */ + break; + -+ if (sd->next_active != NULL) ++ if (sd->next_in_service != NULL) + /* + * The parent entity is still backlogged and + * the budgets on the path towards the root @@ -4880,7 +5846,7 @@ index 0000000..03f8061 + __bfq_activate_entity(entity); + + sd = entity->sched_data; -+ if (!bfq_update_next_active(sd)) ++ if (!bfq_update_next_in_service(sd)) + break; + } +} @@ -4894,7 +5860,7 @@ index 0000000..03f8061 + * active tree of the device is not empty. + * + * NOTE: this hierarchical implementation updates vtimes quite often, -+ * we may end up with reactivated tasks getting timestamps after a ++ * we may end up with reactivated processes getting timestamps after a + * vtime skip done because we needed a ->first_active entity on some + * intermediate node. + */ @@ -4911,13 +5877,14 @@ index 0000000..03f8061 +} + +/** -+ * bfq_first_active - find the eligible entity with the smallest finish time ++ * bfq_first_active_entity - find the eligible entity with ++ * the smallest finish time + * @st: the service tree to select from. + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path -+ * on the right is followed only if a) the left subtree contains no eligible ++ * a subtree with at least one eligible (start >= vtime) entity. The path on ++ * the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) @@ -4960,7 +5927,7 @@ index 0000000..03f8061 +static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, + bool force) +{ -+ struct bfq_entity *entity, *new_next_active = NULL; ++ struct bfq_entity *entity, *new_next_in_service = NULL; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; @@ -4971,13 +5938,13 @@ index 0000000..03f8061 + + /* + * If the chosen entity does not match with the sched_data's -+ * next_active and we are forcedly serving the IDLE priority ++ * next_in_service and we are forcedly serving the IDLE priority + * class tree, bubble up budget update. + */ -+ if (unlikely(force && entity != entity->sched_data->next_active)) { -+ new_next_active = entity; -+ for_each_entity(new_next_active) -+ bfq_update_budget(new_next_active); ++ if (unlikely(force && entity != entity->sched_data->next_in_service)) { ++ new_next_in_service = entity; ++ for_each_entity(new_next_in_service) ++ bfq_update_budget(new_next_in_service); + } + + return entity; @@ -4988,9 +5955,9 @@ index 0000000..03f8061 + * @sd: the sched_data. + * @extract: if true the returned entity will be also extracted from @sd. + * -+ * NOTE: since we cache the next_active entity at each level of the ++ * NOTE: since we cache the next_in_service entity at each level of the + * hierarchy, the complexity of the lookup can be decreased with -+ * absolutely no effort just returning the cached next_active value; ++ * absolutely no effort just returning the cached next_in_service value; + * we prefer to do full lookups to test the consistency of * the data + * structures. + */ @@ -5000,27 +5967,28 @@ index 0000000..03f8061 +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_entity *entity; -+ int i=0; ++ int i = 0; + -+ BUG_ON(sd->active_entity != NULL); ++ BUG_ON(sd->in_service_entity != NULL); + + if (bfqd != NULL && + jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true); ++ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, ++ true); + if (entity != NULL) { + i = BFQ_IOPRIO_CLASSES - 1; + bfqd->bfq_class_idle_last_service = jiffies; -+ sd->next_active = entity; ++ sd->next_in_service = entity; + } + } + for (; i < BFQ_IOPRIO_CLASSES; i++) { + entity = __bfq_lookup_next_entity(st + i, false); + if (entity != NULL) { + if (extract) { -+ bfq_check_next_active(sd, entity); ++ bfq_check_next_in_service(sd, entity); + bfq_active_extract(st + i, entity); -+ sd->active_entity = entity; -+ sd->next_active = NULL; ++ sd->in_service_entity = entity; ++ sd->next_in_service = NULL; + } + break; + } @@ -5038,7 +6006,7 @@ index 0000000..03f8061 + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + -+ BUG_ON(bfqd->active_queue != NULL); ++ BUG_ON(bfqd->in_service_queue != NULL); + + if (bfqd->busy_queues == 0) + return NULL; @@ -5065,7 +6033,7 @@ index 0000000..03f8061 + struct bfq_entity *entity; + struct bfq_sched_data *sd; + -+ BUG_ON(bfqd->active_queue != NULL); ++ BUG_ON(bfqd->in_service_queue != NULL); + + entity = &bfqq->entity; + /* @@ -5076,22 +6044,22 @@ index 0000000..03f8061 + bfq_update_budget(entity); + bfq_update_vtime(bfq_entity_service_tree(entity)); + bfq_active_extract(bfq_entity_service_tree(entity), entity); -+ sd->active_entity = entity; -+ sd->next_active = NULL; ++ sd->in_service_entity = entity; ++ sd->next_in_service = NULL; + entity->service = 0; + } + + return; +} + -+static void __bfq_bfqd_reset_active(struct bfq_data *bfqd) ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +{ -+ if (bfqd->active_bic != NULL) { -+ put_io_context(bfqd->active_bic->icq.ioc); -+ bfqd->active_bic = NULL; ++ if (bfqd->in_service_bic != NULL) { ++ put_io_context(bfqd->in_service_bic->icq.ioc); ++ bfqd->in_service_bic = NULL; + } + -+ bfqd->active_queue = NULL; ++ bfqd->in_service_queue = NULL; + del_timer(&bfqd->idle_slice_timer); +} + @@ -5100,8 +6068,8 @@ index 0000000..03f8061 +{ + struct bfq_entity *entity = &bfqq->entity; + -+ if (bfqq == bfqd->active_queue) -+ __bfq_bfqd_reset_active(bfqd); ++ if (bfqq == bfqd->in_service_queue) ++ __bfq_bfqd_reset_in_service(bfqd); + + bfq_deactivate_entity(entity, requeue); +} @@ -5130,6 +6098,22 @@ index 0000000..03f8061 + BUG_ON(bfqd->busy_queues == 0); + bfqd->busy_queues--; + ++ if (!bfqq->dispatched) { ++ bfq_weights_tree_remove(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ BUG_ON(!bfqd->busy_in_flight_queues); ++ bfqd->busy_in_flight_queues--; ++ if (bfq_bfqq_constantly_seeky(bfqq)) { ++ BUG_ON(!bfqd-> ++ const_seeky_busy_in_flight_queues); ++ bfqd->const_seeky_busy_in_flight_queues--; ++ } ++ } ++ } ++ if (bfqq->wr_coeff > 1) ++ bfqd->wr_busy_queues--; ++ + bfq_deactivate_bfqq(bfqd, bfqq, requeue); +} + @@ -5139,7 +6123,7 @@ index 0000000..03f8061 +static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->active_queue); ++ BUG_ON(bfqq == bfqd->in_service_queue); + + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + @@ -5147,15 +6131,28 @@ index 0000000..03f8061 + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues++; ++ ++ if (!bfqq->dispatched) { ++ if (bfqq->wr_coeff == 1) ++ bfq_weights_tree_add(bfqd, &bfqq->entity, ++ &bfqd->queue_weights_tree); ++ if (!blk_queue_nonrot(bfqd->queue)) { ++ bfqd->busy_in_flight_queues++; ++ if (bfq_bfqq_constantly_seeky(bfqq)) ++ bfqd->const_seeky_busy_in_flight_queues++; ++ } ++ } ++ if (bfqq->wr_coeff > 1) ++ bfqd->wr_busy_queues++; +} diff --git a/block/bfq.h b/block/bfq.h new file mode 100644 -index 0000000..48ecde9 +index 0000000..84c7861 --- /dev/null +++ b/block/bfq.h -@@ -0,0 +1,603 @@ +@@ -0,0 +1,773 @@ +/* -+ * BFQ-v6r2 for 3.10.0: data structures and common functions prototypes. ++ * BFQ-v7r7 for 3.10.0: data structures and common functions prototypes. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> @@ -5175,11 +6172,13 @@ index 0000000..48ecde9 +#include <linux/rbtree.h> + +#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT HZ/5 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 + ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ +#define BFQ_DEFAULT_GRP_WEIGHT 10 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE @@ -5213,15 +6212,15 @@ index 0000000..48ecde9 + +/** + * struct bfq_sched_data - multi-class scheduler. -+ * @active_entity: entity under service. -+ * @next_active: head-of-the-line entity in the scheduler. ++ * @in_service_entity: entity in service. ++ * @next_in_service: head-of-the-line entity in the scheduler. + * @service_tree: array of service trees, one per ioprio_class. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as + * an intermediate queue on a hierarchical setup. -+ * @next_active points to the active entity of the sched_data service -+ * trees that will be scheduled next. ++ * @next_in_service points to the active entity of the sched_data ++ * service trees that will be scheduled next. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. @@ -5231,14 +6230,29 @@ index 0000000..48ecde9 + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_sched_data { -+ struct bfq_entity *active_entity; -+ struct bfq_entity *next_active; ++ struct bfq_entity *in_service_entity; ++ struct bfq_entity *next_in_service; + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; +}; + +/** ++ * struct bfq_weight_counter - counter of the number of all active entities ++ * with a given weight. ++ * @weight: weight of the entities that this counter refers to. ++ * @num_active: number of active entities with this weight. ++ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree ++ * and @group_weights_tree). ++ */ ++struct bfq_weight_counter { ++ short int weight; ++ unsigned int num_active; ++ struct rb_node weights_node; ++}; ++ ++/** + * struct bfq_entity - schedulable entity. + * @rb_node: service_tree member. ++ * @weight_counter: pointer to the weight counter associated with this entity. + * @on_st: flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree). + * @finish: B-WF2Q+ finish timestamp (aka F_i). @@ -5289,6 +6303,7 @@ index 0000000..48ecde9 + */ +struct bfq_entity { + struct rb_node rb_node; ++ struct bfq_weight_counter *weight_counter; + + int on_st; + @@ -5334,22 +6349,40 @@ index 0000000..48ecde9 + * @max_budget: maximum budget allowed from the feedback mechanism. + * @budget_timeout: budget expiration (in jiffies). + * @dispatched: number of requests on the dispatch list or inside driver. -+ * @org_ioprio: saved ioprio during boosted periods. + * @flags: status flags. + * @bfqq_list: node for active/idle bfqq list inside our bfqd. ++ * @burst_list_node: node for the device's burst list. + * @seek_samples: number of seeks sampled + * @seek_total: sum of the distances of the seeks sampled + * @seek_mean: mean seek distance + * @last_request_pos: position of the last request enqueued ++ * @requests_within_timer: number of consecutive pairs of request completion ++ * and arrival, such that the queue becomes idle ++ * after the completion, but the next request arrives ++ * within an idle time slice; used only if the queue's ++ * IO_bound has been cleared. + * @pid: pid of the process owning the queue, used for logging purposes. -+ * @last_rais_start_time: last (idle -> weight-raised) transition attempt -+ * @raising_cur_max_time: current max raising time for this queue ++ * @last_wr_start_finish: start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period ++ * @wr_cur_max_time: current max raising time for this queue ++ * @soft_rt_next_start: minimum time instant such that, only if a new ++ * request is enqueued after this time instant in an ++ * idle @bfq_queue with no outstanding requests, then ++ * the task associated with the queue it is deemed as ++ * soft real-time (see the comments to the function ++ * bfq_bfqq_softrt_next_start()). ++ * @last_idle_bklogged: time of the last transition of the @bfq_queue from ++ * idle to backlogged ++ * @service_from_backlogged: cumulative service received from the @bfq_queue ++ * since the last transition from idle to ++ * backlogged + * -+ * A bfq_queue is a leaf request queue; it can be associated to an io_context -+ * or more (if it is an async one). @cgroup holds a reference to the -+ * cgroup, to be sure that it does not disappear while a bfqq still -+ * references it (mostly to avoid races between request issuing and task -+ * migration followed by cgroup distruction). ++ * A bfq_queue is a leaf request queue; it can be associated with an io_context ++ * or more, if it is async or shared between cooperating processes. @cgroup ++ * holds a reference to the cgroup, to be sure that it does not disappear while ++ * a bfqq still references it (mostly to avoid races between request issuing and ++ * task migration followed by cgroup destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { @@ -5375,23 +6408,28 @@ index 0000000..48ecde9 + + int dispatched; + -+ unsigned short org_ioprio; -+ + unsigned int flags; + + struct list_head bfqq_list; + ++ struct hlist_node burst_list_node; ++ + unsigned int seek_samples; + u64 seek_total; + sector_t seek_mean; + sector_t last_request_pos; + ++ unsigned int requests_within_timer; ++ + pid_t pid; + + /* weight-raising fields */ -+ unsigned int raising_cur_max_time; -+ u64 last_rais_start_finish, soft_rt_next_start; -+ unsigned int raising_coeff; ++ unsigned long wr_cur_max_time; ++ unsigned long soft_rt_next_start; ++ unsigned long last_wr_start_finish; ++ unsigned int wr_coeff; ++ unsigned long last_idle_bklogged; ++ unsigned long service_from_backlogged; +}; + +/** @@ -5421,34 +6459,82 @@ index 0000000..48ecde9 + int ioprio; +}; + ++enum bfq_device_speed { ++ BFQ_BFQD_FAST, ++ BFQ_BFQD_SLOW, ++}; ++ +/** + * struct bfq_data - per device data structure. + * @queue: request queue for the managed device. + * @root_group: root bfq_group for the device. -+ * @rq_pos_tree: rbtree sorted by next_request position, -+ * used when determining if two or more queues -+ * have interleaving requests (see bfq_close_cooperator). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_close_cooperator()). ++ * @active_numerous_groups: number of bfq_groups containing more than one ++ * active @bfq_entity. ++ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues ++ * have the same weight. The tree contains one counter ++ * for each distinct weight associated to some active ++ * and not weight-raised @bfq_queue (see the comments to ++ * the functions bfq_weights_tree_[add|remove] for ++ * further details). ++ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted ++ * by weight. Used to keep track of whether all ++ * @bfq_groups have the same weight. The tree contains ++ * one counter for each distinct weight associated to ++ * some active @bfq_group (see the comments to the ++ * functions bfq_weights_tree_[add|remove] for further ++ * details). + * @busy_queues: number of bfq_queues containing requests (including the -+ * queue under service, even if it is idling). ++ * queue in service, even if it is idling). ++ * @busy_in_flight_queues: number of @bfq_queues containing pending or ++ * in-flight requests, plus the @bfq_queue in ++ * service, even if idle but waiting for the ++ * possible arrival of its next sync request. This ++ * field is updated only if the device is rotational, ++ * but used only if the device is also NCQ-capable. ++ * The reason why the field is updated also for non- ++ * NCQ-capable rotational devices is related to the ++ * fact that the value of @hw_tag may be set also ++ * later than when busy_in_flight_queues may need to ++ * be incremented for the first time(s). Taking also ++ * this possibility into account, to avoid unbalanced ++ * increments/decrements, would imply more overhead ++ * than just updating busy_in_flight_queues ++ * regardless of the value of @hw_tag. ++ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues ++ * (that is, seeky queues that expired ++ * for budget timeout at least once) ++ * containing pending or in-flight ++ * requests, including the in-service ++ * @bfq_queue if constantly seeky. This ++ * field is updated only if the device ++ * is rotational, but used only if the ++ * device is also NCQ-capable (see the ++ * comments to @busy_in_flight_queues). ++ * @wr_busy_queues: number of weight-raised busy @bfq_queues. + * @queued: number of queued requests. + * @rq_in_driver: number of requests dispatched and waiting for completion. + * @sync_flight: number of sync requests in the driver. -+ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples -+ * completed requests . ++ * @max_rq_in_driver: max number of reqs in driver in the last ++ * @hw_tag_samples completed requests. + * @hw_tag_samples: nr of samples used to calculate hw_tag. + * @hw_tag: flag set to one if the driver is showing a queueing behavior. + * @budgets_assigned: number of budgets assigned. + * @idle_slice_timer: timer set when idling for the next sequential request -+ * from the queue under service. ++ * from the queue in service. + * @unplug_work: delayed work to restart dispatching on the request queue. -+ * @active_queue: bfq_queue under service. -+ * @active_bic: bfq_io_cq (bic) associated with the @active_queue. ++ * @in_service_queue: bfq_queue in service. ++ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. + * @last_position: on-disk position of the last served request. + * @last_budget_start: beginning of the last budget. + * @last_idling_start: beginning of the last idle slice. + * @peak_rate: peak transfer rate observed for a budget. + * @peak_rate_samples: number of samples used to calculate @peak_rate. -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling. ++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before ++ * rescheduling. + * @group_list: list of all the bfq_groups active on the device. + * @active_list: list of all the bfq_queues active on the device. + * @idle_list: list of all the bfq_queues idle on the device. @@ -5458,7 +6544,8 @@ index 0000000..48ecde9 + * @bfq_back_penalty: weight of backward seeks wrt forward ones. + * @bfq_back_max: maximum allowed backward seek. + * @bfq_slice_idle: maximum idling time. -+ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning). ++ * @bfq_user_max_budget: user-configured max budget value ++ * (0 for auto-tuning). + * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to + * async queues. + * @bfq_timeout: timeout for bfq_queues to consume their budget; used to @@ -5468,21 +6555,49 @@ index 0000000..48ecde9 + * they are charged for the whole allocated budget, to try + * to preserve a behavior reasonably fair among them, but + * without service-domain guarantees). -+ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted -+ * queue is multiplied -+ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies) -+ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes -+ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising -+ * may be reactivated for a queue (in jiffies) -+ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals -+ * after which weight-raising may be -+ * reactivated for an already busy queue -+ * (in jiffies) -+ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue, -+ * sectors per seconds ++ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is ++ * no more granted any weight-raising. ++ * @bfq_failed_cooperations: number of consecutive failed cooperation ++ * chances after which weight-raising is restored ++ * to a queue subject to more than bfq_coop_thresh ++ * queue merges. ++ * @bfq_requests_within_timer: number of consecutive requests that must be ++ * issued within the idle time slice to set ++ * again idling to a queue which was marked as ++ * non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ * @last_ins_in_burst: last time at which a queue entered the current ++ * burst of queues being activated shortly after ++ * each other; for more details about this and the ++ * following parameters related to a burst of ++ * activations, see the comments to the function ++ * @bfq_handle_burst. ++ * @bfq_burst_interval: reference time interval used to decide whether a ++ * queue has been activated shortly after ++ * @last_ins_in_burst. ++ * @burst_size: number of queues in the current burst of queue activations. ++ * @bfq_large_burst_thresh: maximum burst size above which the current ++ * queue-activation burst is deemed as 'large'. ++ * @large_burst: true if a large queue-activation burst is in progress. ++ * @burst_list: head of the burst list (as for the above fields, more details ++ * in the comments to the function bfq_handle_burst). ++ * @low_latency: if set to true, low-latency heuristics are enabled. ++ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised ++ * queue is multiplied. ++ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). ++ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. ++ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising ++ * may be reactivated for a queue (in jiffies). ++ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals ++ * after which weight-raising may be ++ * reactivated for an already busy queue ++ * (in jiffies). ++ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, ++ * sectors per seconds. + * @RT_prod: cached value of the product R*T used for computing the maximum -+ * duration of the weight raising automatically -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions ++ * duration of the weight raising automatically. ++ * @device_speed: device-speed class for the low-latency heuristic. ++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. + * + * All the fields are protected by the @queue lock. + */ @@ -5490,10 +6605,19 @@ index 0000000..48ecde9 + struct request_queue *queue; + + struct bfq_group *root_group; -+ + struct rb_root rq_pos_tree; + ++#ifdef CONFIG_CGROUP_BFQIO ++ int active_numerous_groups; ++#endif ++ ++ struct rb_root queue_weights_tree; ++ struct rb_root group_weights_tree; ++ + int busy_queues; ++ int busy_in_flight_queues; ++ int const_seeky_busy_in_flight_queues; ++ int wr_busy_queues; + int queued; + int rq_in_driver; + int sync_flight; @@ -5507,8 +6631,8 @@ index 0000000..48ecde9 + struct timer_list idle_slice_timer; + struct work_struct unplug_work; + -+ struct bfq_queue *active_queue; -+ struct bfq_io_cq *active_bic; ++ struct bfq_queue *in_service_queue; ++ struct bfq_io_cq *in_service_bic; + + sector_t last_position; + @@ -5533,22 +6657,34 @@ index 0000000..48ecde9 + unsigned int bfq_max_budget_async_rq; + unsigned int bfq_timeout[2]; + ++ unsigned int bfq_coop_thresh; ++ unsigned int bfq_failed_cooperations; ++ unsigned int bfq_requests_within_timer; ++ ++ unsigned long last_ins_in_burst; ++ unsigned long bfq_burst_interval; ++ int burst_size; ++ unsigned long bfq_large_burst_thresh; ++ bool large_burst; ++ struct hlist_head burst_list; ++ + bool low_latency; + + /* parameters of the low_latency heuristics */ -+ unsigned int bfq_raising_coeff; -+ unsigned int bfq_raising_max_time; -+ unsigned int bfq_raising_rt_max_time; -+ unsigned int bfq_raising_min_idle_time; -+ unsigned int bfq_raising_min_inter_arr_async; -+ unsigned int bfq_raising_max_softrt_rate; ++ unsigned int bfq_wr_coeff; ++ unsigned int bfq_wr_max_time; ++ unsigned int bfq_wr_rt_max_time; ++ unsigned int bfq_wr_min_idle_time; ++ unsigned long bfq_wr_min_inter_arr_async; ++ unsigned int bfq_wr_max_softrt_rate; + u64 RT_prod; ++ enum bfq_device_speed device_speed; + + struct bfq_queue oom_bfqq; +}; + +enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is under service */ ++ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ + BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ + BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ + BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ @@ -5556,9 +6692,25 @@ index 0000000..48ecde9 + BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_constantly_seeky, /* ++ * bfqq has proved to be slow and ++ * seeky until budget timeout ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ + BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ + BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ -+ BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */ +}; + +#define BFQ_BFQQ_FNS(name) \ @@ -5583,9 +6735,12 @@ index 0000000..48ecde9 +BFQ_BFQQ_FNS(prio_changed); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(budget_new); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(constantly_seeky); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(some_coop_idle); ++BFQ_BFQQ_FNS(softrt_update); +#undef BFQ_BFQQ_FNS + +/* Logging facilities. */ @@ -5597,7 +6752,10 @@ index 0000000..48ecde9 + +/* Expiration reasons. */ +enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* queue has been idling for too long */ ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ + BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ + BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ @@ -5619,7 +6777,13 @@ index 0000000..48ecde9 + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/migration. ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_must_not_expire()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level @@ -5645,6 +6809,8 @@ index 0000000..48ecde9 + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; ++ ++ int active_entities; +}; + +/** @@ -5689,15 +6855,15 @@ index 0000000..48ecde9 +} + +static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, -+ int is_sync) ++ bool is_sync) +{ -+ return bic->bfqq[!!is_sync]; ++ return bic->bfqq[is_sync]; +} + +static inline void bic_set_bfqq(struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, int is_sync) ++ struct bfq_queue *bfqq, bool is_sync) +{ -+ bic->bfqq[!!is_sync] = bfqq; ++ bic->bfqq[is_sync] = bfqq; +} + +static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) @@ -5752,24 +6918,12 @@ index 0000000..48ecde9 +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bfq_group *bfqg, int is_sync, + struct bfq_io_cq *bic, gfp_t gfp_mask); -+static void bfq_end_raising_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); +static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+#endif -diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h -index ffa1d1f..e5e6b0d 100644 ---- a/include/linux/cgroup_subsys.h -+++ b/include/linux/cgroup_subsys.h -@@ -85,7 +85,7 @@ SUBSYS(bcache) - - /* */ - --#ifdef CONFIG_CGROUP_BFQIO -+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BFQIO) - SUBSYS(bfqio) - #endif - ++ ++#endif /* _BFQ_H */ -- -1.8.1.4 +2.1.3 diff --git a/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.10.patch1 b/5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r7-for-3.10.patch1 index 5c8fdd35..dab6ad07 100644 --- a/5000_BFQ-3-block-add-Early-Queue-Merge-EQM-v6r2-for-3.10.patch1 +++ b/5003_BFQ-3-block-add-Early-Queue-Merge-EQM-v7r7-for-3.10.patch1 @@ -1,52 +1,58 @@ -From 9204dcb026a40cd2cb4310fecf788924d0fbec8d Mon Sep 17 00:00:00 2001 +From 7f87efae9622a8be88fa7e7a705ea26a1342ea47 Mon Sep 17 00:00:00 2001 From: Mauro Andreolini <mauro.andreolini@unimore.it> -Date: Fri, 14 Jun 2013 13:46:47 +0200 -Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v6r2 for +Date: Fri, 19 Dec 2014 20:09:48 +0100 +Subject: [PATCH 3/3] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r7 for 3.10.0 -A set of processes may happen to perform interleaved reads, i.e., requests -whose union would give rise to a sequential read pattern. There are two -typical cases: in the first case, processes read fixed-size chunks of +A set of processes may happen to perform interleaved reads, i.e.,requests +whose union would give rise to a sequential read pattern. There are two +typical cases: in the first case, processes read fixed-size chunks of data at a fixed distance from each other, while in the second case processes may read variable-size chunks at variable distances. The latter case occurs -for example with KVM, which splits the I/O generated by the guest into +for example with QEMU, which splits the I/O generated by the guest into multiple chunks, and lets these chunks be served by a pool of cooperating -processes, iteratively assigning the next chunk of I/O to the first -available process. CFQ uses actual queue merging for the first type of -processes, whereas it uses preemption to get a sequential read pattern out +processes, iteratively assigning the next chunk of I/O to the first +available process. CFQ uses actual queue merging for the first type of +rocesses, whereas it uses preemption to get a sequential read pattern out of the read requests performed by the second type of processes. In the end -it uses two different mechanisms to achieve the same goal: boosting the +it uses two different mechanisms to achieve the same goal: boosting the throughput with interleaved I/O. This patch introduces Early Queue Merge (EQM), a unified mechanism to get a -sequential read pattern with both types of processes. The main idea is +sequential read pattern with both types of processes. The main idea is checking newly arrived requests against the next request of the active queue both in case of actual request insert and in case of request merge. By doing so, both the types of processes can be handled by just merging their queues. -EQM is then simpler and more compact than the pair of mechanisms used in +EQM is then simpler and more compact than the pair of mechanisms used in CFQ. -Finally, EQM also preserves the typical low-latency properties of BFQ, by +Finally, EQM also preserves the typical low-latency properties of BFQ, by properly restoring the weight-raising state of a queue when it gets back to a non-merged state. Signed-off-by: Mauro Andreolini <mauro.andreolini@unimore.it> Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com> -Reviewed-by: Paolo Valente <paolo.valente@unimore.it> +Signed-off-by: Paolo Valente <paolo.valente@unimore.it> --- - block/bfq-iosched.c | 653 ++++++++++++++++++++++++++++++++++++---------------- - block/bfq-sched.c | 28 --- - block/bfq.h | 16 ++ - 3 files changed, 466 insertions(+), 231 deletions(-) + block/bfq-iosched.c | 751 +++++++++++++++++++++++++++++++++++++--------------- + block/bfq-sched.c | 28 -- + block/bfq.h | 54 +++- + 3 files changed, 581 insertions(+), 252 deletions(-) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index b230927..bc57923 100644 +index 9de51e3..e0d5c63 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c -@@ -444,6 +444,43 @@ static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd) +@@ -571,6 +571,57 @@ static inline unsigned int bfq_wr_duration(struct bfq_data *bfqd) return dur; } ++static inline unsigned ++bfq_bfqq_cooperations(struct bfq_queue *bfqq) ++{ ++ return bfqq->bic ? bfqq->bic->cooperations : 0; ++} ++ +static inline void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ @@ -54,26 +60,34 @@ index b230927..bc57923 100644 + bfq_mark_bfqq_idle_window(bfqq); + else + bfq_clear_bfqq_idle_window(bfqq); -+ if (bic->raising_time_left && bfqq->bfqd->low_latency) { ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ /* Assuming that the flag in_large_burst is already correctly set */ ++ if (bic->wr_time_left && bfqq->bfqd->low_latency && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { + /* + * Start a weight raising period with the duration given by + * the raising_time_left snapshot. + */ -+ bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff; -+ bfqq->raising_cur_max_time = bic->raising_time_left; -+ bfqq->last_rais_start_finish = jiffies; ++ if (bfq_bfqq_busy(bfqq)) ++ bfqq->bfqd->wr_busy_queues++; ++ bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bic->wr_time_left; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->entity.ioprio_changed = 1; + } + /* -+ * Clear raising_time_left to prevent bfq_bfqq_save_state() from ++ * Clear wr_time_left to prevent bfq_bfqq_save_state() from + * getting confused about the queue's need of a weight-raising + * period. + */ -+ bic->raising_time_left = 0; ++ bic->wr_time_left = 0; +} + -+/* -+ * Must be called with the queue_lock held. -+ */ ++/* Must be called with the queue_lock held. */ +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + int process_refs, io_refs; @@ -84,11 +98,36 @@ index b230927..bc57923 100644 + return process_refs; +} + - static void bfq_add_rq_rb(struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); -@@ -483,11 +520,20 @@ static void bfq_add_rq_rb(struct request *rq) - if (! bfqd->low_latency) + /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ + static inline void bfq_reset_burst_list(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +@@ -815,7 +866,7 @@ static void bfq_add_request(struct request *rq) + bfq_rq_pos_tree_add(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) { +- bool soft_rt, ++ bool soft_rt, coop_or_in_burst, + idle_for_long_time = time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +@@ -839,11 +890,12 @@ static void bfq_add_request(struct request *rq) + bfqd->last_ins_in_burst = jiffies; + } + ++ coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || ++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && +- !bfq_bfqq_in_large_burst(bfqq) && ++ !coop_or_in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start); +- interactive = !bfq_bfqq_in_large_burst(bfqq) && +- idle_for_long_time; ++ interactive = !coop_or_in_burst && idle_for_long_time; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + +@@ -862,11 +914,20 @@ static void bfq_add_request(struct request *rq) + if (!bfqd->low_latency) goto add_bfqq_busy; + if (bfq_bfqq_just_split(bfqq)) @@ -105,21 +144,61 @@ index b230927..bc57923 100644 + * requests have not been redirected to a shared queue) + * start a weight-raising period. */ -- if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt)) { -+ if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) && -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { - bfqq->raising_coeff = bfqd->bfq_raising_coeff; - if (idle_for_long_time) - bfqq->raising_cur_max_time = -@@ -517,6 +563,7 @@ static void bfq_add_rq_rb(struct request *rq) - raising_cur_max_time)); - } +- if (old_wr_coeff == 1 && (interactive || soft_rt)) { ++ if (old_wr_coeff == 1 && (interactive || soft_rt) && ++ (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); +@@ -880,7 +941,7 @@ static void bfq_add_request(struct request *rq) + } else if (old_wr_coeff > 1) { + if (interactive) + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); +- else if (bfq_bfqq_in_large_burst(bfqq) || ++ else if (coop_or_in_burst || + (bfqq->wr_cur_max_time == + bfqd->bfq_wr_rt_max_time && + !soft_rt)) { +@@ -899,18 +960,18 @@ static void bfq_add_request(struct request *rq) + /* + * + * The remaining weight-raising time is lower +- * than bfqd->bfq_wr_rt_max_time, which +- * means that the application is enjoying +- * weight raising either because deemed soft- +- * rt in the near past, or because deemed +- * interactive a long ago. In both cases, +- * resetting now the current remaining weight- +- * raising time for the application to the +- * weight-raising duration for soft rt +- * applications would not cause any latency +- * increase for the application (as the new +- * duration would be higher than the remaining +- * time). ++ * than bfqd->bfq_wr_rt_max_time, which means ++ * that the application is enjoying weight ++ * raising either because deemed soft-rt in ++ * the near past, or because deemed interactive ++ * a long ago. ++ * In both cases, resetting now the current ++ * remaining weight-raising time for the ++ * application to the weight-raising duration ++ * for soft rt applications would not cause any ++ * latency increase for the application (as the ++ * new duration would be higher than the ++ * remaining time). + * + * In addition, the application is now meeting + * the requirements for being deemed soft rt. +@@ -945,6 +1006,7 @@ static void bfq_add_request(struct request *rq) + bfqd->bfq_wr_rt_max_time; + } } +set_ioprio_changed: - if (old_raising_coeff != bfqq->raising_coeff) + if (old_wr_coeff != bfqq->wr_coeff) entity->ioprio_changed = 1; add_bfqq_busy: -@@ -695,89 +742,35 @@ static void bfq_end_raising(struct bfq_data *bfqd) +@@ -1156,90 +1218,35 @@ static void bfq_end_wr(struct bfq_data *bfqd) spin_unlock_irq(bfqd->queue->queue_lock); } @@ -150,8 +229,8 @@ index b230927..bc57923 100644 - return bfqq == RQ_BFQQ(rq); -} - --static void __bfq_set_active_queue(struct bfq_data *bfqd, -- struct bfq_queue *bfqq) +-static void __bfq_set_in_service_queue(struct bfq_data *bfqd, +- struct bfq_queue *bfqq) -{ - if (bfqq != NULL) { - bfq_mark_bfqq_must_alloc(bfqq); @@ -160,18 +239,19 @@ index b230927..bc57923 100644 - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; - -- bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", +- bfq_log_bfqq(bfqd, bfqq, +- "set_in_service_queue, cur-budget = %lu", - bfqq->entity.budget); - } - -- bfqd->active_queue = bfqq; +- bfqd->in_service_queue = bfqq; -} - -/* -- * Get and set a new active queue for service. +- * Get and set a new queue for service. - */ --static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd, -- struct bfq_queue *bfqq) +-static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd, +- struct bfq_queue *bfqq) -{ - if (!bfqq) - bfqq = bfq_get_next_queue(bfqd); @@ -180,7 +260,7 @@ index b230927..bc57923 100644 else - bfq_get_next_queue_forced(bfqd, bfqq); - -- __bfq_set_active_queue(bfqd, bfqq); +- __bfq_set_in_service_queue(bfqd, bfqq); - return bfqq; + return ((struct bio *)io_struct)->bi_sector; } @@ -223,8 +303,8 @@ index b230927..bc57923 100644 if (RB_EMPTY_ROOT(root)) return NULL; -@@ -796,7 +789,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) - * position). +@@ -1258,7 +1265,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) + * next_request position). */ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); - if (bfq_rq_close(bfqd, __bfqq->next_rq)) @@ -232,7 +312,7 @@ index b230927..bc57923 100644 return __bfqq; if (blk_rq_pos(__bfqq->next_rq) < sector) -@@ -807,7 +800,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) +@@ -1269,7 +1276,7 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) return NULL; __bfqq = rb_entry(node, struct bfq_queue, pos_node); @@ -241,7 +321,7 @@ index b230927..bc57923 100644 return __bfqq; return NULL; -@@ -816,14 +809,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) +@@ -1278,14 +1285,12 @@ static struct bfq_queue *bfqq_close(struct bfq_data *bfqd) /* * bfqd - obvious * cur_bfqq - passed in so that we don't decide that the current queue @@ -260,7 +340,7 @@ index b230927..bc57923 100644 { struct bfq_queue *bfqq; -@@ -843,7 +834,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, +@@ -1305,7 +1310,7 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, * working closely on the same area of the disk. In that case, * we can group them together and don't waste time idling. */ @@ -269,7 +349,7 @@ index b230927..bc57923 100644 if (bfqq == NULL || bfqq == cur_bfqq) return NULL; -@@ -870,6 +861,275 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, +@@ -1332,6 +1337,315 @@ static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd, return bfqq; } @@ -308,23 +388,26 @@ index b230927..bc57923 100644 + new_bfqq->pid); + + /* -+ * Merging is just a redirection: the requests of the process owning -+ * one of the two queues are redirected to the other queue. The latter -+ * queue, in its turn, is set as shared if this is the first time that -+ * the requests of some process are redirected to it. ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have the -+ * io_cq of this process. So we can immediately configure this io_cq -+ * to redirect the requests of the process to new_bfqq. ++ * are in the context of the process owning bfqq, hence we have ++ * the io_cq of this process. So we can immediately configure this ++ * io_cq to redirect the requests of the process to new_bfqq. + * -+ * NOTE, even if new_bfqq coincides with the active queue, the io_cq of -+ * new_bfqq is not available, because, if the active queue is shared, -+ * bfqd->active_bic may not point to the io_cq of the active queue. -+ * Redirecting the requests of the process owning bfqq to the currently -+ * active queue is in any case the best option, as we feed the active queue -+ * with new requests close to the last request served and, by doing so, -+ * hopefully increase the throughput. ++ * NOTE, even if new_bfqq coincides with the in-service queue, the ++ * io_cq of new_bfqq is not available, because, if the in-service ++ * queue is shared, bfqd->in_service_bic may not point to the ++ * io_cq of the in-service queue. ++ * Redirecting the requests of the process owning bfqq to the ++ * currently in-service queue is in any case the best option, as ++ * we feed the in-service queue with new requests close to the ++ * last request served and, by doing so, hopefully increase the ++ * throughput. + */ + bfqq->new_bfqq = new_bfqq; + atomic_add(process_refs, &new_bfqq->ref); @@ -332,41 +415,52 @@ index b230927..bc57923 100644 +} + +/* -+ * Attempt to schedule a merge of bfqq with the currently active queue or -+ * with a close queue among the scheduled queues. ++ * Attempt to schedule a merge of bfqq with the currently in-service queue ++ * or with a close queue among the scheduled queues. + * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ -+ struct bfq_queue *active_bfqq, *new_bfqq; ++ struct bfq_queue *in_service_bfqq, *new_bfqq; + + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + -+ if (!io_struct) ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + -+ active_bfqq = bfqd->active_queue; ++ in_service_bfqq = bfqd->in_service_queue; + -+ if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic) ++ if (in_service_bfqq == NULL || in_service_bfqq == bfqq || ++ !bfqd->in_service_bic || ++ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) + goto check_scheduled; + -+ if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq)) ++ if (bfq_class_idle(in_service_bfqq) || bfq_class_idle(bfqq)) + goto check_scheduled; + -+ if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq)) ++ if (bfq_class_rt(in_service_bfqq) != bfq_class_rt(bfqq)) + goto check_scheduled; + -+ if (active_bfqq->entity.parent != bfqq->entity.parent) ++ if (in_service_bfqq->entity.parent != bfqq->entity.parent) + goto check_scheduled; + + if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq)) -+ if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq))) -+ return new_bfqq; /* Merge with the active queue */ ++ bfq_bfqq_sync(in_service_bfqq) && bfq_bfqq_sync(bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq != NULL) ++ return new_bfqq; /* Merge with in-service queue */ ++ } + + /* + * Check whether there is a cooperator among currently scheduled @@ -376,7 +470,7 @@ index b230927..bc57923 100644 +check_scheduled: + new_bfqq = bfq_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); -+ if (new_bfqq) ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; @@ -392,41 +486,46 @@ index b230927..bc57923 100644 + */ + if (bfqq->bic == NULL) + return; -+ if (bfqq->bic->raising_time_left) ++ if (bfqq->bic->wr_time_left) + /* + * This is the queue of a just-started process, and would -+ * deserve weight raising: we set raising_time_left to the full -+ * weight-raising duration to trigger weight-raising when and -+ * if the queue is split and the first request of the queue -+ * is enqueued. ++ * deserve weight raising: we set wr_time_left to the full ++ * weight-raising duration to trigger weight-raising when ++ * and if the queue is split and the first request of the ++ * queue is enqueued. + */ -+ bfqq->bic->raising_time_left = bfq_wrais_duration(bfqq->bfqd); -+ else if (bfqq->raising_coeff > 1) { -+ unsigned long wrais_duration = -+ jiffies - bfqq->last_rais_start_finish; ++ bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); ++ else if (bfqq->wr_coeff > 1) { ++ unsigned long wr_duration = ++ jiffies - bfqq->last_wr_start_finish; + /* + * It may happen that a queue's weight raising period lasts -+ * longer than its raising_cur_max_time, as weight raising is ++ * longer than its wr_cur_max_time, as weight raising is + * handled only when a request is enqueued or dispatched (it + * does not use any timer). If the weight raising period is + * about to end, don't save it. + */ -+ if (bfqq->raising_cur_max_time <= wrais_duration) -+ bfqq->bic->raising_time_left = 0; ++ if (bfqq->wr_cur_max_time <= wr_duration) ++ bfqq->bic->wr_time_left = 0; + else -+ bfqq->bic->raising_time_left = -+ bfqq->raising_cur_max_time - wrais_duration; ++ bfqq->bic->wr_time_left = ++ bfqq->wr_cur_max_time - wr_duration; + /* + * The bfq_queue is becoming shared or the requests of the + * process owning the queue are being redirected to a shared + * queue. Stop the weight raising period of the queue, as in -+ * both cases it should not be owned by an interactive or soft -+ * real-time application. ++ * both cases it should not be owned by an interactive or ++ * soft real-time application. + */ -+ bfq_bfqq_end_raising(bfqq); ++ bfq_bfqq_end_wr(bfqq); + } else -+ bfqq->bic->raising_time_left = 0; ++ bfqq->bic->wr_time_left = 0; + bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); ++ bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ bfqq->bic->cooperations++; ++ bfqq->bic->failed_cooperations = 0; +} + +static inline void @@ -442,35 +541,52 @@ index b230927..bc57923 100644 + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (long unsigned)new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); + /* + * Grab a reference to the bic, to prevent it from being destroyed + * before being possibly touched by a bfq_split_bfqq(). + */ + bfq_get_bic_reference(bfqq); + bfq_get_bic_reference(new_bfqq); -+ /* Merge queues (that is, let bic redirect its requests to new_bfqq) */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); + /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): set -+ * new_bfqq->bic to NULL. bfqq either: ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to any -+ * bic soon and bfqq->bic is already NULL (therefore the next ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + bfqq->bic = NULL; -+ bfq_put_queue(bfqq); ++ bfq_put_queue(bfqq); ++} ++ ++static inline void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { ++ bic->failed_cooperations++; ++ if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) ++ bic->cooperations = 0; ++ } +} + +static int bfq_allow_merge(struct request_queue *q, struct request *rq, @@ -500,22 +616,25 @@ index b230927..bc57923 100644 + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ -+ if (bfqq != NULL && -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the shared queue, -+ * i.e., new_bfqq, so use new_bfqq to decide whether bio and -+ * rq can be merged. -+ */ -+ bfqq = new_bfqq; ++ if (bfqq != NULL) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ if (new_bfqq != NULL) { ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); ++ /* ++ * If we get here, the bio will be queued in the ++ * shared queue, i.e., new_bfqq, so use new_bfqq ++ * to decide whether bio and rq can be merged. ++ */ ++ bfqq = new_bfqq; ++ } else ++ bfq_bfqq_increase_failed_cooperations(bfqq); + } + + return bfqq == RQ_BFQQ(rq); +} + -+static void __bfq_set_active_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) +{ + if (bfqq != NULL) { + bfq_mark_bfqq_must_alloc(bfqq); @@ -524,34 +643,33 @@ index b230927..bc57923 100644 + + bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; + -+ bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu", ++ bfq_log_bfqq(bfqd, bfqq, ++ "set_in_service_queue, cur-budget = %lu", + bfqq->entity.budget); + } + -+ bfqd->active_queue = bfqq; ++ bfqd->in_service_queue = bfqq; +} + +/* -+ * Get and set a new active queue for service. ++ * Get and set a new queue for service. + */ -+static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd) ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + -+ __bfq_set_active_queue(bfqd, bfqq); ++ __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + /* * If enough samples have been computed, return the current max budget * stored in bfqd, which is dynamically updated according to the -@@ -1017,63 +1277,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) +@@ -1475,61 +1789,6 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq) return rq; } --/* -- * Must be called with the queue_lock held. -- */ +-/* Must be called with the queue_lock held. */ -static int bfqq_process_refs(struct bfq_queue *bfqq) -{ - int process_refs, io_refs; @@ -609,31 +727,7 @@ index b230927..bc57923 100644 static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq) { struct bfq_entity *entity = &bfqq->entity; -@@ -1493,6 +1696,14 @@ static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - * is likely to boost the disk throughput); - * - the queue is weight-raised (waiting for the request is necessary for - * providing the queue with fairness and latency guarantees). -+ * -+ * In any case, idling can be disabled for cooperation issues, if -+ * 1) there is a close cooperator for the queue, or -+ * 2) the queue is shared and some cooperator is likely to be idle (in this -+ * case, by not arming the idle timer, we try to slow down the queue, to -+ * prevent the zones of the disk accessed by the active cooperators to -+ * become too distant from the zone that will be accessed by the currently -+ * idle cooperators). - */ - static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq, - int budg_timeout) -@@ -1507,7 +1718,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq, - (bfqd->rq_in_driver == 0 || - budg_timeout || - bfqq->raising_coeff > 1) && -- !bfq_close_cooperator(bfqd, bfqq) && -+ !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position) && - (!bfq_bfqq_coop(bfqq) || - !bfq_bfqq_some_coop_idle(bfqq)) && - !bfq_queue_nonrot_noidle(bfqd, bfqq)); -@@ -1519,7 +1730,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq, +@@ -2263,7 +2522,7 @@ static inline bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) */ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) { @@ -641,10 +735,10 @@ index b230927..bc57923 100644 + struct bfq_queue *bfqq; struct request *next_rq; enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; - int budg_timeout; -@@ -1530,17 +1741,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue"); +@@ -2273,17 +2532,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - /* - * If another queue has a request waiting within our mean seek @@ -657,10 +751,10 @@ index b230927..bc57923 100644 - if (new_bfqq != NULL && bfqq->new_bfqq == NULL) - bfq_setup_merge(bfqq, new_bfqq); - - budg_timeout = bfq_may_expire_for_budg_timeout(bfqq); - if (budg_timeout && - !bfq_bfqq_must_idle(bfqq, budg_timeout)) -@@ -1577,10 +1777,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + if (bfq_may_expire_for_budg_timeout(bfqq) && + !timer_pending(&bfqd->idle_slice_timer) && + !bfq_bfqq_must_idle(bfqq)) +@@ -2322,10 +2570,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) bfq_clear_bfqq_wait_request(bfqq); del_timer(&bfqd->idle_slice_timer); } @@ -672,16 +766,14 @@ index b230927..bc57923 100644 } } -@@ -1589,26 +1786,19 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * queue still has requests in flight or is idling for a new request, - * then keep it. +@@ -2334,40 +2579,30 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) + * in flight (possibly waiting for a completion) or is idling for a + * new request, then keep it. */ - if (new_bfqq == NULL && (timer_pending(&bfqd->idle_slice_timer) || +- (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq)))) { + if (timer_pending(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && - (bfq_bfqq_idle_window(bfqq) || bfqq->raising_coeff > 1) && -- !bfq_queue_nonrot_noidle(bfqd, bfqq)))) { -+ !bfq_queue_nonrot_noidle(bfqd, bfqq))) { ++ (bfqq->dispatched != 0 && bfq_bfqq_must_not_expire(bfqq))) { bfqq = NULL; goto keep_queue; - } else if (new_bfqq != NULL && timer_pending(&bfqd->idle_slice_timer)) { @@ -697,41 +789,67 @@ index b230927..bc57923 100644 expire: bfq_bfqq_expire(bfqd, bfqq, 0, reason); new_queue: -- bfqq = bfq_set_active_queue(bfqd, new_bfqq); -+ bfqq = bfq_set_active_queue(bfqd); +- bfqq = bfq_set_in_service_queue(bfqd, new_bfqq); ++ bfqq = bfq_set_in_service_queue(bfqd); bfq_log(bfqd, "select_queue: new queue %d returned", bfqq != NULL ? bfqq->pid : 0); keep_queue: -@@ -1617,9 +1807,8 @@ keep_queue: + return bfqq; + } - static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +-static void bfq_update_wr_data(struct bfq_data *bfqd, +- struct bfq_queue *bfqq) ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) { -+ struct bfq_entity *entity = &bfqq->entity; - if (bfqq->raising_coeff > 1) { /* queue is being boosted */ +- if (bfqq->wr_coeff > 1) { /* queue is being boosted */ - struct bfq_entity *entity = &bfqq->entity; - ++ struct bfq_entity *entity = &bfqq->entity; ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, " - "old raising coeff %u, w %d(%d)", -@@ -1656,12 +1845,14 @@ static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - jiffies_to_msecs(bfqq-> - raising_cur_max_time)); - bfq_bfqq_end_raising(bfqq); -- __bfq_entity_update_weight_prio( -- bfq_entity_service_tree(entity), -- entity); - } + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", +- jiffies_to_msecs(jiffies - +- bfqq->last_wr_start_finish), ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); +@@ -2376,12 +2611,16 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, + entity->orig_weight * bfqq->wr_coeff); + if (entity->ioprio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ + /* + * If the queue was activated in a burst, or + * too much time has elapsed from the beginning +- * of this weight-raising, then end weight raising. ++ * of this weight-raising period, or the queue has ++ * exceeded the acceptable number of cooperations, ++ * then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq) || ++ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + bfqq->last_wr_start_finish = jiffies; +@@ -2390,11 +2629,13 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, + bfqq->last_wr_start_finish, + jiffies_to_msecs(bfqq->wr_cur_max_time)); + bfq_bfqq_end_wr(bfqq); +- __bfq_entity_update_weight_prio( +- bfq_entity_service_tree(entity), +- entity); } } + /* Update weight both if it must be raised and if it must be lowered */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1)) ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio( + bfq_entity_service_tree(entity), + entity); } /* -@@ -1901,6 +2092,25 @@ static void bfq_init_icq(struct io_cq *icq) +@@ -2642,6 +2883,25 @@ static inline void bfq_init_icq(struct io_cq *icq) struct bfq_io_cq *bic = icq_to_bic(icq); bic->ttime.last_end_request = jiffies; @@ -753,11 +871,11 @@ index b230927..bc57923 100644 + * the field raising_time_left and assign 1 to it, to mark the queue + * as needing weight raising. + */ -+ bic->raising_time_left = 1; ++ bic->wr_time_left = 1; } static void bfq_exit_icq(struct io_cq *icq) -@@ -1914,6 +2124,13 @@ static void bfq_exit_icq(struct io_cq *icq) +@@ -2655,6 +2915,13 @@ static void bfq_exit_icq(struct io_cq *icq) } if (bic->bfqq[BLK_RW_SYNC]) { @@ -771,7 +889,7 @@ index b230927..bc57923 100644 bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); bic->bfqq[BLK_RW_SYNC] = NULL; } -@@ -2211,6 +2428,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, +@@ -2950,6 +3217,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) return; @@ -782,7 +900,7 @@ index b230927..bc57923 100644 enable_idle = bfq_bfqq_idle_window(bfqq); if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -@@ -2251,6 +2472,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, +@@ -2997,6 +3268,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || !BFQQ_SEEKY(bfqq)) bfq_update_idle_window(bfqd, bfqq, bic); @@ -790,7 +908,7 @@ index b230927..bc57923 100644 bfq_log_bfqq(bfqd, bfqq, "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", -@@ -2302,13 +2524,45 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, +@@ -3057,13 +3329,49 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_insert_request(struct request_queue *q, struct request *rq) { struct bfq_data *bfqd = q->elevator->elevator_data; @@ -804,27 +922,31 @@ index b230927..bc57923 100644 + * driver: make sure we are in process context while trying to + * merge two bfq_queues. + */ -+ if (!in_interrupt() && -+ (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ atomic_inc(&new_bfqq->ref); -+ bfq_put_queue(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq != NULL) { ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated[rq_data_dir(rq)]++; ++ bfqq->allocated[rq_data_dir(rq)]--; ++ atomic_inc(&new_bfqq->ref); ++ bfq_put_queue(bfqq); ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } else ++ bfq_bfqq_increase_failed_cooperations(bfqq); + } + bfq_init_prio_data(bfqq, RQ_BIC(rq)); - bfq_add_rq_rb(rq); + bfq_add_request(rq); + /* + * Here a newly-created bfq_queue has already started a weight-raising @@ -833,46 +955,30 @@ index b230927..bc57923 100644 + * comments about this field in bfq_init_icq(). + */ + if (bfqq->bic != NULL) -+ bfqq->bic->raising_time_left = 0; ++ bfqq->bic->wr_time_left = 0; rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]); list_add_tail(&rq->queuelist, &bfqq->fifo); -@@ -2371,15 +2625,6 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - if (bfq_bfqq_budget_new(bfqq)) - bfq_set_budget_timeout(bfqd); - -- /* Idling is disabled also for cooperation issues: -- * 1) there is a close cooperator for the queue, or -- * 2) the queue is shared and some cooperator is likely -- * to be idle (in this case, by not arming the idle timer, -- * we try to slow down the queue, to prevent the zones -- * of the disk accessed by the active cooperators to become -- * too distant from the zone that will be accessed by the -- * currently idle cooperators) -- */ - if (bfq_bfqq_must_idle(bfqq, budg_timeout)) - bfq_arm_slice_timer(bfqd); - else if (budg_timeout) -@@ -2449,18 +2694,6 @@ static void bfq_put_request(struct request *rq) +@@ -3228,18 +3536,6 @@ static void bfq_put_request(struct request *rq) } } -static struct bfq_queue * -bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -- struct bfq_queue *bfqq) +- struct bfq_queue *bfqq) -{ -- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", +- bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", - (long unsigned)bfqq->new_bfqq->pid); -- bic_set_bfqq(bic, bfqq->new_bfqq, 1); -- bfq_mark_bfqq_coop(bfqq->new_bfqq); -- bfq_put_queue(bfqq); -- return bic_to_bfqq(bic, 1); +- bic_set_bfqq(bic, bfqq->new_bfqq, 1); +- bfq_mark_bfqq_coop(bfqq->new_bfqq); +- bfq_put_queue(bfqq); +- return bic_to_bfqq(bic, 1); -} - /* * Returns NULL if a new bfqq should be allocated, or the old bfqq if this * was the last process referring to said bfqq. -@@ -2469,6 +2702,9 @@ static struct bfq_queue * +@@ -3248,6 +3544,9 @@ static struct bfq_queue * bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) { bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); @@ -881,8 +987,8 @@ index b230927..bc57923 100644 + if (bfqq_process_refs(bfqq) == 1) { bfqq->pid = current->pid; - bfq_clear_bfqq_some_coop_idle(bfqq); -@@ -2498,6 +2734,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, + bfq_clear_bfqq_coop(bfqq); +@@ -3276,6 +3575,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, struct bfq_queue *bfqq; struct bfq_group *bfqg; unsigned long flags; @@ -890,9 +996,21 @@ index b230927..bc57923 100644 might_sleep_if(gfp_mask & __GFP_WAIT); -@@ -2516,24 +2753,14 @@ new_queue: +@@ -3293,25 +3593,26 @@ new_queue: + if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) { bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask); bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ else { ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ } } else { - /* - * If the queue was seeky for too long, break it apart. @@ -917,7 +1035,7 @@ index b230927..bc57923 100644 } bfqq->allocated[rw]++; -@@ -2544,6 +2771,26 @@ new_queue: +@@ -3322,6 +3623,26 @@ new_queue: rq->elv.priv[0] = bic; rq->elv.priv[1] = bfqq; @@ -928,14 +1046,14 @@ index b230927..bc57923 100644 + * queue has just been split, mark a flag so that the + * information is available to the other scheduler hooks. + */ -+ if (bfqq_process_refs(bfqq) == 1) { ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + bfq_mark_bfqq_just_split(bfqq); + /* -+ * If the queue has just been split from a shared queue, -+ * restore the idle window and the possible weight -+ * raising period. ++ * If the queue has just been split from a shared ++ * queue, restore the idle window and the possible ++ * weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bic); + } @@ -945,10 +1063,10 @@ index b230927..bc57923 100644 return 0; diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index 03f8061..a0edaa2 100644 +index 2931563..6764a7e 100644 --- a/block/bfq-sched.c +++ b/block/bfq-sched.c -@@ -978,34 +978,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +@@ -1091,34 +1091,6 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) return bfqq; } @@ -961,7 +1079,7 @@ index 03f8061..a0edaa2 100644 - struct bfq_entity *entity; - struct bfq_sched_data *sd; - -- BUG_ON(bfqd->active_queue != NULL); +- BUG_ON(bfqd->in_service_queue != NULL); - - entity = &bfqq->entity; - /* @@ -972,50 +1090,81 @@ index 03f8061..a0edaa2 100644 - bfq_update_budget(entity); - bfq_update_vtime(bfq_entity_service_tree(entity)); - bfq_active_extract(bfq_entity_service_tree(entity), entity); -- sd->active_entity = entity; -- sd->next_active = NULL; +- sd->in_service_entity = entity; +- sd->next_in_service = NULL; - entity->service = 0; - } - - return; -} - - static void __bfq_bfqd_reset_active(struct bfq_data *bfqd) + static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) { - if (bfqd->active_bic != NULL) { + if (bfqd->in_service_bic != NULL) { diff --git a/block/bfq.h b/block/bfq.h -index 48ecde9..bb52975 100644 +index 84c7861..0a40b4b 100644 --- a/block/bfq.h +++ b/block/bfq.h -@@ -188,6 +188,8 @@ struct bfq_group; - * @pid: pid of the process owning the queue, used for logging purposes. - * @last_rais_start_time: last (idle -> weight-raised) transition attempt - * @raising_cur_max_time: current max raising time for this queue +@@ -218,18 +218,21 @@ struct bfq_group; + * idle @bfq_queue with no outstanding requests, then + * the task associated with the queue it is deemed as + * soft real-time (see the comments to the function +- * bfq_bfqq_softrt_next_start()). ++ * bfq_bfqq_softrt_next_start()) + * @last_idle_bklogged: time of the last transition of the @bfq_queue from + * idle to backlogged + * @service_from_backlogged: cumulative service received from the @bfq_queue + * since the last transition from idle to + * backlogged + * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the + * queue is shared * - * A bfq_queue is a leaf request queue; it can be associated to an io_context - * or more (if it is an async one). @cgroup holds a reference to the -@@ -231,6 +233,7 @@ struct bfq_queue { - sector_t last_request_pos; +- * A bfq_queue is a leaf request queue; it can be associated with an io_context +- * or more, if it is async or shared between cooperating processes. @cgroup +- * holds a reference to the cgroup, to be sure that it does not disappear while +- * a bfqq still references it (mostly to avoid races between request issuing and +- * task migration followed by cgroup destruction). ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ + struct bfq_queue { +@@ -269,6 +272,7 @@ struct bfq_queue { + unsigned int requests_within_timer; pid_t pid; + struct bfq_io_cq *bic; /* weight-raising fields */ - unsigned int raising_cur_max_time; -@@ -257,12 +260,23 @@ struct bfq_ttime { + unsigned long wr_cur_max_time; +@@ -298,12 +302,42 @@ struct bfq_ttime { * @icq: associated io_cq structure * @bfqq: array of two process queues, the sync and the async * @ttime: associated @bfq_ttime struct -+ * @raising_time_left: snapshot of the time left before weight raising ends -+ * for the sync queue associated to this process; this -+ * snapshot is taken to remember this value while the weight -+ * raising is suspended because the queue is merged with a -+ * shared queue, and is used to set @raising_cur_max_time -+ * when the queue is split from the shared queue and its -+ * weight is raised again -+ * @saved_idle_window: same purpose as the previous field for the idle window ++ * @wr_time_left: snapshot of the time left before weight raising ends ++ * for the sync queue associated to this process; this ++ * snapshot is taken to remember this value while the weight ++ * raising is suspended because the queue is merged with a ++ * shared queue, and is used to set @raising_cur_max_time ++ * when the queue is split from the shared queue and its ++ * weight is raised again ++ * @saved_idle_window: same purpose as the previous field for the idle ++ * window ++ * @saved_IO_bound: same purpose as the previous two fields for the I/O ++ * bound classification of a queue ++ * @saved_in_large_burst: same purpose as the previous fields for the ++ * value of the field keeping the queue's belonging ++ * to a large burst ++ * @was_in_burst_list: true if the queue belonged to a burst list ++ * before its merge with another cooperating queue ++ * @cooperations: counter of consecutive successful queue merges underwent ++ * by any of the process' @bfq_queues ++ * @failed_cooperations: counter of consecutive failed queue merges of any ++ * of the process' @bfq_queues */ struct bfq_io_cq { struct io_cq icq; /* must be the first member */ @@ -1023,27 +1172,51 @@ index 48ecde9..bb52975 100644 struct bfq_ttime ttime; int ioprio; + -+ unsigned int raising_time_left; -+ unsigned int saved_idle_window; ++ unsigned int wr_time_left; ++ bool saved_idle_window; ++ bool saved_IO_bound; ++ ++ bool saved_in_large_burst; ++ bool was_in_burst_list; ++ ++ unsigned int cooperations; ++ unsigned int failed_cooperations; }; - /** -@@ -403,6 +417,7 @@ enum bfqq_state_flags { + enum bfq_device_speed { +@@ -539,7 +573,7 @@ enum bfqq_state_flags { + BFQ_BFQQ_FLAG_prio_changed, /* task priority has changed */ + BFQ_BFQQ_FLAG_sync, /* synchronous queue */ + BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ +- BFQ_BFQQ_FLAG_IO_bound, /* ++ BFQ_BFQQ_FLAG_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget +@@ -552,12 +586,13 @@ enum bfqq_state_flags { + * bfqq has proved to be slow and + * seeky until budget timeout + */ +- BFQ_BFQQ_FLAG_softrt_update, /* ++ BFQ_BFQQ_FLAG_softrt_update, /* + * may need softrt-next-start + * update + */ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ - BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ - BFQ_BFQQ_FLAG_some_coop_idle, /* some cooperator is inactive */ +- BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be splitted */ ++ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ + BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ }; #define BFQ_BFQQ_FNS(name) \ -@@ -430,6 +445,7 @@ BFQ_BFQQ_FNS(budget_new); +@@ -587,6 +622,7 @@ BFQ_BFQQ_FNS(in_large_burst); + BFQ_BFQQ_FNS(constantly_seeky); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); - BFQ_BFQQ_FNS(some_coop_idle); +BFQ_BFQQ_FNS(just_split); + BFQ_BFQQ_FNS(softrt_update); #undef BFQ_BFQQ_FNS - /* Logging facilities. */ -- -1.8.1.4 +2.1.3 |