diff --git b/Documentation/ABI/testing/debugfs-aufs b/Documentation/ABI/testing/debugfs-aufs
new file mode 100644
index 0000000..99642d1
--- /dev/null
+++ b/Documentation/ABI/testing/debugfs-aufs
@@ -0,0 +1,50 @@
+What:		/debug/aufs/si_<id>/
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		Under /debug/aufs, a directory named si_<id> is created
+		per aufs mount, where <id> is a unique id generated
+		internally.
+
+What:		/debug/aufs/si_<id>/plink
+Date:		Apr 2013
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It has three lines and shows the information about the
+		pseudo-link. The first line is a single number
+		representing a number of buckets. The second line is a
+		number of pseudo-links per buckets (separated by a
+		blank). The last line is a single number representing a
+		total number of psedo-links.
+		When the aufs mount option 'noplink' is specified, it
+		will show "1\n0\n0\n".
+
+What:		/debug/aufs/si_<id>/xib
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the consumed blocks by xib (External Inode Number
+		Bitmap), its block size and file size.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
+
+What:		/debug/aufs/si_<id>/xino0, xino1 ... xinoN
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the consumed blocks by xino (External Inode Number
+		Translation Table), its link count, block size and file
+		size.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
+
+What:		/debug/aufs/si_<id>/xigen
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the consumed blocks by xigen (External Inode
+		Generation Table), its block size and file size.
+		If CONFIG_AUFS_EXPORT is disabled, this entry will not
+		be created.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
diff --git b/Documentation/ABI/testing/sysfs-aufs b/Documentation/ABI/testing/sysfs-aufs
new file mode 100644
index 0000000..82f9518
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-aufs
@@ -0,0 +1,31 @@
+What:		/sys/fs/aufs/si_<id>/
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		Under /sys/fs/aufs, a directory named si_<id> is created
+		per aufs mount, where <id> is a unique id generated
+		internally.
+
+What:		/sys/fs/aufs/si_<id>/br0, br1 ... brN
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the abolute path of a member directory (which
+		is called branch) in aufs, and its permission.
+
+What:		/sys/fs/aufs/si_<id>/brid0, brid1 ... bridN
+Date:		July 2013
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the id of a member directory (which is called
+		branch) in aufs.
+
+What:		/sys/fs/aufs/si_<id>/xi_path
+Date:		March 2009
+Contact:	J. R. Okajima <hooanon05g@gmail.com>
+Description:
+		It shows the abolute path of XINO (External Inode Number
+		Bitmap, Translation Table and Generation Table) file
+		even if it is the default path.
+		When the aufs mount option 'noxino' is specified, it
+		will be empty. About XINO files, see the aufs manual.
diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX
index e55103a..8d55b4b 100644
--- a/Documentation/block/00-INDEX
+++ b/Documentation/block/00-INDEX
@@ -1,5 +1,7 @@
 00-INDEX
 	- This file
+bfq-iosched.txt
+	- BFQ IO scheduler and its tunables
 biodoc.txt
 	- Notes on the Generic Block Layer Rewrite in Linux 2.5
 biovecs.txt
diff --git b/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt
new file mode 100644
index 0000000..5ba67af
--- /dev/null
+++ b/Documentation/block/bfq-iosched.txt
@@ -0,0 +1,516 @@
+BFQ (Budget Fair Queueing)
+==========================
+
+BFQ is a proportional-share I/O scheduler, with some extra
+low-latency capabilities. In addition to cgroups support (blkio or io
+controllers), BFQ's main features are:
+- BFQ guarantees a high system and application responsiveness, and a
+  low latency for time-sensitive applications, such as audio or video
+  players;
+- BFQ distributes bandwidth, and not just time, among processes or
+  groups (switching back to time distribution when needed to keep
+  throughput high).
+
+On average CPUs, the current version of BFQ can handle devices
+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a
+reference, 30-50 KIOPS correspond to very high bandwidths with
+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and
+to 120-200 MB/s with 4KB random I/O.
+
+The table of contents follow. Impatients can just jump to Section 3.
+
+CONTENTS
+
+1. When may BFQ be useful?
+ 1-1 Personal systems
+ 1-2 Server systems
+2. How does BFQ work?
+3. What are BFQ's tunable?
+4. BFQ group scheduling
+ 4-1 Service guarantees provided
+ 4-2 Interface
+
+1. When may BFQ be useful?
+==========================
+
+BFQ provides the following benefits on personal and server systems.
+
+1-1 Personal systems
+--------------------
+
+Low latency for interactive applications
+
+Regardless of the actual background workload, BFQ guarantees that, for
+interactive tasks, the storage device is virtually as responsive as if
+it was idle. For example, even if one or more of the following
+background workloads are being executed:
+- one or more large files are being read, written or copied,
+- a tree of source files is being compiled,
+- one or more virtual machines are performing I/O,
+- a software update is in progress,
+- indexing daemons are scanning filesystems and updating their
+  databases,
+starting an application or loading a file from within an application
+takes about the same time as if the storage device was idle. As a
+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions,
+applications experience high latencies, or even become unresponsive
+until the background workload terminates (also on SSDs).
+
+Low latency for soft real-time applications
+
+Also soft real-time applications, such as audio and video
+players/streamers, enjoy a low latency and a low drop rate, regardless
+of the background I/O workload. As a consequence, these applications
+do not suffer from almost any glitch due to the background workload.
+
+Higher speed for code-development tasks
+
+If some additional workload happens to be executed in parallel, then
+BFQ executes the I/O-related components of typical code-development
+tasks (compilation, checkout, merge, ...) much more quickly than CFQ,
+NOOP or DEADLINE.
+
+High throughput
+
+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and
+up to 150% higher throughput than DEADLINE and NOOP, with all the
+sequential workloads considered in our tests. With random workloads,
+and with all the workloads on flash-based devices, BFQ achieves,
+instead, about the same throughput as the other schedulers.
+
+Strong fairness, bandwidth and delay guarantees
+
+BFQ distributes the device throughput, and not just the device time,
+among I/O-bound applications in proportion their weights, with any
+workload and regardless of the device parameters. From these bandwidth
+guarantees, it is possible to compute tight per-I/O-request delay
+guarantees by a simple formula. If not configured for strict service
+guarantees, BFQ switches to time-based resource sharing (only) for
+applications that would otherwise cause a throughput loss.
+
+1-2 Server systems
+------------------
+
+Most benefits for server systems follow from the same service
+properties as above. In particular, regardless of whether additional,
+possibly heavy workloads are being served, BFQ guarantees:
+
+. audio and video-streaming with zero or very low jitter and drop
+  rate;
+
+. fast retrieval of WEB pages and embedded objects;
+
+. real-time recording of data in live-dumping applications (e.g.,
+  packet logging);
+
+. responsiveness in local and remote access to a server.
+
+
+2. How does BFQ work?
+=====================
+
+BFQ is a proportional-share I/O scheduler, whose general structure,
+plus a lot of code, are borrowed from CFQ.
+
+- Each process doing I/O on a device is associated with a weight and a
+  (bfq_)queue.
+
+- BFQ grants exclusive access to the device, for a while, to one queue
+  (process) at a time, and implements this service model by
+  associating every queue with a budget, measured in number of
+  sectors.
+
+  - After a queue is granted access to the device, the budget of the
+    queue is decremented, on each request dispatch, by the size of the
+    request.
+
+  - The in-service queue is expired, i.e., its service is suspended,
+    only if one of the following events occurs: 1) the queue finishes
+    its budget, 2) the queue empties, 3) a "budget timeout" fires.
+
+    - The budget timeout prevents processes doing random I/O from
+      holding the device for too long and dramatically reducing
+      throughput.
+
+    - Actually, as in CFQ, a queue associated with a process issuing
+      sync requests may not be expired immediately when it empties. In
+      contrast, BFQ may idle the device for a short time interval,
+      giving the process the chance to go on being served if it issues
+      a new request in time. Device idling typically boosts the
+      throughput on rotational devices, if processes do synchronous
+      and sequential I/O. In addition, under BFQ, device idling is
+      also instrumental in guaranteeing the desired throughput
+      fraction to processes issuing sync requests (see the description
+      of the slice_idle tunable in this document, or [1, 2], for more
+      details).
+
+      - With respect to idling for service guarantees, if several
+	processes are competing for the device at the same time, but
+	all processes (and groups, after the following commit) have
+	the same weight, then BFQ guarantees the expected throughput
+	distribution without ever idling the device. Throughput is
+	thus as high as possible in this common scenario.
+
+  - If low-latency mode is enabled (default configuration), BFQ
+    executes some special heuristics to detect interactive and soft
+    real-time applications (e.g., video or audio players/streamers),
+    and to reduce their latency. The most important action taken to
+    achieve this goal is to give to the queues associated with these
+    applications more than their fair share of the device
+    throughput. For brevity, we call just "weight-raising" the whole
+    sets of actions taken by BFQ to privilege these queues. In
+    particular, BFQ provides a milder form of weight-raising for
+    interactive applications, and a stronger form for soft real-time
+    applications.
+
+  - BFQ automatically deactivates idling for queues born in a burst of
+    queue creations. In fact, these queues are usually associated with
+    the processes of applications and services that benefit mostly
+    from a high throughput. Examples are systemd during boot, or git
+    grep.
+
+  - As CFQ, BFQ merges queues performing interleaved I/O, i.e.,
+    performing random I/O that becomes mostly sequential if
+    merged. Differently from CFQ, BFQ achieves this goal with a more
+    reactive mechanism, called Early Queue Merge (EQM). EQM is so
+    responsive in detecting interleaved I/O (cooperating processes),
+    that it enables BFQ to achieve a high throughput, by queue
+    merging, even for queues for which CFQ needs a different
+    mechanism, preemption, to get a high throughput. As such EQM is a
+    unified mechanism to achieve a high throughput with interleaved
+    I/O.
+
+  - Queues are scheduled according to a variant of WF2Q+, named
+    B-WF2Q+, and implemented using an augmented rb-tree to preserve an
+    O(log N) overall complexity.  See [2] for more details. B-WF2Q+ is
+    also ready for hierarchical scheduling. However, for a cleaner
+    logical breakdown, the code that enables and completes
+    hierarchical support is provided in the next commit, which focuses
+    exactly on this feature.
+
+  - B-WF2Q+ guarantees a tight deviation with respect to an ideal,
+    perfectly fair, and smooth service. In particular, B-WF2Q+
+    guarantees that each queue receives a fraction of the device
+    throughput proportional to its weight, even if the throughput
+    fluctuates, and regardless of: the device parameters, the current
+    workload and the budgets assigned to the queue.
+
+  - The last, budget-independence, property (although probably
+    counterintuitive in the first place) is definitely beneficial, for
+    the following reasons:
+
+    - First, with any proportional-share scheduler, the maximum
+      deviation with respect to an ideal service is proportional to
+      the maximum budget (slice) assigned to queues. As a consequence,
+      BFQ can keep this deviation tight not only because of the
+      accurate service of B-WF2Q+, but also because BFQ *does not*
+      need to assign a larger budget to a queue to let the queue
+      receive a higher fraction of the device throughput.
+
+    - Second, BFQ is free to choose, for every process (queue), the
+      budget that best fits the needs of the process, or best
+      leverages the I/O pattern of the process. In particular, BFQ
+      updates queue budgets with a simple feedback-loop algorithm that
+      allows a high throughput to be achieved, while still providing
+      tight latency guarantees to time-sensitive applications. When
+      the in-service queue expires, this algorithm computes the next
+      budget of the queue so as to:
+
+      - Let large budgets be eventually assigned to the queues
+	associated with I/O-bound applications performing sequential
+	I/O: in fact, the longer these applications are served once
+	got access to the device, the higher the throughput is.
+
+      - Let small budgets be eventually assigned to the queues
+	associated with time-sensitive applications (which typically
+	perform sporadic and short I/O), because, the smaller the
+	budget assigned to a queue waiting for service is, the sooner
+	B-WF2Q+ will serve that queue (Subsec 3.3 in [2]).
+
+- If several processes are competing for the device at the same time,
+  but all processes and groups have the same weight, then BFQ
+  guarantees the expected throughput distribution without ever idling
+  the device. It uses preemption instead. Throughput is then much
+  higher in this common scenario.
+
+- ioprio classes are served in strict priority order, i.e.,
+  lower-priority queues are not served as long as there are
+  higher-priority queues.  Among queues in the same class, the
+  bandwidth is distributed in proportion to the weight of each
+  queue. A very thin extra bandwidth is however guaranteed to
+  the Idle class, to prevent it from starving.
+
+
+3. What are BFQ's tunable?
+==========================
+
+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and
+fifo_expire_sync below are the same as in CFQ. Their description is
+just copied from that for CFQ. Some considerations in the description
+of slice_idle are copied from CFQ too.
+
+per-process ioprio and weight
+-----------------------------
+
+Unless the cgroups interface is used, weights can be assigned to
+processes only indirectly, through I/O priorities, and according to
+the relation: weight = (IOPRIO_BE_NR - ioprio) * 10.
+
+slice_idle
+----------
+
+This parameter specifies how long BFQ should idle for next I/O
+request, when certain sync BFQ queues become empty. By default
+slice_idle is a non-zero value. Idling has a double purpose: boosting
+throughput and making sure that the desired throughput distribution is
+respected (see the description of how BFQ works, and, if needed, the
+papers referred there).
+
+As for throughput, idling can be very helpful on highly seeky media
+like single spindle SATA/SAS disks where we can cut down on overall
+number of seeks and see improved throughput.
+
+Setting slice_idle to 0 will remove all the idling on queues and one
+should see an overall improved throughput on faster storage devices
+like multiple SATA/SAS disks in hardware RAID configuration.
+
+So depending on storage and workload, it might be useful to set
+slice_idle=0.  In general for SATA/SAS disks and software RAID of
+SATA/SAS disks keeping slice_idle enabled should be useful. For any
+configurations where there are multiple spindles behind single LUN
+(Host based hardware RAID controller or for storage arrays), setting
+slice_idle=0 might end up in better throughput and acceptable
+latencies.
+
+Idling is however necessary to have service guarantees enforced in
+case of differentiated weights or differentiated I/O-request lengths.
+To see why, suppose that a given BFQ queue A must get several I/O
+requests served for each request served for another queue B. Idling
+ensures that, if A makes a new I/O request slightly after becoming
+empty, then no request of B is dispatched in the middle, and thus A
+does not lose the possibility to get more than one request dispatched
+before the next request of B is dispatched. Note that idling
+guarantees the desired differentiated treatment of queues only in
+terms of I/O-request dispatches. To guarantee that the actual service
+order then corresponds to the dispatch order, the strict_guarantees
+tunable must be set too.
+
+There is an important flipside for idling: apart from the above cases
+where it is beneficial also for throughput, idling can severely impact
+throughput. One important case is random workload. Because of this
+issue, BFQ tends to avoid idling as much as possible, when it is not
+beneficial also for throughput. As a consequence of this behavior, and
+of further issues described for the strict_guarantees tunable,
+short-term service guarantees may be occasionally violated. And, in
+some cases, these guarantees may be more important than guaranteeing
+maximum throughput. For example, in video playing/streaming, a very
+low drop rate may be more important than maximum throughput. In these
+cases, consider setting the strict_guarantees parameter.
+
+strict_guarantees
+-----------------
+
+If this parameter is set (default: unset), then BFQ
+
+- always performs idling when the in-service queue becomes empty;
+
+- forces the device to serve one I/O request at a time, by dispatching a
+  new request only if there is no outstanding request.
+
+In the presence of differentiated weights or I/O-request sizes, both
+the above conditions are needed to guarantee that every BFQ queue
+receives its allotted share of the bandwidth. The first condition is
+needed for the reasons explained in the description of the slice_idle
+tunable.  The second condition is needed because all modern storage
+devices reorder internally-queued requests, which may trivially break
+the service guarantees enforced by the I/O scheduler.
+
+Setting strict_guarantees may evidently affect throughput.
+
+back_seek_max
+-------------
+
+This specifies, given in Kbytes, the maximum "distance" for backward seeking.
+The distance is the amount of space from the current head location to the
+sectors that are backward in terms of distance.
+
+This parameter allows the scheduler to anticipate requests in the "backward"
+direction and consider them as being the "next" if they are within this
+distance from the current head location.
+
+back_seek_penalty
+-----------------
+
+This parameter is used to compute the cost of backward seeking. If the
+backward distance of request is just 1/back_seek_penalty from a "front"
+request, then the seeking cost of two requests is considered equivalent.
+
+So scheduler will not bias toward one or the other request (otherwise scheduler
+will bias toward front request). Default value of back_seek_penalty is 2.
+
+fifo_expire_async
+-----------------
+
+This parameter is used to set the timeout of asynchronous requests. Default
+value of this is 248ms.
+
+fifo_expire_sync
+----------------
+
+This parameter is used to set the timeout of synchronous requests. Default
+value of this is 124ms. In case to favor synchronous requests over asynchronous
+one, this value should be decreased relative to fifo_expire_async.
+
+low_latency
+-----------
+
+This parameter is used to enable/disable BFQ's low latency mode. By
+default, low latency mode is enabled. If enabled, interactive and soft
+real-time applications are privileged and experience a lower latency,
+as explained in more detail in the description of how BFQ works.
+
+timeout_sync
+------------
+
+Maximum amount of device time that can be given to a task (queue) once
+it has been selected for service. On devices with costly seeks,
+increasing this time usually increases maximum throughput. On the
+opposite end, increasing this time coarsens the granularity of the
+short-term bandwidth and latency guarantees, especially if the
+following parameter is set to zero.
+
+max_budget
+----------
+
+Maximum amount of service, measured in sectors, that can be provided
+to a BFQ queue once it is set in service (of course within the limits
+of the above timeout). According to what said in the description of
+the algorithm, larger values increase the throughput in proportion to
+the percentage of sequential I/O requests issued. The price of larger
+values is that they coarsen the granularity of short-term bandwidth
+and latency guarantees.
+
+The default value is 0, which enables auto-tuning: BFQ sets max_budget
+to the maximum number of sectors that can be served during
+timeout_sync, according to the estimated peak rate.
+
+weights
+-------
+
+Read-only parameter, used to show the weights of the currently active
+BFQ queues.
+
+
+wr_ tunables
+------------
+
+BFQ exports a few parameters to control/tune the behavior of
+low-latency heuristics.
+
+wr_coeff
+
+Factor by which the weight of a weight-raised queue is multiplied. If
+the queue is deemed soft real-time, then the weight is further
+multiplied by an additional, constant factor.
+
+wr_max_time
+
+Maximum duration of a weight-raising period for an interactive task
+(ms). If set to zero (default value), then this value is computed
+automatically, as a function of the peak rate of the device. In any
+case, when the value of this parameter is read, it always reports the
+current duration, regardless of whether it has been set manually or
+computed automatically.
+
+wr_max_softrt_rate
+
+Maximum service rate below which a queue is deemed to be associated
+with a soft real-time application, and is then weight-raised
+accordingly (sectors/sec).
+
+wr_min_idle_time
+
+Minimum idle period after which interactive weight-raising may be
+reactivated for a queue (in ms).
+
+wr_rt_max_time
+
+Maximum weight-raising duration for soft real-time queues (in ms). The
+start time from which this duration is considered is automatically
+moved forward if the queue is detected to be still soft real-time
+before the current soft real-time weight-raising period finishes.
+
+wr_min_inter_arr_async
+
+Minimum period between I/O request arrivals after which weight-raising
+may be reactivated for an already busy async queue (in ms).
+
+
+4. Group scheduling with BFQ
+============================
+
+BFQ supports both cgroup-v1 and cgroup-v2 io controllers, namely blkio
+and io. In particular, BFQ supports weight-based proportional
+share.
+
+4-1 Service guarantees provided
+-------------------------------
+
+With BFQ, proportional share means true proportional share of the
+device bandwidth, according to group weights. For example, a group
+with weight 200 gets twice the bandwidth, and not just twice the time,
+of a group with weight 100.
+
+BFQ supports hierarchies (group trees) of any depth. Bandwidth is
+distributed among groups and processes in the expected way: for each
+group, the children of the group share the whole bandwidth of the
+group in proportion to their weights. In particular, this implies
+that, for each leaf group, every process of the group receives the
+same share of the whole group bandwidth, unless the ioprio of the
+process is modified.
+
+The resource-sharing guarantee for a group may partially or totally
+switch from bandwidth to time, if providing bandwidth guarantees to
+the group lowers the throughput too much. This switch occurs on a
+per-process basis: if a process of a leaf group causes throughput loss
+if served in such a way to receive its share of the bandwidth, then
+BFQ switches back to just time-based proportional share for that
+process.
+
+4-2 Interface
+-------------
+
+To get proportional sharing of bandwidth with BFQ for a given device,
+BFQ must of course be the active scheduler for that device.
+
+Within each group directory, the names of the files associated with
+BFQ-specific cgroup parameters and stats begin with the "bfq."
+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for
+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group
+parameter to set the weight of a group with BFQ is blkio.bfq.weight
+or io.bfq.weight.
+
+Parameters to set
+-----------------
+
+For each group, there is only the following parameter to set.
+
+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the
+group inside its parent. Available values: 1..10000 (default 100). The
+linear mapping between ioprio and weights, described at the beginning
+of the tunable section, is still valid, but all weights higher than
+IOPRIO_BE_NR*10 are mapped to ioprio 0.
+
+
+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O
+    Scheduler", Proceedings of the First Workshop on Mobile System
+    Technologies (MST-2015), May 2015.
+    http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf
+
+[2] P. Valente and M. Andreolini, "Improving Application
+    Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of
+    the 5th Annual International Systems and Storage Conference
+    (SYSTOR '12), June 2012.
+    Slightly extended version:
+    http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite-
+							results.pdf
diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt
index 2a39040..2847219 100644
--- a/Documentation/block/queue-sysfs.txt
+++ b/Documentation/block/queue-sysfs.txt
@@ -169,5 +169,18 @@ This is the number of bytes the device can write in a single write-same
 command.  A value of '0' means write-same is not supported by this
 device.
 
+wb_lat_usec (RW)
+----------------
+If the device is registered for writeback throttling, then this file shows
+the target minimum read latency. If this latency is exceeded in a given
+window of time (see wb_window_usec), then the writeback throttling will start
+scaling back writes.
+
+wb_window_usec (RW)
+-------------------
+If the device is registered for writeback throttling, then this file shows
+the value of the monitoring window in which we'll look at the target
+latency. See wb_lat_usec.
+
 
 Jens Axboe <jens.axboe@oracle.com>, February 2009
diff --git b/Documentation/filesystems/aufs/README b/Documentation/filesystems/aufs/README
new file mode 100644
index 0000000..36df674
--- /dev/null
+++ b/Documentation/filesystems/aufs/README
@@ -0,0 +1,392 @@
+
+Aufs4 -- advanced multi layered unification filesystem version 4.x
+http://aufs.sf.net
+Junjiro R. Okajima
+
+
+0. Introduction
+----------------------------------------
+In the early days, aufs was entirely re-designed and re-implemented
+Unionfs Version 1.x series. Adding many original ideas, approaches,
+improvements and implementations, it becomes totally different from
+Unionfs while keeping the basic features.
+Recently, Unionfs Version 2.x series begin taking some of the same
+approaches to aufs1's.
+Unionfs is being developed by Professor Erez Zadok at Stony Brook
+University and his team.
+
+Aufs4 supports linux-4.0 and later, and for linux-3.x series try aufs3.
+If you want older kernel version support, try aufs2-2.6.git or
+aufs2-standalone.git repository, aufs1 from CVS on SourceForge.
+
+Note: it becomes clear that "Aufs was rejected. Let's give it up."
+      According to Christoph Hellwig, linux rejects all union-type
+      filesystems but UnionMount.
+<http://marc.info/?l=linux-kernel&m=123938533724484&w=2>
+
+PS. Al Viro seems have a plan to merge aufs as well as overlayfs and
+    UnionMount, and he pointed out an issue around a directory mutex
+    lock and aufs addressed it. But it is still unsure whether aufs will
+    be merged (or any other union solution).
+<http://marc.info/?l=linux-kernel&m=136312705029295&w=1>
+
+
+1. Features
+----------------------------------------
+- unite several directories into a single virtual filesystem. The member
+  directory is called as a branch.
+- you can specify the permission flags to the branch, which are 'readonly',
+  'readwrite' and 'whiteout-able.'
+- by upper writable branch, internal copyup and whiteout, files/dirs on
+  readonly branch are modifiable logically.
+- dynamic branch manipulation, add, del.
+- etc...
+
+Also there are many enhancements in aufs, such as:
+- test only the highest one for the directory permission (dirperm1)
+- copyup on open (coo=)
+- 'move' policy for copy-up between two writable branches, after
+  checking free space.
+- xattr, acl
+- readdir(3) in userspace.
+- keep inode number by external inode number table
+- keep the timestamps of file/dir in internal copyup operation
+- seekable directory, supporting NFS readdir.
+- whiteout is hardlinked in order to reduce the consumption of inodes
+  on branch
+- do not copyup, nor create a whiteout when it is unnecessary
+- revert a single systemcall when an error occurs in aufs
+- remount interface instead of ioctl
+- maintain /etc/mtab by an external command, /sbin/mount.aufs.
+- loopback mounted filesystem as a branch
+- kernel thread for removing the dir who has a plenty of whiteouts
+- support copyup sparse file (a file which has a 'hole' in it)
+- default permission flags for branches
+- selectable permission flags for ro branch, whether whiteout can
+  exist or not
+- export via NFS.
+- support <sysfs>/fs/aufs and <debugfs>/aufs.
+- support multiple writable branches, some policies to select one
+  among multiple writable branches.
+- a new semantics for link(2) and rename(2) to support multiple
+  writable branches.
+- no glibc changes are required.
+- pseudo hardlink (hardlink over branches)
+- allow a direct access manually to a file on branch, e.g. bypassing aufs.
+  including NFS or remote filesystem branch.
+- userspace wrapper for pathconf(3)/fpathconf(3) with _PC_LINK_MAX.
+- and more...
+
+Currently these features are dropped temporary from aufs4.
+See design/08plan.txt in detail.
+- nested mount, i.e. aufs as readonly no-whiteout branch of another aufs
+  (robr)
+- statistics of aufs thread (/sys/fs/aufs/stat)
+
+Features or just an idea in the future (see also design/*.txt),
+- reorder the branch index without del/re-add.
+- permanent xino files for NFSD
+- an option for refreshing the opened files after add/del branches
+- light version, without branch manipulation. (unnecessary?)
+- copyup in userspace
+- inotify in userspace
+- readv/writev
+
+
+2. Download
+----------------------------------------
+There are three GIT trees for aufs4, aufs4-linux.git,
+aufs4-standalone.git, and aufs-util.git. Note that there is no "4" in
+"aufs-util.git."
+While the aufs-util is always necessary, you need either of aufs4-linux
+or aufs4-standalone.
+
+The aufs4-linux tree includes the whole linux mainline GIT tree,
+git://git.kernel.org/.../torvalds/linux.git.
+And you cannot select CONFIG_AUFS_FS=m for this version, eg. you cannot
+build aufs4 as an external kernel module.
+Several extra patches are not included in this tree. Only
+aufs4-standalone tree contains them. They are described in the later
+section "Configuration and Compilation."
+
+On the other hand, the aufs4-standalone tree has only aufs source files
+and necessary patches, and you can select CONFIG_AUFS_FS=m.
+But you need to apply all aufs patches manually.
+
+You will find GIT branches whose name is in form of "aufs4.x" where "x"
+represents the linux kernel version, "linux-4.x". For instance,
+"aufs4.0" is for linux-4.0. For latest "linux-4.x-rcN", use
+"aufs4.x-rcN" branch.
+
+o aufs4-linux tree
+$ git clone --reference /your/linux/git/tree \
+	git://github.com/sfjro/aufs4-linux.git aufs4-linux.git
+- if you don't have linux GIT tree, then remove "--reference ..."
+$ cd aufs4-linux.git
+$ git checkout origin/aufs4.0
+
+Or You may want to directly git-pull aufs into your linux GIT tree, and
+leave the patch-work to GIT.
+$ cd /your/linux/git/tree
+$ git remote add aufs4 git://github.com/sfjro/aufs4-linux.git
+$ git fetch aufs4
+$ git checkout -b my4.0 v4.0
+$ (add your local change...)
+$ git pull aufs4 aufs4.0
+- now you have v4.0 + your_changes + aufs4.0 in you my4.0 branch.
+- you may need to solve some conflicts between your_changes and
+  aufs4.0. in this case, git-rerere is recommended so that you can
+  solve the similar conflicts automatically when you upgrade to 4.1 or
+  later in the future.
+
+o aufs4-standalone tree
+$ git clone git://github.com/sfjro/aufs4-standalone.git aufs4-standalone.git
+$ cd aufs4-standalone.git
+$ git checkout origin/aufs4.0
+
+o aufs-util tree
+$ git clone git://git.code.sf.net/p/aufs/aufs-util aufs-util.git
+- note that the public aufs-util.git is on SourceForge instead of
+  GitHUB.
+$ cd aufs-util.git
+$ git checkout origin/aufs4.0
+
+Note: The 4.x-rcN branch is to be used with `rc' kernel versions ONLY.
+The minor version number, 'x' in '4.x', of aufs may not always
+follow the minor version number of the kernel.
+Because changes in the kernel that cause the use of a new
+minor version number do not always require changes to aufs-util.
+
+Since aufs-util has its own minor version number, you may not be
+able to find a GIT branch in aufs-util for your kernel's
+exact minor version number.
+In this case, you should git-checkout the branch for the
+nearest lower number.
+
+For (an unreleased) example:
+If you are using "linux-4.10" and the "aufs4.10" branch
+does not exist in aufs-util repository, then "aufs4.9", "aufs4.8"
+or something numerically smaller is the branch for your kernel.
+
+Also you can view all branches by
+	$ git branch -a
+
+
+3. Configuration and Compilation
+----------------------------------------
+Make sure you have git-checkout'ed the correct branch.
+
+For aufs4-linux tree,
+- enable CONFIG_AUFS_FS.
+- set other aufs configurations if necessary.
+
+For aufs4-standalone tree,
+There are several ways to build.
+
+1.
+- apply ./aufs4-kbuild.patch to your kernel source files.
+- apply ./aufs4-base.patch too.
+- apply ./aufs4-mmap.patch too.
+- apply ./aufs4-standalone.patch too, if you have a plan to set
+  CONFIG_AUFS_FS=m. otherwise you don't need ./aufs4-standalone.patch.
+- copy ./{Documentation,fs,include/uapi/linux/aufs_type.h} files to your
+  kernel source tree. Never copy $PWD/include/uapi/linux/Kbuild.
+- enable CONFIG_AUFS_FS, you can select either
+  =m or =y.
+- and build your kernel as usual.
+- install the built kernel.
+  Note: Since linux-3.9, every filesystem module requires an alias
+  "fs-<fsname>". You should make sure that "fs-aufs" is listed in your
+  modules.aliases file if you set CONFIG_AUFS_FS=m.
+- install the header files too by "make headers_install" to the
+  directory where you specify. By default, it is $PWD/usr.
+  "make help" shows a brief note for headers_install.
+- and reboot your system.
+
+2.
+- module only (CONFIG_AUFS_FS=m).
+- apply ./aufs4-base.patch to your kernel source files.
+- apply ./aufs4-mmap.patch too.
+- apply ./aufs4-standalone.patch too.
+- build your kernel, don't forget "make headers_install", and reboot.
+- edit ./config.mk and set other aufs configurations if necessary.
+  Note: You should read $PWD/fs/aufs/Kconfig carefully which describes
+  every aufs configurations.
+- build the module by simple "make".
+  Note: Since linux-3.9, every filesystem module requires an alias
+  "fs-<fsname>". You should make sure that "fs-aufs" is listed in your
+  modules.aliases file.
+- you can specify ${KDIR} make variable which points to your kernel
+  source tree.
+- install the files
+  + run "make install" to install the aufs module, or copy the built
+    $PWD/aufs.ko to /lib/modules/... and run depmod -a (or reboot simply).
+  + run "make install_headers" (instead of headers_install) to install
+    the modified aufs header file (you can specify DESTDIR which is
+    available in aufs standalone version's Makefile only), or copy
+    $PWD/usr/include/linux/aufs_type.h to /usr/include/linux or wherever
+    you like manually. By default, the target directory is $PWD/usr.
+- no need to apply aufs4-kbuild.patch, nor copying source files to your
+  kernel source tree.
+
+Note: The header file aufs_type.h is necessary to build aufs-util
+      as well as "make headers_install" in the kernel source tree.
+      headers_install is subject to be forgotten, but it is essentially
+      necessary, not only for building aufs-util.
+      You may not meet problems without headers_install in some older
+      version though.
+
+And then,
+- read README in aufs-util, build and install it
+- note that your distribution may contain an obsoleted version of
+  aufs_type.h in /usr/include/linux or something. When you build aufs
+  utilities, make sure that your compiler refers the correct aufs header
+  file which is built by "make headers_install."
+- if you want to use readdir(3) in userspace or pathconf(3) wrapper,
+  then run "make install_ulib" too. And refer to the aufs manual in
+  detail.
+
+There several other patches in aufs4-standalone.git. They are all
+optional. When you meet some problems, they will help you.
+- aufs4-loopback.patch
+  Supports a nested loopback mount in a branch-fs. This patch is
+  unnecessary until aufs produces a message like "you may want to try
+  another patch for loopback file".
+- vfs-ino.patch
+  Modifies a system global kernel internal function get_next_ino() in
+  order to stop assigning 0 for an inode-number. Not directly related to
+  aufs, but recommended generally.
+- tmpfs-idr.patch
+  Keeps the tmpfs inode number as the lowest value. Effective to reduce
+  the size of aufs XINO files for tmpfs branch. Also it prevents the
+  duplication of inode number, which is important for backup tools and
+  other utilities. When you find aufs XINO files for tmpfs branch
+  growing too much, try this patch.
+- lockdep-debug.patch
+  Because aufs is not only an ordinary filesystem (callee of VFS), but
+  also a caller of VFS functions for branch filesystems, subclassing of
+  the internal locks for LOCKDEP is necessary. LOCKDEP is a debugging
+  feature of linux kernel. If you enable CONFIG_LOCKDEP, then you will
+  need to apply this debug patch to expand several constant values.
+  If don't know what LOCKDEP, then you don't have apply this patch.
+
+
+4. Usage
+----------------------------------------
+At first, make sure aufs-util are installed, and please read the aufs
+manual, aufs.5 in aufs-util.git tree.
+$ man -l aufs.5
+
+And then,
+$ mkdir /tmp/rw /tmp/aufs
+# mount -t aufs -o br=/tmp/rw:${HOME} none /tmp/aufs
+
+Here is another example. The result is equivalent.
+# mount -t aufs -o br=/tmp/rw=rw:${HOME}=ro none /tmp/aufs
+  Or
+# mount -t aufs -o br:/tmp/rw none /tmp/aufs
+# mount -o remount,append:${HOME} /tmp/aufs
+
+Then, you can see whole tree of your home dir through /tmp/aufs. If
+you modify a file under /tmp/aufs, the one on your home directory is
+not affected, instead the same named file will be newly created under
+/tmp/rw. And all of your modification to a file will be applied to
+the one under /tmp/rw. This is called the file based Copy on Write
+(COW) method.
+Aufs mount options are described in aufs.5.
+If you run chroot or something and make your aufs as a root directory,
+then you need to customize the shutdown script. See the aufs manual in
+detail.
+
+Additionally, there are some sample usages of aufs which are a
+diskless system with network booting, and LiveCD over NFS.
+See sample dir in CVS tree on SourceForge.
+
+
+5. Contact
+----------------------------------------
+When you have any problems or strange behaviour in aufs, please let me
+know with:
+- /proc/mounts (instead of the output of mount(8))
+- /sys/module/aufs/*
+- /sys/fs/aufs/* (if you have them)
+- /debug/aufs/* (if you have them)
+- linux kernel version
+  if your kernel is not plain, for example modified by distributor,
+  the url where i can download its source is necessary too.
+- aufs version which was printed at loading the module or booting the
+  system, instead of the date you downloaded.
+- configuration (define/undefine CONFIG_AUFS_xxx)
+- kernel configuration or /proc/config.gz (if you have it)
+- behaviour which you think to be incorrect
+- actual operation, reproducible one is better
+- mailto: aufs-users at lists.sourceforge.net
+
+Usually, I don't watch the Public Areas(Bugs, Support Requests, Patches,
+and Feature Requests) on SourceForge. Please join and write to
+aufs-users ML.
+
+
+6. Acknowledgements
+----------------------------------------
+Thanks to everyone who have tried and are using aufs, whoever
+have reported a bug or any feedback.
+
+Especially donators:
+Tomas Matejicek(slax.org) made a donation (much more than once).
+	Since Apr 2010, Tomas M (the author of Slax and Linux Live
+	scripts) is making "doubling" donations.
+	Unfortunately I cannot list all of the donators, but I really
+	appreciate.
+	It ends Aug 2010, but the ordinary donation URL is still available.
+	<http://sourceforge.net/donate/index.php?group_id=167503>
+Dai Itasaka made a donation (2007/8).
+Chuck Smith made a donation (2008/4, 10 and 12).
+Henk Schoneveld made a donation (2008/9).
+Chih-Wei Huang, ASUS, CTC donated Eee PC 4G (2008/10).
+Francois Dupoux made a donation (2008/11).
+Bruno Cesar Ribas and Luis Carlos Erpen de Bona, C3SL serves public
+	aufs2 GIT tree (2009/2).
+William Grant made a donation (2009/3).
+Patrick Lane made a donation (2009/4).
+The Mail Archive (mail-archive.com) made donations (2009/5).
+Nippy Networks (Ed Wildgoose) made a donation (2009/7).
+New Dream Network, LLC (www.dreamhost.com) made a donation (2009/11).
+Pavel Pronskiy made a donation (2011/2).
+Iridium and Inmarsat satellite phone retailer (www.mailasail.com), Nippy
+	Networks (Ed Wildgoose) made a donation for hardware (2011/3).
+Max Lekomcev (DOM-TV project) made a donation (2011/7, 12, 2012/3, 6 and
+11).
+Sam Liddicott made a donation (2011/9).
+Era Scarecrow made a donation (2013/4).
+Bor Ratajc made a donation (2013/4).
+Alessandro Gorreta made a donation (2013/4).
+POIRETTE Marc made a donation (2013/4).
+Alessandro Gorreta made a donation (2013/4).
+lauri kasvandik made a donation (2013/5).
+"pemasu from Finland" made a donation (2013/7).
+The Parted Magic Project made a donation (2013/9 and 11).
+Pavel Barta made a donation (2013/10).
+Nikolay Pertsev made a donation (2014/5).
+James B made a donation (2014/7 and 2015/7).
+Stefano Di Biase made a donation (2014/8).
+Daniel Epellei made a donation (2015/1).
+OmegaPhil made a donation (2016/1).
+Tomasz Szewczyk made a donation (2016/4).
+
+Thank you very much.
+Donations are always, including future donations, very important and
+helpful for me to keep on developing aufs.
+
+
+7.
+----------------------------------------
+If you are an experienced user, no explanation is needed. Aufs is
+just a linux filesystem.
+
+
+Enjoy!
+
+# Local variables: ;
+# mode: text;
+# End: ;
diff --git b/Documentation/filesystems/aufs/design/01intro.txt b/Documentation/filesystems/aufs/design/01intro.txt
new file mode 100644
index 0000000..5d01214
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/01intro.txt
@@ -0,0 +1,157 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Introduction
+----------------------------------------
+
+aufs [ei ju: ef es] | [a u f s]
+1. abbrev. for "advanced multi-layered unification filesystem".
+2. abbrev. for "another unionfs".
+3. abbrev. for "auf das" in German which means "on the" in English.
+   Ex. "Butter aufs Brot"(G) means "butter onto bread"(E).
+   But "Filesystem aufs Filesystem" is hard to understand.
+
+AUFS is a filesystem with features:
+- multi layered stackable unification filesystem, the member directory
+  is called as a branch.
+- branch permission and attribute, 'readonly', 'real-readonly',
+  'readwrite', 'whiteout-able', 'link-able whiteout', etc. and their
+  combination.
+- internal "file copy-on-write".
+- logical deletion, whiteout.
+- dynamic branch manipulation, adding, deleting and changing permission.
+- allow bypassing aufs, user's direct branch access.
+- external inode number translation table and bitmap which maintains the
+  persistent aufs inode number.
+- seekable directory, including NFS readdir.
+- file mapping, mmap and sharing pages.
+- pseudo-link, hardlink over branches.
+- loopback mounted filesystem as a branch.
+- several policies to select one among multiple writable branches.
+- revert a single systemcall when an error occurs in aufs.
+- and more...
+
+
+Multi Layered Stackable Unification Filesystem
+----------------------------------------------------------------------
+Most people already knows what it is.
+It is a filesystem which unifies several directories and provides a
+merged single directory. When users access a file, the access will be
+passed/re-directed/converted (sorry, I am not sure which English word is
+correct) to the real file on the member filesystem. The member
+filesystem is called 'lower filesystem' or 'branch' and has a mode
+'readonly' and 'readwrite.' And the deletion for a file on the lower
+readonly branch is handled by creating 'whiteout' on the upper writable
+branch.
+
+On LKML, there have been discussions about UnionMount (Jan Blunck,
+Bharata B Rao and Valerie Aurora) and Unionfs (Erez Zadok). They took
+different approaches to implement the merged-view.
+The former tries putting it into VFS, and the latter implements as a
+separate filesystem.
+(If I misunderstand about these implementations, please let me know and
+I shall correct it. Because it is a long time ago when I read their
+source files last time).
+
+UnionMount's approach will be able to small, but may be hard to share
+branches between several UnionMount since the whiteout in it is
+implemented in the inode on branch filesystem and always
+shared. According to Bharata's post, readdir does not seems to be
+finished yet.
+There are several missing features known in this implementations such as
+- for users, the inode number may change silently. eg. copy-up.
+- link(2) may break by copy-up.
+- read(2) may get an obsoleted filedata (fstat(2) too).
+- fcntl(F_SETLK) may be broken by copy-up.
+- unnecessary copy-up may happen, for example mmap(MAP_PRIVATE) after
+  open(O_RDWR).
+
+In linux-3.18, "overlay" filesystem (formerly known as "overlayfs") was
+merged into mainline. This is another implementation of UnionMount as a
+separated filesystem. All the limitations and known problems which
+UnionMount are equally inherited to "overlay" filesystem.
+
+Unionfs has a longer history. When I started implementing a stackable
+filesystem (Aug 2005), it already existed. It has virtual super_block,
+inode, dentry and file objects and they have an array pointing lower
+same kind objects. After contributing many patches for Unionfs, I
+re-started my project AUFS (Jun 2006).
+
+In AUFS, the structure of filesystem resembles to Unionfs, but I
+implemented my own ideas, approaches and enhancements and it became
+totally different one.
+
+Comparing DM snapshot and fs based implementation
+- the number of bytes to be copied between devices is much smaller.
+- the type of filesystem must be one and only.
+- the fs must be writable, no readonly fs, even for the lower original
+  device. so the compression fs will not be usable. but if we use
+  loopback mount, we may address this issue.
+  for instance,
+	mount /cdrom/squashfs.img /sq
+	losetup /sq/ext2.img
+	losetup /somewhere/cow
+	dmsetup "snapshot /dev/loop0 /dev/loop1 ..."
+- it will be difficult (or needs more operations) to extract the
+  difference between the original device and COW.
+- DM snapshot-merge may help a lot when users try merging. in the
+  fs-layer union, users will use rsync(1).
+
+You may want to read my old paper "Filesystems in LiveCD"
+(http://aufs.sourceforge.net/aufs2/report/sq/sq.pdf).
+
+
+Several characters/aspects/persona of aufs
+----------------------------------------------------------------------
+
+Aufs has several characters, aspects or persona.
+1. a filesystem, callee of VFS helper
+2. sub-VFS, caller of VFS helper for branches
+3. a virtual filesystem which maintains persistent inode number
+4. reader/writer of files on branches such like an application
+
+1. Callee of VFS Helper
+As an ordinary linux filesystem, aufs is a callee of VFS. For instance,
+unlink(2) from an application reaches sys_unlink() kernel function and
+then vfs_unlink() is called. vfs_unlink() is one of VFS helper and it
+calls filesystem specific unlink operation. Actually aufs implements the
+unlink operation but it behaves like a redirector.
+
+2. Caller of VFS Helper for Branches
+aufs_unlink() passes the unlink request to the branch filesystem as if
+it were called from VFS. So the called unlink operation of the branch
+filesystem acts as usual. As a caller of VFS helper, aufs should handle
+every necessary pre/post operation for the branch filesystem.
+- acquire the lock for the parent dir on a branch
+- lookup in a branch
+- revalidate dentry on a branch
+- mnt_want_write() for a branch
+- vfs_unlink() for a branch
+- mnt_drop_write() for a branch
+- release the lock on a branch
+
+3. Persistent Inode Number
+One of the most important issue for a filesystem is to maintain inode
+numbers. This is particularly important to support exporting a
+filesystem via NFS. Aufs is a virtual filesystem which doesn't have a
+backend block device for its own. But some storage is necessary to
+keep and maintain the inode numbers. It may be a large space and may not
+suit to keep in memory. Aufs rents some space from its first writable
+branch filesystem (by default) and creates file(s) on it. These files
+are created by aufs internally and removed soon (currently) keeping
+opened.
+Note: Because these files are removed, they are totally gone after
+      unmounting aufs. It means the inode numbers are not persistent
+      across unmount or reboot. I have a plan to make them really
+      persistent which will be important for aufs on NFS server.
+
+4. Read/Write Files Internally (copy-on-write)
+Because a branch can be readonly, when you write a file on it, aufs will
+"copy-up" it to the upper writable branch internally. And then write the
+originally requested thing to the file. Generally kernel doesn't
+open/read/write file actively. In aufs, even a single write may cause a
+internal "file copy". This behaviour is very similar to cp(1) command.
+
+Some people may think it is better to pass such work to user space
+helper, instead of doing in kernel space. Actually I am still thinking
+about it. But currently I have implemented it in kernel space.
diff --git b/Documentation/filesystems/aufs/design/02struct.txt b/Documentation/filesystems/aufs/design/02struct.txt
new file mode 100644
index 0000000..783328a
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/02struct.txt
@@ -0,0 +1,245 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Basic Aufs Internal Structure
+
+Superblock/Inode/Dentry/File Objects
+----------------------------------------------------------------------
+As like an ordinary filesystem, aufs has its own
+superblock/inode/dentry/file objects. All these objects have a
+dynamically allocated array and store the same kind of pointers to the
+lower filesystem, branch.
+For example, when you build a union with one readwrite branch and one
+readonly, mounted /au, /rw and /ro respectively.
+- /au = /rw + /ro
+- /ro/fileA exists but /rw/fileA
+
+Aufs lookup operation finds /ro/fileA and gets dentry for that. These
+pointers are stored in a aufs dentry. The array in aufs dentry will be,
+- [0] = NULL (because /rw/fileA doesn't exist)
+- [1] = /ro/fileA
+
+This style of an array is essentially same to the aufs
+superblock/inode/dentry/file objects.
+
+Because aufs supports manipulating branches, ie. add/delete/change
+branches dynamically, these objects has its own generation. When
+branches are changed, the generation in aufs superblock is
+incremented. And a generation in other object are compared when it is
+accessed. When a generation in other objects are obsoleted, aufs
+refreshes the internal array.
+
+
+Superblock
+----------------------------------------------------------------------
+Additionally aufs superblock has some data for policies to select one
+among multiple writable branches, XIB files, pseudo-links and kobject.
+See below in detail.
+About the policies which supports copy-down a directory, see
+wbr_policy.txt too.
+
+
+Branch and XINO(External Inode Number Translation Table)
+----------------------------------------------------------------------
+Every branch has its own xino (external inode number translation table)
+file. The xino file is created and unlinked by aufs internally. When two
+members of a union exist on the same filesystem, they share the single
+xino file.
+The struct of a xino file is simple, just a sequence of aufs inode
+numbers which is indexed by the lower inode number.
+In the above sample, assume the inode number of /ro/fileA is i111 and
+aufs assigns the inode number i999 for fileA. Then aufs writes 999 as
+4(8) bytes at 111 * 4(8) bytes offset in the xino file.
+
+When the inode numbers are not contiguous, the xino file will be sparse
+which has a hole in it and doesn't consume as much disk space as it
+might appear. If your branch filesystem consumes disk space for such
+holes, then you should specify 'xino=' option at mounting aufs.
+
+Aufs has a mount option to free the disk blocks for such holes in XINO
+files on tmpfs or ramdisk. But it is not so effective actually. If you
+meet a problem of disk shortage due to XINO files, then you should try
+"tmpfs-ino.patch" (and "vfs-ino.patch" too) in aufs4-standalone.git.
+The patch localizes the assignment inumbers per tmpfs-mount and avoid
+the holes in XINO files.
+
+Also a writable branch has three kinds of "whiteout bases". All these
+are existed when the branch is joined to aufs, and their names are
+whiteout-ed doubly, so that users will never see their names in aufs
+hierarchy.
+1. a regular file which will be hardlinked to all whiteouts.
+2. a directory to store a pseudo-link.
+3. a directory to store an "orphan"-ed file temporary.
+
+1. Whiteout Base
+   When you remove a file on a readonly branch, aufs handles it as a
+   logical deletion and creates a whiteout on the upper writable branch
+   as a hardlink of this file in order not to consume inode on the
+   writable branch.
+2. Pseudo-link Dir
+   See below, Pseudo-link.
+3. Step-Parent Dir
+   When "fileC" exists on the lower readonly branch only and it is
+   opened and removed with its parent dir, and then user writes
+   something into it, then aufs copies-up fileC to this
+   directory. Because there is no other dir to store fileC. After
+   creating a file under this dir, the file is unlinked.
+
+Because aufs supports manipulating branches, ie. add/delete/change
+dynamically, a branch has its own id. When the branch order changes,
+aufs finds the new index by searching the branch id.
+
+
+Pseudo-link
+----------------------------------------------------------------------
+Assume "fileA" exists on the lower readonly branch only and it is
+hardlinked to "fileB" on the branch. When you write something to fileA,
+aufs copies-up it to the upper writable branch. Additionally aufs
+creates a hardlink under the Pseudo-link Directory of the writable
+branch. The inode of a pseudo-link is kept in aufs super_block as a
+simple list. If fileB is read after unlinking fileA, aufs returns
+filedata from the pseudo-link instead of the lower readonly
+branch. Because the pseudo-link is based upon the inode, to keep the
+inode number by xino (see above) is essentially necessary.
+
+All the hardlinks under the Pseudo-link Directory of the writable branch
+should be restored in a proper location later. Aufs provides a utility
+to do this. The userspace helpers executed at remounting and unmounting
+aufs by default.
+During this utility is running, it puts aufs into the pseudo-link
+maintenance mode. In this mode, only the process which began the
+maintenance mode (and its child processes) is allowed to operate in
+aufs. Some other processes which are not related to the pseudo-link will
+be allowed to run too, but the rest have to return an error or wait
+until the maintenance mode ends. If a process already acquires an inode
+mutex (in VFS), it has to return an error.
+
+
+XIB(external inode number bitmap)
+----------------------------------------------------------------------
+Addition to the xino file per a branch, aufs has an external inode number
+bitmap in a superblock object. It is also an internal file such like a
+xino file.
+It is a simple bitmap to mark whether the aufs inode number is in-use or
+not.
+To reduce the file I/O, aufs prepares a single memory page to cache xib.
+
+As well as XINO files, aufs has a feature to truncate/refresh XIB to
+reduce the number of consumed disk blocks for these files.
+
+
+Virtual or Vertical Dir, and Readdir in Userspace
+----------------------------------------------------------------------
+In order to support multiple layers (branches), aufs readdir operation
+constructs a virtual dir block on memory. For readdir, aufs calls
+vfs_readdir() internally for each dir on branches, merges their entries
+with eliminating the whiteout-ed ones, and sets it to file (dir)
+object. So the file object has its entry list until it is closed. The
+entry list will be updated when the file position is zero and becomes
+obsoleted. This decision is made in aufs automatically.
+
+The dynamically allocated memory block for the name of entries has a
+unit of 512 bytes (by default) and stores the names contiguously (no
+padding). Another block for each entry is handled by kmem_cache too.
+During building dir blocks, aufs creates hash list and judging whether
+the entry is whiteouted by its upper branch or already listed.
+The merged result is cached in the corresponding inode object and
+maintained by a customizable life-time option.
+
+Some people may call it can be a security hole or invite DoS attack
+since the opened and once readdir-ed dir (file object) holds its entry
+list and becomes a pressure for system memory. But I'd say it is similar
+to files under /proc or /sys. The virtual files in them also holds a
+memory page (generally) while they are opened. When an idea to reduce
+memory for them is introduced, it will be applied to aufs too.
+For those who really hate this situation, I've developed readdir(3)
+library which operates this merging in userspace. You just need to set
+LD_PRELOAD environment variable, and aufs will not consume no memory in
+kernel space for readdir(3).
+
+
+Workqueue
+----------------------------------------------------------------------
+Aufs sometimes requires privilege access to a branch. For instance,
+in copy-up/down operation. When a user process is going to make changes
+to a file which exists in the lower readonly branch only, and the mode
+of one of ancestor directories may not be writable by a user
+process. Here aufs copy-up the file with its ancestors and they may
+require privilege to set its owner/group/mode/etc.
+This is a typical case of a application character of aufs (see
+Introduction).
+
+Aufs uses workqueue synchronously for this case. It creates its own
+workqueue. The workqueue is a kernel thread and has privilege. Aufs
+passes the request to call mkdir or write (for example), and wait for
+its completion. This approach solves a problem of a signal handler
+simply.
+If aufs didn't adopt the workqueue and changed the privilege of the
+process, then the process may receive the unexpected SIGXFSZ or other
+signals.
+
+Also aufs uses the system global workqueue ("events" kernel thread) too
+for asynchronous tasks, such like handling inotify/fsnotify, re-creating a
+whiteout base and etc. This is unrelated to a privilege.
+Most of aufs operation tries acquiring a rw_semaphore for aufs
+superblock at the beginning, at the same time waits for the completion
+of all queued asynchronous tasks.
+
+
+Whiteout
+----------------------------------------------------------------------
+The whiteout in aufs is very similar to Unionfs's. That is represented
+by its filename. UnionMount takes an approach of a file mode, but I am
+afraid several utilities (find(1) or something) will have to support it.
+
+Basically the whiteout represents "logical deletion" which stops aufs to
+lookup further, but also it represents "dir is opaque" which also stop
+further lookup.
+
+In aufs, rmdir(2) and rename(2) for dir uses whiteout alternatively.
+In order to make several functions in a single systemcall to be
+revertible, aufs adopts an approach to rename a directory to a temporary
+unique whiteouted name.
+For example, in rename(2) dir where the target dir already existed, aufs
+renames the target dir to a temporary unique whiteouted name before the
+actual rename on a branch, and then handles other actions (make it opaque,
+update the attributes, etc). If an error happens in these actions, aufs
+simply renames the whiteouted name back and returns an error. If all are
+succeeded, aufs registers a function to remove the whiteouted unique
+temporary name completely and asynchronously to the system global
+workqueue.
+
+
+Copy-up
+----------------------------------------------------------------------
+It is a well-known feature or concept.
+When user modifies a file on a readonly branch, aufs operate "copy-up"
+internally and makes change to the new file on the upper writable branch.
+When the trigger systemcall does not update the timestamps of the parent
+dir, aufs reverts it after copy-up.
+
+
+Move-down (aufs3.9 and later)
+----------------------------------------------------------------------
+"Copy-up" is one of the essential feature in aufs. It copies a file from
+the lower readonly branch to the upper writable branch when a user
+changes something about the file.
+"Move-down" is an opposite action of copy-up. Basically this action is
+ran manually instead of automatically and internally.
+For desgin and implementation, aufs has to consider these issues.
+- whiteout for the file may exist on the lower branch.
+- ancestor directories may not exist on the lower branch.
+- diropq for the ancestor directories may exist on the upper branch.
+- free space on the lower branch will reduce.
+- another access to the file may happen during moving-down, including
+  UDBA (see "Revalidate Dentry and UDBA").
+- the file should not be hard-linked nor pseudo-linked. they should be
+  handled by auplink utility later.
+
+Sometimes users want to move-down a file from the upper writable branch
+to the lower readonly or writable branch. For instance,
+- the free space of the upper writable branch is going to run out.
+- create a new intermediate branch between the upper and lower branch.
+- etc.
+
+For this purpose, use "aumvdown" command in aufs-util.git.
diff --git b/Documentation/filesystems/aufs/design/03atomic_open.txt b/Documentation/filesystems/aufs/design/03atomic_open.txt
new file mode 100644
index 0000000..741ad6d
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/03atomic_open.txt
@@ -0,0 +1,72 @@
+
+# Copyright (C) 2015-2016 Junjiro R. Okajima
+
+Support for a branch who has its ->atomic_open()
+----------------------------------------------------------------------
+The filesystems who implement its ->atomic_open() are not majority. For
+example NFSv4 does, and aufs should call NFSv4 ->atomic_open,
+particularly for open(O_CREAT|O_EXCL, 0400) case. Other than
+->atomic_open(), NFSv4 returns an error for this open(2). While I am not
+sure whether all filesystems who have ->atomic_open() behave like this,
+but NFSv4 surely returns the error.
+
+In order to support ->atomic_open() for aufs, there are a few
+approaches.
+
+A. Introduce aufs_atomic_open()
+   - calls one of VFS:do_last(), lookup_open() or atomic_open() for
+     branch fs.
+B. Introduce aufs_atomic_open() calling create, open and chmod. this is
+   an aufs user Pip Cet's approach
+   - calls aufs_create(), VFS finish_open() and notify_change().
+   - pass fake-mode to finish_open(), and then correct the mode by
+     notify_change().
+C. Extend aufs_open() to call branch fs's ->atomic_open()
+   - no aufs_atomic_open().
+   - aufs_lookup() registers the TID to an aufs internal object.
+   - aufs_create() does nothing when the matching TID is registered, but
+     registers the mode.
+   - aufs_open() calls branch fs's ->atomic_open() when the matching
+     TID is registered.
+D. Extend aufs_open() to re-try branch fs's ->open() with superuser's
+   credential
+   - no aufs_atomic_open().
+   - aufs_create() registers the TID to an internal object. this info
+     represents "this process created this file just now."
+   - when aufs gets EACCES from branch fs's ->open(), then confirm the
+     registered TID and re-try open() with superuser's credential.
+
+Pros and cons for each approach.
+
+A.
+   - straightforward but highly depends upon VFS internal.
+   - the atomic behavaiour is kept.
+   - some of parameters such as nameidata are hard to reproduce for
+     branch fs.
+   - large overhead.
+B.
+   - easy to implement.
+   - the atomic behavaiour is lost.
+C.
+   - the atomic behavaiour is kept.
+   - dirty and tricky.
+   - VFS checks whether the file is created correctly after calling
+     ->create(), which means this approach doesn't work.
+D.
+   - easy to implement.
+   - the atomic behavaiour is lost.
+   - to open a file with superuser's credential and give it to a user
+     process is a bad idea, since the file object keeps the credential
+     in it. It may affect LSM or something. This approach doesn't work
+     either.
+
+The approach A is ideal, but it hard to implement. So here is a
+variation of A, which is to be implemented.
+
+A-1. Introduce aufs_atomic_open()
+     - calls branch fs ->atomic_open() if exists. otherwise calls
+       vfs_create() and finish_open().
+     - the demerit is that the several checks after branch fs
+       ->atomic_open() are lost. in the ordinary case, the checks are
+       done by VFS:do_last(), lookup_open() and atomic_open(). some can
+       be implemented in aufs, but not all I am afraid.
diff --git b/Documentation/filesystems/aufs/design/03lookup.txt b/Documentation/filesystems/aufs/design/03lookup.txt
new file mode 100644
index 0000000..5b6b000
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/03lookup.txt
@@ -0,0 +1,100 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Lookup in a Branch
+----------------------------------------------------------------------
+Since aufs has a character of sub-VFS (see Introduction), it operates
+lookup for branches as VFS does. It may be a heavy work. But almost all
+lookup operation in aufs is the simplest case, ie. lookup only an entry
+directly connected to its parent. Digging down the directory hierarchy
+is unnecessary. VFS has a function lookup_one_len() for that use, and
+aufs calls it.
+
+When a branch is a remote filesystem, aufs basically relies upon its
+->d_revalidate(), also aufs forces the hardest revalidate tests for
+them.
+For d_revalidate, aufs implements three levels of revalidate tests. See
+"Revalidate Dentry and UDBA" in detail.
+
+
+Test Only the Highest One for the Directory Permission (dirperm1 option)
+----------------------------------------------------------------------
+Let's try case study.
+- aufs has two branches, upper readwrite and lower readonly.
+  /au = /rw + /ro
+- "dirA" exists under /ro, but /rw. and its mode is 0700.
+- user invoked "chmod a+rx /au/dirA"
+- the internal copy-up is activated and "/rw/dirA" is created and its
+  permission bits are set to world readable.
+- then "/au/dirA" becomes world readable?
+
+In this case, /ro/dirA is still 0700 since it exists in readonly branch,
+or it may be a natively readonly filesystem. If aufs respects the lower
+branch, it should not respond readdir request from other users. But user
+allowed it by chmod. Should really aufs rejects showing the entries
+under /ro/dirA?
+
+To be honest, I don't have a good solution for this case. So aufs
+implements 'dirperm1' and 'nodirperm1' mount options, and leave it to
+users.
+When dirperm1 is specified, aufs checks only the highest one for the
+directory permission, and shows the entries. Otherwise, as usual, checks
+every dir existing on all branches and rejects the request.
+
+As a side effect, dirperm1 option improves the performance of aufs
+because the number of permission check is reduced when the number of
+branch is many.
+
+
+Revalidate Dentry and UDBA (User's Direct Branch Access)
+----------------------------------------------------------------------
+Generally VFS helpers re-validate a dentry as a part of lookup.
+0. digging down the directory hierarchy.
+1. lock the parent dir by its i_mutex.
+2. lookup the final (child) entry.
+3. revalidate it.
+4. call the actual operation (create, unlink, etc.)
+5. unlock the parent dir
+
+If the filesystem implements its ->d_revalidate() (step 3), then it is
+called. Actually aufs implements it and checks the dentry on a branch is
+still valid.
+But it is not enough. Because aufs has to release the lock for the
+parent dir on a branch at the end of ->lookup() (step 2) and
+->d_revalidate() (step 3) while the i_mutex of the aufs dir is still
+held by VFS.
+If the file on a branch is changed directly, eg. bypassing aufs, after
+aufs released the lock, then the subsequent operation may cause
+something unpleasant result.
+
+This situation is a result of VFS architecture, ->lookup() and
+->d_revalidate() is separated. But I never say it is wrong. It is a good
+design from VFS's point of view. It is just not suitable for sub-VFS
+character in aufs.
+
+Aufs supports such case by three level of revalidation which is
+selectable by user.
+1. Simple Revalidate
+   Addition to the native flow in VFS's, confirm the child-parent
+   relationship on the branch just after locking the parent dir on the
+   branch in the "actual operation" (step 4). When this validation
+   fails, aufs returns EBUSY. ->d_revalidate() (step 3) in aufs still
+   checks the validation of the dentry on branches.
+2. Monitor Changes Internally by Inotify/Fsnotify
+   Addition to above, in the "actual operation" (step 4) aufs re-lookup
+   the dentry on the branch, and returns EBUSY if it finds different
+   dentry.
+   Additionally, aufs sets the inotify/fsnotify watch for every dir on branches
+   during it is in cache. When the event is notified, aufs registers a
+   function to kernel 'events' thread by schedule_work(). And the
+   function sets some special status to the cached aufs dentry and inode
+   private data. If they are not cached, then aufs has nothing to
+   do. When the same file is accessed through aufs (step 0-3) later,
+   aufs will detect the status and refresh all necessary data.
+   In this mode, aufs has to ignore the event which is fired by aufs
+   itself.
+3. No Extra Validation
+   This is the simplest test and doesn't add any additional revalidation
+   test, and skip the revalidation in step 4. It is useful and improves
+   aufs performance when system surely hide the aufs branches from user,
+   by over-mounting something (or another method).
diff --git b/Documentation/filesystems/aufs/design/04branch.txt b/Documentation/filesystems/aufs/design/04branch.txt
new file mode 100644
index 0000000..e68f4d3
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/04branch.txt
@@ -0,0 +1,61 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Branch Manipulation
+
+Since aufs supports dynamic branch manipulation, ie. add/remove a branch
+and changing its permission/attribute, there are a lot of works to do.
+
+
+Add a Branch
+----------------------------------------------------------------------
+o Confirm the adding dir exists outside of aufs, including loopback
+  mount, and its various attributes.
+o Initialize the xino file and whiteout bases if necessary.
+  See struct.txt.
+
+o Check the owner/group/mode of the directory
+  When the owner/group/mode of the adding directory differs from the
+  existing branch, aufs issues a warning because it may impose a
+  security risk.
+  For example, when a upper writable branch has a world writable empty
+  top directory, a malicious user can create any files on the writable
+  branch directly, like copy-up and modify manually. If something like
+  /etc/{passwd,shadow} exists on the lower readonly branch but the upper
+  writable branch, and the writable branch is world-writable, then a
+  malicious guy may create /etc/passwd on the writable branch directly
+  and the infected file will be valid in aufs.
+  I am afraid it can be a security issue, but aufs can do nothing except
+  producing a warning.
+
+
+Delete a Branch
+----------------------------------------------------------------------
+o Confirm the deleting branch is not busy
+  To be general, there is one merit to adopt "remount" interface to
+  manipulate branches. It is to discard caches. At deleting a branch,
+  aufs checks the still cached (and connected) dentries and inodes. If
+  there are any, then they are all in-use. An inode without its
+  corresponding dentry can be alive alone (for example, inotify/fsnotify case).
+
+  For the cached one, aufs checks whether the same named entry exists on
+  other branches.
+  If the cached one is a directory, because aufs provides a merged view
+  to users, as long as one dir is left on any branch aufs can show the
+  dir to users. In this case, the branch can be removed from aufs.
+  Otherwise aufs rejects deleting the branch.
+
+  If any file on the deleting branch is opened by aufs, then aufs
+  rejects deleting.
+
+
+Modify the Permission of a Branch
+----------------------------------------------------------------------
+o Re-initialize or remove the xino file and whiteout bases if necessary.
+  See struct.txt.
+
+o rw --> ro: Confirm the modifying branch is not busy
+  Aufs rejects the request if any of these conditions are true.
+  - a file on the branch is mmap-ed.
+  - a regular file on the branch is opened for write and there is no
+    same named entry on the upper branch.
diff --git b/Documentation/filesystems/aufs/design/05wbr_policy.txt b/Documentation/filesystems/aufs/design/05wbr_policy.txt
new file mode 100644
index 0000000..1726d5d
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/05wbr_policy.txt
@@ -0,0 +1,51 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Policies to Select One among Multiple Writable Branches
+----------------------------------------------------------------------
+When the number of writable branch is more than one, aufs has to decide
+the target branch for file creation or copy-up. By default, the highest
+writable branch which has the parent (or ancestor) dir of the target
+file is chosen (top-down-parent policy).
+By user's request, aufs implements some other policies to select the
+writable branch, for file creation several policies, round-robin,
+most-free-space, and other policies. For copy-up, top-down-parent,
+bottom-up-parent, bottom-up and others.
+
+As expected, the round-robin policy selects the branch in circular. When
+you have two writable branches and creates 10 new files, 5 files will be
+created for each branch. mkdir(2) systemcall is an exception. When you
+create 10 new directories, all will be created on the same branch.
+And the most-free-space policy selects the one which has most free
+space among the writable branches. The amount of free space will be
+checked by aufs internally, and users can specify its time interval.
+
+The policies for copy-up is more simple,
+top-down-parent is equivalent to the same named on in create policy,
+bottom-up-parent selects the writable branch where the parent dir
+exists and the nearest upper one from the copyup-source,
+bottom-up selects the nearest upper writable branch from the
+copyup-source, regardless the existence of the parent dir.
+
+There are some rules or exceptions to apply these policies.
+- If there is a readonly branch above the policy-selected branch and
+  the parent dir is marked as opaque (a variation of whiteout), or the
+  target (creating) file is whiteout-ed on the upper readonly branch,
+  then the result of the policy is ignored and the target file will be
+  created on the nearest upper writable branch than the readonly branch.
+- If there is a writable branch above the policy-selected branch and
+  the parent dir is marked as opaque or the target file is whiteouted
+  on the branch, then the result of the policy is ignored and the target
+  file will be created on the highest one among the upper writable
+  branches who has diropq or whiteout. In case of whiteout, aufs removes
+  it as usual.
+- link(2) and rename(2) systemcalls are exceptions in every policy.
+  They try selecting the branch where the source exists as possible
+  since copyup a large file will take long time. If it can't be,
+  ie. the branch where the source exists is readonly, then they will
+  follow the copyup policy.
+- There is an exception for rename(2) when the target exists.
+  If the rename target exists, aufs compares the index of the branches
+  where the source and the target exists and selects the higher
+  one. If the selected branch is readonly, then aufs follows the
+  copyup policy.
diff --git b/Documentation/filesystems/aufs/design/06fhsm.txt b/Documentation/filesystems/aufs/design/06fhsm.txt
new file mode 100644
index 0000000..84b46dc
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/06fhsm.txt
@@ -0,0 +1,105 @@
+
+# Copyright (C) 2011-2016 Junjiro R. Okajima
+
+File-based Hierarchical Storage Management (FHSM)
+----------------------------------------------------------------------
+Hierarchical Storage Management (or HSM) is a well-known feature in the
+storage world. Aufs provides this feature as file-based with multiple
+writable branches, based upon the principle of "Colder, the Lower".
+Here the word "colder" means that the less used files, and "lower" means
+that the position in the order of the stacked branches vertically.
+These multiple writable branches are prioritized, ie. the topmost one
+should be the fastest drive and be used heavily.
+
+o Characters in aufs FHSM story
+- aufs itself and a new branch attribute.
+- a new ioctl interface to move-down and to establish a connection with
+  the daemon ("move-down" is a converse of "copy-up").
+- userspace tool and daemon.
+
+The userspace daemon establishes a connection with aufs and waits for
+the notification. The notified information is very similar to struct
+statfs containing the number of consumed blocks and inodes.
+When the consumed blocks/inodes of a branch exceeds the user-specified
+upper watermark, the daemon activates its move-down process until the
+consumed blocks/inodes reaches the user-specified lower watermark.
+
+The actual move-down is done by aufs based upon the request from
+user-space since we need to maintain the inode number and the internal
+pointer arrays in aufs.
+
+Currently aufs FHSM handles the regular files only. Additionally they
+must not be hard-linked nor pseudo-linked.
+
+
+o Cowork of aufs and the user-space daemon
+  During the userspace daemon established the connection, aufs sends a
+  small notification to it whenever aufs writes something into the
+  writable branch. But it may cost high since aufs issues statfs(2)
+  internally. So user can specify a new option to cache the
+  info. Actually the notification is controlled by these factors.
+  + the specified cache time.
+  + classified as "force" by aufs internally.
+  Until the specified time expires, aufs doesn't send the info
+  except the forced cases. When aufs decide forcing, the info is always
+  notified to userspace.
+  For example, the number of free inodes is generally large enough and
+  the shortage of it happens rarely. So aufs doesn't force the
+  notification when creating a new file, directory and others. This is
+  the typical case which aufs doesn't force.
+  When aufs writes the actual filedata and the files consumes any of new
+  blocks, the aufs forces notifying.
+
+
+o Interfaces in aufs
+- New branch attribute.
+  + fhsm
+    Specifies that the branch is managed by FHSM feature. In other word,
+    participant in the FHSM.
+    When nofhsm is set to the branch, it will not be the source/target
+    branch of the move-down operation. This attribute is set
+    independently from coo and moo attributes, and if you want full
+    FHSM, you should specify them as well.
+- New mount option.
+  + fhsm_sec
+    Specifies a second to suppress many less important info to be
+    notified.
+- New ioctl.
+  + AUFS_CTL_FHSM_FD
+    create a new file descriptor which userspace can read the notification
+    (a subset of struct statfs) from aufs.
+- Module parameter 'brs'
+  It has to be set to 1. Otherwise the new mount option 'fhsm' will not
+  be set.
+- mount helpers /sbin/mount.aufs and /sbin/umount.aufs
+  When there are two or more branches with fhsm attributes,
+  /sbin/mount.aufs invokes the user-space daemon and /sbin/umount.aufs
+  terminates it. As a result of remounting and branch-manipulation, the
+  number of branches with fhsm attribute can be one. In this case,
+  /sbin/mount.aufs will terminate the user-space daemon.
+
+
+Finally the operation is done as these steps in kernel-space.
+- make sure that,
+  + no one else is using the file.
+  + the file is not hard-linked.
+  + the file is not pseudo-linked.
+  + the file is a regular file.
+  + the parent dir is not opaqued.
+- find the target writable branch.
+- make sure the file is not whiteout-ed by the upper (than the target)
+  branch.
+- make the parent dir on the target branch.
+- mutex lock the inode on the branch.
+- unlink the whiteout on the target branch (if exists).
+- lookup and create the whiteout-ed temporary name on the target branch.
+- copy the file as the whiteout-ed temporary name on the target branch.
+- rename the whiteout-ed temporary name to the original name.
+- unlink the file on the source branch.
+- maintain the internal pointer array and the external inode number
+  table (XINO).
+- maintain the timestamps and other attributes of the parent dir and the
+  file.
+
+And of course, in every step, an error may happen. So the operation
+should restore the original file state after an error happens.
diff --git b/Documentation/filesystems/aufs/design/06mmap.txt b/Documentation/filesystems/aufs/design/06mmap.txt
new file mode 100644
index 0000000..991c0b1
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/06mmap.txt
@@ -0,0 +1,59 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+mmap(2) -- File Memory Mapping
+----------------------------------------------------------------------
+In aufs, the file-mapped pages are handled by a branch fs directly, no
+interaction with aufs. It means aufs_mmap() calls the branch fs's
+->mmap().
+This approach is simple and good, but there is one problem.
+Under /proc, several entries show the mmapped files by its path (with
+device and inode number), and the printed path will be the path on the
+branch fs's instead of virtual aufs's.
+This is not a problem in most cases, but some utilities lsof(1) (and its
+user) may expect the path on aufs.
+
+To address this issue, aufs adds a new member called vm_prfile in struct
+vm_area_struct (and struct vm_region). The original vm_file points to
+the file on the branch fs in order to handle everything correctly as
+usual. The new vm_prfile points to a virtual file in aufs, and the
+show-functions in procfs refers to vm_prfile if it is set.
+Also we need to maintain several other places where touching vm_file
+such like
+- fork()/clone() copies vma and the reference count of vm_file is
+  incremented.
+- merging vma maintains the ref count too.
+
+This is not a good approach. It just fakes the printed path. But it
+leaves all behaviour around f_mapping unchanged. This is surely an
+advantage.
+Actually aufs had adopted another complicated approach which calls
+generic_file_mmap() and handles struct vm_operations_struct. In this
+approach, aufs met a hard problem and I could not solve it without
+switching the approach.
+
+There may be one more another approach which is
+- bind-mount the branch-root onto the aufs-root internally
+- grab the new vfsmount (ie. struct mount)
+- lazy-umount the branch-root internally
+- in open(2) the aufs-file, open the branch-file with the hidden
+  vfsmount (instead of the original branch's vfsmount)
+- ideally this "bind-mount and lazy-umount" should be done atomically,
+  but it may be possible from userspace by the mount helper.
+
+Adding the internal hidden vfsmount and using it in opening a file, the
+file path under /proc will be printed correctly. This approach looks
+smarter, but is not possible I am afraid.
+- aufs-root may be bind-mount later. when it happens, another hidden
+  vfsmount will be required.
+- it is hard to get the chance to bind-mount and lazy-umount
+  + in kernel-space, FS can have vfsmount in open(2) via
+    file->f_path, and aufs can know its vfsmount. But several locks are
+    already acquired, and if aufs tries to bind-mount and lazy-umount
+    here, then it may cause a deadlock.
+  + in user-space, bind-mount doesn't invoke the mount helper.
+- since /proc shows dev and ino, aufs has to give vma these info. it
+  means a new member vm_prinode will be necessary. this is essentially
+  equivalent to vm_prfile described above.
+
+I have to give up this "looks-smater" approach.
diff --git b/Documentation/filesystems/aufs/design/06xattr.txt b/Documentation/filesystems/aufs/design/06xattr.txt
new file mode 100644
index 0000000..7bfa94f
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/06xattr.txt
@@ -0,0 +1,81 @@
+
+# Copyright (C) 2014-2016 Junjiro R. Okajima
+
+Listing XATTR/EA and getting the value
+----------------------------------------------------------------------
+For the inode standard attributes (owner, group, timestamps, etc.), aufs
+shows the values from the topmost existing file. This behaviour is good
+for the non-dir entries since the bahaviour exactly matches the shown
+information. But for the directories, aufs considers all the same named
+entries on the lower branches. Which means, if one of the lower entry
+rejects readdir call, then aufs returns an error even if the topmost
+entry allows it. This behaviour is necessary to respect the branch fs's
+security, but can make users confused since the user-visible standard
+attributes don't match the behaviour.
+To address this issue, aufs has a mount option called dirperm1 which
+checks the permission for the topmost entry only, and ignores the lower
+entry's permission.
+
+A similar issue can happen around XATTR.
+getxattr(2) and listxattr(2) families behave as if dirperm1 option is
+always set. Otherwise these very unpleasant situation would happen.
+- listxattr(2) may return the duplicated entries.
+- users may not be able to remove or reset the XATTR forever,
+
+
+XATTR/EA support in the internal (copy,move)-(up,down)
+----------------------------------------------------------------------
+Generally the extended attributes of inode are categorized as these.
+- "security" for LSM and capability.
+- "system" for posix ACL, 'acl' mount option is required for the branch
+  fs generally.
+- "trusted" for userspace, CAP_SYS_ADMIN is required.
+- "user" for userspace, 'user_xattr' mount option is required for the
+  branch fs generally.
+
+Moreover there are some other categories. Aufs handles these rather
+unpopular categories as the ordinary ones, ie. there is no special
+condition nor exception.
+
+In copy-up, the support for XATTR on the dst branch may differ from the
+src branch. In this case, the copy-up operation will get an error and
+the original user operation which triggered the copy-up will fail. It
+can happen that even all copy-up will fail.
+When both of src and dst branches support XATTR and if an error occurs
+during copying XATTR, then the copy-up should fail obviously. That is a
+good reason and aufs should return an error to userspace. But when only
+the src branch support that XATTR, aufs should not return an error.
+For example, the src branch supports ACL but the dst branch doesn't
+because the dst branch may natively un-support it or temporary
+un-support it due to "noacl" mount option. Of course, the dst branch fs
+may NOT return an error even if the XATTR is not supported. It is
+totally up to the branch fs.
+
+Anyway when the aufs internal copy-up gets an error from the dst branch
+fs, then aufs tries removing the just copied entry and returns the error
+to the userspace. The worst case of this situation will be all copy-up
+will fail.
+
+For the copy-up operation, there two basic approaches.
+- copy the specified XATTR only (by category above), and return the
+  error unconditionally if it happens.
+- copy all XATTR, and ignore the error on the specified category only.
+
+In order to support XATTR and to implement the correct behaviour, aufs
+chooses the latter approach and introduces some new branch attributes,
+"icexsec", "icexsys", "icextr", "icexusr", and "icexoth".
+They correspond to the XATTR namespaces (see above). Additionally, to be
+convenient, "icex" is also provided which means all "icex*" attributes
+are set (here the word "icex" stands for "ignore copy-error on XATTR").
+
+The meaning of these attributes is to ignore the error from setting
+XATTR on that branch.
+Note that aufs tries copying all XATTR unconditionally, and ignores the
+error from the dst branch according to the specified attributes.
+
+Some XATTR may have its default value. The default value may come from
+the parent dir or the environment. If the default value is set at the
+file creating-time, it will be overwritten by copy-up.
+Some contradiction may happen I am afraid.
+Do we need another attribute to stop copying XATTR? I am unsure. For
+now, aufs implements the branch attributes to ignore the error.
diff --git b/Documentation/filesystems/aufs/design/07export.txt b/Documentation/filesystems/aufs/design/07export.txt
new file mode 100644
index 0000000..c23930b
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/07export.txt
@@ -0,0 +1,45 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Export Aufs via NFS
+----------------------------------------------------------------------
+Here is an approach.
+- like xino/xib, add a new file 'xigen' which stores aufs inode
+  generation.
+- iget_locked(): initialize aufs inode generation for a new inode, and
+  store it in xigen file.
+- destroy_inode(): increment aufs inode generation and store it in xigen
+  file. it is necessary even if it is not unlinked, because any data of
+  inode may be changed by UDBA.
+- encode_fh(): for a root dir, simply return FILEID_ROOT. otherwise
+  build file handle by
+  + branch id (4 bytes)
+  + superblock generation (4 bytes)
+  + inode number (4 or 8 bytes)
+  + parent dir inode number (4 or 8 bytes)
+  + inode generation (4 bytes))
+  + return value of exportfs_encode_fh() for the parent on a branch (4
+    bytes)
+  + file handle for a branch (by exportfs_encode_fh())
+- fh_to_dentry():
+  + find the index of a branch from its id in handle, and check it is
+    still exist in aufs.
+  + 1st level: get the inode number from handle and search it in cache.
+  + 2nd level: if not found in cache, get the parent inode number from
+    the handle and search it in cache. and then open the found parent
+    dir, find the matching inode number by vfs_readdir() and get its
+    name, and call lookup_one_len() for the target dentry.
+  + 3rd level: if the parent dir is not cached, call
+    exportfs_decode_fh() for a branch and get the parent on a branch,
+    build a pathname of it, convert it a pathname in aufs, call
+    path_lookup(). now aufs gets a parent dir dentry, then handle it as
+    the 2nd level.
+  + to open the dir, aufs needs struct vfsmount. aufs keeps vfsmount
+    for every branch, but not itself. to get this, (currently) aufs
+    searches in current->nsproxy->mnt_ns list. it may not be a good
+    idea, but I didn't get other approach.
+  + test the generation of the gotten inode.
+- every inode operation: they may get EBUSY due to UDBA. in this case,
+  convert it into ESTALE for NFSD.
+- readdir(): call lockdep_on/off() because filldir in NFSD calls
+  lookup_one_len(), vfs_getattr(), encode_fh() and others.
diff --git b/Documentation/filesystems/aufs/design/08shwh.txt b/Documentation/filesystems/aufs/design/08shwh.txt
new file mode 100644
index 0000000..ad58ebe
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/08shwh.txt
@@ -0,0 +1,39 @@
+
+# Copyright (C) 2005-2016 Junjiro R. Okajima
+
+Show Whiteout Mode (shwh)
+----------------------------------------------------------------------
+Generally aufs hides the name of whiteouts. But in some cases, to show
+them is very useful for users. For instance, creating a new middle layer
+(branch) by merging existing layers.
+
+(borrowing aufs1 HOW-TO from a user, Michael Towers)
+When you have three branches,
+- Bottom: 'system', squashfs (underlying base system), read-only
+- Middle: 'mods', squashfs, read-only
+- Top: 'overlay', ram (tmpfs), read-write
+
+The top layer is loaded at boot time and saved at shutdown, to preserve
+the changes made to the system during the session.
+When larger changes have been made, or smaller changes have accumulated,
+the size of the saved top layer data grows. At this point, it would be
+nice to be able to merge the two overlay branches ('mods' and 'overlay')
+and rewrite the 'mods' squashfs, clearing the top layer and thus
+restoring save and load speed.
+
+This merging is simplified by the use of another aufs mount, of just the
+two overlay branches using the 'shwh' option.
+# mount -t aufs -o ro,shwh,br:/livesys/overlay=ro+wh:/livesys/mods=rr+wh \
+	aufs /livesys/merge_union
+
+A merged view of these two branches is then available at
+/livesys/merge_union, and the new feature is that the whiteouts are
+visible!
+Note that in 'shwh' mode the aufs mount must be 'ro', which will disable
+writing to all branches. Also the default mode for all branches is 'ro'.
+It is now possible to save the combined contents of the two overlay
+branches to a new squashfs, e.g.:
+# mksquashfs /livesys/merge_union /path/to/newmods.squash
+
+This new squashfs archive can be stored on the boot device and the
+initramfs will use it to replace the old one at the next boot.
diff --git b/Documentation/filesystems/aufs/design/10dynop.txt b/Documentation/filesystems/aufs/design/10dynop.txt
new file mode 100644
index 0000000..49afc58
--- /dev/null
+++ b/Documentation/filesystems/aufs/design/10dynop.txt
@@ -0,0 +1,34 @@
+
+# Copyright (C) 2010-2016 Junjiro R. Okajima
+
+Dynamically customizable FS operations
+----------------------------------------------------------------------
+Generally FS operations (struct inode_operations, struct
+address_space_operations, struct file_operations, etc.) are defined as
+"static const", but it never means that FS have only one set of
+operation. Some FS have multiple sets of them. For instance, ext2 has
+three sets, one for XIP, for NOBH, and for normal.
+Since aufs overrides and redirects these operations, sometimes aufs has
+to change its behaviour according to the branch FS type. More importantly
+VFS acts differently if a function (member in the struct) is set or
+not. It means aufs should have several sets of operations and select one
+among them according to the branch FS definition.
+
+In order to solve this problem and not to affect the behaviour of VFS,
+aufs defines these operations dynamically. For instance, aufs defines
+dummy direct_IO function for struct address_space_operations, but it may
+not be set to the address_space_operations actually. When the branch FS
+doesn't have it, aufs doesn't set it to its address_space_operations
+while the function definition itself is still alive. So the behaviour
+itself will not change, and it will return an error when direct_IO is
+not set.
+
+The lifetime of these dynamically generated operation object is
+maintained by aufs branch object. When the branch is removed from aufs,
+the reference counter of the object is decremented. When it reaches
+zero, the dynamically generated operation object will be freed.
+
+This approach is designed to support AIO (io_submit), Direct I/O and
+XIP (DAX) mainly.
+Currently this approach is applied to address_space_operations for
+regular files only.
diff --git b/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt
new file mode 100644
index 0000000..c028200
--- /dev/null
+++ b/Documentation/scheduler/sched-BFS.txt
@@ -0,0 +1,351 @@
+BFS - The Brain Fuck Scheduler by Con Kolivas.
+
+Goals.
+
+The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to
+completely do away with the complex designs of the past for the cpu process
+scheduler and instead implement one that is very simple in basic design.
+The main focus of BFS is to achieve excellent desktop interactivity and
+responsiveness without heuristics and tuning knobs that are difficult to
+understand, impossible to model and predict the effect of, and when tuned to
+one workload cause massive detriment to another.
+
+
+Design summary.
+
+BFS is best described as a single runqueue, O(n) lookup, earliest effective
+virtual deadline first design, loosely based on EEVDF (earliest eligible virtual
+deadline first) and my previous Staircase Deadline scheduler. Each component
+shall be described in order to understand the significance of, and reasoning for
+it. The codebase when the first stable version was released was approximately
+9000 lines less code than the existing mainline linux kernel scheduler (in
+2.6.31). This does not even take into account the removal of documentation and
+the cgroups code that is not used.
+
+Design reasoning.
+
+The single runqueue refers to the queued but not running processes for the
+entire system, regardless of the number of CPUs. The reason for going back to
+a single runqueue design is that once multiple runqueues are introduced,
+per-CPU or otherwise, there will be complex interactions as each runqueue will
+be responsible for the scheduling latency and fairness of the tasks only on its
+own runqueue, and to achieve fairness and low latency across multiple CPUs, any
+advantage in throughput of having CPU local tasks causes other disadvantages.
+This is due to requiring a very complex balancing system to at best achieve some
+semblance of fairness across CPUs and can only maintain relatively low latency
+for tasks bound to the same CPUs, not across them. To increase said fairness
+and latency across CPUs, the advantage of local runqueue locking, which makes
+for better scalability, is lost due to having to grab multiple locks.
+
+A significant feature of BFS is that all accounting is done purely based on CPU
+used and nowhere is sleep time used in any way to determine entitlement or
+interactivity. Interactivity "estimators" that use some kind of sleep/run
+algorithm are doomed to fail to detect all interactive tasks, and to falsely tag
+tasks that aren't interactive as being so. The reason for this is that it is
+close to impossible to determine that when a task is sleeping, whether it is
+doing it voluntarily, as in a userspace application waiting for input in the
+form of a mouse click or otherwise, or involuntarily, because it is waiting for
+another thread, process, I/O, kernel activity or whatever. Thus, such an
+estimator will introduce corner cases, and more heuristics will be required to
+cope with those corner cases, introducing more corner cases and failed
+interactivity detection and so on. Interactivity in BFS is built into the design
+by virtue of the fact that tasks that are waking up have not used up their quota
+of CPU time, and have earlier effective deadlines, thereby making it very likely
+they will preempt any CPU bound task of equivalent nice level. See below for
+more information on the virtual deadline mechanism. Even if they do not preempt
+a running task, because the rr interval is guaranteed to have a bound upper
+limit on how long a task will wait for, it will be scheduled within a timeframe
+that will not cause visible interface jitter.
+
+
+Design details.
+
+Task insertion.
+
+BFS inserts tasks into each relevant queue as an O(1) insertion into a double
+linked list. On insertion, *every* running queue is checked to see if the newly
+queued task can run on any idle queue, or preempt the lowest running task on the
+system. This is how the cross-CPU scheduling of BFS achieves significantly lower
+latency per extra CPU the system has. In this case the lookup is, in the worst
+case scenario, O(n) where n is the number of CPUs on the system.
+
+Data protection.
+
+BFS has one single lock protecting the process local data of every task in the
+global queue. Thus every insertion, removal and modification of task data in the
+global runqueue needs to grab the global lock. However, once a task is taken by
+a CPU, the CPU has its own local data copy of the running process' accounting
+information which only that CPU accesses and modifies (such as during a
+timer tick) thus allowing the accounting data to be updated lockless. Once a
+CPU has taken a task to run, it removes it from the global queue. Thus the
+global queue only ever has, at most,
+
+	(number of tasks requesting cpu time) - (number of logical CPUs) + 1
+
+tasks in the global queue. This value is relevant for the time taken to look up
+tasks during scheduling. This will increase if many tasks with CPU affinity set
+in their policy to limit which CPUs they're allowed to run on if they outnumber
+the number of CPUs. The +1 is because when rescheduling a task, the CPU's
+currently running task is put back on the queue. Lookup will be described after
+the virtual deadline mechanism is explained.
+
+Virtual deadline.
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in BFS is entirely in the virtual deadline mechanism. The one
+tunable in BFS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in jiffies by this equation:
+
+	jiffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases. Once a task is descheduled, it is put back on the queue, and an
+O(n) lookup of all queued-but-not-running tasks is done to determine which has
+the earliest deadline and that task is chosen to receive CPU next.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+	(prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (jiffies) is
+constantly moving.
+
+Task lookup.
+
+BFS has 103 priority queues. 100 of these are dedicated to the static priority
+of realtime tasks, and the remaining 3 are, in order of best to worst priority,
+SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority
+scheduling). When a task of these priorities is queued, a bitmap of running
+priorities is set showing which of these priorities has tasks waiting for CPU
+time. When a CPU is made to reschedule, the lookup for the next task to get
+CPU time is performed in the following way:
+
+First the bitmap is checked to see what static priority tasks are queued. If
+any realtime priorities are found, the corresponding queue is checked and the
+first task listed there is taken (provided CPU affinity is suitable) and lookup
+is complete. If the priority corresponds to a SCHED_ISO task, they are also
+taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds
+to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this
+stage, every task in the runlist that corresponds to that priority is checked
+to see which has the earliest set deadline, and (provided it has suitable CPU
+affinity) it is taken off the runqueue and given the CPU. If a task has an
+expired deadline, it is taken and the rest of the lookup aborted (as they are
+chosen in FIFO order).
+
+Thus, the lookup is O(n) in the worst case only, where n is as described
+earlier, as tasks may be chosen before the whole task list is looked over.
+
+
+Scalability.
+
+The major limitations of BFS will be that of scalability, as the separate
+runqueue designs will have less lock contention as the number of CPUs rises.
+However they do not scale linearly even with separate runqueues as multiple
+runqueues will need to be locked concurrently on such designs to be able to
+achieve fair CPU balancing, to try and achieve some sort of nice-level fairness
+across CPUs, and to achieve low enough latency for tasks on a busy CPU when
+other CPUs would be more suited. BFS has the advantage that it requires no
+balancing algorithm whatsoever, as balancing occurs by proxy simply because
+all CPUs draw off the global runqueue, in priority and deadline order. Despite
+the fact that scalability is _not_ the prime concern of BFS, it both shows very
+good scalability to smaller numbers of CPUs and is likely a more scalable design
+at these numbers of CPUs.
+
+It also has some very low overhead scalability features built into the design
+when it has been deemed their overhead is so marginal that they're worth adding.
+The first is the local copy of the running process' data to the CPU it's running
+on to allow that data to be updated lockless where possible. Then there is
+deference paid to the last CPU a task was running on, by trying that CPU first
+when looking for an idle CPU to use the next time it's scheduled. Finally there
+is the notion of cache locality beyond the last running CPU. The sched_domains
+information is used to determine the relative virtual "cache distance" that
+other CPUs have from the last CPU a task was running on. CPUs with shared
+caches, such as SMT siblings, or multicore CPUs with shared caches, are treated
+as cache local. CPUs without shared caches are treated as not cache local, and
+CPUs on different NUMA nodes are treated as very distant. This "relative cache
+distance" is used by modifying the virtual deadline value when doing lookups.
+Effectively, the deadline is unaltered between "cache local" CPUs, doubled for
+"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning
+behind the doubling of deadlines is as follows. The real cost of migrating a
+task from one CPU to another is entirely dependant on the cache footprint of
+the task, how cache intensive the task is, how long it's been running on that
+CPU to take up the bulk of its cache, how big the CPU cache is, how fast and
+how layered the CPU cache is, how fast a context switch is... and so on. In
+other words, it's close to random in the real world where we do more than just
+one sole workload. The only thing we can be sure of is that it's not free. So
+BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs
+is more important than cache locality, and cache locality only plays a part
+after that. Doubling the effective deadline is based on the premise that the
+"cache local" CPUs will tend to work on the same tasks up to double the number
+of cache local CPUs, and once the workload is beyond that amount, it is likely
+that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA
+is a value I pulled out of my arse.
+
+When choosing an idle CPU for a waking task, the cache locality is determined
+according to where the task last ran and then idle CPUs are ranked from best
+to worst to choose the most suitable idle CPU based on cache locality, NUMA
+node locality and hyperthread sibling business. They are chosen in the
+following preference (if idle):
+
+* Same core, idle or busy cache, idle threads
+* Other core, same cache, idle or busy cache, idle threads.
+* Same node, other CPU, idle cache, idle threads.
+* Same node, other CPU, busy cache, idle threads.
+* Same core, busy threads.
+* Other core, same cache, busy threads.
+* Same node, other CPU, busy threads.
+* Other node, other CPU, idle cache, idle threads.
+* Other node, other CPU, busy cache, idle threads.
+* Other node, other CPU, busy threads.
+
+This shows the SMT or "hyperthread" awareness in the design as well which will
+choose a real idle core first before a logical SMT sibling which already has
+tasks on the physical CPU.
+
+Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark.
+However this benchmarking was performed on an earlier design that was far less
+scalable than the current one so it's hard to know how scalable it is in terms
+of both CPUs (due to the global runqueue) and heavily loaded machines (due to
+O(n) lookup) at this stage. Note that in terms of scalability, the number of
+_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x)
+quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark
+results are very promising indeed, without needing to tweak any knobs, features
+or options. Benchmark contributions are most welcome.
+
+
+Features
+
+As the initial prime target audience for BFS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval
+and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition
+to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is
+support for CGROUPS. The average user should neither need to know what these
+are, nor should they need to be using them to have good desktop behaviour.
+
+rr_interval
+
+There is only one "scheduler" tunable, the round robin interval. This can be
+accessed in
+
+	/proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6 on a
+uniprocessor machine, and automatically set to a progressively higher value on
+multiprocessor machines. The reasoning behind increasing the value on more CPUs
+is that the effective latency is decreased by virtue of there being more CPUs on
+BFS (for reasons explained above), and increasing the value allows for less
+cache contention and more throughput. Valid values are from 1 to 1000
+Decreasing the value will decrease latencies at the cost of decreasing
+throughput, while increasing it will improve throughput, but at the cost of
+worsening latencies. The accuracy of the rr interval is limited by HZ resolution
+of the kernel configuration. Thus, the worst case latencies are usually slightly
+higher than this actual value. The default value of 6 is not an arbitrary one.
+It is based on the fact that humans can detect jitter at approximately 7ms, so
+aiming for much lower latencies is pointless under most circumstances. It is
+worth noting this fact when comparing the latency performance of BFS to other
+schedulers. Worst case latencies being higher than 7ms are far worse than
+average latencies not being in the microsecond range.
+
+Isochronous scheduling.
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+	schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of _total CPU_ available across the machine, configurable
+as a percentage in the following "resource handling" tunable (as opposed to a
+scheduler tunable):
+
+	/proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of BFS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+Because some applications constantly set their policy as well as their nice
+level, there is potential for them to undo the override specified by the user
+on the command line of setting the policy to SCHED_ISO. To counter this, once
+a task has been set to SCHED_ISO policy, it needs superuser privileges to set
+it back to SCHED_NORMAL. This will ensure the task remains ISO and all child
+processes and threads will also inherit the ISO policy.
+
+Idleprio scheduling.
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start
+a video encode or so on without any slowdown of other tasks. To avoid this
+policy from grabbing shared resources and holding them indefinitely, if it
+detects a state where the task is waiting on I/O, the machine is about to
+suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As
+per the Isochronous task management, once a task has been scheduled as IDLEPRIO,
+it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can
+be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+	schedtool -D -e ./mprime
+
+Subtick accounting.
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the
+timer tick frequency (HZ) is lowered. It is possible to create an application
+which uses almost 100% CPU, yet by being descheduled at the right time, records
+zero CPU usage. While the main problem with this is that there are possible
+security implications, it is also difficult to determine how much CPU a task
+really does use. BFS tries to use the sub-tick accounting from the TSC clock,
+where possible, to determine real CPU usage. This is not entirely reliable, but
+is far more likely to produce accurate CPU usage data than the existing designs
+and will not show tasks as consuming no CPU usage when they actually are. Thus,
+the amount of CPU reported as being used by BFS will more accurately represent
+how much CPU the task itself is using (as is shown for example by the 'time'
+application), so the reported values may be quite different to other schedulers.
+Values reported as the 'load' are more prone to problems with this design, but
+per process values are closer to real usage. When comparing throughput of BFS
+to other designs, it is important to compare the actual completed work in terms
+of total wall clock time taken and total work done, rather than the reported
+"cpu usage".
+
+
+Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010
diff --git b/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt
new file mode 100644
index 0000000..bbd6980
--- /dev/null
+++ b/Documentation/scheduler/sched-MuQSS.txt
@@ -0,0 +1,345 @@
+MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas.
+
+MuQSS is a per-cpu runqueue variant of the original BFS scheduler with
+one 8 level skiplist per runqueue, and fine grained locking for much more
+scalability.
+
+
+Goals.
+
+The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from
+here on (pronounced mux) is to completely do away with the complex designs of
+the past for the cpu process scheduler and instead implement one that is very
+simple in basic design. The main focus of MuQSS is to achieve excellent desktop
+interactivity and responsiveness without heuristics and tuning knobs that are
+difficult to understand, impossible to model and predict the effect of, and when
+tuned to one workload cause massive detriment to another, while still being
+scalable to many CPUs and processes.
+
+
+Design summary.
+
+MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1)
+lookup, earliest effective virtual deadline first tickless design, loosely based
+on EEVDF (earliest eligible virtual deadline first) and my previous Staircase
+Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler.
+Each component shall be described in order to understand the significance of,
+and reasoning for it.
+
+
+Design reasoning.
+
+In BFS, the use of a single runqueue across all CPUs meant that each CPU would
+need to scan the entire runqueue looking for the process with the earliest
+deadline and schedule that next, regardless of which CPU it originally came
+from. This made BFS deterministic with respect to latency and provided
+guaranteed latencies dependent on number of processes and CPUs. The single
+runqueue, however, meant that all CPUs would compete for the single lock
+protecting it, which would lead to increasing lock contention as the number of
+CPUs rose and appeared to limit scalability of common workloads beyond 16
+logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously
+increased overhead proportionate to the number of queued proecesses and led to
+cache thrashing while iterating over the linked list.
+
+MuQSS is an evolution of BFS, designed to maintain the same scheduling
+decision mechanism and be virtually deterministic without relying on the
+constrained design of the single runqueue by splitting out the single runqueue
+to be per-CPU and use skiplists instead of linked lists.
+
+The original reason for going back to a single runqueue design for BFS was that
+once multiple runqueues are introduced, per-CPU or otherwise, there will be
+complex interactions as each runqueue will be responsible for the scheduling
+latency and fairness of the tasks only on its own runqueue, and to achieve
+fairness and low latency across multiple CPUs, any advantage in throughput of
+having CPU local tasks causes other disadvantages. This is due to requiring a
+very complex balancing system to at best achieve some semblance of fairness
+across CPUs and can only maintain relatively low latency for tasks bound to the
+same CPUs, not across them. To increase said fairness and latency across CPUs,
+the advantage of local runqueue locking, which makes for better scalability, is
+lost due to having to grab multiple locks.
+
+MuQSS works around the problems inherent in multiple runqueue designs by
+making its skip lists priority ordered and through novel use of lockless
+examination of each other runqueue it can decide if it should take the earliest
+deadline task from another runqueue for latency reasons, or for CPU balancing
+reasons. It still does not have a balancing system, choosing to allow the
+next task scheduling decision and task wakeup CPU choice to allow balancing to
+happen by virtue of its choices.
+
+
+Design details.
+
+Custom skip list implementation:
+
+To avoid the overhead of building up and tearing down skip list structures,
+the variant used by MuQSS has a number of optimisations making it specific for
+its use case in the scheduler. It uses static arrays of 8 'levels' instead of
+building up and tearing down structures dynamically. This makes each runqueue
+only scale O(log N) up to 256 tasks. However as there is one runqueue per CPU
+it means that it scales O(log N) up to 256 x number of logical CPUs which is
+far beyond the realistic task limits each CPU could handle. By being 8 levels
+it also makes the array exactly one cacheline in size. Additionally, each
+skip list node is bidirectional making insertion and removal amortised O(1),
+being O(k) where k is 1-8. Uniquely, we are only ever interested in the very
+first entry in each list at all times with MuQSS, so there is never a need to
+do a search and thus look up is always O(1).
+
+Task insertion:
+
+MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into
+a custom skip list as described above (based on the original design by William
+Pugh). Insertion is ordered in such a way that there is never a need to do a
+search by ordering tasks according to static priority primarily, and then
+virtual deadline at the time of insertion.
+
+Niffies:
+
+Niffies are a monotonic forward moving timer not unlike the "jiffies" but are
+of nanosecond resolution. Niffies are calculated per-runqueue from the high
+resolution TSC timers, and in order to maintain fairness are synchronised
+between CPUs whenever both runqueues are locked concurrently.
+
+Virtual deadline:
+
+The key to achieving low latency, scheduling fairness, and "nice level"
+distribution in MuQSS is entirely in the virtual deadline mechanism. The one
+tunable in MuQSS is the rr_interval, or "round robin interval". This is the
+maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy)
+tasks of the same nice level will be running for, or looking at it the other
+way around, the longest duration two tasks of the same nice level will be
+delayed for. When a task requests cpu time, it is given a quota (time_slice)
+equal to the rr_interval and a virtual deadline. The virtual deadline is
+offset from the current time in niffies by this equation:
+
+	niffies + (prio_ratio * rr_interval)
+
+The prio_ratio is determined as a ratio compared to the baseline of nice -20
+and increases by 10% per nice level. The deadline is a virtual one only in that
+no guarantee is placed that a task will actually be scheduled by this time, but
+it is used to compare which task should go next. There are three components to
+how a task is next chosen. First is time_slice expiration. If a task runs out
+of its time_slice, it is descheduled, the time_slice is refilled, and the
+deadline reset to that formula above. Second is sleep, where a task no longer
+is requesting CPU for whatever reason. The time_slice and deadline are _not_
+adjusted in this case and are just carried over for when the task is next
+scheduled. Third is preemption, and that is when a newly waking task is deemed
+higher priority than a currently running task on any cpu by virtue of the fact
+that it has an earlier virtual deadline than the currently running task. The
+earlier deadline is the key to which task is next chosen for the first and
+second cases.
+
+The CPU proportion of different nice tasks works out to be approximately the
+
+	(prio_ratio difference)^2
+
+The reason it is squared is that a task's deadline does not change while it is
+running unless it runs out of time_slice. Thus, even if the time actually
+passes the deadline of another task that is queued, it will not get CPU time
+unless the current running task deschedules, and the time "base" (niffies) is
+constantly moving.
+
+Task lookup:
+
+As tasks are already pre-ordered according to anticipated scheduling order in
+the skip lists, lookup for the next suitable task per-runqueue is always a
+matter of simply selecting the first task in the 0th level skip list entry.
+In order to maintain optimal latency and fairness across CPUs, MuQSS does a
+novel examination of every other runqueue in cache locality order, choosing the
+best task across all runqueues. This provides near-determinism of how long any
+task across the entire system may wait before receiving CPU time. The other
+runqueues are first examine lockless and then trylocked to minimise the
+potential lock contention if they are likely to have a suitable better task.
+Each other runqueue lock is only held for as long as it takes to examine the
+entry for suitability. In "interactive" mode, the default setting, MuQSS will
+look for the best deadline task across all CPUs, while in !interactive mode,
+it will only select a better deadline task from another CPU if it is more
+heavily laden than the current one.
+
+Lookup is therefore O(k) where k is number of CPUs.
+
+
+Latency.
+
+Through the use of virtual deadlines to govern the scheduling order of normal
+tasks, queue-to-activation latency per runqueue is guaranteed to be bound by
+the rr_interval tunable which is set to 6ms by default. This means that the
+longest a CPU bound task will wait for more CPU is proportional to the number
+of running tasks and in the common case of 0-2 running tasks per CPU, will be
+under the 7ms threshold for human perception of jitter. Additionally, as newly
+woken tasks will have an early deadline from their previous runtime, the very
+tasks that are usually latency sensitive will have the shortest interval for
+activation, usually preempting any existing CPU bound tasks.
+
+Tickless expiry:
+
+A feature of MuQSS is that it is not tied to the resolution of the chosen tick
+rate in Hz, instead depending entirely on the high resolution timers where
+possible for sub-millisecond accuracy on timeouts regarless of the underlying
+tick rate. This allows MuQSS to be run with the low overhead of low Hz rates
+such as 100 by default, benefiting from the improved throughput and lower
+power usage it provides. Another advantage of this approach is that in
+combination with the Full No HZ option, which disables ticks on running task
+CPUs instead of just idle CPUs, the tick can be disabled at all times
+regardless of how many tasks are running instead of being limited to just one
+running task. Note that this option is NOT recommended for regular desktop
+users.
+
+
+Scalability and balancing.
+
+Unlike traditional approaches where balancing is a combination of CPU selection
+at task wakeup and intermittent balancing based on a vast array of rules set
+according to architecture, busyness calculations and special case management,
+MuQSS indirectly balances on the fly at task wakeup and next task selection.
+During initialisation, MuQSS creates a cache coherency ordered list of CPUs for
+each logical CPU and uses this to aid task/CPU selection when CPUs are busy.
+Additionally it selects any idle CPUs, if they are available, at any time over
+busy CPUs according to the following preference:
+
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+
+Mux is therefore SMT, MC and Numa aware without the need for extra
+intermittent balancing to maintain CPUs busy and make the most of cache
+coherency.
+
+
+Features
+
+As the initial prime target audience for MuQSS was the average desktop user, it
+was designed to not need tweaking, tuning or have features set to obtain benefit
+from it. Thus the number of knobs and features has been kept to an absolute
+minimum and should not require extra user input for the vast majority of cases.
+There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval,
+interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO
+policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS
+does _not_ now feature is support for CGROUPS. The average user should neither
+need to know what these are, nor should they need to be using them to have good
+desktop behaviour. However since some applications refuse to work without
+cgroups, one can enable them with MuQSS as a stub and the filesystem will be
+created which will allow the applications to work.
+
+rr_interval:
+
+	/proc/sys/kernel/rr_interval
+
+The value is in milliseconds, and the default value is set to 6. Valid values
+are from 1 to 1000 Decreasing the value will decrease latencies at the cost of
+decreasing throughput, while increasing it will improve throughput, but at the
+cost of worsening latencies. It is based on the fact that humans can detect
+jitter at approximately 7ms, so aiming for much lower latencies is pointless
+under most circumstances. It is worth noting this fact when comparing the
+latency performance of MuQSS to other schedulers. Worst case latencies being
+higher than 7ms are far worse than average latencies not being in the
+microsecond range.
+
+interactive:
+
+	/proc/sys/kernel/interactive
+
+The value is a simple boolean of 1 for on and 0 for off and is set to on by
+default. Disabling this will disable the near-determinism of MuQSS when
+selecting the next task by not examining all CPUs for the earliest deadline
+task, or which CPU to wake to, instead prioritising CPU balancing for improved
+throughput. Latency will still be bound by rr_interval, but on a per-CPU basis
+instead of across the whole system.
+
+Isochronous scheduling:
+
+Isochronous scheduling is a unique scheduling policy designed to provide
+near-real-time performance to unprivileged (ie non-root) users without the
+ability to starve the machine indefinitely. Isochronous tasks (which means
+"same time") are set using, for example, the schedtool application like so:
+
+	schedtool -I -e amarok
+
+This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works
+is that it has a priority level between true realtime tasks and SCHED_NORMAL
+which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie,
+if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval
+rate). However if ISO tasks run for more than a tunable finite amount of time,
+they are then demoted back to SCHED_NORMAL scheduling. This finite amount of
+time is the percentage of CPU available per CPU, configurable as a percentage in
+the following "resource handling" tunable (as opposed to a scheduler tunable):
+
+iso_cpu:
+
+	/proc/sys/kernel/iso_cpu
+
+and is set to 70% by default. It is calculated over a rolling 5 second average
+Because it is the total CPU available, it means that on a multi CPU machine, it
+is possible to have an ISO task running as realtime scheduling indefinitely on
+just one CPU, as the other CPUs will be available. Setting this to 100 is the
+equivalent of giving all users SCHED_RR access and setting it to 0 removes the
+ability to run any pseudo-realtime tasks.
+
+A feature of MuQSS is that it detects when an application tries to obtain a
+realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the
+appropriate privileges to use those policies. When it detects this, it will
+give the task SCHED_ISO policy instead. Thus it is transparent to the user.
+
+
+Idleprio scheduling:
+
+Idleprio scheduling is a scheduling policy designed to give out CPU to a task
+_only_ when the CPU would be otherwise idle. The idea behind this is to allow
+ultra low priority tasks to be run in the background that have virtually no
+effect on the foreground tasks. This is ideally suited to distributed computing
+clients (like setiathome, folding, mprime etc) but can also be used to start a
+video encode or so on without any slowdown of other tasks. To avoid this policy
+from grabbing shared resources and holding them indefinitely, if it detects a
+state where the task is waiting on I/O, the machine is about to suspend to ram
+and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has
+been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without
+superuser privileges since it is effectively a lower scheduling policy. Tasks
+can be set to start as SCHED_IDLEPRIO with the schedtool command like so:
+
+schedtool -D -e ./mprime
+
+Subtick accounting:
+
+It is surprisingly difficult to get accurate CPU accounting, and in many cases,
+the accounting is done by simply determining what is happening at the precise
+moment a timer tick fires off. This becomes increasingly inaccurate as the timer
+tick frequency (HZ) is lowered. It is possible to create an application which
+uses almost 100% CPU, yet by being descheduled at the right time, records zero
+CPU usage. While the main problem with this is that there are possible security
+implications, it is also difficult to determine how much CPU a task really does
+use. Mux uses sub-tick accounting from the TSC clock to determine real CPU
+usage. Thus, the amount of CPU reported as being used by MuQSS will more
+accurately represent how much CPU the task itself is using (as is shown for
+example by the 'time' application), so the reported values may be quite
+different to other schedulers. When comparing throughput of MuQSS to other
+designs, it is important to compare the actual completed work in terms of total
+wall clock time taken and total work done, rather than the reported "cpu usage".
+
+Symmetric MultiThreading (SMT) aware nice:
+
+SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the
+logical CPU count rises by adding thread units to each CPU core, allowing more
+than one task to be run simultaneously on the same core, the disadvantage of it
+is that the CPU power is shared between the tasks, not summating to the power
+of two CPUs. The practical upshot of this is that two tasks running on
+separate threads of the same core run significantly slower than if they had one
+core each to run on. While smart CPU selection allows each task to have a core
+to itself whenever available (as is done on MuQSS), it cannot offset the
+slowdown that occurs when the cores are all loaded and only a thread is left.
+Most of the time this is harmless as the CPU is effectively overloaded at this
+point and the extra thread is of benefit. However when running a niced task in
+the presence of an un-niced task (say nice 19 v nice 0), the nice task gets
+precisely the same amount of CPU power as the unniced one. MuQSS has an
+optional configuration feature known as SMT-NICE which selectively idles the
+secondary niced thread for a period proportional to the nice difference,
+allowing CPU distribution according to nice level to be maintained, at the
+expense of a small amount of extra overhead. If this is configured in on a
+machine without SMT threads, the overhead is minimal.
+
+
+Con Kolivas <kernel@kolivas.org> Sat, 29th October 2016
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index ffab8b5..01e0a4a 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -39,6 +39,7 @@ show up in /proc/sys/kernel:
 - hung_task_timeout_secs
 - hung_task_warnings
 - kexec_load_disabled
+- iso_cpu
 - kptr_restrict
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
@@ -73,6 +74,7 @@ show up in /proc/sys/kernel:
 - randomize_va_space
 - real-root-dev               ==> Documentation/initrd.txt
 - reboot-cmd                  [ SPARC only ]
+- rr_interval
 - rtsig-max
 - rtsig-nr
 - sem
@@ -402,6 +404,16 @@ kernel stack.
 
 ==============================================================
 
+iso_cpu: (MuQSS CPU scheduler only).
+
+This sets the percentage cpu that the unprivileged SCHED_ISO tasks can
+run effectively at realtime priority, averaged over a rolling five
+seconds over the -whole- system, meaning all cpus.
+
+Set to 70 (percent) by default.
+
+==============================================================
+
 l2cr: (PPC only)
 
 This flag controls the L2 cache of G3 processor boards. If
@@ -818,6 +830,20 @@ rebooting. ???
 
 ==============================================================
 
+rr_interval: (MuQSS CPU scheduler only)
+
+This is the smallest duration that any cpu process scheduling unit
+will run for. Increasing this value can increase throughput of cpu
+bound tasks substantially but at the expense of increased latencies
+overall. Conversely decreasing it will decrease average and maximum
+latencies but at the expense of throughput. This value is in
+milliseconds and the default value chosen depends on the number of
+cpus available at scheduler initialisation with a minimum of 6.
+
+Valid values are from 1-1000.
+
+==============================================================
+
 rtsig-max & rtsig-nr:
 
 The file rtsig-max can be used to tune the maximum number
diff --git b/Documentation/tp_smapi.txt b/Documentation/tp_smapi.txt
new file mode 100644
index 0000000..d037301
--- /dev/null
+++ b/Documentation/tp_smapi.txt
@@ -0,0 +1,267 @@
+tp_smapi version 0.40
+IBM ThinkPad hardware functions driver
+
+Author:  Shem Multinymous <multinymous@gmail.com>
+Project: http://sourceforge.net/projects/tpctl
+Wiki:    http://thinkwiki.org/wiki/tp_smapi
+List:    linux-thinkpad@linux-thinkpad.org
+         (http://mailman.linux-thinkpad.org/mailman/listinfo/linux-thinkpad)
+
+Description
+-----------
+
+ThinkPad laptops include a proprietary interface called SMAPI BIOS
+(System Management Application Program Interface) which provides some
+hardware control functionality that is not accessible by other means.
+
+This driver exposes some features of the SMAPI BIOS through a sysfs
+interface. It is suitable for newer models, on which SMAPI is invoked
+through IO port writes. Older models use a different SMAPI interface;
+for those, try the "thinkpad" module from the "tpctl" package.
+
+WARNING:
+This driver uses undocumented features and direct hardware access.
+It thus cannot be guaranteed to work, and may cause arbitrary damage
+(especially on models it wasn't tested on).
+
+
+Module parameters
+-----------------
+
+thinkpad_ec module:
+  force_io=1 lets thinkpad_ec load on some recent ThinkPad models
+  (e.g., T400 and T500) whose BIOS's ACPI DSDT reserves the ports we need.
+tp_smapi module:
+  debug=1    enables verbose dmesg output.
+
+
+Usage
+-----
+
+Control of battery charging thresholds (in percents of current full charge
+capacity):
+
+# echo 40 > /sys/devices/platform/smapi/BAT0/start_charge_thresh
+# echo 70 > /sys/devices/platform/smapi/BAT0/stop_charge_thresh
+# cat /sys/devices/platform/smapi/BAT0/*_charge_thresh
+
+    (This is useful since Li-Ion batteries wear out much faster at very
+     high or low charge levels. The driver will also keeps the thresholds
+     across suspend-to-disk with AC disconnected; this isn't done
+     automatically by the hardware.)
+
+Inhibiting battery charging for 17 minutes (overrides thresholds):
+
+# echo 17 > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes
+# echo 0  > /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes  # stop
+# cat /sys/devices/platform/smapi/BAT0/inhibit_charge_minutes
+
+    (This can be used to control which battery is charged when using an
+     Ultrabay battery.)
+
+Forcing battery discharging even if AC power available:
+
+# echo 1 > /sys/devices/platform/smapi/BAT0/force_discharge  # start discharge
+# echo 0 > /sys/devices/platform/smapi/BAT0/force_discharge  # stop discharge
+# cat /sys/devices/platform/smapi/BAT0/force_discharge
+
+    (When AC is connected, forced discharging will automatically stop
+     when battery is fully depleted -- this is useful for calibration.
+     Also, this attribute can be used to control which battery is discharged
+     when both a system battery and an Ultrabay battery are connected.)
+
+Misc read-only battery status attributes (see note about HDAPS below):
+
+/sys/devices/platform/smapi/BAT0/installed   # 0 or 1
+/sys/devices/platform/smapi/BAT0/state       # idle/charging/discharging
+/sys/devices/platform/smapi/BAT0/cycle_count # integer counter
+/sys/devices/platform/smapi/BAT0/current_now # instantaneous current
+/sys/devices/platform/smapi/BAT0/current_avg # last minute average
+/sys/devices/platform/smapi/BAT0/power_now   # instantaneous power
+/sys/devices/platform/smapi/BAT0/power_avg   # last minute average
+/sys/devices/platform/smapi/BAT0/last_full_capacity         # in mWh
+/sys/devices/platform/smapi/BAT0/remaining_percent          # remaining percent of energy (set by calibration)
+/sys/devices/platform/smapi/BAT0/remaining_percent_error    # error range of remaing_percent (not reset by calibration)
+/sys/devices/platform/smapi/BAT0/remaining_running_time     # in minutes, by last minute average power
+/sys/devices/platform/smapi/BAT0/remaining_running_time_now # in minutes, by instantenous power
+/sys/devices/platform/smapi/BAT0/remaining_charging_time    # in minutes
+/sys/devices/platform/smapi/BAT0/remaining_capacity         # in mWh
+/sys/devices/platform/smapi/BAT0/design_capacity            # in mWh
+/sys/devices/platform/smapi/BAT0/voltage           # in mV
+/sys/devices/platform/smapi/BAT0/design_voltage    # in mV
+/sys/devices/platform/smapi/BAT0/charging_max_current  # max charging current
+/sys/devices/platform/smapi/BAT0/charging_max_voltage  # max charging voltage
+/sys/devices/platform/smapi/BAT0/group{0,1,2,3}_voltage # see below
+/sys/devices/platform/smapi/BAT0/manufacturer      # string
+/sys/devices/platform/smapi/BAT0/model             # string
+/sys/devices/platform/smapi/BAT0/barcoding         # string
+/sys/devices/platform/smapi/BAT0/chemistry         # string
+/sys/devices/platform/smapi/BAT0/serial            # integer
+/sys/devices/platform/smapi/BAT0/manufacture_date  # YYYY-MM-DD
+/sys/devices/platform/smapi/BAT0/first_use_date    # YYYY-MM-DD
+/sys/devices/platform/smapi/BAT0/temperature  # in milli-Celsius
+/sys/devices/platform/smapi/BAT0/dump         # see below
+/sys/devices/platform/smapi/ac_connected      # 0 or 1
+
+The BAT0/group{0,1,2,3}_voltage attribute refers to the separate cell groups
+in each battery. For example, on the ThinkPad 600, X3x, T4x and R5x models,
+the battery contains 3 cell groups in series, where each group consisting of 2
+or 3 cells  connected in parallel. The voltage of each group is given by these
+attributes, and their sum (roughly) equals the "voltage" attribute.
+(The effective performance of the battery is determined by the weakest group,
+i.e., the one those voltage changes most rapidly during dis/charging.)
+
+The "BAT0/dump" attribute gives a a hex dump of the raw status data, which
+contains additional data now in the above (if you can figure it out). Some
+unused values are autodetected and replaced by "--":
+
+In all of the above, replace BAT0 with BAT1 to address the 2nd battery (e.g.
+in the UltraBay).
+
+
+Raw SMAPI calls:
+
+/sys/devices/platform/smapi/smapi_request
+This performs raw SMAPI calls. It uses a bad interface that cannot handle
+multiple simultaneous access. Don't touch it, it's for development only.
+If you did touch it, you would so something like
+# echo '211a 100 0 0' > /sys/devices/platform/smapi/smapi_request
+# cat /sys/devices/platform/smapi/smapi_request
+and notice that in the output "211a 34b b2 0 0 0 'OK'", the "4b" in the 2nd
+value, converted to decimal is 75: the current charge stop threshold.
+
+
+Model-specific status
+---------------------
+
+Works (at least partially) on the following ThinkPad model:
+* A30
+* G41
+* R40, R50p, R51, R52
+* T23, T40, T40p, T41, T41p, T42, T42p, T43, T43p, T60
+* X24, X31, X32, X40, X41, X60
+* Z60t, Z61m
+
+Not all functions are available on all models; for detailed status, see:
+  http://thinkwiki.org/wiki/tp_smapi
+
+Please report success/failure by e-mail or on the Wiki.
+If you get a "not implemented" or "not supported" message, your laptop
+probably just can't do that (at least not via the SMAPI BIOS).
+For negative reports, follow the bug reporting guidelines below.
+If you send me the necessary technical data (i.e., SMAPI function
+interfaces), I will support additional models.
+
+
+Additional HDAPS features
+-------------------------
+
+The modified hdaps driver has several improvements on the one in mainline
+(beyond resolving the conflict with thinkpad_ec and tp_smapi):
+
+- Fixes reliability and improves support for recent ThinkPad models
+  (especially *60 and newer). Unlike the mainline driver, the modified hdaps
+  correctly follows the Embedded Controller communication protocol.
+
+- Extends the "invert" parameter to cover all possible axis orientations.
+  The possible values are as follows.
+  Let X,Y denote the hardware readouts.
+  Let R denote the laptop's roll (tilt left/right).
+  Let P denote the laptop's pitch (tilt forward/backward).
+    invert=0:   R= X  P= Y   (same as mainline)
+    invert=1:   R=-X  P=-Y   (same as mainline)
+    invert=2:   R=-X  P= Y   (new)
+    invert=3:   R= X  P=-Y   (new)
+    invert=4:   R= Y  P= X   (new)
+    invert=5:   R=-Y  P=-X   (new)
+    invert=6:   R=-Y  P= X   (new)
+    invert=7:   R= Y  P=-X   (new)
+  It's probably easiest to just try all 8 possibilities and see which yields
+  correct results (e.g., in the hdaps-gl visualisation).
+
+- Adds a whitelist which automatically sets the correct axis orientation for
+  some models. If the value for your model is wrong or missing, you can override
+  it using the "invert" parameter. Please also update the tables at
+  http://www.thinkwiki.org/wiki/tp_smapi and
+  http://www.thinkwiki.org/wiki/List_of_DMI_IDs
+  and submit a patch for the whitelist in hdaps.c.
+
+- Provides new attributes:
+  /sys/devices/platform/hdaps/sampling_rate:
+    This determines the frequency at which the host queries the embedded
+    controller for accelerometer data (and informs the hdaps input devices).
+    Default=50.
+  /sys/devices/platform/hdaps/oversampling_ratio:
+    When set to X, the embedded controller is told to do physical accelerometer
+    measurements at a rate that is X times higher than the rate at which
+    the driver reads those measurements (i.e., X*sampling_rate). This
+    makes the readouts from the embedded controller more fresh, and is also
+    useful for the running average filter (see next). Default=5
+  /sys/devices/platform/hdaps/running_avg_filter_order:
+    When set to X, reported readouts will be the average of the last X physical
+    accelerometer measurements. Current firmware allows 1<=X<=8. Setting to a
+    high value decreases readout fluctuations. The averaging is handled by the
+    embedded controller, so no CPU resources are used. Higher values make the
+    readouts smoother, since it averages out both sensor noise (good) and abrupt
+    changes (bad). Default=2.
+
+- Provides a second input device, which publishes the raw accelerometer
+  measurements (without the fuzzing needed for joystick emulation). This input
+  device can be matched by a udev rule such as the following (all on one line):
+    KERNEL=="event[0-9]*", ATTRS{phys}=="hdaps/input1",
+    ATTRS{modalias}=="input:b0019v1014p5054e4801-*",
+    SYMLINK+="input/hdaps/accelerometer-event
+
+A new version of the hdapsd userspace daemon, which uses the input device
+interface instead of polling sysfs, is available seprately. Using this reduces
+the total interrupts per second generated by hdaps+hdapsd (on tickless kernels)
+to 50, down from a value that fluctuates between 50 and 100. Set the
+sampling_rate sysfs attribute to a lower value to further reduce interrupts,
+at the expense of response latency.
+
+Licensing note: all my changes to the HDAPS driver are licensed under the
+GPL version 2 or, at your option and to the extent allowed by derivation from
+prior works, any later version. My version of hdaps is derived work from the
+mainline version, which at the time of writing is available only under
+GPL version 2.
+
+Bug reporting
+-------------
+
+Mail <multinymous@gmail.com>. Please include:
+* Details about your model,
+* Relevant "dmesg" output. Make sure thinkpad_ec and tp_smapi are loaded with
+  the "debug=1" parameter (e.g., use "make load HDAPS=1 DEBUG=1").
+* Output of "dmidecode | grep -C5 Product"
+* Does the failed functionality works under Windows?
+
+
+More about SMAPI
+----------------
+
+For hints about what may be possible via the SMAPI BIOS and how, see:
+
+* IBM Technical Reference Manual for the ThinkPad 770
+  (http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD)
+* Exported symbols in PWRMGRIF.DLL or TPPWRW32.DLL (e.g., use "objdump -x").
+* drivers/char/mwave/smapi.c in the Linux kernel tree.*
+* The "thinkpad" SMAPI module (http://tpctl.sourceforge.net).
+* The SMAPI_* constants in tp_smapi.c.
+
+Note that in the above Technical Reference and in the "thinkpad" module,
+SMAPI is invoked through a function call to some physical address. However,
+the interface used by tp_smapi and the above mwave drive, and apparently
+required by newer ThinkPad, is different: you set the parameters up in the
+CPU's registers and write to ports 0xB2 (the APM control port) and 0x4F; this
+triggers an SMI (System Management Interrupt), causing the CPU to enter
+SMM (System Management Mode) and run the BIOS firmware; the results are
+returned in the CPU's registers. It is not clear what is the relation between
+the two variants of SMAPI, though the assignment of error codes seems to be
+similar.
+
+In addition, the embedded controller on ThinkPad laptops has a non-standard
+interface at IO ports 0x1600-0x161F (mapped to LCP channel 3 of the H8S chip).
+The interface provides various system management services (currently known:
+battery information and accelerometer readouts). For more information see the
+thinkpad_ec module and the H8S hardware documentation:
+http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf
diff --git a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX
index 6a5e2a1..09eaa9a 100644
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -18,6 +18,8 @@ idle_page_tracking.txt
 	- description of the idle page tracking feature.
 ksm.txt
 	- how to use the Kernel Samepage Merging feature.
+uksm.txt
+	- Introduction to Ultra KSM
 numa
 	- information about NUMA specific code in the Linux vm.
 numa_memory_policy.txt
diff --git b/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt
new file mode 100644
index 0000000..8fce86f
--- /dev/null
+++ b/Documentation/vm/uksm.txt
@@ -0,0 +1,60 @@
+The Ultra Kernel Samepage Merging feature
+----------------------------------------------
+/*
+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
+ *
+ * This is an improvement upon KSM. Some basic data structures and routines
+ * are borrowed from ksm.c .
+ *
+ * Its new features:
+ * 1. Full system scan:
+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
+ *      interaction to submit a memory area to KSM is no longer needed.
+ *
+ * 2. Rich area detection:
+ *      It automatically detects rich areas containing abundant duplicated
+ *      pages based. Rich areas are given a full scan speed. Poor areas are
+ *      sampled at a reasonable speed with very low CPU consumption.
+ *
+ * 3. Ultra Per-page scan speed improvement:
+ *      A new hash algorithm is proposed. As a result, on a machine with
+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
+ *      can scan memory areas that does not contain duplicated pages at speed of
+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
+ *      477MB/sec ~ 923MB/sec.
+ *
+ * 4. Thrashing area avoidance:
+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
+ *      hash value based volatile page detection.
+ *
+ *
+ * 5. Misc changes upon KSM:
+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
+ *        comparison. It's much faster than default C version on x86.
+ *      * rmap_item now has an struct *page member to loosely cache a
+ *        address-->page mapping, which reduces too much time-costly
+ *        follow_page().
+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
+ *        ksm is needed for this case.
+ *
+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
+ *    Now uksmd consider full zero pages as special pages and merge them to an
+ *    special unswappable uksm zero page.
+ */
+
+ChangeLog:
+
+2012-05-05 The creation of this Doc
+2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up.
+2012-05-28 UKSM 0.1.1.2 bug fix release
+2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2
+2012-07-2  UKSM 0.1.2-beta2
+2012-07-10 UKSM 0.1.2-beta3
+2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization.
+2012-10-13 UKSM 0.1.2.1 Bug fixes.
+2012-12-31 UKSM 0.1.2.2 Minor bug fixes.
+2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug".
+2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings.
+2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation.
diff --git a/MAINTAINERS b/MAINTAINERS
index babaf82..69604e4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2256,6 +2256,19 @@ F:	include/linux/audit.h
 F:	include/uapi/linux/audit.h
 F:	kernel/audit*
 
+AUFS (advanced multi layered unification filesystem) FILESYSTEM
+M:	"J. R. Okajima" <hooanon05g@gmail.com>
+L:	linux-unionfs@vger.kernel.org
+L:	aufs-users@lists.sourceforge.net (members only)
+W:	http://aufs.sourceforge.net
+T:	git://github.com/sfjro/aufs4-linux.git
+S:	Supported
+F:	Documentation/filesystems/aufs/
+F:	Documentation/ABI/testing/debugfs-aufs
+F:	Documentation/ABI/testing/sysfs-aufs
+F:	fs/aufs/
+F:	include/uapi/linux/aufs_type.h
+
 AUXILIARY DISPLAY DRIVERS
 M:	Miguel Ojeda Sandonis <miguel.ojeda.sandonis@gmail.com>
 W:	http://miguelojeda.es/auxdisplay.htm
diff --git a/Makefile b/Makefile
index 23fde10..333e40a 100644
--- a/Makefile
+++ b/Makefile
@@ -613,6 +613,12 @@ endif # $(dot-config)
 # Defaults to vmlinux, but the arch makefile usually adds further targets
 all: vmlinux
 
+# force no-pie for distro compilers that enable pie by default
+KBUILD_CFLAGS += $(call cc-option, -fno-pie)
+KBUILD_CFLAGS += $(call cc-option, -no-pie)
+KBUILD_AFLAGS += $(call cc-option, -fno-pie)
+KBUILD_CPPFLAGS += $(call cc-option, -fno-pie)
+
 # The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default
 # values of the respective KBUILD_* variables
 ARCH_CPPFLAGS :=
@@ -626,6 +632,8 @@ KBUILD_CFLAGS	+= $(call cc-disable-warning,frame-address,)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS	+= -Os
+else ifdef CONFIG_CC_OPTIMIZE_HARDER
+KBUILD_CFLAGS	+= -O3
 else
 ifdef CONFIG_PROFILE_ALL_BRANCHES
 KBUILD_CFLAGS	+= -O2
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 460f5f3..eeb3e32 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -64,11 +64,6 @@ static struct timer_list spusched_timer;
 static struct timer_list spuloadavg_timer;
 
 /*
- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0).
- */
-#define NORMAL_PRIO		120
-
-/*
  * Frequency of the spu scheduler tick.  By default we do one SPU scheduler
  * tick for every 10 CPU scheduler ticks.
  */
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2a1f0ce..7767a5f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -914,10 +914,26 @@ config SCHED_SMT
 	depends on SMP
 	---help---
 	  SMT scheduler support improves the CPU scheduler's decision making
-	  when dealing with Intel Pentium 4 chips with HyperThreading at a
+	  when dealing with Intel P4/Core 2 chips with HyperThreading at a
 	  cost of slightly increased overhead in some places. If unsure say
 	  N here.
 
+config SMT_NICE
+	bool "SMT (Hyperthreading) aware nice priority and policy support"
+	depends on SCHED_MUQSS && SCHED_SMT
+	default y
+	---help---
+	  Enabling Hyperthreading on Intel CPUs decreases the effectiveness
+	  of the use of 'nice' levels and different scheduling policies
+	  (e.g. realtime) due to sharing of CPU power between hyperthreads.
+	  SMT nice support makes each logical CPU aware of what is running on
+	  its hyperthread siblings, maintaining appropriate distribution of
+	  CPU according to nice levels and scheduling policies at the expense
+	  of slightly increased overhead.
+
+	  If unsure say Y here.
+
+
 config SCHED_MC
 	def_bool y
 	prompt "Multi-core scheduler support"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 3ba5ff2..f7b4016 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -147,9 +147,8 @@ config MPENTIUM4
 		-Paxville
 		-Dempsey
 
-
 config MK6
-	bool "K6/K6-II/K6-III"
+	bool "AMD K6/K6-II/K6-III"
 	depends on X86_32
 	---help---
 	  Select this for an AMD K6-family processor.  Enables use of
@@ -157,7 +156,7 @@ config MK6
 	  flags to GCC.
 
 config MK7
-	bool "Athlon/Duron/K7"
+	bool "AMD Athlon/Duron/K7"
 	depends on X86_32
 	---help---
 	  Select this for an AMD Athlon K7-family processor.  Enables use of
@@ -165,12 +164,69 @@ config MK7
 	  flags to GCC.
 
 config MK8
-	bool "Opteron/Athlon64/Hammer/K8"
+	bool "AMD Opteron/Athlon64/Hammer/K8"
 	---help---
 	  Select this for an AMD Opteron or Athlon64 Hammer-family processor.
 	  Enables use of some extended instructions, and passes appropriate
 	  optimization flags to GCC.
 
+config MK8SSE3
+	bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3"
+	---help---
+	  Select this for improved AMD Opteron or Athlon64 Hammer-family processors.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MK10
+	bool "AMD 61xx/7x50/PhenomX3/X4/II/K10"
+	---help---
+	  Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50,
+		Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor.
+	  Enables use of some extended instructions, and passes appropriate
+	  optimization flags to GCC.
+
+config MBARCELONA
+	bool "AMD Barcelona"
+	---help---
+	  Select this for AMD Barcelona and newer processors.
+
+	  Enables -march=barcelona
+
+config MBOBCAT
+	bool "AMD Bobcat"
+	---help---
+	  Select this for AMD Bobcat processors.
+
+	  Enables -march=btver1
+
+config MBULLDOZER
+	bool "AMD Bulldozer"
+	---help---
+	  Select this for AMD Bulldozer processors.
+
+	  Enables -march=bdver1
+
+config MPILEDRIVER
+	bool "AMD Piledriver"
+	---help---
+	  Select this for AMD Piledriver processors.
+
+	  Enables -march=bdver2
+
+config MSTEAMROLLER
+	bool "AMD Steamroller"
+	---help---
+	  Select this for AMD Steamroller processors.
+
+	  Enables -march=bdver3
+
+config MJAGUAR
+	bool "AMD Jaguar"
+	---help---
+	  Select this for AMD Jaguar processors.
+
+	  Enables -march=btver2
+
 config MCRUSOE
 	bool "Crusoe"
 	depends on X86_32
@@ -261,8 +317,17 @@ config MPSC
 	  using the cpu family field
 	  in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one.
 
+config MATOM
+	bool "Intel Atom"
+	---help---
+
+	  Select this for the Intel Atom platform. Intel Atom CPUs have an
+	  in-order pipelining architecture and thus can benefit from
+	  accordingly optimized code. Use a recent GCC with specific Atom
+	  support in order to fully benefit from selecting this option.
+
 config MCORE2
-	bool "Core 2/newer Xeon"
+	bool "Intel Core 2"
 	---help---
 
 	  Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and
@@ -270,14 +335,71 @@ config MCORE2
 	  family in /proc/cpuinfo. Newer ones have 6 and older ones 15
 	  (not a typo)
 
-config MATOM
-	bool "Intel Atom"
+	  Enables -march=core2
+
+config MNEHALEM
+	bool "Intel Nehalem"
 	---help---
 
-	  Select this for the Intel Atom platform. Intel Atom CPUs have an
-	  in-order pipelining architecture and thus can benefit from
-	  accordingly optimized code. Use a recent GCC with specific Atom
-	  support in order to fully benefit from selecting this option.
+	  Select this for 1st Gen Core processors in the Nehalem family.
+
+	  Enables -march=nehalem
+
+config MWESTMERE
+	bool "Intel Westmere"
+	---help---
+
+	  Select this for the Intel Westmere formerly Nehalem-C family.
+
+	  Enables -march=westmere
+
+config MSILVERMONT
+	bool "Intel Silvermont"
+	---help---
+
+	  Select this for the Intel Silvermont platform.
+
+	  Enables -march=silvermont
+
+config MSANDYBRIDGE
+	bool "Intel Sandy Bridge"
+	---help---
+
+	  Select this for 2nd Gen Core processors in the Sandy Bridge family.
+
+	  Enables -march=sandybridge
+
+config MIVYBRIDGE
+	bool "Intel Ivy Bridge"
+	---help---
+
+	  Select this for 3rd Gen Core processors in the Ivy Bridge family.
+
+	  Enables -march=ivybridge
+
+config MHASWELL
+	bool "Intel Haswell"
+	---help---
+
+	  Select this for 4th Gen Core processors in the Haswell family.
+
+	  Enables -march=haswell
+
+config MBROADWELL
+	bool "Intel Broadwell"
+	---help---
+
+	  Select this for 5th Gen Core processors in the Broadwell family.
+
+	  Enables -march=broadwell
+
+config MSKYLAKE
+	bool "Intel Skylake"
+	---help---
+
+	  Select this for 6th Gen Core processors in the Skylake family.
+
+	  Enables -march=skylake
 
 config GENERIC_CPU
 	bool "Generic-x86-64"
@@ -286,6 +408,19 @@ config GENERIC_CPU
 	  Generic x86-64 CPU.
 	  Run equally well on all x86-64 CPUs.
 
+config MNATIVE
+ bool "Native optimizations autodetected by GCC"
+ ---help---
+
+   GCC 4.2 and above support -march=native, which automatically detects
+   the optimum settings to use based on your processor. -march=native 
+   also detects and applies additional settings beyond -march specific
+   to your CPU, (eg. -msse4). Unless you have a specific reason not to
+   (e.g. distcc cross-compiling), you should probably be using
+   -march=native rather than anything listed below.
+
+   Enables -march=native
+
 endchoice
 
 config X86_GENERIC
@@ -310,7 +445,7 @@ config X86_INTERNODE_CACHE_SHIFT
 config X86_L1_CACHE_SHIFT
 	int
 	default "7" if MPENTIUM4 || MPSC
-	default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
+	default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
 	default "4" if MELAN || M486 || MGEODEGX1
 	default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
 
@@ -341,11 +476,11 @@ config X86_ALIGNMENT_16
 
 config X86_INTEL_USERCOPY
 	def_bool y
-	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2
+	depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE
 
 config X86_USE_PPRO_CHECKSUM
 	def_bool y
-	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
+	depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE
 
 config X86_USE_3DNOW
 	def_bool y
@@ -369,17 +504,17 @@ config X86_P6_NOP
 
 config X86_TSC
 	def_bool y
-	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64
+	depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64
 
 config X86_CMPXCHG64
 	def_bool y
-	depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
+	depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE
 
 # this should be set for all -march=.. options where the compiler
 # generates cmov.
 config X86_CMOV
 	def_bool y
-	depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
+	depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX)
 
 config X86_MINIMUM_CPU_FAMILY
 	int
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 830ed39..7ac254b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -104,13 +104,38 @@ else
 	KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup)
 
         # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu)
+        cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
         cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8)
+        cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8)
+        cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10)
+        cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona)
+        cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1)
+        cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1)
+        cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2)
+        cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3)
+        cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2)
         cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona)
 
         cflags-$(CONFIG_MCORE2) += \
-                $(call cc-option,-march=core2,$(call cc-option,-mtune=generic))
-	cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \
-		$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+                $(call cc-option,-march=core2,$(call cc-option,-mtune=core2))
+        cflags-$(CONFIG_MNEHALEM) += \
+                $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem))
+        cflags-$(CONFIG_MWESTMERE) += \
+                $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere))
+        cflags-$(CONFIG_MSILVERMONT) += \
+                $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont))
+        cflags-$(CONFIG_MSANDYBRIDGE) += \
+                $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge))
+        cflags-$(CONFIG_MIVYBRIDGE) += \
+                $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge))
+        cflags-$(CONFIG_MHASWELL) += \
+                $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell))
+        cflags-$(CONFIG_MBROADWELL) += \
+                $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell))
+        cflags-$(CONFIG_MSKYLAKE) += \
+                $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake))
+        cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \
+                $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
         cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic)
         KBUILD_CFLAGS += $(cflags-y)
 
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 6647ed4..56aa88a 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -23,7 +23,16 @@ cflags-$(CONFIG_MK6)		+= -march=k6
 # Please note, that patches that add -march=athlon-xp and friends are pointless.
 # They make zero difference whatsosever to performance at this time.
 cflags-$(CONFIG_MK7)		+= -march=athlon
+cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native)
 cflags-$(CONFIG_MK8)		+= $(call cc-option,-march=k8,-march=athlon)
+cflags-$(CONFIG_MK8SSE3)		+= $(call cc-option,-march=k8-sse3,-march=athlon)
+cflags-$(CONFIG_MK10)	+= $(call cc-option,-march=amdfam10,-march=athlon)
+cflags-$(CONFIG_MBARCELONA)	+= $(call cc-option,-march=barcelona,-march=athlon)
+cflags-$(CONFIG_MBOBCAT)	+= $(call cc-option,-march=btver1,-march=athlon)
+cflags-$(CONFIG_MBULLDOZER)	+= $(call cc-option,-march=bdver1,-march=athlon)
+cflags-$(CONFIG_MPILEDRIVER)	+= $(call cc-option,-march=bdver2,-march=athlon)
+cflags-$(CONFIG_MSTEAMROLLER)	+= $(call cc-option,-march=bdver3,-march=athlon)
+cflags-$(CONFIG_MJAGUAR)	+= $(call cc-option,-march=btver2,-march=athlon)
 cflags-$(CONFIG_MCRUSOE)	+= -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MEFFICEON)	+= -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
 cflags-$(CONFIG_MWINCHIPC6)	+= $(call cc-option,-march=winchip-c6,-march=i586)
@@ -32,8 +41,16 @@ cflags-$(CONFIG_MCYRIXIII)	+= $(call cc-option,-march=c3,-march=i486) $(align)-f
 cflags-$(CONFIG_MVIAC3_2)	+= $(call cc-option,-march=c3-2,-march=i686)
 cflags-$(CONFIG_MVIAC7)		+= -march=i686
 cflags-$(CONFIG_MCORE2)		+= -march=i686 $(call tune,core2)
-cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \
-	$(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
+cflags-$(CONFIG_MNEHALEM)	+= -march=i686 $(call tune,nehalem)
+cflags-$(CONFIG_MWESTMERE)	+= -march=i686 $(call tune,westmere)
+cflags-$(CONFIG_MSILVERMONT)	+= -march=i686 $(call tune,silvermont)
+cflags-$(CONFIG_MSANDYBRIDGE)	+= -march=i686 $(call tune,sandybridge)
+cflags-$(CONFIG_MIVYBRIDGE)	+= -march=i686 $(call tune,ivybridge)
+cflags-$(CONFIG_MHASWELL)	+= -march=i686 $(call tune,haswell)
+cflags-$(CONFIG_MBROADWELL)	+= -march=i686 $(call tune,broadwell)
+cflags-$(CONFIG_MSKYLAKE)	+= -march=i686 $(call tune,skylake)
+cflags-$(CONFIG_MATOM)		+= $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \
+	$(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic))
 
 # AMD Elan support
 cflags-$(CONFIG_MELAN)		+= -march=i486
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index e3b7819..90f3c76 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -15,6 +15,24 @@
 #define MODULE_PROC_FAMILY "586MMX "
 #elif defined CONFIG_MCORE2
 #define MODULE_PROC_FAMILY "CORE2 "
+#elif defined CONFIG_MNATIVE
+#define MODULE_PROC_FAMILY "NATIVE "
+#elif defined CONFIG_MNEHALEM
+#define MODULE_PROC_FAMILY "NEHALEM "
+#elif defined CONFIG_MWESTMERE
+#define MODULE_PROC_FAMILY "WESTMERE "
+#elif defined CONFIG_MSILVERMONT
+#define MODULE_PROC_FAMILY "SILVERMONT "
+#elif defined CONFIG_MSANDYBRIDGE
+#define MODULE_PROC_FAMILY "SANDYBRIDGE "
+#elif defined CONFIG_MIVYBRIDGE
+#define MODULE_PROC_FAMILY "IVYBRIDGE "
+#elif defined CONFIG_MHASWELL
+#define MODULE_PROC_FAMILY "HASWELL "
+#elif defined CONFIG_MBROADWELL
+#define MODULE_PROC_FAMILY "BROADWELL "
+#elif defined CONFIG_MSKYLAKE
+#define MODULE_PROC_FAMILY "SKYLAKE "
 #elif defined CONFIG_MATOM
 #define MODULE_PROC_FAMILY "ATOM "
 #elif defined CONFIG_M686
@@ -33,6 +51,22 @@
 #define MODULE_PROC_FAMILY "K7 "
 #elif defined CONFIG_MK8
 #define MODULE_PROC_FAMILY "K8 "
+#elif defined CONFIG_MK8SSE3
+#define MODULE_PROC_FAMILY "K8SSE3 "
+#elif defined CONFIG_MK10
+#define MODULE_PROC_FAMILY "K10 "
+#elif defined CONFIG_MBARCELONA
+#define MODULE_PROC_FAMILY "BARCELONA "
+#elif defined CONFIG_MBOBCAT
+#define MODULE_PROC_FAMILY "BOBCAT "
+#elif defined CONFIG_MBULLDOZER
+#define MODULE_PROC_FAMILY "BULLDOZER "
+#elif defined CONFIG_MPILEDRIVER
+#define MODULE_PROC_FAMILY "STEAMROLLER "
+#elif defined CONFIG_MSTEAMROLLER
+#define MODULE_PROC_FAMILY "PILEDRIVER "
+#elif defined CONFIG_MJAGUAR
+#define MODULE_PROC_FAMILY "JAGUAR "
 #elif defined CONFIG_MELAN
 #define MODULE_PROC_FAMILY "ELAN "
 #elif defined CONFIG_MCRUSOE
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index c289e2f..3fc03a0 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -517,14 +517,14 @@ void fpu__clear(struct fpu *fpu)
 {
 	WARN_ON_FPU(fpu != &current->thread.fpu); /* Almost certainly an anomaly */
 
-	fpu__drop(fpu);
-
-	/*
-	 * Make sure fpstate is cleared and initialized.
-	 */
-	if (static_cpu_has(X86_FEATURE_FPU)) {
-		fpu__activate_curr(fpu);
-		user_fpu_begin();
+	if (!use_eager_fpu() || !static_cpu_has(X86_FEATURE_FPU)) {
+		/* FPU state will be reallocated lazily at the first use. */
+		fpu__drop(fpu);
+	} else {
+		if (!fpu->fpstate_active) {
+			fpu__activate_curr(fpu);
+			user_fpu_begin();
+		}
 		copy_init_fpstate_to_fpregs();
 	}
 }
diff --git a/block/Kconfig b/block/Kconfig
index 161491d..6da79e6 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -4,6 +4,7 @@
 menuconfig BLOCK
        bool "Enable the block layer" if EXPERT
        default y
+       select WBT
        help
 	 Provide block layer support for the kernel.
 
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..48145be 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -39,9 +39,25 @@ config CFQ_GROUP_IOSCHED
 	---help---
 	  Enable group IO scheduling in CFQ.
 
+config IOSCHED_BFQ
+	tristate "BFQ I/O scheduler"
+	default y
+	---help---
+	The BFQ I/O scheduler distributes bandwidth among all
+	processes according to their weights, regardless of the
+	device parameters and with any workload. It also guarantees
+	a low latency to interactive and soft real-time applications.
+
+config BFQ_GROUP_IOSCHED
+	bool "BFQ hierarchical scheduling support"
+	depends on IOSCHED_BFQ && BLK_CGROUP
+	default n
+	---help---
+	  Enable hierarchical scheduling in BFQ, using the blkio controller.
+
 choice
 	prompt "Default I/O scheduler"
-	default DEFAULT_CFQ
+	default DEFAULT_BFQ
 	help
 	  Select the I/O scheduler which will be used by default for all
 	  block devices.
@@ -52,6 +68,16 @@ choice
 	config DEFAULT_CFQ
 		bool "CFQ" if IOSCHED_CFQ=y
 
+	config DEFAULT_BFQ
+		bool "BFQ" if IOSCHED_BFQ=y
+		help
+		  Selects BFQ as the default I/O scheduler which will be
+		  used by default for all block devices.
+		  The BFQ I/O scheduler aims at distributing the bandwidth
+		  as desired, independently of the disk parameters and with
+		  any workload. It also tries to guarantee low latency to
+		  interactive and soft real-time applications.
+
 	config DEFAULT_NOOP
 		bool "No-op"
 
@@ -61,6 +87,7 @@ config DEFAULT_IOSCHED
 	string
 	default "deadline" if DEFAULT_DEADLINE
 	default "cfq" if DEFAULT_CFQ
+	default "bfq" if DEFAULT_BFQ
 	default "noop" if DEFAULT_NOOP
 
 endmenu
diff --git a/block/Makefile b/block/Makefile
index 9eda232..b7aa613 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
 obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
-			blk-lib.o blk-mq.o blk-mq-tag.o \
+			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
 			blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
 			genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
 			badblocks.o partitions/
@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
+obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_CMDLINE_PARSER)	+= cmdline-parser.o
diff --git b/block/bfq-cgroup.c b/block/bfq-cgroup.c
new file mode 100644
index 0000000..569988b
--- /dev/null
+++ b/block/bfq-cgroup.c
@@ -0,0 +1,1208 @@
+/*
+ * BFQ: CGROUPS support.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ */
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+/* bfqg stats flags */
+enum bfqg_stats_flags {
+	BFQG_stats_waiting = 0,
+	BFQG_stats_idling,
+	BFQG_stats_empty,
+};
+
+#define BFQG_FLAG_FNS(name)						\
+static void bfqg_stats_mark_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags |= (1 << BFQG_stats_##name);			\
+}									\
+static void bfqg_stats_clear_##name(struct bfqg_stats *stats)	\
+{									\
+	stats->flags &= ~(1 << BFQG_stats_##name);			\
+}									\
+static int bfqg_stats_##name(struct bfqg_stats *stats)		\
+{									\
+	return (stats->flags & (1 << BFQG_stats_##name)) != 0;		\
+}									\
+
+BFQG_FLAG_FNS(waiting)
+BFQG_FLAG_FNS(idling)
+BFQG_FLAG_FNS(empty)
+#undef BFQG_FLAG_FNS
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_waiting(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_group_wait_time))
+		blkg_stat_add(&stats->group_wait_time,
+			      now - stats->start_group_wait_time);
+	bfqg_stats_clear_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+						 struct bfq_group *curr_bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_waiting(stats))
+		return;
+	if (bfqg == curr_bfqg)
+		return;
+	stats->start_group_wait_time = sched_clock();
+	bfqg_stats_mark_waiting(stats);
+}
+
+/* This should be called with the queue_lock held. */
+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
+{
+	unsigned long long now;
+
+	if (!bfqg_stats_empty(stats))
+		return;
+
+	now = sched_clock();
+	if (time_after64(now, stats->start_empty_time))
+		blkg_stat_add(&stats->empty_time,
+			      now - stats->start_empty_time);
+	bfqg_stats_clear_empty(stats);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
+{
+	blkg_stat_add(&bfqg->stats.dequeue, 1);
+}
+
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (blkg_rwstat_total(&stats->queued))
+		return;
+
+	/*
+	 * group is already marked empty. This can happen if bfqq got new
+	 * request in parent group and moved to this group while being added
+	 * to service tree. Just ignore the event and move on.
+	 */
+	if (bfqg_stats_empty(stats))
+		return;
+
+	stats->start_empty_time = sched_clock();
+	bfqg_stats_mark_empty(stats);
+}
+
+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	if (bfqg_stats_idling(stats)) {
+		unsigned long long now = sched_clock();
+
+		if (time_after64(now, stats->start_idle_time))
+			blkg_stat_add(&stats->idle_time,
+				      now - stats->start_idle_time);
+		bfqg_stats_clear_idling(stats);
+	}
+}
+
+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	stats->start_idle_time = sched_clock();
+	bfqg_stats_mark_idling(stats);
+}
+
+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+
+	blkg_stat_add(&stats->avg_queue_size_sum,
+		      blkg_rwstat_total(&stats->queued));
+	blkg_stat_add(&stats->avg_queue_size_samples, 1);
+	bfqg_stats_update_group_wait_time(stats);
+}
+
+static struct blkcg_policy blkcg_policy_bfq;
+
+/*
+ * blk-cgroup policy-related handlers
+ * The following functions help in converting between blk-cgroup
+ * internal structures and BFQ-specific structures.
+ */
+
+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd)
+{
+	return pd ? container_of(pd, struct bfq_group, pd) : NULL;
+}
+
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg)
+{
+	return pd_to_blkg(&bfqg->pd);
+}
+
+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg)
+{
+	struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq);
+
+	return pd_to_bfqg(pd);
+}
+
+/*
+ * bfq_group handlers
+ * The following functions help in navigating the bfq_group hierarchy
+ * by allowing to find the parent of a bfq_group or the bfq_group
+ * associated to a bfq_queue.
+ */
+
+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg)
+{
+	struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent;
+
+	return pblkg ? blkg_to_bfqg(pblkg) : NULL;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	return group_entity ? container_of(group_entity, struct bfq_group,
+					   entity) :
+			      bfqq->bfqd->root_group;
+}
+
+/*
+ * The following two functions handle get and put of a bfq_group by
+ * wrapping the related blk-cgroup hooks.
+ */
+
+static void bfqg_get(struct bfq_group *bfqg)
+{
+	return blkg_get(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_put(struct bfq_group *bfqg)
+{
+	return blkg_put(bfqg_to_blkg(bfqg));
+}
+
+static void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+				     struct bfq_queue *bfqq,
+				     int op, int op_flags)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, 1);
+	bfqg_stats_end_empty_time(&bfqg->stats);
+	if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+		bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
+}
+
+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op,
+					int op_flags)
+{
+	blkg_rwstat_add(&bfqg->stats.queued, op, op_flags, -1);
+}
+
+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg,  int op,
+					int op_flags)
+{
+	blkg_rwstat_add(&bfqg->stats.merged, op, op_flags, 1);
+}
+
+static void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time, int op,
+			int op_flags)
+{
+	struct bfqg_stats *stats = &bfqg->stats;
+	unsigned long long now = sched_clock();
+
+	if (time_after64(now, io_start_time))
+		blkg_rwstat_add(&stats->service_time, op, op_flags,
+				now - io_start_time);
+	if (time_after64(io_start_time, start_time))
+		blkg_rwstat_add(&stats->wait_time, op, op_flags,
+				io_start_time - start_time);
+}
+
+/* @stats = 0 */
+static void bfqg_stats_reset(struct bfqg_stats *stats)
+{
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_reset(&stats->merged);
+	blkg_rwstat_reset(&stats->service_time);
+	blkg_rwstat_reset(&stats->wait_time);
+	blkg_stat_reset(&stats->time);
+	blkg_stat_reset(&stats->avg_queue_size_sum);
+	blkg_stat_reset(&stats->avg_queue_size_samples);
+	blkg_stat_reset(&stats->dequeue);
+	blkg_stat_reset(&stats->group_wait_time);
+	blkg_stat_reset(&stats->idle_time);
+	blkg_stat_reset(&stats->empty_time);
+}
+
+/* @to += @from */
+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
+{
+	if (!to || !from)
+		return;
+
+	/* queued stats shouldn't be cleared */
+	blkg_rwstat_add_aux(&to->merged, &from->merged);
+	blkg_rwstat_add_aux(&to->service_time, &from->service_time);
+	blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
+	blkg_stat_add_aux(&from->time, &from->time);
+	blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+	blkg_stat_add_aux(&to->avg_queue_size_samples,
+			  &from->avg_queue_size_samples);
+	blkg_stat_add_aux(&to->dequeue, &from->dequeue);
+	blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+	blkg_stat_add_aux(&to->idle_time, &from->idle_time);
+	blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+}
+
+/*
+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors'
+ * recursive stats can still account for the amount used by this bfqg after
+ * it's gone.
+ */
+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg)
+{
+	struct bfq_group *parent;
+
+	if (!bfqg) /* root_group */
+		return;
+
+	parent = bfqg_parent(bfqg);
+
+	lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock);
+
+	if (unlikely(!parent))
+		return;
+
+	bfqg_stats_add_aux(&parent->stats, &bfqg->stats);
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+		bfqg_get(bfqg);
+	}
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static void bfqg_stats_exit(struct bfqg_stats *stats)
+{
+	blkg_rwstat_exit(&stats->merged);
+	blkg_rwstat_exit(&stats->service_time);
+	blkg_rwstat_exit(&stats->wait_time);
+	blkg_rwstat_exit(&stats->queued);
+	blkg_stat_exit(&stats->time);
+	blkg_stat_exit(&stats->avg_queue_size_sum);
+	blkg_stat_exit(&stats->avg_queue_size_samples);
+	blkg_stat_exit(&stats->dequeue);
+	blkg_stat_exit(&stats->group_wait_time);
+	blkg_stat_exit(&stats->idle_time);
+	blkg_stat_exit(&stats->empty_time);
+}
+
+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
+{
+	if (blkg_rwstat_init(&stats->merged, gfp) ||
+	    blkg_rwstat_init(&stats->service_time, gfp) ||
+	    blkg_rwstat_init(&stats->wait_time, gfp) ||
+	    blkg_rwstat_init(&stats->queued, gfp) ||
+	    blkg_stat_init(&stats->time, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
+	    blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
+	    blkg_stat_init(&stats->dequeue, gfp) ||
+	    blkg_stat_init(&stats->group_wait_time, gfp) ||
+	    blkg_stat_init(&stats->idle_time, gfp) ||
+	    blkg_stat_init(&stats->empty_time, gfp)) {
+		bfqg_stats_exit(stats);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
+{
+	return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL;
+}
+
+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg)
+{
+	return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq));
+}
+
+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
+{
+	struct bfq_group_data *bgd;
+
+	bgd = kzalloc(sizeof(*bgd), GFP_KERNEL);
+	if (!bgd)
+		return NULL;
+	return &bgd->pd;
+}
+
+static void bfq_cpd_init(struct blkcg_policy_data *cpd)
+{
+	struct bfq_group_data *d = cpd_to_bfqgd(cpd);
+
+	d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
+		CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+}
+
+static void bfq_cpd_free(struct blkcg_policy_data *cpd)
+{
+	kfree(cpd_to_bfqgd(cpd));
+}
+
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+{
+	struct bfq_group *bfqg;
+
+	bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+	if (!bfqg)
+		return NULL;
+
+	if (bfqg_stats_init(&bfqg->stats, gfp)) {
+		kfree(bfqg);
+		return NULL;
+	}
+
+	return &bfqg->pd;
+}
+
+static void bfq_pd_init(struct blkg_policy_data *pd)
+{
+	struct blkcg_gq *blkg;
+	struct bfq_group *bfqg;
+	struct bfq_data *bfqd;
+	struct bfq_entity *entity;
+	struct bfq_group_data *d;
+
+	blkg = pd_to_blkg(pd);
+	BUG_ON(!blkg);
+	bfqg = blkg_to_bfqg(blkg);
+	bfqd = blkg->q->elevator->elevator_data;
+	entity = &bfqg->entity;
+	d = blkcg_to_bfqgd(blkg->blkcg);
+
+	entity->orig_weight = entity->weight = entity->new_weight = d->weight;
+	entity->my_sched_data = &bfqg->sched_data;
+	bfqg->my_entity = entity; /*
+				   * the root_group's will be set to NULL
+				   * in bfq_init_queue()
+				   */
+	bfqg->bfqd = bfqd;
+	bfqg->active_entities = 0;
+	bfqg->rq_pos_tree = RB_ROOT;
+}
+
+static void bfq_pd_free(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_exit(&bfqg->stats);
+	return kfree(bfqg);
+}
+
+static void bfq_pd_reset_stats(struct blkg_policy_data *pd)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+	bfqg_stats_reset(&bfqg->stats);
+}
+
+static void bfq_group_set_parent(struct bfq_group *bfqg,
+					struct bfq_group *parent)
+{
+	struct bfq_entity *entity;
+
+	BUG_ON(!parent);
+	BUG_ON(!bfqg);
+	BUG_ON(bfqg == parent);
+
+	entity = &bfqg->entity;
+	entity->parent = parent->my_entity;
+	entity->sched_data = &parent->sched_data;
+}
+
+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
+					 struct blkcg *blkcg)
+{
+	struct blkcg_gq *blkg;
+
+	blkg = blkg_lookup(blkcg, bfqd->queue);
+	if (likely(blkg))
+		return blkg_to_bfqg(blkg);
+	return NULL;
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	struct bfq_group *bfqg, *parent;
+	struct bfq_entity *entity;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	bfqg = bfq_lookup_bfqg(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		return NULL;
+
+	/*
+	 * Update chain of bfq_groups as we might be handling a leaf group
+	 * which, along with some of its relatives, has not been hooked yet
+	 * to the private hierarchy of BFQ.
+	 */
+	entity = &bfqg->entity;
+	for_each_entity(entity) {
+		bfqg = container_of(entity, struct bfq_group, entity);
+		BUG_ON(!bfqg);
+		if (bfqg != bfqd->root_group) {
+			parent = bfqg_parent(bfqg);
+			if (!parent)
+				parent = bfqd->root_group;
+			BUG_ON(!parent);
+			bfq_group_set_parent(bfqg, parent);
+		}
+	}
+
+	return bfqg;
+}
+
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd,
+				  struct bfq_queue *bfqq);
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
+/**
+ * bfq_bfqq_move - migrate @bfqq to @bfqg.
+ * @bfqd: queue descriptor.
+ * @bfqq: the queue to move.
+ * @bfqg: the group to move to.
+ *
+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
+ * it on the new one.  Avoid putting the entity on the old group idle tree.
+ *
+ * Must be called under the queue lock; the cgroup owning @bfqg must
+ * not disappear (by now this just means that we are called under
+ * rcu_read_lock()).
+ */
+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_group *bfqg)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list));
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st);
+	BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list)
+	       && entity->on_st &&
+	       bfqq != bfqd->in_service_queue);
+	BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue);
+
+	/* If bfqq is empty, then bfq_bfqq_expire also invokes
+	 * bfq_del_bfqq_busy, thereby removing bfqq and its entity
+	 * from data structures related to current group. Otherwise we
+	 * need to remove bfqq explicitly with bfq_deactivate_bfqq, as
+	 * we do below.
+	 */
+	if (bfqq == bfqd->in_service_queue)
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQ_BFQQ_PREEMPTED);
+
+	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+	    && &bfq_entity_service_tree(entity)->idle !=
+	       entity->tree);
+
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+
+	if (bfq_bfqq_busy(bfqq))
+		bfq_deactivate_bfqq(bfqd, bfqq, 0);
+	else if (entity->on_st) {
+		BUG_ON(&bfq_entity_service_tree(entity)->idle !=
+		       entity->tree);
+		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
+	}
+	bfqg_put(bfqq_group(bfqq));
+
+	/*
+	 * Here we use a reference to bfqg.  We don't need a refcounter
+	 * as the cgroup reference will not be dropped, so that its
+	 * destroy() callback will not be invoked.
+	 */
+	entity->parent = bfqg->my_entity;
+	entity->sched_data = &bfqg->sched_data;
+	bfqg_get(bfqg);
+
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq));
+	if (bfq_bfqq_busy(bfqq)) {
+		bfq_pos_tree_add_move(bfqd, bfqq);
+		bfq_activate_bfqq(bfqd, bfqq);
+	}
+
+	if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+	BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq)
+	       && &bfq_entity_service_tree(entity)->idle !=
+	       entity->tree);
+}
+
+/**
+ * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * @bfqd: the queue descriptor.
+ * @bic: the bic to move.
+ * @blkcg: the blk-cgroup to move to.
+ *
+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller
+ * has to make sure that the reference to cgroup is valid across the call.
+ *
+ * NOTE: an alternative approach might have been to store the current
+ * cgroup in bfqq and getting a reference to it, reducing the lookup
+ * time here, at the price of slightly more complex code.
+ */
+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
+						struct bfq_io_cq *bic,
+						struct blkcg *blkcg)
+{
+	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
+	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
+	struct bfq_group *bfqg;
+	struct bfq_entity *entity;
+
+	lockdep_assert_held(bfqd->queue->queue_lock);
+
+	bfqg = bfq_find_set_group(bfqd, blkcg);
+
+	if (unlikely(!bfqg))
+		bfqg = bfqd->root_group;
+
+	if (async_bfqq) {
+		entity = &async_bfqq->entity;
+
+		if (entity->sched_data != &bfqg->sched_data) {
+			bic_set_bfqq(bic, NULL, 0);
+			bfq_log_bfqq(bfqd, async_bfqq,
+				     "bic_change_group: %p %d",
+				     async_bfqq,
+				     async_bfqq->ref);
+			bfq_put_queue(async_bfqq);
+		}
+	}
+
+	if (sync_bfqq) {
+		entity = &sync_bfqq->entity;
+		if (entity->sched_data != &bfqg->sched_data)
+			bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+	}
+
+	return bfqg;
+}
+
+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_group *bfqg = NULL;
+	uint64_t serial_nr;
+
+	rcu_read_lock();
+	serial_nr = bio_blkcg(bio)->css.serial_nr;
+
+	/*
+	 * Check whether blkcg has changed.  The condition may trigger
+	 * spuriously on a newly created cic but there's no harm.
+	 */
+	if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
+		goto out;
+
+	/*
+	 * If we have a non-root cgroup, we can depend on that to
+	 * do proper throttling of writes. Turn off wbt for that
+	 * case.
+	*/
+	if (bio_blkcg(bio) != &blkcg_root) {
+		struct request_queue *q = bfqd->queue;
+
+		if (q->rq_wb)
+			wbt_disable(q->rq_wb);
+	}
+
+	bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio));
+	bic->blkcg_serial_nr = serial_nr;
+out:
+	rcu_read_unlock();
+}
+
+/**
+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
+ * @st: the service tree being flushed.
+ */
+static void bfq_flush_idle_tree(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entity = st->first_idle;
+
+	for (; entity ; entity = st->first_idle)
+		__bfq_deactivate_entity(entity, 0);
+}
+
+/**
+ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
+ * @bfqd: the device data structure with the root group.
+ * @entity: the entity to move.
+ */
+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
+				     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	BUG_ON(!bfqq);
+	bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
+}
+
+/**
+ * bfq_reparent_active_entities - move to the root group all active
+ *                                entities.
+ * @bfqd: the device data structure with the root group.
+ * @bfqg: the group to move from.
+ * @st: the service tree with the entities.
+ *
+ * Needs queue_lock to be taken and reference to be valid over the call.
+ */
+static void bfq_reparent_active_entities(struct bfq_data *bfqd,
+					 struct bfq_group *bfqg,
+					 struct bfq_service_tree *st)
+{
+	struct rb_root *active = &st->active;
+	struct bfq_entity *entity = NULL;
+
+	if (!RB_EMPTY_ROOT(&st->active))
+		entity = bfq_entity_of(rb_first(active));
+
+	for (; entity ; entity = bfq_entity_of(rb_first(active)))
+		bfq_reparent_leaf_entity(bfqd, entity);
+
+	if (bfqg->sched_data.in_service_entity)
+		bfq_reparent_leaf_entity(bfqd,
+			bfqg->sched_data.in_service_entity);
+}
+
+/**
+ * bfq_pd_offline - deactivate the entity associated with @pd,
+ *		    and reparent its children entities.
+ * @pd: descriptor of the policy going offline.
+ *
+ * blkio already grabs the queue_lock for us, so no need to use
+ * RCU-based magic
+ */
+static void bfq_pd_offline(struct blkg_policy_data *pd)
+{
+	struct bfq_service_tree *st;
+	struct bfq_group *bfqg;
+	struct bfq_data *bfqd;
+	struct bfq_entity *entity;
+	int i;
+
+	BUG_ON(!pd);
+	bfqg = pd_to_bfqg(pd);
+	BUG_ON(!bfqg);
+	bfqd = bfqg->bfqd;
+	BUG_ON(bfqd && !bfqd->root_group);
+
+	entity = bfqg->my_entity;
+
+	if (!entity) /* root group */
+		return;
+
+	/*
+	 * Empty all service_trees belonging to this group before
+	 * deactivating the group itself.
+	 */
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
+		BUG_ON(!bfqg->sched_data.service_tree);
+		st = bfqg->sched_data.service_tree + i;
+		/*
+		 * The idle tree may still contain bfq_queues belonging
+		 * to exited task because they never migrated to a different
+		 * cgroup from the one being destroyed now.  No one else
+		 * can access them so it's safe to act without any lock.
+		 */
+		bfq_flush_idle_tree(st);
+
+		/*
+		 * It may happen that some queues are still active
+		 * (busy) upon group destruction (if the corresponding
+		 * processes have been forced to terminate). We move
+		 * all the leaf entities corresponding to these queues
+		 * to the root_group.
+		 * Also, it may happen that the group has an entity
+		 * in service, which is disconnected from the active
+		 * tree: it must be moved, too.
+		 * There is no need to put the sync queues, as the
+		 * scheduler has taken no reference.
+		 */
+		bfq_reparent_active_entities(bfqd, bfqg, st);
+		BUG_ON(!RB_EMPTY_ROOT(&st->active));
+		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
+	}
+	BUG_ON(bfqg->sched_data.next_in_service);
+	BUG_ON(bfqg->sched_data.in_service_entity);
+
+	__bfq_deactivate_entity(entity, 0);
+	bfq_put_async_queues(bfqd, bfqg);
+	BUG_ON(entity->tree);
+
+	/*
+	 * @blkg is going offline and will be ignored by
+	 * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+	 * that they don't get lost.  If IOs complete after this point, the
+	 * stats for them will be lost.  Oh well...
+	 */
+	bfqg_stats_xfer_dead(bfqg);
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	struct blkcg_gq *blkg;
+
+	list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+		BUG_ON(!bfqg);
+
+		bfq_end_wr_async_queues(bfqd, bfqg);
+	}
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	unsigned int val = 0;
+
+	if (bfqgd)
+		val = bfqgd->weight;
+
+	seq_printf(sf, "%u\n", val);
+
+	return 0;
+}
+
+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
+				    struct cftype *cftype,
+				    u64 val)
+{
+	struct blkcg *blkcg = css_to_blkcg(css);
+	struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+	struct blkcg_gq *blkg;
+	int ret = -ERANGE;
+
+	if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT)
+		return ret;
+
+	ret = 0;
+	spin_lock_irq(&blkcg->lock);
+	bfqgd->weight = (unsigned short)val;
+	hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+		struct bfq_group *bfqg = blkg_to_bfqg(blkg);
+
+		if (!bfqg)
+			continue;
+		/*
+		 * Setting the prio_changed flag of the entity
+		 * to 1 with new_weight == weight would re-set
+		 * the value of the weight to its ioprio mapping.
+		 * Set the flag only if necessary.
+		 */
+		if ((unsigned short)val != bfqg->entity.new_weight) {
+			bfqg->entity.new_weight = (unsigned short)val;
+			/*
+			 * Make sure that the above new value has been
+			 * stored in bfqg->entity.new_weight before
+			 * setting the prio_changed flag. In fact,
+			 * this flag may be read asynchronously (in
+			 * critical sections protected by a different
+			 * lock than that held here), and finding this
+			 * flag set may cause the execution of the code
+			 * for updating parameters whose value may
+			 * depend also on bfqg->entity.new_weight (in
+			 * __bfq_entity_update_weight_prio).
+			 * This barrier makes sure that the new value
+			 * of bfqg->entity.new_weight is correctly
+			 * seen in that code.
+			 */
+			smp_wmb();
+			bfqg->entity.prio_changed = 1;
+		}
+	}
+	spin_unlock_irq(&blkcg->lock);
+
+	return ret;
+}
+
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+				 char *buf, size_t nbytes,
+				 loff_t off)
+{
+	u64 weight;
+	/* First unsigned long found in the file is used */
+	int ret = kstrtoull(strim(buf), 0, &weight);
+
+	if (ret)
+		return ret;
+
+	return bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+}
+
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, false);
+	return 0;
+}
+
+static int bfqg_print_rwstat(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat,
+			  &blkcg_policy_bfq, seq_cft(sf)->private, true);
+	return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
+					  &blkcg_policy_bfq, off);
+	return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
+					struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
+							   &blkcg_policy_bfq,
+							   off);
+	return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, false);
+	return 0;
+}
+
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+			  seq_cft(sf)->private, true);
+	return 0;
+}
+
+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
+			       int off)
+{
+	u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false);
+	return 0;
+}
+
+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
+					 struct blkg_policy_data *pd, int off)
+{
+	struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
+					offsetof(struct blkcg_gq, stat_bytes));
+	u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+	return __blkg_prfill_u64(sf, pd, sum >> 9);
+}
+
+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0,
+			  false);
+	return 0;
+}
+
+
+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
+				      struct blkg_policy_data *pd, int off)
+{
+	struct bfq_group *bfqg = pd_to_bfqg(pd);
+	u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+	u64 v = 0;
+
+	if (samples) {
+		v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+		v = div64_u64(v, samples);
+	}
+	__blkg_prfill_u64(sf, pd, v);
+	return 0;
+}
+
+/* print avg_queue_size */
+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
+{
+	blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+			  bfqg_prfill_avg_queue_size, &blkcg_policy_bfq,
+			  0, false);
+	return 0;
+}
+
+static struct bfq_group *
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	int ret;
+
+	ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+	if (ret)
+		return NULL;
+
+	return blkg_to_bfqg(bfqd->queue->root_blkg);
+}
+
+static struct cftype bfq_blkcg_legacy_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write_u64 = bfq_io_set_weight_legacy,
+	},
+
+	/* statistics, covers only the tasks in the bfqg */
+	{
+		.name = "bfq.time",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.sectors",
+		.seq_show = bfqg_print_stat_sectors,
+	},
+	{
+		.name = "bfq.io_service_bytes",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes,
+	},
+	{
+		.name = "bfq.io_serviced",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios,
+	},
+	{
+		.name = "bfq.io_service_time",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_wait_time",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_merged",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat,
+	},
+	{
+		.name = "bfq.io_queued",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat,
+	},
+
+	/* the same statictics which cover the bfqg and its descendants */
+	{
+		.name = "bfq.time_recursive",
+		.private = offsetof(struct bfq_group, stats.time),
+		.seq_show = bfqg_print_stat_recursive,
+	},
+	{
+		.name = "bfq.sectors_recursive",
+		.seq_show = bfqg_print_stat_sectors_recursive,
+	},
+	{
+		.name = "bfq.io_service_bytes_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_bytes_recursive,
+	},
+	{
+		.name = "bfq.io_serviced_recursive",
+		.private = (unsigned long)&blkcg_policy_bfq,
+		.seq_show = blkg_print_stat_ios_recursive,
+	},
+	{
+		.name = "bfq.io_service_time_recursive",
+		.private = offsetof(struct bfq_group, stats.service_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_wait_time_recursive",
+		.private = offsetof(struct bfq_group, stats.wait_time),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_merged_recursive",
+		.private = offsetof(struct bfq_group, stats.merged),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.io_queued_recursive",
+		.private = offsetof(struct bfq_group, stats.queued),
+		.seq_show = bfqg_print_rwstat_recursive,
+	},
+	{
+		.name = "bfq.avg_queue_size",
+		.seq_show = bfqg_print_avg_queue_size,
+	},
+	{
+		.name = "bfq.group_wait_time",
+		.private = offsetof(struct bfq_group, stats.group_wait_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.idle_time",
+		.private = offsetof(struct bfq_group, stats.idle_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.empty_time",
+		.private = offsetof(struct bfq_group, stats.empty_time),
+		.seq_show = bfqg_print_stat,
+	},
+	{
+		.name = "bfq.dequeue",
+		.private = offsetof(struct bfq_group, stats.dequeue),
+		.seq_show = bfqg_print_stat,
+	},
+	{ }	/* terminate */
+};
+
+static struct cftype bfq_blkg_files[] = {
+	{
+		.name = "bfq.weight",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.seq_show = bfq_io_show_weight,
+		.write = bfq_io_set_weight,
+	},
+	{} /* terminate */
+};
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg,
+			struct bfq_queue *bfqq, int op, int op_flags) { }
+static inline void
+bfqg_stats_update_io_remove(struct bfq_group *bfqg, int op, int op_flags) { }
+static inline void
+bfqg_stats_update_io_merged(struct bfq_group *bfqg, int op, int op_flags) { }
+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg,
+			uint64_t start_time, uint64_t io_start_time, int op,
+			int op_flags) { }
+static inline void
+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
+				     struct bfq_group *curr_bfqg) { }
+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { }
+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
+
+static void bfq_init_entity(struct bfq_entity *entity,
+			    struct bfq_group *bfqg)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->weight = entity->new_weight;
+	entity->orig_weight = entity->new_weight;
+	if (bfqq) {
+		bfqq->ioprio = bfqq->new_ioprio;
+		bfqq->ioprio_class = bfqq->new_ioprio_class;
+	}
+	entity->sched_data = &bfqg->sched_data;
+}
+
+static struct bfq_group *
+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+
+	return bfqd->root_group;
+}
+
+static void bfq_end_wr_async(struct bfq_data *bfqd)
+{
+	bfq_end_wr_async_queues(bfqd, bfqd->root_group);
+}
+
+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
+					    struct blkcg *blkcg)
+{
+	return bfqd->root_group;
+}
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
+
+static struct bfq_group *
+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
+{
+	struct bfq_group *bfqg;
+	int i;
+
+	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
+	if (!bfqg)
+		return NULL;
+
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+
+	return bfqg;
+}
+#endif
diff --git b/block/bfq-ioc.c b/block/bfq-ioc.c
new file mode 100644
index 0000000..fb7bb8f
--- /dev/null
+++ b/block/bfq-ioc.c
@@ -0,0 +1,36 @@
+/*
+ * BFQ: I/O context handling.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
+ */
+
+/**
+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
+ * @icq: the iocontext queue.
+ */
+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
+{
+	/* bic->icq is the first member, %NULL will convert to %NULL */
+	return container_of(icq, struct bfq_io_cq, icq);
+}
+
+/**
+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
+ * @bfqd: the lookup key.
+ * @ioc: the io_context of the process doing I/O.
+ *
+ * Queue lock must be held.
+ */
+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+					struct io_context *ioc)
+{
+	if (ioc)
+		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
+	return NULL;
+}
diff --git b/block/bfq-iosched.c b/block/bfq-iosched.c
new file mode 100644
index 0000000..0f3081d
--- /dev/null
+++ b/block/bfq-iosched.c
@@ -0,0 +1,5291 @@
+/*
+ * Budget Fair Queueing (BFQ) I/O scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
+ *
+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ
+ * file.
+ *
+ * BFQ is a proportional-share storage-I/O scheduling algorithm based
+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns
+ * budgets, measured in number of sectors, to processes instead of
+ * time slices. The device is not granted to the in-service process
+ * for a given time slice, but until it has exhausted its assigned
+ * budget. This change from the time to the service domain enables BFQ
+ * to distribute the device throughput among processes as desired,
+ * without any distortion due to throughput fluctuations, or to device
+ * internal queueing. BFQ uses an ad hoc internal scheduler, called
+ * B-WF2Q+, to schedule processes according to their budgets. More
+ * precisely, BFQ schedules queues associated with processes. Thanks to
+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high
+ * budgets to I/O-bound processes issuing sequential requests (to
+ * boost the throughput), and yet guarantee a low latency to
+ * interactive and soft real-time applications.
+ *
+ * BFQ is described in [1], where also a reference to the initial, more
+ * theoretical paper on BFQ can be found. The interested reader can find
+ * in the latter paper full details on the main algorithm, as well as
+ * formulas of the guarantees and formal proofs of all the properties.
+ * With respect to the version of BFQ presented in these papers, this
+ * implementation adds a few more heuristics, such as the one that
+ * guarantees a low latency to soft real-time applications, and a
+ * hierarchical extension based on H-WF2Q+.
+ *
+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
+ * complexity derives from the one introduced with EEVDF in [3].
+ *
+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness
+ *     with the BFQ Disk I/O Scheduler'',
+ *     Proceedings of the 5th Annual International Systems and Storage
+ *     Conference (SYSTOR '12), June 2012.
+ *
+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf
+ *
+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
+ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
+ *     Oct 1997.
+ *
+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
+ *
+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
+ *     First: A Flexible and Accurate Mechanism for Proportional Share
+ *     Resource Allocation,'' technical report.
+ *
+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
+ */
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/cgroup.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include "bfq.h"
+#include "blk.h"
+
+/* Expiration time of sync (0) and async (1) requests, in ns. */
+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
+
+/* Maximum backwards seek, in KiB. */
+static const int bfq_back_max = (16 * 1024);
+
+/* Penalty of a backwards seek, in number of sectors. */
+static const int bfq_back_penalty = 2;
+
+/* Idling period duration, in ns. */
+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125);
+
+/* Minimum number of assigned budgets for which stats are safe to compute. */
+static const int bfq_stats_min_budgets = 194;
+
+/* Default maximum budget values, in sectors and number of requests. */
+static const int bfq_default_max_budget = (16 * 1024);
+
+/*
+ * Async to sync throughput distribution is controlled as follows:
+ * when an async request is served, the entity is charged the number
+ * of sectors of the request, multiplied by the factor below
+ */
+static const int bfq_async_charge_factor = 10;
+
+/* Default timeout values, in jiffies, approximating CFQ defaults. */
+static const int bfq_timeout = (HZ / 8);
+
+struct kmem_cache *bfq_pool;
+
+/* Below this threshold (in ns), we consider thinktime immediate. */
+#define BFQ_MIN_TT		(2 * NSEC_PER_MSEC)
+
+/* hw_tag detection: parallel requests threshold and min samples needed. */
+#define BFQ_HW_QUEUE_THRESHOLD	4
+#define BFQ_HW_QUEUE_SAMPLES	32
+
+#define BFQQ_SEEK_THR		(sector_t)(8 * 100)
+#define BFQQ_CLOSE_THR		(sector_t)(8 * 1024)
+#define BFQQ_SEEKY(bfqq)	(hweight32(bfqq->seek_history) > 32/8)
+
+/* Min number of samples required to perform peak-rate update */
+#define BFQ_RATE_MIN_SAMPLES	32
+/* Min observation time interval required to perform a peak-rate update (ns) */
+#define BFQ_RATE_MIN_INTERVAL	(300*NSEC_PER_MSEC)
+/* Target observation time interval for a peak-rate update (ns) */
+#define BFQ_RATE_REF_INTERVAL	NSEC_PER_SEC
+
+/* Shift used for peak rate fixed precision calculations. */
+#define BFQ_RATE_SHIFT		16
+
+/*
+ * By default, BFQ computes the duration of the weight raising for
+ * interactive applications automatically, using the following formula:
+ * duration = (R / r) * T, where r is the peak rate of the device, and
+ * R and T are two reference parameters.
+ * In particular, R is the peak rate of the reference device (see below),
+ * and T is a reference time: given the systems that are likely to be
+ * installed on the reference device according to its speed class, T is
+ * about the maximum time needed, under BFQ and while reading two files in
+ * parallel, to load typical large applications on these systems.
+ * In practice, the slower/faster the device at hand is, the more/less it
+ * takes to load applications with respect to the reference device.
+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive
+ * applications.
+ *
+ * BFQ uses four different reference pairs (R, T), depending on:
+ * . whether the device is rotational or non-rotational;
+ * . whether the device is slow, such as old or portable HDDs, as well as
+ *   SD cards, or fast, such as newer HDDs and SSDs.
+ *
+ * The device's speed class is dynamically (re)detected in
+ * bfq_update_peak_rate() every time the estimated peak rate is updated.
+ *
+ * In the following definitions, R_slow[0]/R_fast[0] and
+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast
+ * rotational device, whereas R_slow[1]/R_fast[1] and
+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast
+ * non-rotational device. Finally, device_speed_thresh are the
+ * thresholds used to switch between speed classes. The reference
+ * rates are not the actual peak rates of the devices used as a
+ * reference, but slightly lower values. The reason for using these
+ * slightly lower values is that the peak-rate estimator tends to
+ * yield slightly lower values than the actual peak rate (it can yield
+ * the actual peak rate only if there is only one process doing I/O,
+ * and the process does sequential I/O).
+ *
+ * Both the reference peak rates and the thresholds are measured in
+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT.
+ */
+static int R_slow[2] = {1000, 10700};
+static int R_fast[2] = {14000, 33000};
+/*
+ * To improve readability, a conversion function is used to initialize the
+ * following arrays, which entails that they can be initialized only in a
+ * function.
+ */
+static int T_slow[2];
+static int T_fast[2];
+static int device_speed_thresh[2];
+
+#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
+				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
+
+#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
+#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
+
+static void bfq_schedule_dispatch(struct bfq_data *bfqd);
+
+#include "bfq-ioc.c"
+#include "bfq-sched.c"
+#include "bfq-cgroup.c"
+
+#define bfq_class_idle(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
+#define bfq_class_rt(bfqq)	((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
+
+#define bfq_sample_valid(samples)	((samples) > 80)
+
+/*
+ * We regard a request as SYNC, if either it's a read or has the SYNC bit
+ * set (in which case it could also be a direct WRITE).
+ */
+static int bfq_bio_sync(struct bio *bio)
+{
+	return bio_data_dir(bio) == READ || (bio->bi_opf & REQ_SYNC);
+}
+
+/*
+ * Scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing.
+ */
+static void bfq_schedule_dispatch(struct bfq_data *bfqd)
+{
+	if (bfqd->queued != 0) {
+		bfq_log(bfqd, "schedule dispatch");
+		kblockd_schedule_work(&bfqd->unplug_work);
+	}
+}
+
+/*
+ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
+ * We choose the request that is closesr to the head right now.  Distance
+ * behind the head is penalized and only allowed to a certain extent.
+ */
+static struct request *bfq_choose_req(struct bfq_data *bfqd,
+				      struct request *rq1,
+				      struct request *rq2,
+				      sector_t last)
+{
+	sector_t s1, s2, d1 = 0, d2 = 0;
+	unsigned long back_max;
+#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
+#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
+	unsigned int wrap = 0; /* bit mask: requests behind the disk head? */
+
+	if (!rq1 || rq1 == rq2)
+		return rq2;
+	if (!rq2)
+		return rq1;
+
+	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
+		return rq1;
+	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
+		return rq2;
+	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
+		return rq1;
+	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
+		return rq2;
+
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
+
+	/*
+	 * By definition, 1KiB is 2 sectors.
+	 */
+	back_max = bfqd->bfq_back_max * 2;
+
+	/*
+	 * Strict one way elevator _except_ in the case where we allow
+	 * short backward seeks which are biased as twice the cost of a
+	 * similar forward seek.
+	 */
+	if (s1 >= last)
+		d1 = s1 - last;
+	else if (s1 + back_max >= last)
+		d1 = (last - s1) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ1_WRAP;
+
+	if (s2 >= last)
+		d2 = s2 - last;
+	else if (s2 + back_max >= last)
+		d2 = (last - s2) * bfqd->bfq_back_penalty;
+	else
+		wrap |= BFQ_RQ2_WRAP;
+
+	/* Found required data */
+
+	/*
+	 * By doing switch() on the bit mask "wrap" we avoid having to
+	 * check two variables for all permutations: --> faster!
+	 */
+	switch (wrap) {
+	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
+		if (d1 < d2)
+			return rq1;
+		else if (d2 < d1)
+			return rq2;
+
+		if (s1 >= s2)
+			return rq1;
+		else
+			return rq2;
+
+	case BFQ_RQ2_WRAP:
+		return rq1;
+	case BFQ_RQ1_WRAP:
+		return rq2;
+	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
+	default:
+		/*
+		 * Since both rqs are wrapped,
+		 * start with the one that's further behind head
+		 * (--> only *one* back seek required),
+		 * since back seek takes more time than forward.
+		 */
+		if (s1 <= s2)
+			return rq1;
+		else
+			return rq2;
+	}
+}
+
+static struct bfq_queue *
+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
+		     sector_t sector, struct rb_node **ret_parent,
+		     struct rb_node ***rb_link)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *bfqq = NULL;
+
+	parent = NULL;
+	p = &root->rb_node;
+	while (*p) {
+		struct rb_node **n;
+
+		parent = *p;
+		bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+
+		/*
+		 * Sort strictly based on sector. Smallest to the left,
+		 * largest to the right.
+		 */
+		if (sector > blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_right;
+		else if (sector < blk_rq_pos(bfqq->next_rq))
+			n = &(*p)->rb_left;
+		else
+			break;
+		p = n;
+		bfqq = NULL;
+	}
+
+	*ret_parent = parent;
+	if (rb_link)
+		*rb_link = p;
+
+	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
+		(unsigned long long) sector,
+		bfqq ? bfqq->pid : 0);
+
+	return bfqq;
+}
+
+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct rb_node **p, *parent;
+	struct bfq_queue *__bfqq;
+
+	if (bfqq->pos_root) {
+		rb_erase(&bfqq->pos_node, bfqq->pos_root);
+		bfqq->pos_root = NULL;
+	}
+
+	if (bfq_class_idle(bfqq))
+		return;
+	if (!bfqq->next_rq)
+		return;
+
+	bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
+			blk_rq_pos(bfqq->next_rq), &parent, &p);
+	if (!__bfqq) {
+		rb_link_node(&bfqq->pos_node, parent, p);
+		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
+	} else
+		bfqq->pos_root = NULL;
+}
+
+/*
+ * Tell whether there are active queues or groups with differentiated weights.
+ */
+static bool bfq_differentiated_weights(struct bfq_data *bfqd)
+{
+	/*
+	 * For weights to differ, at least one of the trees must contain
+	 * at least two nodes.
+	 */
+	return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) &&
+		(bfqd->queue_weights_tree.rb_node->rb_left ||
+		 bfqd->queue_weights_tree.rb_node->rb_right)
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	       ) ||
+	       (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) &&
+		(bfqd->group_weights_tree.rb_node->rb_left ||
+		 bfqd->group_weights_tree.rb_node->rb_right)
+#endif
+	       );
+}
+
+/*
+ * The following function returns true if every queue must receive the
+ * same share of the throughput (this condition is used when deciding
+ * whether idling may be disabled, see the comments in the function
+ * bfq_bfqq_may_idle()).
+ *
+ * Such a scenario occurs when:
+ * 1) all active queues have the same weight,
+ * 2) all active groups at the same level in the groups tree have the same
+ *    weight,
+ * 3) all active groups at the same level in the groups tree have the same
+ *    number of children.
+ *
+ * Unfortunately, keeping the necessary state for evaluating exactly the
+ * above symmetry conditions would be quite complex and time-consuming.
+ * Therefore this function evaluates, instead, the following stronger
+ * sub-conditions, for which it is much easier to maintain the needed
+ * state:
+ * 1) all active queues have the same weight,
+ * 2) all active groups have the same weight,
+ * 3) all active groups have at most one active child each.
+ * In particular, the last two conditions are always true if hierarchical
+ * support and the cgroups interface are not enabled, thus no state needs
+ * to be maintained in this case.
+ */
+static bool bfq_symmetric_scenario(struct bfq_data *bfqd)
+{
+	return !bfq_differentiated_weights(bfqd);
+}
+
+/*
+ * If the weight-counter tree passed as input contains no counter for
+ * the weight of the input entity, then add that counter; otherwise just
+ * increment the existing counter.
+ *
+ * Note that weight-counter trees contain few nodes in mostly symmetric
+ * scenarios. For example, if all queues have the same weight, then the
+ * weight-counter tree for the queues may contain at most one node.
+ * This holds even if low_latency is on, because weight-raised queues
+ * are not inserted in the tree.
+ * In most scenarios, the rate at which nodes are created/destroyed
+ * should be low too.
+ */
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Do not insert if the entity is already associated with a
+	 * counter, which happens if:
+	 *   1) the entity is associated with a queue,
+	 *   2) a request arrival has caused the queue to become both
+	 *      non-weight-raised, and hence change its weight, and
+	 *      backlogged; in this respect, each of the two events
+	 *      causes an invocation of this function,
+	 *   3) this is the invocation of this function caused by the
+	 *      second event. This second invocation is actually useless,
+	 *      and we handle this fact by exiting immediately. More
+	 *      efficient or clearer solutions might possibly be adopted.
+	 */
+	if (entity->weight_counter)
+		return;
+
+	while (*new) {
+		struct bfq_weight_counter *__counter = container_of(*new,
+						struct bfq_weight_counter,
+						weights_node);
+		parent = *new;
+
+		if (entity->weight == __counter->weight) {
+			entity->weight_counter = __counter;
+			goto inc_counter;
+		}
+		if (entity->weight < __counter->weight)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter),
+					 GFP_ATOMIC);
+	entity->weight_counter->weight = entity->weight;
+	rb_link_node(&entity->weight_counter->weights_node, parent, new);
+	rb_insert_color(&entity->weight_counter->weights_node, root);
+
+inc_counter:
+	entity->weight_counter->num_active++;
+}
+
+/*
+ * Decrement the weight counter associated with the entity, and, if the
+ * counter reaches 0, remove the counter from the tree.
+ * See the comments to the function bfq_weights_tree_add() for considerations
+ * about overhead.
+ */
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root)
+{
+	if (!entity->weight_counter)
+		return;
+
+	BUG_ON(RB_EMPTY_ROOT(root));
+	BUG_ON(entity->weight_counter->weight != entity->weight);
+
+	BUG_ON(!entity->weight_counter->num_active);
+	entity->weight_counter->num_active--;
+	if (entity->weight_counter->num_active > 0)
+		goto reset_entity_pointer;
+
+	rb_erase(&entity->weight_counter->weights_node, root);
+	kfree(entity->weight_counter);
+
+reset_entity_pointer:
+	entity->weight_counter = NULL;
+}
+
+static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq,
+					struct request *last)
+{
+	struct rb_node *rbnext = rb_next(&last->rb_node);
+	struct rb_node *rbprev = rb_prev(&last->rb_node);
+	struct request *next = NULL, *prev = NULL;
+
+	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
+
+	if (rbprev)
+		prev = rb_entry_rq(rbprev);
+
+	if (rbnext)
+		next = rb_entry_rq(rbnext);
+	else {
+		rbnext = rb_first(&bfqq->sort_list);
+		if (rbnext && rbnext != &last->rb_node)
+			next = rb_entry_rq(rbnext);
+	}
+
+	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
+}
+
+/* see the definition of bfq_async_charge_factor for details */
+static unsigned long bfq_serv_to_charge(struct request *rq,
+					struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1)
+		return blk_rq_sectors(rq);
+
+	/*
+	 * If there are no weight-raised queues, then amplify service
+	 * by just the async charge factor; otherwise amplify service
+	 * by twice the async charge factor, to further reduce latency
+	 * for weight-raised queues.
+	 */
+	if (bfqq->bfqd->wr_busy_queues == 0)
+		return blk_rq_sectors(rq) * bfq_async_charge_factor;
+
+	return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor;
+}
+
+/**
+ * bfq_updated_next_req - update the queue after a new next_rq selection.
+ * @bfqd: the device data the queue belongs to.
+ * @bfqq: the queue to update.
+ *
+ * If the first request of a queue changes we make sure that the queue
+ * has enough budget to serve at least its first request (if the
+ * request has grown).  We do this because if the queue has not enough
+ * budget for its first request, it has to go through two dispatch
+ * rounds to actually get it dispatched.
+ */
+static void bfq_updated_next_req(struct bfq_data *bfqd,
+				 struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	struct request *next_rq = bfqq->next_rq;
+	unsigned long new_budget;
+
+	if (!next_rq)
+		return;
+
+	if (bfqq == bfqd->in_service_queue)
+		/*
+		 * In order not to break guarantees, budgets cannot be
+		 * changed after an entity has been selected.
+		 */
+		return;
+
+	BUG_ON(entity->tree != &st->active);
+	BUG_ON(entity == entity->sched_data->in_service_entity);
+
+	new_budget = max_t(unsigned long, bfqq->max_budget,
+			   bfq_serv_to_charge(next_rq, bfqq));
+	if (entity->budget != new_budget) {
+		entity->budget = new_budget;
+		bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu",
+					 new_budget);
+		bfq_activate_bfqq(bfqd, bfqq);
+	}
+}
+
+static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
+{
+	u64 dur;
+
+	if (bfqd->bfq_wr_max_time > 0)
+		return bfqd->bfq_wr_max_time;
+
+	dur = bfqd->RT_prod;
+	do_div(dur, bfqd->peak_rate);
+
+	/*
+	 * Limit duration between 3 and 13 seconds. Tests show that
+	 * higher values than 13 seconds often yield the opposite of
+	 * the desired result, i.e., worsen responsiveness by letting
+	 * non-interactive and non-soft-real-time applications
+	 * preserve weight raising for a too long time interval.
+	 *
+	 * On the other end, lower values than 3 seconds make it
+	 * difficult for most interactive tasks to complete their jobs
+	 * before weight-raising finishes.
+	 */
+	if (dur > msecs_to_jiffies(13000))
+		dur = msecs_to_jiffies(13000);
+	else if (dur < msecs_to_jiffies(3000))
+		dur = msecs_to_jiffies(3000);
+
+	return dur;
+}
+
+static void
+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
+{
+	if (bic->saved_idle_window)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+
+	if (bic->saved_IO_bound)
+		bfq_mark_bfqq_IO_bound(bfqq);
+	else
+		bfq_clear_bfqq_IO_bound(bfqq);
+
+	bfqq->wr_coeff = bic->saved_wr_coeff;
+	bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
+	BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt));
+	bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
+	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+	if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
+	    time_is_before_jiffies(bfqq->last_wr_start_finish +
+				   bfqq->wr_cur_max_time))) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+		    "resume state: switching off wr");
+
+		bfqq->wr_coeff = 1;
+	}
+	/* make sure weight will be updated, however we got here */
+	bfqq->entity.prio_changed = 1;
+}
+
+static int bfqq_process_refs(struct bfq_queue *bfqq)
+{
+	int process_refs, io_refs;
+
+	lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
+
+	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
+	process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;
+	BUG_ON(process_refs < 0);
+	return process_refs;
+}
+
+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */
+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_queue *item;
+	struct hlist_node *n;
+
+	hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node)
+		hlist_del_init(&item->burst_list_node);
+	hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+	bfqd->burst_size = 1;
+	bfqd->burst_parent_entity = bfqq->entity.parent;
+}
+
+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */
+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/* Increment burst size to take into account also bfqq */
+	bfqd->burst_size++;
+
+	bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size);
+
+	BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh);
+
+	if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) {
+		struct bfq_queue *pos, *bfqq_item;
+		struct hlist_node *n;
+
+		/*
+		 * Enough queues have been activated shortly after each
+		 * other to consider this burst as large.
+		 */
+		bfqd->large_burst = true;
+		bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started");
+
+		/*
+		 * We can now mark all queues in the burst list as
+		 * belonging to a large burst.
+		 */
+		hlist_for_each_entry(bfqq_item, &bfqd->burst_list,
+				     burst_list_node) {
+			bfq_mark_bfqq_in_large_burst(bfqq_item);
+			bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst");
+		}
+		bfq_mark_bfqq_in_large_burst(bfqq);
+		bfq_log_bfqq(bfqd, bfqq, "marked in large burst");
+
+		/*
+		 * From now on, and until the current burst finishes, any
+		 * new queue being activated shortly after the last queue
+		 * was inserted in the burst can be immediately marked as
+		 * belonging to a large burst. So the burst list is not
+		 * needed any more. Remove it.
+		 */
+		hlist_for_each_entry_safe(pos, n, &bfqd->burst_list,
+					  burst_list_node)
+			hlist_del_init(&pos->burst_list_node);
+	} else /*
+		* Burst not yet large: add bfqq to the burst list. Do
+		* not increment the ref counter for bfqq, because bfqq
+		* is removed from the burst list before freeing bfqq
+		* in put_queue.
+		*/
+		hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list);
+}
+
+/*
+ * If many queues belonging to the same group happen to be created
+ * shortly after each other, then the processes associated with these
+ * queues have typically a common goal. In particular, bursts of queue
+ * creations are usually caused by services or applications that spawn
+ * many parallel threads/processes. Examples are systemd during boot,
+ * or git grep. To help these processes get their job done as soon as
+ * possible, it is usually better to not grant either weight-raising
+ * or device idling to their queues.
+ *
+ * In this comment we describe, firstly, the reasons why this fact
+ * holds, and, secondly, the next function, which implements the main
+ * steps needed to properly mark these queues so that they can then be
+ * treated in a different way.
+ *
+ * The above services or applications benefit mostly from a high
+ * throughput: the quicker the requests of the activated queues are
+ * cumulatively served, the sooner the target job of these queues gets
+ * completed. As a consequence, weight-raising any of these queues,
+ * which also implies idling the device for it, is almost always
+ * counterproductive. In most cases it just lowers throughput.
+ *
+ * On the other hand, a burst of queue creations may be caused also by
+ * the start of an application that does not consist of a lot of
+ * parallel I/O-bound threads. In fact, with a complex application,
+ * several short processes may need to be executed to start-up the
+ * application. In this respect, to start an application as quickly as
+ * possible, the best thing to do is in any case to privilege the I/O
+ * related to the application with respect to all other
+ * I/O. Therefore, the best strategy to start as quickly as possible
+ * an application that causes a burst of queue creations is to
+ * weight-raise all the queues created during the burst. This is the
+ * exact opposite of the best strategy for the other type of bursts.
+ *
+ * In the end, to take the best action for each of the two cases, the
+ * two types of bursts need to be distinguished. Fortunately, this
+ * seems relatively easy, by looking at the sizes of the bursts. In
+ * particular, we found a threshold such that only bursts with a
+ * larger size than that threshold are apparently caused by
+ * services or commands such as systemd or git grep. For brevity,
+ * hereafter we call just 'large' these bursts. BFQ *does not*
+ * weight-raise queues whose creation occurs in a large burst. In
+ * addition, for each of these queues BFQ performs or does not perform
+ * idling depending on which choice boosts the throughput more. The
+ * exact choice depends on the device and request pattern at
+ * hand.
+ *
+ * Unfortunately, false positives may occur while an interactive task
+ * is starting (e.g., an application is being started). The
+ * consequence is that the queues associated with the task do not
+ * enjoy weight raising as expected. Fortunately these false positives
+ * are very rare. They typically occur if some service happens to
+ * start doing I/O exactly when the interactive task starts.
+ *
+ * Turning back to the next function, it implements all the steps
+ * needed to detect the occurrence of a large burst and to properly
+ * mark all the queues belonging to it (so that they can then be
+ * treated in a different way). This goal is achieved by maintaining a
+ * "burst list" that holds, temporarily, the queues that belong to the
+ * burst in progress. The list is then used to mark these queues as
+ * belonging to a large burst if the burst does become large. The main
+ * steps are the following.
+ *
+ * . when the very first queue is created, the queue is inserted into the
+ *   list (as it could be the first queue in a possible burst)
+ *
+ * . if the current burst has not yet become large, and a queue Q that does
+ *   not yet belong to the burst is activated shortly after the last time
+ *   at which a new queue entered the burst list, then the function appends
+ *   Q to the burst list
+ *
+ * . if, as a consequence of the previous step, the burst size reaches
+ *   the large-burst threshold, then
+ *
+ *     . all the queues in the burst list are marked as belonging to a
+ *       large burst
+ *
+ *     . the burst list is deleted; in fact, the burst list already served
+ *       its purpose (keeping temporarily track of the queues in a burst,
+ *       so as to be able to mark them as belonging to a large burst in the
+ *       previous sub-step), and now is not needed any more
+ *
+ *     . the device enters a large-burst mode
+ *
+ * . if a queue Q that does not belong to the burst is created while
+ *   the device is in large-burst mode and shortly after the last time
+ *   at which a queue either entered the burst list or was marked as
+ *   belonging to the current large burst, then Q is immediately marked
+ *   as belonging to a large burst.
+ *
+ * . if a queue Q that does not belong to the burst is created a while
+ *   later, i.e., not shortly after, than the last time at which a queue
+ *   either entered the burst list or was marked as belonging to the
+ *   current large burst, then the current burst is deemed as finished and:
+ *
+ *        . the large-burst mode is reset if set
+ *
+ *        . the burst list is emptied
+ *
+ *        . Q is inserted in the burst list, as Q may be the first queue
+ *          in a possible new burst (then the burst list contains just Q
+ *          after this step).
+ */
+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	/*
+	 * If bfqq is already in the burst list or is part of a large
+	 * burst, or finally has just been split, then there is
+	 * nothing else to do.
+	 */
+	if (!hlist_unhashed(&bfqq->burst_list_node) ||
+	    bfq_bfqq_in_large_burst(bfqq) ||
+	    time_is_after_eq_jiffies(bfqq->split_time +
+				     msecs_to_jiffies(10)))
+		return;
+
+	/*
+	 * If bfqq's creation happens late enough, or bfqq belongs to
+	 * a different group than the burst group, then the current
+	 * burst is finished, and related data structures must be
+	 * reset.
+	 *
+	 * In this respect, consider the special case where bfqq is
+	 * the very first queue created after BFQ is selected for this
+	 * device. In this case, last_ins_in_burst and
+	 * burst_parent_entity are not yet significant when we get
+	 * here. But it is easy to verify that, whether or not the
+	 * following condition is true, bfqq will end up being
+	 * inserted into the burst list. In particular the list will
+	 * happen to contain only bfqq. And this is exactly what has
+	 * to happen, as bfqq may be the first queue of the first
+	 * burst.
+	 */
+	if (time_is_before_jiffies(bfqd->last_ins_in_burst +
+	    bfqd->bfq_burst_interval) ||
+	    bfqq->entity.parent != bfqd->burst_parent_entity) {
+		bfqd->large_burst = false;
+		bfq_reset_burst_list(bfqd, bfqq);
+		bfq_log_bfqq(bfqd, bfqq,
+			"handle_burst: late activation or different group");
+		goto end;
+	}
+
+	/*
+	 * If we get here, then bfqq is being activated shortly after the
+	 * last queue. So, if the current burst is also large, we can mark
+	 * bfqq as belonging to this large burst immediately.
+	 */
+	if (bfqd->large_burst) {
+		bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst");
+		bfq_mark_bfqq_in_large_burst(bfqq);
+		goto end;
+	}
+
+	/*
+	 * If we get here, then a large-burst state has not yet been
+	 * reached, but bfqq is being activated shortly after the last
+	 * queue. Then we add bfqq to the burst.
+	 */
+	bfq_add_to_burst(bfqd, bfqq);
+end:
+	/*
+	 * At this point, bfqq either has been added to the current
+	 * burst or has caused the current burst to terminate and a
+	 * possible new burst to start. In particular, in the second
+	 * case, bfqq has become the first queue in the possible new
+	 * burst.  In both cases last_ins_in_burst needs to be moved
+	 * forward.
+	 */
+	bfqd->last_ins_in_burst = jiffies;
+
+}
+
+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	return entity->budget - entity->service;
+}
+
+/*
+ * If enough samples have been computed, return the current max budget
+ * stored in bfqd, which is dynamically updated according to the
+ * estimated disk peak rate; otherwise return the default max budget
+ */
+static int bfq_max_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget;
+	else
+		return bfqd->bfq_max_budget;
+}
+
+/*
+ * Return min budget, which is a fraction of the current or default
+ * max budget (trying with 1/32)
+ */
+static int bfq_min_budget(struct bfq_data *bfqd)
+{
+	if (bfqd->budgets_assigned < bfq_stats_min_budgets)
+		return bfq_default_max_budget / 32;
+	else
+		return bfqd->bfq_max_budget / 32;
+}
+
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason);
+
+/*
+ * The next function, invoked after the input queue bfqq switches from
+ * idle to busy, updates the budget of bfqq. The function also tells
+ * whether the in-service queue should be expired, by returning
+ * true. The purpose of expiring the in-service queue is to give bfqq
+ * the chance to possibly preempt the in-service queue, and the reason
+ * for preempting the in-service queue is to achieve one of the two
+ * goals below.
+ *
+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has
+ * expired because it has remained idle. In particular, bfqq may have
+ * expired for one of the following two reasons:
+ *
+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and
+ *   did not make it to issue a new request before its last request
+ *   was served;
+ *
+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue
+ *   a new request before the expiration of the idling-time.
+ *
+ * Even if bfqq has expired for one of the above reasons, the process
+ * associated with the queue may be however issuing requests greedily,
+ * and thus be sensitive to the bandwidth it receives (bfqq may have
+ * remained idle for other reasons: CPU high load, bfqq not enjoying
+ * idling, I/O throttling somewhere in the path from the process to
+ * the I/O scheduler, ...). But if, after every expiration for one of
+ * the above two reasons, bfqq has to wait for the service of at least
+ * one full budget of another queue before being served again, then
+ * bfqq is likely to get a much lower bandwidth or resource time than
+ * its reserved ones. To address this issue, two countermeasures need
+ * to be taken.
+ *
+ * First, the budget and the timestamps of bfqq need to be updated in
+ * a special way on bfqq reactivation: they need to be updated as if
+ * bfqq did not remain idle and did not expire. In fact, if they are
+ * computed as if bfqq expired and remained idle until reactivation,
+ * then the process associated with bfqq is treated as if, instead of
+ * being greedy, it stopped issuing requests when bfqq remained idle,
+ * and restarts issuing requests only on this reactivation. In other
+ * words, the scheduler does not help the process recover the "service
+ * hole" between bfqq expiration and reactivation. As a consequence,
+ * the process receives a lower bandwidth than its reserved one. In
+ * contrast, to recover this hole, the budget must be updated as if
+ * bfqq was not expired at all before this reactivation, i.e., it must
+ * be set to the value of the remaining budget when bfqq was
+ * expired. Along the same line, timestamps need to be assigned the
+ * value they had the last time bfqq was selected for service, i.e.,
+ * before last expiration. Thus timestamps need to be back-shifted
+ * with respect to their normal computation (see [1] for more details
+ * on this tricky aspect).
+ *
+ * Secondly, to allow the process to recover the hole, the in-service
+ * queue must be expired too, to give bfqq the chance to preempt it
+ * immediately. In fact, if bfqq has to wait for a full budget of the
+ * in-service queue to be completed, then it may become impossible to
+ * let the process recover the hole, even if the back-shifted
+ * timestamps of bfqq are lower than those of the in-service queue. If
+ * this happens for most or all of the holes, then the process may not
+ * receive its reserved bandwidth. In this respect, it is worth noting
+ * that, being the service of outstanding requests unpreemptible, a
+ * little fraction of the holes may however be unrecoverable, thereby
+ * causing a little loss of bandwidth.
+ *
+ * The last important point is detecting whether bfqq does need this
+ * bandwidth recovery. In this respect, the next function deems the
+ * process associated with bfqq greedy, and thus allows it to recover
+ * the hole, if: 1) the process is waiting for the arrival of a new
+ * request (which implies that bfqq expired for one of the above two
+ * reasons), and 2) such a request has arrived soon. The first
+ * condition is controlled through the flag non_blocking_wait_rq,
+ * while the second through the flag arrived_in_time. If both
+ * conditions hold, then the function computes the budget in the
+ * above-described special way, and signals that the in-service queue
+ * should be expired. Timestamp back-shifting is done later in
+ * __bfq_activate_entity.
+ *
+ * 2. Reduce latency. Even if timestamps are not backshifted to let
+ * the process associated with bfqq recover a service hole, bfqq may
+ * however happen to have, after being (re)activated, a lower finish
+ * timestamp than the in-service queue.  That is, the next budget of
+ * bfqq may have to be completed before the one of the in-service
+ * queue. If this is the case, then preempting the in-service queue
+ * allows this goal to be achieved, apart from the unpreemptible,
+ * outstanding requests mentioned above.
+ *
+ * Unfortunately, regardless of which of the above two goals one wants
+ * to achieve, service trees need first to be updated to know whether
+ * the in-service queue must be preempted. To have service trees
+ * correctly updated, the in-service queue must be expired and
+ * rescheduled, and bfqq must be scheduled too. This is one of the
+ * most costly operations (in future versions, the scheduling
+ * mechanism may be re-designed in such a way to make it possible to
+ * know whether preemption is needed without needing to update service
+ * trees). In addition, queue preemptions almost always cause random
+ * I/O, and thus loss of throughput. Because of these facts, the next
+ * function adopts the following simple scheme to avoid both costly
+ * operations and too frequent preemptions: it requests the expiration
+ * of the in-service queue (unconditionally) only for queues that need
+ * to recover a hole, or that either are weight-raised or deserve to
+ * be weight-raised.
+ */
+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
+						struct bfq_queue *bfqq,
+						bool arrived_in_time,
+						bool wr_or_deserves_wr)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) {
+		/*
+		 * We do not clear the flag non_blocking_wait_rq here, as
+		 * the latter is used in bfq_activate_bfqq to signal
+		 * that timestamps need to be back-shifted (and is
+		 * cleared right after).
+		 */
+
+		/*
+		 * In next assignment we rely on that either
+		 * entity->service or entity->budget are not updated
+		 * on expiration if bfqq is empty (see
+		 * __bfq_bfqq_recalc_budget). Thus both quantities
+		 * remain unchanged after such an expiration, and the
+		 * following statement therefore assigns to
+		 * entity->budget the remaining budget on such an
+		 * expiration. For clarity, entity->service is not
+		 * updated on expiration in any case, and, in normal
+		 * operation, is reset only when bfqq is selected for
+		 * service (see bfq_get_next_queue).
+		 */
+		BUG_ON(bfqq->max_budget < 0);
+		entity->budget = min_t(unsigned long,
+				       bfq_bfqq_budget_left(bfqq),
+				       bfqq->max_budget);
+
+		BUG_ON(entity->budget < 0);
+		return true;
+	}
+
+	BUG_ON(bfqq->max_budget < 0);
+	entity->budget = max_t(unsigned long, bfqq->max_budget,
+			       bfq_serv_to_charge(bfqq->next_rq, bfqq));
+	BUG_ON(entity->budget < 0);
+
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+	return wr_or_deserves_wr;
+}
+
+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq,
+					     unsigned int old_wr_coeff,
+					     bool wr_or_deserves_wr,
+					     bool interactive,
+					     bool in_burst,
+					     bool soft_rt)
+{
+	if (old_wr_coeff == 1 && wr_or_deserves_wr) {
+		/* start a weight-raising period */
+		if (interactive) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+		} else {
+			bfqq->wr_start_at_switch_to_srt = jiffies;
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+				BFQ_SOFTRT_WEIGHT_FACTOR;
+			bfqq->wr_cur_max_time =
+				bfqd->bfq_wr_rt_max_time;
+		}
+		/*
+		 * If needed, further reduce budget to make sure it is
+		 * close to bfqq's backlog, so as to reduce the
+		 * scheduling-error component due to a too large
+		 * budget. Do not care about throughput consequences,
+		 * but only about latency. Finally, do not assign a
+		 * too small budget either, to avoid increasing
+		 * latency by causing too frequent expirations.
+		 */
+		bfqq->entity.budget = min_t(unsigned long,
+					    bfqq->entity.budget,
+					    2 * bfq_min_budget(bfqd));
+
+		bfq_log_bfqq(bfqd, bfqq,
+			     "wrais starting at %lu, rais_max_time %u",
+			     jiffies,
+			     jiffies_to_msecs(bfqq->wr_cur_max_time));
+	} else if (old_wr_coeff > 1) {
+		if (interactive) { /* update wr coeff and duration */
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+		} else if (in_burst) {
+			bfqq->wr_coeff = 1;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "wrais ending at %lu, rais_max_time %u",
+				     jiffies,
+				     jiffies_to_msecs(bfqq->
+						      wr_cur_max_time));
+		} else if (soft_rt) {
+			/*
+			 * The application is now or still meeting the
+			 * requirements for being deemed soft rt.  We
+			 * can then correctly and safely (re)charge
+			 * the weight-raising duration for the
+			 * application with the weight-raising
+			 * duration for soft rt applications.
+			 *
+			 * In particular, doing this recharge now, i.e.,
+			 * before the weight-raising period for the
+			 * application finishes, reduces the probability
+			 * of the following negative scenario:
+			 * 1) the weight of a soft rt application is
+			 *    raised at startup (as for any newly
+			 *    created application),
+			 * 2) since the application is not interactive,
+			 *    at a certain time weight-raising is
+			 *    stopped for the application,
+			 * 3) at that time the application happens to
+			 *    still have pending requests, and hence
+			 *    is destined to not have a chance to be
+			 *    deemed soft rt before these requests are
+			 *    completed (see the comments to the
+			 *    function bfq_bfqq_softrt_next_start()
+			 *    for details on soft rt detection),
+			 * 4) these pending requests experience a high
+			 *    latency because the application is not
+			 *    weight-raised while they are pending.
+			 */
+			if (bfqq->wr_cur_max_time !=
+				bfqd->bfq_wr_rt_max_time) {
+				bfqq->wr_start_at_switch_to_srt =
+					bfqq->last_wr_start_finish;
+                BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+				bfqq->wr_cur_max_time =
+					bfqd->bfq_wr_rt_max_time;
+				bfqq->wr_coeff = bfqd->bfq_wr_coeff *
+					BFQ_SOFTRT_WEIGHT_FACTOR;
+				bfq_log_bfqq(bfqd, bfqq,
+					     "switching to soft_rt wr");
+			} else
+				bfq_log_bfqq(bfqd, bfqq,
+					"moving forward soft_rt wr duration");
+			bfqq->last_wr_start_finish = jiffies;
+		}
+	}
+}
+
+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
+					struct bfq_queue *bfqq)
+{
+	return bfqq->dispatched == 0 &&
+		time_is_before_jiffies(
+			bfqq->budget_timeout +
+			bfqd->bfq_wr_min_idle_time);
+}
+
+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
+					     struct bfq_queue *bfqq,
+					     int old_wr_coeff,
+					     struct request *rq,
+					     bool *interactive)
+{
+	bool soft_rt, in_burst,	wr_or_deserves_wr,
+		bfqq_wants_to_preempt,
+		idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq),
+		/*
+		 * See the comments on
+		 * bfq_bfqq_update_budg_for_activation for
+		 * details on the usage of the next variable.
+		 */
+		arrived_in_time =  ktime_get_ns() <=
+			RQ_BIC(rq)->ttime.last_end_request +
+			bfqd->bfq_slice_idle * 3;
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "bfq_add_request non-busy: "
+		     "jiffies %lu, in_time %d, idle_long %d busyw %d "
+		     "wr_coeff %u",
+		     jiffies, arrived_in_time,
+		     idle_for_long_time,
+		     bfq_bfqq_non_blocking_wait_rq(bfqq),
+		     old_wr_coeff);
+
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+	BUG_ON(bfqq == bfqd->in_service_queue);
+	bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq,
+				 req_op(rq), rq->cmd_flags);
+
+	/*
+	 * bfqq deserves to be weight-raised if:
+	 * - it is sync,
+	 * - it does not belong to a large burst,
+	 * - it has been idle for enough time or is soft real-time,
+	 * - is linked to a bfq_io_cq (it is not shared in any sense)
+	 */
+	in_burst = bfq_bfqq_in_large_burst(bfqq);
+	soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
+		!in_burst &&
+		time_is_before_jiffies(bfqq->soft_rt_next_start);
+	*interactive =
+		!in_burst &&
+		idle_for_long_time;
+	wr_or_deserves_wr = bfqd->low_latency &&
+		(bfqq->wr_coeff > 1 ||
+		 (bfq_bfqq_sync(bfqq) &&
+		  bfqq->bic && (*interactive || soft_rt)));
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "bfq_add_request: "
+		     "in_burst %d, "
+		     "soft_rt %d (next %lu), inter %d, bic %p",
+		     bfq_bfqq_in_large_burst(bfqq), soft_rt,
+		     bfqq->soft_rt_next_start,
+		     *interactive,
+		     bfqq->bic);
+
+	/*
+	 * Using the last flag, update budget and check whether bfqq
+	 * may want to preempt the in-service queue.
+	 */
+	bfqq_wants_to_preempt =
+		bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
+						    arrived_in_time,
+						    wr_or_deserves_wr);
+
+	/*
+	 * If bfqq happened to be activated in a burst, but has been
+	 * idle for much more than an interactive queue, then we
+	 * assume that, in the overall I/O initiated in the burst, the
+	 * I/O associated with bfqq is finished. So bfqq does not need
+	 * to be treated as a queue belonging to a burst
+	 * anymore. Accordingly, we reset bfqq's in_large_burst flag
+	 * if set, and remove bfqq from the burst list if it's
+	 * there. We do not decrement burst_size, because the fact
+	 * that bfqq does not need to belong to the burst list any
+	 * more does not invalidate the fact that bfqq was created in
+	 * a burst.
+	 */
+	if (likely(!bfq_bfqq_just_created(bfqq)) &&
+	    idle_for_long_time &&
+	    time_is_before_jiffies(
+		    bfqq->budget_timeout +
+		    msecs_to_jiffies(10000))) {
+		hlist_del_init(&bfqq->burst_list_node);
+		bfq_clear_bfqq_in_large_burst(bfqq);
+	}
+
+	bfq_clear_bfqq_just_created(bfqq);
+
+	if (!bfq_bfqq_IO_bound(bfqq)) {
+		if (arrived_in_time) {
+			bfqq->requests_within_timer++;
+			if (bfqq->requests_within_timer >=
+			    bfqd->bfq_requests_within_timer)
+				bfq_mark_bfqq_IO_bound(bfqq);
+		} else
+			bfqq->requests_within_timer = 0;
+		bfq_log_bfqq(bfqd, bfqq, "requests in time %d",
+			     bfqq->requests_within_timer);
+	}
+
+	if (bfqd->low_latency) {
+		if (unlikely(time_is_after_jiffies(bfqq->split_time)))
+			/* wraparound */
+			bfqq->split_time =
+				jiffies - bfqd->bfq_wr_min_idle_time - 1;
+
+		if (time_is_before_jiffies(bfqq->split_time +
+					   bfqd->bfq_wr_min_idle_time)) {
+			bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq,
+							 old_wr_coeff,
+							 wr_or_deserves_wr,
+							 *interactive,
+							 in_burst,
+							 soft_rt);
+
+			if (old_wr_coeff != bfqq->wr_coeff)
+				bfqq->entity.prio_changed = 1;
+		}
+	}
+
+	bfqq->last_idle_bklogged = jiffies;
+	bfqq->service_from_backlogged = 0;
+	bfq_clear_bfqq_softrt_update(bfqq);
+
+	bfq_add_bfqq_busy(bfqd, bfqq);
+
+	/*
+	 * Expire in-service queue only if preemption may be needed
+	 * for guarantees. In this respect, the function
+	 * next_queue_may_preempt just checks a simple, necessary
+	 * condition, and not a sufficient condition based on
+	 * timestamps. In fact, for the latter condition to be
+	 * evaluated, timestamps would need first to be updated, and
+	 * this operation is quite costly (see the comments on the
+	 * function bfq_bfqq_update_budg_for_activation).
+	 */
+	if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
+	    bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+	    next_queue_may_preempt(bfqd)) {
+		struct bfq_queue *in_serv =
+			bfqd->in_service_queue;
+		BUG_ON(in_serv == bfqq);
+
+		bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
+				false, BFQ_BFQQ_PREEMPTED);
+		BUG_ON(in_serv->entity.budget < 0);
+	}
+}
+
+static void bfq_add_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	struct request *next_rq, *prev;
+	unsigned int old_wr_coeff = bfqq->wr_coeff;
+	bool interactive = false;
+
+	bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s",
+		     blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A");
+
+	if (bfqq->wr_coeff > 1) /* queue is being weight-raised */
+		bfq_log_bfqq(bfqd, bfqq,
+			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time),
+			bfqq->wr_coeff,
+			bfqq->entity.weight, bfqq->entity.orig_weight);
+
+	bfqq->queued[rq_is_sync(rq)]++;
+	bfqd->queued++;
+
+	elv_rb_add(&bfqq->sort_list, rq);
+
+	/*
+	 * Check if this request is a better next-to-serve candidate.
+	 */
+	prev = bfqq->next_rq;
+	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
+	BUG_ON(!next_rq);
+	bfqq->next_rq = next_rq;
+
+	/*
+	 * Adjust priority tree position, if next_rq changes.
+	 */
+	if (prev != bfqq->next_rq)
+		bfq_pos_tree_add_move(bfqd, bfqq);
+
+	if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */
+		bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff,
+						 rq, &interactive);
+	else {
+		if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) &&
+		    time_is_before_jiffies(
+				bfqq->last_wr_start_finish +
+				bfqd->bfq_wr_min_inter_arr_async)) {
+			bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+			bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+
+			bfqd->wr_busy_queues++;
+			bfqq->entity.prio_changed = 1;
+			bfq_log_bfqq(bfqd, bfqq,
+				     "non-idle wrais starting, "
+				     "wr_max_time %u wr_busy %d",
+				     jiffies_to_msecs(bfqq->wr_cur_max_time),
+				     bfqd->wr_busy_queues);
+		}
+		if (prev != bfqq->next_rq)
+			bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	/*
+	 * Assign jiffies to last_wr_start_finish in the following
+	 * cases:
+	 *
+	 * . if bfqq is not going to be weight-raised, because, for
+	 *   non weight-raised queues, last_wr_start_finish stores the
+	 *   arrival time of the last request; as of now, this piece
+	 *   of information is used only for deciding whether to
+	 *   weight-raise async queues
+	 *
+	 * . if bfqq is not weight-raised, because, if bfqq is now
+	 *   switching to weight-raised, then last_wr_start_finish
+	 *   stores the time when weight-raising starts
+	 *
+	 * . if bfqq is interactive, because, regardless of whether
+	 *   bfqq is currently weight-raised, the weight-raising
+	 *   period must start or restart (this case is considered
+	 *   separately because it is not detected by the above
+	 *   conditions, if bfqq is already weight-raised)
+	 *
+	 * last_wr_start_finish has to be updated also if bfqq is soft
+	 * real-time, because the weight-raising period is constantly
+	 * restarted on idle-to-busy transitions for these queues, but
+	 * this is already done in bfq_bfqq_handle_idle_busy_switch if
+	 * needed.
+	 */
+	if (bfqd->low_latency &&
+		(old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive))
+		bfqq->last_wr_start_finish = jiffies;
+}
+
+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
+					  struct bio *bio)
+{
+	struct task_struct *tsk = current;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq;
+
+	bic = bfq_bic_lookup(bfqd, tsk->io_context);
+	if (!bic)
+		return NULL;
+
+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+	if (bfqq)
+		return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio));
+
+	return NULL;
+}
+
+static sector_t get_sdist(sector_t last_pos, struct request *rq)
+{
+	sector_t sdist = 0;
+
+	if (last_pos) {
+		if (last_pos < blk_rq_pos(rq))
+			sdist = blk_rq_pos(rq) - last_pos;
+		else
+			sdist = last_pos - blk_rq_pos(rq);
+	}
+
+	return sdist;
+}
+
+static void bfq_activate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	bfqd->rq_in_driver++;
+}
+
+static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+
+	BUG_ON(bfqd->rq_in_driver == 0);
+	bfqd->rq_in_driver--;
+}
+
+static void bfq_remove_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	const int sync = rq_is_sync(rq);
+
+	BUG_ON(bfqq->entity.service > bfqq->entity.budget &&
+	       bfqq == bfqd->in_service_queue);
+
+	if (bfqq->next_rq == rq) {
+		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+		bfq_updated_next_req(bfqd, bfqq);
+	}
+
+	if (rq->queuelist.prev != &rq->queuelist)
+		list_del_init(&rq->queuelist);
+	BUG_ON(bfqq->queued[sync] == 0);
+	bfqq->queued[sync]--;
+	bfqd->queued--;
+	elv_rb_del(&bfqq->sort_list, rq);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		BUG_ON(bfqq->entity.budget < 0);
+
+		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
+			bfq_del_bfqq_busy(bfqd, bfqq, 1);
+
+			/* bfqq emptied. In normal operation, when
+			 * bfqq is empty, bfqq->entity.service and
+			 * bfqq->entity.budget must contain,
+			 * respectively, the service received and the
+			 * budget used last time bfqq emptied. These
+			 * facts do not hold in this case, as at least
+			 * this last removal occurred while bfqq is
+			 * not in service. To avoid inconsistencies,
+			 * reset both bfqq->entity.service and
+			 * bfqq->entity.budget.
+			 */
+			bfqq->entity.budget = bfqq->entity.service = 0;
+		}
+
+		/*
+		 * Remove queue from request-position tree as it is empty.
+		 */
+		if (bfqq->pos_root) {
+			rb_erase(&bfqq->pos_node, bfqq->pos_root);
+			bfqq->pos_root = NULL;
+		}
+	}
+
+	if (rq->cmd_flags & REQ_META) {
+		BUG_ON(bfqq->meta_pending == 0);
+		bfqq->meta_pending--;
+	}
+	bfqg_stats_update_io_remove(bfqq_group(bfqq), req_op(rq),
+				    rq->cmd_flags);
+}
+
+static int bfq_merge(struct request_queue *q, struct request **req,
+		     struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct request *__rq;
+
+	__rq = bfq_find_rq_fmerge(bfqd, bio);
+	if (__rq && elv_bio_merge_ok(__rq, bio)) {
+		*req = __rq;
+		return ELEVATOR_FRONT_MERGE;
+	}
+
+	return ELEVATOR_NO_MERGE;
+}
+
+static void bfq_merged_request(struct request_queue *q, struct request *req,
+			       int type)
+{
+	if (type == ELEVATOR_FRONT_MERGE &&
+	    rb_prev(&req->rb_node) &&
+	    blk_rq_pos(req) <
+	    blk_rq_pos(container_of(rb_prev(&req->rb_node),
+				    struct request, rb_node))) {
+		struct bfq_queue *bfqq = RQ_BFQQ(req);
+		struct bfq_data *bfqd = bfqq->bfqd;
+		struct request *prev, *next_rq;
+
+		/* Reposition request in its sort_list */
+		elv_rb_del(&bfqq->sort_list, req);
+		elv_rb_add(&bfqq->sort_list, req);
+		/* Choose next request to be served for bfqq */
+		prev = bfqq->next_rq;
+		next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
+					 bfqd->last_position);
+		BUG_ON(!next_rq);
+		bfqq->next_rq = next_rq;
+		/*
+		 * If next_rq changes, update both the queue's budget to
+		 * fit the new request and the queue's position in its
+		 * rq_pos_tree.
+		 */
+		if (prev != bfqq->next_rq) {
+			bfq_updated_next_req(bfqd, bfqq);
+			bfq_pos_tree_add_move(bfqd, bfqq);
+		}
+	}
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfq_bio_merged(struct request_queue *q, struct request *req,
+			   struct bio *bio)
+{
+	bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio_op(bio),
+				    bio->bi_opf);
+}
+#endif
+
+static void bfq_merged_requests(struct request_queue *q, struct request *rq,
+				struct request *next)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);
+
+	/*
+	 * If next and rq belong to the same bfq_queue and next is older
+	 * than rq, then reposition rq in the fifo (by substituting next
+	 * with rq). Otherwise, if next and rq belong to different
+	 * bfq_queues, never reposition rq: in fact, we would have to
+	 * reposition it with respect to next's position in its own fifo,
+	 * which would most certainly be too expensive with respect to
+	 * the benefits.
+	 */
+	if (bfqq == next_bfqq &&
+	    !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
+	    next->fifo_time < rq->fifo_time) {
+		list_del_init(&rq->queuelist);
+		list_replace_init(&next->queuelist, &rq->queuelist);
+		rq->fifo_time = next->fifo_time;
+	}
+
+	if (bfqq->next_rq == next)
+		bfqq->next_rq = rq;
+
+	bfq_remove_request(next);
+	bfqg_stats_update_io_merged(bfqq_group(bfqq), req_op(next),
+				    next->cmd_flags);
+}
+
+/* Must be called with bfqq != NULL */
+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
+{
+	BUG_ON(!bfqq);
+
+	if (bfq_bfqq_busy(bfqq))
+		bfqq->bfqd->wr_busy_queues--;
+	bfqq->wr_coeff = 1;
+	bfqq->wr_cur_max_time = 0;
+	bfqq->last_wr_start_finish = jiffies;
+	/*
+	 * Trigger a weight change on the next invocation of
+	 * __bfq_entity_update_weight_prio.
+	 */
+	bfqq->entity.prio_changed = 1;
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		     "end_wr: wrais ending at %lu, rais_max_time %u",
+		     bfqq->last_wr_start_finish,
+		     jiffies_to_msecs(bfqq->wr_cur_max_time));
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d",
+		     bfqq->bfqd->wr_busy_queues);
+}
+
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			if (bfqg->async_bfqq[i][j])
+				bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
+	if (bfqg->async_idle_bfqq)
+		bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+}
+
+static void bfq_end_wr(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+
+	spin_lock_irq(bfqd->queue->queue_lock);
+
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
+		bfq_bfqq_end_wr(bfqq);
+	bfq_end_wr_async(bfqd);
+
+	spin_unlock_irq(bfqd->queue->queue_lock);
+}
+
+static sector_t bfq_io_struct_pos(void *io_struct, bool request)
+{
+	if (request)
+		return blk_rq_pos(io_struct);
+	else
+		return ((struct bio *)io_struct)->bi_iter.bi_sector;
+}
+
+static int bfq_rq_close_to_sector(void *io_struct, bool request,
+				  sector_t sector)
+{
+	return abs(bfq_io_struct_pos(io_struct, request) - sector) <=
+	       BFQQ_CLOSE_THR;
+}
+
+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
+					 struct bfq_queue *bfqq,
+					 sector_t sector)
+{
+	struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+	struct rb_node *parent, *node;
+	struct bfq_queue *__bfqq;
+
+	if (RB_EMPTY_ROOT(root))
+		return NULL;
+
+	/*
+	 * First, if we find a request starting at the end of the last
+	 * request, choose it.
+	 */
+	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
+	if (__bfqq)
+		return __bfqq;
+
+	/*
+	 * If the exact sector wasn't found, the parent of the NULL leaf
+	 * will contain the closest sector (rq_pos_tree sorted by
+	 * next_request position).
+	 */
+	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+		return __bfqq;
+
+	if (blk_rq_pos(__bfqq->next_rq) < sector)
+		node = rb_next(&__bfqq->pos_node);
+	else
+		node = rb_prev(&__bfqq->pos_node);
+	if (!node)
+		return NULL;
+
+	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
+	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
+		return __bfqq;
+
+	return NULL;
+}
+
+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd,
+						   struct bfq_queue *cur_bfqq,
+						   sector_t sector)
+{
+	struct bfq_queue *bfqq;
+
+	/*
+	 * We shall notice if some of the queues are cooperating,
+	 * e.g., working closely on the same area of the device. In
+	 * that case, we can group them together and: 1) don't waste
+	 * time idling, and 2) serve the union of their requests in
+	 * the best possible order for throughput.
+	 */
+	bfqq = bfqq_find_close(bfqd, cur_bfqq, sector);
+	if (!bfqq || bfqq == cur_bfqq)
+		return NULL;
+
+	return bfqq;
+}
+
+static struct bfq_queue *
+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+	int process_refs, new_process_refs;
+	struct bfq_queue *__bfqq;
+
+	/*
+	 * If there are no process references on the new_bfqq, then it is
+	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
+	 * may have dropped their last reference (not just their last process
+	 * reference).
+	 */
+	if (!bfqq_process_refs(new_bfqq))
+		return NULL;
+
+	/* Avoid a circular list and skip interim queue merges. */
+	while ((__bfqq = new_bfqq->new_bfqq)) {
+		if (__bfqq == bfqq)
+			return NULL;
+		new_bfqq = __bfqq;
+	}
+
+	process_refs = bfqq_process_refs(bfqq);
+	new_process_refs = bfqq_process_refs(new_bfqq);
+	/*
+	 * If the process for the bfqq has gone away, there is no
+	 * sense in merging the queues.
+	 */
+	if (process_refs == 0 || new_process_refs == 0)
+		return NULL;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
+		new_bfqq->pid);
+
+	/*
+	 * Merging is just a redirection: the requests of the process
+	 * owning one of the two queues are redirected to the other queue.
+	 * The latter queue, in its turn, is set as shared if this is the
+	 * first time that the requests of some process are redirected to
+	 * it.
+	 *
+	 * We redirect bfqq to new_bfqq and not the opposite, because we
+	 * are in the context of the process owning bfqq, hence we have
+	 * the io_cq of this process. So we can immediately configure this
+	 * io_cq to redirect the requests of the process to new_bfqq.
+	 *
+	 * NOTE, even if new_bfqq coincides with the in-service queue, the
+	 * io_cq of new_bfqq is not available, because, if the in-service
+	 * queue is shared, bfqd->in_service_bic may not point to the
+	 * io_cq of the in-service queue.
+	 * Redirecting the requests of the process owning bfqq to the
+	 * currently in-service queue is in any case the best option, as
+	 * we feed the in-service queue with new requests close to the
+	 * last request served and, by doing so, hopefully increase the
+	 * throughput.
+	 */
+	bfqq->new_bfqq = new_bfqq;
+	new_bfqq->ref += process_refs;
+	return new_bfqq;
+}
+
+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
+					struct bfq_queue *new_bfqq)
+{
+	if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) ||
+	    (bfqq->ioprio_class != new_bfqq->ioprio_class))
+		return false;
+
+	/*
+	 * If either of the queues has already been detected as seeky,
+	 * then merging it with the other queue is unlikely to lead to
+	 * sequential I/O.
+	 */
+	if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq))
+		return false;
+
+	/*
+	 * Interleaved I/O is known to be done by (some) applications
+	 * only for reads, so it does not make sense to merge async
+	 * queues.
+	 */
+	if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq))
+		return false;
+
+	return true;
+}
+
+/*
+ * If this function returns true, then bfqq cannot be merged. The idea
+ * is that true cooperation happens very early after processes start
+ * to do I/O. Usually, late cooperations are just accidental false
+ * positives. In case bfqq is weight-raised, such false positives
+ * would evidently degrade latency guarantees for bfqq.
+ */
+bool wr_from_too_long(struct bfq_queue *bfqq)
+{
+	return bfqq->wr_coeff > 1 &&
+		time_is_before_jiffies(bfqq->last_wr_start_finish +
+				       msecs_to_jiffies(100));
+}
+
+/*
+ * Attempt to schedule a merge of bfqq with the currently in-service
+ * queue or with a close queue among the scheduled queues.  Return
+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue
+ * structure otherwise.
+ *
+ * The OOM queue is not allowed to participate to cooperation: in fact, since
+ * the requests temporarily redirected to the OOM queue could be redirected
+ * again to dedicated queues at any time, the state needed to correctly
+ * handle merging with the OOM queue would be quite complex and expensive
+ * to maintain. Besides, in such a critical condition as an out of memory,
+ * the benefits of queue merging may be little relevant, or even negligible.
+ *
+ * Weight-raised queues can be merged only if their weight-raising
+ * period has just started. In fact cooperating processes are usually
+ * started together. Thus, with this filter we avoid false positives
+ * that would jeopardize low-latency guarantees.
+ *
+ * WARNING: queue merging may impair fairness among non-weight raised
+ * queues, for at least two reasons: 1) the original weight of a
+ * merged queue may change during the merged state, 2) even being the
+ * weight the same, a merged queue may be bloated with many more
+ * requests than the ones produced by its originally-associated
+ * process.
+ */
+static struct bfq_queue *
+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		     void *io_struct, bool request)
+{
+	struct bfq_queue *in_service_bfqq, *new_bfqq;
+
+	if (bfqq->new_bfqq)
+		return bfqq->new_bfqq;
+
+	if (io_struct && wr_from_too_long(bfqq) &&
+	    likely(bfqq != &bfqd->oom_bfqq))
+		bfq_log_bfqq(bfqd, bfqq,
+			     "would have looked for coop, but bfq%d wr",
+			bfqq->pid);
+
+	if (!io_struct ||
+	    wr_from_too_long(bfqq) ||
+	    unlikely(bfqq == &bfqd->oom_bfqq))
+		return NULL;
+
+	/* If there is only one backlogged queue, don't search. */
+	if (bfqd->busy_queues == 1)
+		return NULL;
+
+	in_service_bfqq = bfqd->in_service_queue;
+
+	if (in_service_bfqq && in_service_bfqq != bfqq &&
+	    bfqd->in_service_bic && wr_from_too_long(in_service_bfqq)
+	    && likely(in_service_bfqq == &bfqd->oom_bfqq))
+		bfq_log_bfqq(bfqd, bfqq,
+		"would have tried merge with in-service-queue, but wr");
+
+	if (!in_service_bfqq || in_service_bfqq == bfqq ||
+	    !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) ||
+	    unlikely(in_service_bfqq == &bfqd->oom_bfqq))
+		goto check_scheduled;
+
+	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
+	    bfqq->entity.parent == in_service_bfqq->entity.parent &&
+	    bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) {
+		new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq);
+		if (new_bfqq)
+			return new_bfqq;
+	}
+	/*
+	 * Check whether there is a cooperator among currently scheduled
+	 * queues. The only thing we need is that the bio/request is not
+	 * NULL, as we need it to establish whether a cooperator exists.
+	 */
+check_scheduled:
+	new_bfqq = bfq_find_close_cooperator(bfqd, bfqq,
+			bfq_io_struct_pos(io_struct, request));
+
+	BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent);
+
+	if (new_bfqq && wr_from_too_long(new_bfqq) &&
+	    likely(new_bfqq != &bfqd->oom_bfqq) &&
+	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
+		bfq_log_bfqq(bfqd, bfqq,
+			     "would have merged with bfq%d, but wr",
+			     new_bfqq->pid);
+
+	if (new_bfqq && !wr_from_too_long(new_bfqq) &&
+	    likely(new_bfqq != &bfqd->oom_bfqq) &&
+	    bfq_may_be_close_cooperator(bfqq, new_bfqq))
+		return bfq_setup_merge(bfqq, new_bfqq);
+
+	return NULL;
+}
+
+static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
+{
+	struct bfq_io_cq *bic = bfqq->bic;
+
+	/*
+	 * If !bfqq->bic, the queue is already shared or its requests
+	 * have already been redirected to a shared queue; both idle window
+	 * and weight raising state have already been saved. Do nothing.
+	 */
+	if (!bic)
+		return;
+
+	bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
+	bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+	bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+	bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+	bic->saved_wr_coeff = bfqq->wr_coeff;
+	bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+	bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
+	BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish));
+}
+
+static void bfq_get_bic_reference(struct bfq_queue *bfqq)
+{
+	/*
+	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
+	 * is about to begin using a shared bfq_queue.
+	 */
+	if (bfqq->bic)
+		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
+}
+
+static void
+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
+		struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
+		     (unsigned long) new_bfqq->pid);
+	/* Save weight raising and idle window of the merged queues */
+	bfq_bfqq_save_state(bfqq);
+	bfq_bfqq_save_state(new_bfqq);
+	if (bfq_bfqq_IO_bound(bfqq))
+		bfq_mark_bfqq_IO_bound(new_bfqq);
+	bfq_clear_bfqq_IO_bound(bfqq);
+
+	/*
+	 * If bfqq is weight-raised, then let new_bfqq inherit
+	 * weight-raising. To reduce false positives, neglect the case
+	 * where bfqq has just been created, but has not yet made it
+	 * to be weight-raised (which may happen because EQM may merge
+	 * bfqq even before bfq_add_request is executed for the first
+	 * time for bfqq). Handling this case would however be very
+	 * easy, thanks to the flag just_created.
+	 */
+	if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) {
+		new_bfqq->wr_coeff = bfqq->wr_coeff;
+		new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time;
+		new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish;
+		new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt;
+		if (bfq_bfqq_busy(new_bfqq))
+			bfqd->wr_busy_queues++;
+		new_bfqq->entity.prio_changed = 1;
+		bfq_log_bfqq(bfqd, new_bfqq,
+			     "wr start after merge with %d, rais_max_time %u",
+			     bfqq->pid,
+			     jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */
+		bfqq->wr_coeff = 1;
+		bfqq->entity.prio_changed = 1;
+		if (bfq_bfqq_busy(bfqq))
+			bfqd->wr_busy_queues--;
+	}
+
+	bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d",
+		     bfqd->wr_busy_queues);
+
+	/*
+	 * Grab a reference to the bic, to prevent it from being destroyed
+	 * before being possibly touched by a bfq_split_bfqq().
+	 */
+	bfq_get_bic_reference(bfqq);
+	bfq_get_bic_reference(new_bfqq);
+	/*
+	 * Merge queues (that is, let bic redirect its requests to new_bfqq)
+	 */
+	bic_set_bfqq(bic, new_bfqq, 1);
+	bfq_mark_bfqq_coop(new_bfqq);
+	/*
+	 * new_bfqq now belongs to at least two bics (it is a shared queue):
+	 * set new_bfqq->bic to NULL. bfqq either:
+	 * - does not belong to any bic any more, and hence bfqq->bic must
+	 *   be set to NULL, or
+	 * - is a queue whose owning bics have already been redirected to a
+	 *   different queue, hence the queue is destined to not belong to
+	 *   any bic soon and bfqq->bic is already NULL (therefore the next
+	 *   assignment causes no harm).
+	 */
+	new_bfqq->bic = NULL;
+	bfqq->bic = NULL;
+	bfq_put_queue(bfqq);
+}
+
+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+			       struct bio *bio)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq, *new_bfqq;
+
+	/*
+	 * Disallow merge of a sync bio into an async request.
+	 */
+	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
+		return false;
+
+	/*
+	 * Lookup the bfqq that this bio will be queued with. Allow
+	 * merge only if rq is queued there.
+	 * Queue lock is held here.
+	 */
+	bic = bfq_bic_lookup(bfqd, current->io_context);
+	if (!bic)
+		return false;
+
+	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
+	/*
+	 * We take advantage of this function to perform an early merge
+	 * of the queues of possible cooperating processes.
+	 */
+	if (bfqq) {
+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+		if (new_bfqq) {
+			bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
+			/*
+			 * If we get here, the bio will be queued in the
+			 * shared queue, i.e., new_bfqq, so use new_bfqq
+			 * to decide whether bio and rq can be merged.
+			 */
+			bfqq = new_bfqq;
+		}
+	}
+
+	return bfqq == RQ_BFQQ(rq);
+}
+
+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq,
+			      struct request *next)
+{
+	return RQ_BFQQ(rq) == RQ_BFQQ(next);
+}
+
+/*
+ * Set the maximum time for the in-service queue to consume its
+ * budget. This prevents seeky processes from lowering the throughput.
+ * In practice, a time-slice service scheme is used with seeky
+ * processes.
+ */
+static void bfq_set_budget_timeout(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq)
+{
+	unsigned int timeout_coeff;
+
+	if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time)
+		timeout_coeff = 1;
+	else
+		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
+
+	bfqd->last_budget_start = ktime_get();
+
+	bfqq->budget_timeout = jiffies +
+		bfqd->bfq_timeout * timeout_coeff;
+
+	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
+		jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff));
+}
+
+static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
+				       struct bfq_queue *bfqq)
+{
+	if (bfqq) {
+		bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
+		bfq_mark_bfqq_must_alloc(bfqq);
+		bfq_clear_bfqq_fifo_expire(bfqq);
+
+		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
+
+		BUG_ON(bfqq == bfqd->in_service_queue);
+		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+		if (time_is_before_jiffies(bfqq->last_wr_start_finish) &&
+		    bfqq->wr_coeff > 1 &&
+		    bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+		    time_is_before_jiffies(bfqq->budget_timeout)) {
+			/*
+			 * For soft real-time queues, move the start
+			 * of the weight-raising period forward by the
+			 * time the queue has not received any
+			 * service. Otherwise, a relatively long
+			 * service delay is likely to cause the
+			 * weight-raising period of the queue to end,
+			 * because of the short duration of the
+			 * weight-raising period of a soft real-time
+			 * queue.  It is worth noting that this move
+			 * is not so dangerous for the other queues,
+			 * because soft real-time queues are not
+			 * greedy.
+			 *
+			 * To not add a further variable, we use the
+			 * overloaded field budget_timeout to
+			 * determine for how long the queue has not
+			 * received service, i.e., how much time has
+			 * elapsed since the queue expired. However,
+			 * this is a little imprecise, because
+			 * budget_timeout is set to jiffies if bfqq
+			 * not only expires, but also remains with no
+			 * request.
+			 */
+			if (time_after(bfqq->budget_timeout,
+				       bfqq->last_wr_start_finish))
+				bfqq->last_wr_start_finish +=
+					jiffies - bfqq->budget_timeout;
+			else
+				bfqq->last_wr_start_finish = jiffies;
+
+			if (time_is_after_jiffies(bfqq->last_wr_start_finish)) {
+			       pr_crit(
+			       "BFQ WARNING:last %lu budget %lu jiffies %lu",
+			       bfqq->last_wr_start_finish,
+			       bfqq->budget_timeout,
+			       jiffies);
+			       pr_crit("diff %lu", jiffies -
+				       max_t(unsigned long,
+					     bfqq->last_wr_start_finish,
+					     bfqq->budget_timeout));
+			       bfqq->last_wr_start_finish = jiffies;
+			}
+		}
+
+		bfq_set_budget_timeout(bfqd, bfqq);
+		bfq_log_bfqq(bfqd, bfqq,
+			     "set_in_service_queue, cur-budget = %d",
+			     bfqq->entity.budget);
+	} else
+		bfq_log(bfqd, "set_in_service_queue: NULL");
+
+	bfqd->in_service_queue = bfqq;
+}
+
+/*
+ * Get and set a new queue for service.
+ */
+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
+
+	__bfq_set_in_service_queue(bfqd, bfqq);
+	return bfqq;
+}
+
+static void bfq_arm_slice_timer(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq = bfqd->in_service_queue;
+	struct bfq_io_cq *bic;
+	u32 sl;
+
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	/* Processes have exited, don't wait. */
+	bic = bfqd->in_service_bic;
+	if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0)
+		return;
+
+	bfq_mark_bfqq_wait_request(bfqq);
+
+	/*
+	 * We don't want to idle for seeks, but we do want to allow
+	 * fair distribution of slice time for a process doing back-to-back
+	 * seeks. So allow a little bit of time for him to submit a new rq.
+	 *
+	 * To prevent processes with (partly) seeky workloads from
+	 * being too ill-treated, grant them a small fraction of the
+	 * assigned budget before reducing the waiting time to
+	 * BFQ_MIN_TT. This happened to help reduce latency.
+	 */
+	sl = bfqd->bfq_slice_idle;
+	/*
+	 * Unless the queue is being weight-raised or the scenario is
+	 * asymmetric, grant only minimum idle time if the queue
+	 * is seeky. A long idling is preserved for a weight-raised
+	 * queue, or, more in general, in an asymemtric scenario,
+	 * because a long idling is needed for guaranteeing to a queue
+	 * its reserved share of the throughput (in particular, it is
+	 * needed if the queue has a higher weight than some other
+	 * queue).
+	 */
+	if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 &&
+	    bfq_symmetric_scenario(bfqd))
+		sl = min_t(u32, sl, BFQ_MIN_TT);
+
+	bfqd->last_idling_start = ktime_get();
+	hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
+		      HRTIMER_MODE_REL);
+	bfqg_stats_set_start_idle_time(bfqq_group(bfqq));
+	bfq_log(bfqd, "arm idle: %ld/%ld ms",
+		sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC);
+}
+
+/*
+ * In autotuning mode, max_budget is dynamically recomputed as the
+ * amount of sectors transferred in timeout at the estimated peak
+ * rate. This enables BFQ to utilize a full timeslice with a full
+ * budget, even if the in-service queue is served at peak rate. And
+ * this maximises throughput with sequential workloads.
+ */
+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd)
+{
+	return (u64)bfqd->peak_rate * USEC_PER_MSEC *
+		jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT;
+}
+
+/*
+ * Update parameters related to throughput and responsiveness, as a
+ * function of the estimated peak rate. See comments on
+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays.
+ */
+void update_thr_responsiveness_params(struct bfq_data *bfqd)
+{
+	int dev_type = blk_queue_nonrot(bfqd->queue);
+
+	if (bfqd->bfq_user_max_budget == 0) {
+		bfqd->bfq_max_budget =
+			bfq_calc_max_budget(bfqd);
+		BUG_ON(bfqd->bfq_max_budget < 0);
+		bfq_log(bfqd, "new max_budget = %d",
+			bfqd->bfq_max_budget);
+	}
+
+	if (bfqd->device_speed == BFQ_BFQD_FAST &&
+	    bfqd->peak_rate < device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_SLOW;
+		bfqd->RT_prod = R_slow[dev_type] *
+			T_slow[dev_type];
+	} else if (bfqd->device_speed == BFQ_BFQD_SLOW &&
+		   bfqd->peak_rate > device_speed_thresh[dev_type]) {
+		bfqd->device_speed = BFQ_BFQD_FAST;
+		bfqd->RT_prod = R_fast[dev_type] *
+			T_fast[dev_type];
+	}
+
+	bfq_log(bfqd,
+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec",
+		dev_type == 0 ? "ROT" : "NONROT",
+		bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW",
+		bfqd->device_speed == BFQ_BFQD_FAST ?
+		(USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT :
+		(USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT,
+		(USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>>
+		BFQ_RATE_SHIFT);
+}
+
+void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq)
+{
+	if (rq != NULL) { /* new rq dispatch now, reset accordingly */
+		bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ;
+		bfqd->peak_rate_samples = 1;
+		bfqd->sequential_samples = 0;
+		bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
+			blk_rq_sectors(rq);
+	} else /* no new rq dispatched, just reset the number of samples */
+		bfqd->peak_rate_samples = 0; /* full re-init on next disp. */
+
+	bfq_log(bfqd,
+		"reset_rate_computation at end, sample %u/%u tot_sects %llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		bfqd->tot_sectors_dispatched);
+}
+
+void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq)
+{
+	u32 rate, weight, divisor;
+
+	/*
+	 * For the convergence property to hold (see comments on
+	 * bfq_update_peak_rate()) and for the assessment to be
+	 * reliable, a minimum number of samples must be present, and
+	 * a minimum amount of time must have elapsed. If not so, do
+	 * not compute new rate. Just reset parameters, to get ready
+	 * for a new evaluation attempt.
+	 */
+	if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES ||
+	    bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) {
+		bfq_log(bfqd,
+	"update_rate_reset: only resetting, delta_first %lluus samples %d",
+			bfqd->delta_from_first>>10, bfqd->peak_rate_samples);
+		goto reset_computation;
+	}
+
+	/*
+	 * If a new request completion has occurred after last
+	 * dispatch, then, to approximate the rate at which requests
+	 * have been served by the device, it is more precise to
+	 * extend the observation interval to the last completion.
+	 */
+	bfqd->delta_from_first =
+		max_t(u64, bfqd->delta_from_first,
+		      bfqd->last_completion - bfqd->first_dispatch);
+
+	BUG_ON(bfqd->delta_from_first == 0);
+	/*
+	 * Rate computed in sects/usec, and not sects/nsec, for
+	 * precision issues.
+	 */
+	rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT,
+			div_u64(bfqd->delta_from_first, NSEC_PER_USEC));
+
+	bfq_log(bfqd,
+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)",
+		bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		rate > 20<<BFQ_RATE_SHIFT);
+
+	/*
+	 * Peak rate not updated if:
+	 * - the percentage of sequential dispatches is below 3/4 of the
+	 *   total, and rate is below the current estimated peak rate
+	 * - rate is unreasonably high (> 20M sectors/sec)
+	 */
+	if ((bfqd->peak_rate_samples > (3 * bfqd->sequential_samples)>>2 &&
+	     rate <= bfqd->peak_rate) ||
+		rate > 20<<BFQ_RATE_SHIFT) {
+		bfq_log(bfqd,
+		"update_rate_reset: goto reset, samples %u/%u rate/peak %llu/%llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+		goto reset_computation;
+	} else {
+		bfq_log(bfqd,
+		"update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT),
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+	}
+
+	/*
+	 * We have to update the peak rate, at last! To this purpose,
+	 * we use a low-pass filter. We compute the smoothing constant
+	 * of the filter as a function of the 'weight' of the new
+	 * measured rate.
+	 *
+	 * As can be seen in next formulas, we define this weight as a
+	 * quantity proportional to how sequential the workload is,
+	 * and to how long the observation time interval is.
+	 *
+	 * The weight runs from 0 to 8. The maximum value of the
+	 * weight, 8, yields the minimum value for the smoothing
+	 * constant. At this minimum value for the smoothing constant,
+	 * the measured rate contributes for half of the next value of
+	 * the estimated peak rate.
+	 *
+	 * So, the first step is to compute the weight as a function
+	 * of how sequential the workload is. Note that the weight
+	 * cannot reach 9, because bfqd->sequential_samples cannot
+	 * become equal to bfqd->peak_rate_samples, which, in its
+	 * turn, holds true because bfqd->sequential_samples is not
+	 * incremented for the first sample.
+	 */
+	weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples;
+
+	/*
+	 * Second step: further refine the weight as a function of the
+	 * duration of the observation interval.
+	 */
+	weight = min_t(u32, 8,
+		       div_u64(weight * bfqd->delta_from_first,
+			       BFQ_RATE_REF_INTERVAL));
+
+	/*
+	 * Divisor ranging from 10, for minimum weight, to 2, for
+	 * maximum weight.
+	 */
+	divisor = 10 - weight;
+	BUG_ON(divisor == 0);
+
+	/*
+	 * Finally, update peak rate:
+	 *
+	 * peak_rate = peak_rate * (divisor-1) / divisor  +  rate / divisor
+	 */
+	bfqd->peak_rate *= divisor-1;
+	bfqd->peak_rate /= divisor;
+	rate /= divisor; /* smoothing constant alpha = 1/divisor */
+
+	bfq_log(bfqd,
+		"update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u",
+		divisor,
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT),
+		(u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT));
+
+	BUG_ON(bfqd->peak_rate == 0);
+	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+	bfqd->peak_rate += rate;
+	update_thr_responsiveness_params(bfqd);
+	BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT);
+
+reset_computation:
+	bfq_reset_rate_computation(bfqd, rq);
+}
+
+/*
+ * Update the read/write peak rate (the main quantity used for
+ * auto-tuning, see update_thr_responsiveness_params()).
+ *
+ * It is not trivial to estimate the peak rate (correctly): because of
+ * the presence of sw and hw queues between the scheduler and the
+ * device components that finally serve I/O requests, it is hard to
+ * say exactly when a given dispatched request is served inside the
+ * device, and for how long. As a consequence, it is hard to know
+ * precisely at what rate a given set of requests is actually served
+ * by the device.
+ *
+ * On the opposite end, the dispatch time of any request is trivially
+ * available, and, from this piece of information, the "dispatch rate"
+ * of requests can be immediately computed. So, the idea in the next
+ * function is to use what is known, namely request dispatch times
+ * (plus, when useful, request completion times), to estimate what is
+ * unknown, namely in-device request service rate.
+ *
+ * The main issue is that, because of the above facts, the rate at
+ * which a certain set of requests is dispatched over a certain time
+ * interval can vary greatly with respect to the rate at which the
+ * same requests are then served. But, since the size of any
+ * intermediate queue is limited, and the service scheme is lossless
+ * (no request is silently dropped), the following obvious convergence
+ * property holds: the number of requests dispatched MUST become
+ * closer and closer to the number of requests completed as the
+ * observation interval grows. This is the key property used in
+ * the next function to estimate the peak service rate as a function
+ * of the observed dispatch rate. The function assumes to be invoked
+ * on every request dispatch.
+ */
+void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
+{
+	u64 now_ns = ktime_get_ns();
+
+	if (bfqd->peak_rate_samples == 0) { /* first dispatch */
+		bfq_log(bfqd,
+		"update_peak_rate: goto reset, samples %d",
+				bfqd->peak_rate_samples) ;
+		bfq_reset_rate_computation(bfqd, rq);
+		goto update_last_values; /* will add one sample */
+	}
+
+	/*
+	 * Device idle for very long: the observation interval lasting
+	 * up to this dispatch cannot be a valid observation interval
+	 * for computing a new peak rate (similarly to the late-
+	 * completion event in bfq_completed_request()). Go to
+	 * update_rate_and_reset to have the following three steps
+	 * taken:
+	 * - close the observation interval at the last (previous)
+	 *   request dispatch or completion
+	 * - compute rate, if possible, for that observation interval
+	 * - start a new observation interval with this dispatch
+	 */
+	if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
+	    bfqd->rq_in_driver == 0) {
+		bfq_log(bfqd,
+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d",
+			(now_ns - bfqd->last_dispatch)>>10,
+			bfqd->peak_rate_samples) ;
+		goto update_rate_and_reset;
+	}
+
+	/* Update sampling information */
+	bfqd->peak_rate_samples++;
+
+	if ((bfqd->rq_in_driver > 0 ||
+		now_ns - bfqd->last_completion < BFQ_MIN_TT)
+	     && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR)
+		bfqd->sequential_samples++;
+
+	bfqd->tot_sectors_dispatched += blk_rq_sectors(rq);
+
+	/* Reset max observed rq size every 32 dispatches */
+	if (likely(bfqd->peak_rate_samples % 32))
+		bfqd->last_rq_max_size =
+			max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size);
+	else
+		bfqd->last_rq_max_size = blk_rq_sectors(rq);
+
+	bfqd->delta_from_first = now_ns - bfqd->first_dispatch;
+
+	bfq_log(bfqd,
+	"update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus",
+		bfqd->peak_rate_samples, bfqd->sequential_samples,
+		bfqd->tot_sectors_dispatched,
+		bfqd->delta_from_first>>10);
+
+	/* Target observation interval not yet reached, go on sampling */
+	if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL)
+		goto update_last_values;
+
+update_rate_and_reset:
+	bfq_update_rate_reset(bfqd, rq);
+update_last_values:
+	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
+	bfqd->last_dispatch = now_ns;
+
+	bfq_log(bfqd,
+	"update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu",
+		(now_ns - bfqd->first_dispatch)>>10,
+		(unsigned long long) bfqd->last_position,
+		((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT));
+	bfq_log(bfqd,
+	"update_peak_rate: samples at end %d", bfqd->peak_rate_samples);
+}
+
+/*
+ * Move request from internal lists to the dispatch list of the request queue
+ */
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	/*
+	 * For consistency, the next instruction should have been executed
+	 * after removing the request from the queue and dispatching it.
+	 * We execute instead this instruction before bfq_remove_request()
+	 * (and hence introduce a temporary inconsistency), for efficiency.
+	 * In fact, in a forced_dispatch, this prevents two counters related
+	 * to bfqq->dispatched to risk to be uselessly decremented if bfqq
+	 * is not in service, and then to be incremented again after
+	 * incrementing bfqq->dispatched.
+	 */
+	bfqq->dispatched++;
+	bfq_update_peak_rate(q->elevator->elevator_data, rq);
+
+	bfq_remove_request(rq);
+	elv_dispatch_sort(q, rq);
+}
+
+/*
+ * Return expired entry, or NULL to just start from scratch in rbtree.
+ */
+static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
+{
+	struct request *rq = NULL;
+
+	if (bfq_bfqq_fifo_expire(bfqq))
+		return NULL;
+
+	bfq_mark_bfqq_fifo_expire(bfqq);
+
+	if (list_empty(&bfqq->fifo))
+		return NULL;
+
+	rq = rq_entry_fifo(bfqq->fifo.next);
+
+	if (ktime_get_ns() < rq->fifo_time)
+		return NULL;
+
+	return rq;
+}
+
+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	__bfq_bfqd_reset_in_service(bfqd);
+
+	/*
+	 * If this bfqq is shared between multiple processes, check
+	 * to make sure that those processes are still issuing I/Os
+	 * within the mean seek distance. If not, it may be time to
+	 * break the queues apart again.
+	 */
+	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
+		bfq_mark_bfqq_split_coop(bfqq);
+
+	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		if (bfqq->dispatched == 0)
+			/*
+			 * Overloading budget_timeout field to store
+			 * the time at which the queue remains with no
+			 * backlog and no outstanding request; used by
+			 * the weight-raising mechanism.
+			 */
+			bfqq->budget_timeout = jiffies;
+
+		bfq_del_bfqq_busy(bfqd, bfqq, 1);
+	} else {
+		bfq_activate_bfqq(bfqd, bfqq);
+		/*
+		 * Resort priority tree of potential close cooperators.
+		 */
+		bfq_pos_tree_add_move(bfqd, bfqq);
+	}
+}
+
+/**
+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
+ * @bfqd: device data.
+ * @bfqq: queue to update.
+ * @reason: reason for expiration.
+ *
+ * Handle the feedback on @bfqq budget at queue expiration.
+ * See the body for detailed comments.
+ */
+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
+				     struct bfq_queue *bfqq,
+				     enum bfqq_expiration reason)
+{
+	struct request *next_rq;
+	int budget, min_budget;
+
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	min_budget = bfq_min_budget(bfqd);
+
+	if (bfqq->wr_coeff == 1)
+		budget = bfqq->max_budget;
+	else /*
+	      * Use a constant, low budget for weight-raised queues,
+	      * to help achieve a low latency. Keep it slightly higher
+	      * than the minimum possible budget, to cause a little
+	      * bit fewer expirations.
+	      */
+		budget = 2 * min_budget;
+
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d",
+		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d",
+		budget, bfq_min_budget(bfqd));
+	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
+		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue));
+
+	if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) {
+		switch (reason) {
+		/*
+		 * Caveat: in all the following cases we trade latency
+		 * for throughput.
+		 */
+		case BFQ_BFQQ_TOO_IDLE:
+			/*
+			 * This is the only case where we may reduce
+			 * the budget: if there is no request of the
+			 * process still waiting for completion, then
+			 * we assume (tentatively) that the timer has
+			 * expired because the batch of requests of
+			 * the process could have been served with a
+			 * smaller budget.  Hence, betting that
+			 * process will behave in the same way when it
+			 * becomes backlogged again, we reduce its
+			 * next budget.  As long as we guess right,
+			 * this budget cut reduces the latency
+			 * experienced by the process.
+			 *
+			 * However, if there are still outstanding
+			 * requests, then the process may have not yet
+			 * issued its next request just because it is
+			 * still waiting for the completion of some of
+			 * the still outstanding ones.  So in this
+			 * subcase we do not reduce its budget, on the
+			 * contrary we increase it to possibly boost
+			 * the throughput, as discussed in the
+			 * comments to the BUDGET_TIMEOUT case.
+			 */
+			if (bfqq->dispatched > 0) /* still outstanding reqs */
+				budget = min(budget * 2, bfqd->bfq_max_budget);
+			else {
+				if (budget > 5 * min_budget)
+					budget -= 4 * min_budget;
+				else
+					budget = min_budget;
+			}
+			break;
+		case BFQ_BFQQ_BUDGET_TIMEOUT:
+			/*
+			 * We double the budget here because it gives
+			 * the chance to boost the throughput if this
+			 * is not a seeky process (and has bumped into
+			 * this timeout because of, e.g., ZBR).
+			 */
+			budget = min(budget * 2, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_BUDGET_EXHAUSTED:
+			/*
+			 * The process still has backlog, and did not
+			 * let either the budget timeout or the disk
+			 * idling timeout expire. Hence it is not
+			 * seeky, has a short thinktime and may be
+			 * happy with a higher budget too. So
+			 * definitely increase the budget of this good
+			 * candidate to boost the disk throughput.
+			 */
+			budget = min(budget * 4, bfqd->bfq_max_budget);
+			break;
+		case BFQ_BFQQ_NO_MORE_REQUESTS:
+			/*
+			 * For queues that expire for this reason, it
+			 * is particularly important to keep the
+			 * budget close to the actual service they
+			 * need. Doing so reduces the timestamp
+			 * misalignment problem described in the
+			 * comments in the body of
+			 * __bfq_activate_entity. In fact, suppose
+			 * that a queue systematically expires for
+			 * BFQ_BFQQ_NO_MORE_REQUESTS and presents a
+			 * new request in time to enjoy timestamp
+			 * back-shifting. The larger the budget of the
+			 * queue is with respect to the service the
+			 * queue actually requests in each service
+			 * slot, the more times the queue can be
+			 * reactivated with the same virtual finish
+			 * time. It follows that, even if this finish
+			 * time is pushed to the system virtual time
+			 * to reduce the consequent timestamp
+			 * misalignment, the queue unjustly enjoys for
+			 * many re-activations a lower finish time
+			 * than all newly activated queues.
+			 *
+			 * The service needed by bfqq is measured
+			 * quite precisely by bfqq->entity.service.
+			 * Since bfqq does not enjoy device idling,
+			 * bfqq->entity.service is equal to the number
+			 * of sectors that the process associated with
+			 * bfqq requested to read/write before waiting
+			 * for request completions, or blocking for
+			 * other reasons.
+			 */
+			budget = max_t(int, bfqq->entity.service, min_budget);
+			break;
+		default:
+			return;
+		}
+	} else if (!bfq_bfqq_sync(bfqq))
+		/*
+		 * Async queues get always the maximum possible
+		 * budget, as for them we do not care about latency
+		 * (in addition, their ability to dispatch is limited
+		 * by the charging factor).
+		 */
+		budget = bfqd->bfq_max_budget;
+
+	bfqq->max_budget = budget;
+
+	if (bfqd->budgets_assigned >= bfq_stats_min_budgets &&
+	    !bfqd->bfq_user_max_budget)
+		bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget);
+
+	/*
+	 * If there is still backlog, then assign a new budget, making
+	 * sure that it is large enough for the next request.  Since
+	 * the finish time of bfqq must be kept in sync with the
+	 * budget, be sure to call __bfq_bfqq_expire() *after* this
+	 * update.
+	 *
+	 * If there is no backlog, then no need to update the budget;
+	 * it will be updated on the arrival of a new request.
+	 */
+	next_rq = bfqq->next_rq;
+	if (next_rq) {
+		BUG_ON(reason == BFQ_BFQQ_TOO_IDLE ||
+		       reason == BFQ_BFQQ_NO_MORE_REQUESTS);
+		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
+					    bfq_serv_to_charge(next_rq, bfqq));
+		BUG_ON(!bfq_bfqq_busy(bfqq));
+		BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+	}
+
+	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d",
+			next_rq ? blk_rq_sectors(next_rq) : 0,
+			bfqq->entity.budget);
+}
+
+/*
+ * Return true if the process associated with bfqq is "slow". The slow
+ * flag is used, in addition to the budget timeout, to reduce the
+ * amount of service provided to seeky processes, and thus reduce
+ * their chances to lower the throughput. More details in the comments
+ * on the function bfq_bfqq_expire().
+ *
+ * An important observation is in order: as discussed in the comments
+ * on the function bfq_update_peak_rate(), with devices with internal
+ * queues, it is hard if ever possible to know when and for how long
+ * an I/O request is processed by the device (apart from the trivial
+ * I/O pattern where a new request is dispatched only after the
+ * previous one has been completed). This makes it hard to evaluate
+ * the real rate at which the I/O requests of each bfq_queue are
+ * served.  In fact, for an I/O scheduler like BFQ, serving a
+ * bfq_queue means just dispatching its requests during its service
+ * slot (i.e., until the budget of the queue is exhausted, or the
+ * queue remains idle, or, finally, a timeout fires). But, during the
+ * service slot of a bfq_queue, around 100 ms at most, the device may
+ * be even still processing requests of bfq_queues served in previous
+ * service slots. On the opposite end, the requests of the in-service
+ * bfq_queue may be completed after the service slot of the queue
+ * finishes.
+ *
+ * Anyway, unless more sophisticated solutions are used
+ * (where possible), the sum of the sizes of the requests dispatched
+ * during the service slot of a bfq_queue is probably the only
+ * approximation available for the service received by the bfq_queue
+ * during its service slot. And this sum is the quantity used in this
+ * function to evaluate the I/O speed of a process.
+ */
+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				 bool compensate, enum bfqq_expiration reason,
+				 unsigned long *delta_ms)
+{
+	ktime_t delta_ktime;
+	u32 delta_usecs;
+	bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */
+
+	if (!bfq_bfqq_sync(bfqq))
+		return false;
+
+	if (compensate)
+		delta_ktime = bfqd->last_idling_start;
+	else
+		delta_ktime = ktime_get();
+	delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
+	delta_usecs = ktime_to_us(delta_ktime);
+
+	/* don't trust short/unrealistic values. */
+	if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) {
+		if (blk_queue_nonrot(bfqd->queue))
+			 /*
+			  * give same worst-case guarantees as idling
+			  * for seeky
+			  */
+			*delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC;
+		else /* charge at least one seek */
+			*delta_ms = bfq_slice_idle / NSEC_PER_MSEC;
+
+		bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs);
+
+		return slow;
+	}
+
+	*delta_ms = delta_usecs / USEC_PER_MSEC;
+
+	/*
+	 * Use only long (> 20ms) intervals to filter out excessive
+	 * spikes in service rate estimation.
+	 */
+	if (delta_usecs > 20000) {
+		/*
+		 * Caveat for rotational devices: processes doing I/O
+		 * in the slower disk zones tend to be slow(er) even
+		 * if not seeky. In this respect, the estimated peak
+		 * rate is likely to be an average over the disk
+		 * surface. Accordingly, to not be too harsh with
+		 * unlucky processes, a process is deemed slow only if
+		 * its rate has been lower than half of the estimated
+		 * peak rate.
+		 */
+		slow = bfqq->entity.service < bfqd->bfq_max_budget / 2;
+		bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d",
+			bfqq->entity.service, bfqd->bfq_max_budget);
+	}
+
+	bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow);
+
+	return slow;
+}
+
+/*
+ * To be deemed as soft real-time, an application must meet two
+ * requirements. First, the application must not require an average
+ * bandwidth higher than the approximate bandwidth required to playback or
+ * record a compressed high-definition video.
+ * The next function is invoked on the completion of the last request of a
+ * batch, to compute the next-start time instant, soft_rt_next_start, such
+ * that, if the next request of the application does not arrive before
+ * soft_rt_next_start, then the above requirement on the bandwidth is met.
+ *
+ * The second requirement is that the request pattern of the application is
+ * isochronous, i.e., that, after issuing a request or a batch of requests,
+ * the application stops issuing new requests until all its pending requests
+ * have been completed. After that, the application may issue a new batch,
+ * and so on.
+ * For this reason the next function is invoked to compute
+ * soft_rt_next_start only for applications that meet this requirement,
+ * whereas soft_rt_next_start is set to infinity for applications that do
+ * not.
+ *
+ * Unfortunately, even a greedy application may happen to behave in an
+ * isochronous way if the CPU load is high. In fact, the application may
+ * stop issuing requests while the CPUs are busy serving other processes,
+ * then restart, then stop again for a while, and so on. In addition, if
+ * the disk achieves a low enough throughput with the request pattern
+ * issued by the application (e.g., because the request pattern is random
+ * and/or the device is slow), then the application may meet the above
+ * bandwidth requirement too. To prevent such a greedy application to be
+ * deemed as soft real-time, a further rule is used in the computation of
+ * soft_rt_next_start: soft_rt_next_start must be higher than the current
+ * time plus the maximum time for which the arrival of a request is waited
+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle.
+ * This filters out greedy applications, as the latter issue instead their
+ * next request as soon as possible after the last one has been completed
+ * (in contrast, when a batch of requests is completed, a soft real-time
+ * application spends some time processing data).
+ *
+ * Unfortunately, the last filter may easily generate false positives if
+ * only bfqd->bfq_slice_idle is used as a reference time interval and one
+ * or both the following cases occur:
+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher
+ *    than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with
+ *    HZ=100.
+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing
+ *    for a while, then suddenly 'jump' by several units to recover the lost
+ *    increments. This seems to happen, e.g., inside virtual machines.
+ * To address this issue, we do not use as a reference time interval just
+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In
+ * particular we add the minimum number of jiffies for which the filter
+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual
+ * machines.
+ */
+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd,
+						struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqd, bfqq,
+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u",
+		     bfqq->service_from_backlogged,
+		     bfqd->bfq_wr_max_softrt_rate,
+		     jiffies_to_msecs(HZ * bfqq->service_from_backlogged /
+				      bfqd->bfq_wr_max_softrt_rate));
+
+	return max(bfqq->last_idle_bklogged +
+		   HZ * bfqq->service_from_backlogged /
+		   bfqd->bfq_wr_max_softrt_rate,
+		   jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4);
+}
+
+/*
+ * Return the farthest future time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_greatest_from_now(void)
+{
+	return jiffies + MAX_JIFFY_OFFSET;
+}
+
+/*
+ * Return the farthest past time instant according to jiffies
+ * macros.
+ */
+static unsigned long bfq_smallest_from_now(void)
+{
+	return jiffies - MAX_JIFFY_OFFSET;
+}
+
+/**
+ * bfq_bfqq_expire - expire a queue.
+ * @bfqd: device owning the queue.
+ * @bfqq: the queue to expire.
+ * @compensate: if true, compensate for the time spent idling.
+ * @reason: the reason causing the expiration.
+ *
+ * If the process associated with bfqq does slow I/O (e.g., because it
+ * issues random requests), we charge bfqq with the time it has been
+ * in service instead of the service it has received (see
+ * bfq_bfqq_charge_time for details on how this goal is achieved). As
+ * a consequence, bfqq will typically get higher timestamps upon
+ * reactivation, and hence it will be rescheduled as if it had
+ * received more service than what it has actually received. In the
+ * end, bfqq receives less service in proportion to how slowly its
+ * associated process consumes its budgets (and hence how seriously it
+ * tends to lower the throughput). In addition, this time-charging
+ * strategy guarantees time fairness among slow processes. In
+ * contrast, if the process associated with bfqq is not slow, we
+ * charge bfqq exactly with the service it has received.
+ *
+ * Charging time to the first type of queues and the exact service to
+ * the other has the effect of using the WF2Q+ policy to schedule the
+ * former on a timeslice basis, without violating service domain
+ * guarantees among the latter.
+ */
+static void bfq_bfqq_expire(struct bfq_data *bfqd,
+			    struct bfq_queue *bfqq,
+			    bool compensate,
+			    enum bfqq_expiration reason)
+{
+	bool slow;
+	unsigned long delta = 0;
+	struct bfq_entity *entity = &bfqq->entity;
+
+	BUG_ON(bfqq != bfqd->in_service_queue);
+
+	/*
+	 * Check whether the process is slow (see bfq_bfqq_is_slow).
+	 */
+	slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
+
+	/*
+	 * Increase service_from_backlogged before next statement,
+	 * because the possible next invocation of
+	 * bfq_bfqq_charge_time would likely inflate
+	 * entity->service. In contrast, service_from_backlogged must
+	 * contain real service, to enable the soft real-time
+	 * heuristic to correctly compute the bandwidth consumed by
+	 * bfqq.
+	 */
+	bfqq->service_from_backlogged += entity->service;
+
+	/*
+	 * As above explained, charge slow (typically seeky) and
+	 * timed-out queues with the time and not the service
+	 * received, to favor sequential workloads.
+	 *
+	 * Processes doing I/O in the slower disk zones will tend to
+	 * be slow(er) even if not seeky. Therefore, since the
+	 * estimated peak rate is actually an average over the disk
+	 * surface, these processes may timeout just for bad luck. To
+	 * avoid punishing them, do not charge time to processes that
+	 * succeeded in consuming at least 2/3 of their budget. This
+	 * allows BFQ to preserve enough elasticity to still perform
+	 * bandwidth, and not time, distribution with little unlucky
+	 * or quasi-sequential processes.
+	 */
+	if (bfqq->wr_coeff == 1 &&
+	    (slow ||
+	     (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
+	      bfq_bfqq_budget_left(bfqq) >=  entity->budget / 3)))
+		bfq_bfqq_charge_time(bfqd, bfqq, delta);
+
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+	if (reason == BFQ_BFQQ_TOO_IDLE &&
+	    entity->service <= 2 * entity->budget / 10)
+		bfq_clear_bfqq_IO_bound(bfqq);
+
+	if (bfqd->low_latency && bfqq->wr_coeff == 1)
+		bfqq->last_wr_start_finish = jiffies;
+
+	if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 &&
+	    RB_EMPTY_ROOT(&bfqq->sort_list)) {
+		/*
+		 * If we get here, and there are no outstanding
+		 * requests, then the request pattern is isochronous
+		 * (see the comments on the function
+		 * bfq_bfqq_softrt_next_start()). Thus we can compute
+		 * soft_rt_next_start. If, instead, the queue still
+		 * has outstanding requests, then we have to wait for
+		 * the completion of all the outstanding requests to
+		 * discover whether the request pattern is actually
+		 * isochronous.
+		 */
+		BUG_ON(bfqd->busy_queues < 1);
+		if (bfqq->dispatched == 0) {
+			bfqq->soft_rt_next_start =
+				bfq_bfqq_softrt_next_start(bfqd, bfqq);
+			bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu",
+				     bfqq->soft_rt_next_start);
+		} else {
+			/*
+			 * The application is still waiting for the
+			 * completion of one or more requests:
+			 * prevent it from possibly being incorrectly
+			 * deemed as soft real-time by setting its
+			 * soft_rt_next_start to infinity. In fact,
+			 * without this assignment, the application
+			 * would be incorrectly deemed as soft
+			 * real-time if:
+			 * 1) it issued a new request before the
+			 *    completion of all its in-flight
+			 *    requests, and
+			 * 2) at that time, its soft_rt_next_start
+			 *    happened to be in the past.
+			 */
+			bfqq->soft_rt_next_start =
+				bfq_greatest_from_now();
+			/*
+			 * Schedule an update of soft_rt_next_start to when
+			 * the task may be discovered to be isochronous.
+			 */
+			bfq_mark_bfqq_softrt_update(bfqq);
+		}
+	}
+
+	bfq_log_bfqq(bfqd, bfqq,
+		"expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)",
+		     reason, slow, bfqq->dispatched,
+		     bfq_bfqq_idle_window(bfqq), entity->weight);
+
+	/*
+	 * Increase, decrease or leave budget unchanged according to
+	 * reason.
+	 */
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
+	BUG_ON(bfqq->next_rq == NULL &&
+	       bfqq->entity.budget < bfqq->entity.service);
+	__bfq_bfqq_expire(bfqd, bfqq);
+
+	BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED &&
+		!bfq_class_idle(bfqq));
+
+	if (!bfq_bfqq_busy(bfqq) &&
+	    reason != BFQ_BFQQ_BUDGET_TIMEOUT &&
+	    reason != BFQ_BFQQ_BUDGET_EXHAUSTED)
+		bfq_mark_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+/*
+ * Budget timeout is not implemented through a dedicated timer, but
+ * just checked on request arrivals and completions, as well as on
+ * idle timer expirations.
+ */
+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
+{
+	return time_is_before_eq_jiffies(bfqq->budget_timeout);
+}
+
+/*
+ * If we expire a queue that is actively waiting (i.e., with the
+ * device idled) for the arrival of a new request, then we may incur
+ * the timestamp misalignment problem described in the body of the
+ * function __bfq_activate_entity. Hence we return true only if this
+ * condition does not hold, or if the queue is slow enough to deserve
+ * only to be kicked off for preserving a high throughput.
+ */
+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		"may_budget_timeout: wait_request %d left %d timeout %d",
+		bfq_bfqq_wait_request(bfqq),
+			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
+		bfq_bfqq_budget_timeout(bfqq));
+
+	return (!bfq_bfqq_wait_request(bfqq) ||
+		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
+		&&
+		bfq_bfqq_budget_timeout(bfqq);
+}
+
+/*
+ * For a queue that becomes empty, device idling is allowed only if
+ * this function returns true for that queue. As a consequence, since
+ * device idling plays a critical role for both throughput boosting
+ * and service guarantees, the return value of this function plays a
+ * critical role as well.
+ *
+ * In a nutshell, this function returns true only if idling is
+ * beneficial for throughput or, even if detrimental for throughput,
+ * idling is however necessary to preserve service guarantees (low
+ * latency, desired throughput distribution, ...). In particular, on
+ * NCQ-capable devices, this function tries to return false, so as to
+ * help keep the drives' internal queues full, whenever this helps the
+ * device boost the throughput without causing any service-guarantee
+ * issue.
+ *
+ * In more detail, the return value of this function is obtained by,
+ * first, computing a number of boolean variables that take into
+ * account throughput and service-guarantee issues, and, then,
+ * combining these variables in a logical expression. Most of the
+ * issues taken into account are not trivial. We discuss these issues
+ * while introducing the variables.
+ */
+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+	bool idling_boosts_thr, idling_boosts_thr_without_issues,
+		idling_needed_for_service_guarantees,
+		asymmetric_scenario;
+
+	if (bfqd->strict_guarantees)
+		return true;
+
+	/*
+	 * The next variable takes into account the cases where idling
+	 * boosts the throughput.
+	 *
+	 * The value of the variable is computed considering, first, that
+	 * idling is virtually always beneficial for the throughput if:
+	 * (a) the device is not NCQ-capable, or
+	 * (b) regardless of the presence of NCQ, the device is rotational
+	 *     and the request pattern for bfqq is I/O-bound and sequential.
+	 *
+	 * Secondly, and in contrast to the above item (b), idling an
+	 * NCQ-capable flash-based device would not boost the
+	 * throughput even with sequential I/O; rather it would lower
+	 * the throughput in proportion to how fast the device
+	 * is. Accordingly, the next variable is true if any of the
+	 * above conditions (a) and (b) is true, and, in particular,
+	 * happens to be false if bfqd is an NCQ-capable flash-based
+	 * device.
+	 */
+	idling_boosts_thr = !bfqd->hw_tag ||
+		(!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) &&
+		 bfq_bfqq_idle_window(bfqq));
+
+	/*
+	 * The value of the next variable,
+	 * idling_boosts_thr_without_issues, is equal to that of
+	 * idling_boosts_thr, unless a special case holds. In this
+	 * special case, described below, idling may cause problems to
+	 * weight-raised queues.
+	 *
+	 * When the request pool is saturated (e.g., in the presence
+	 * of write hogs), if the processes associated with
+	 * non-weight-raised queues ask for requests at a lower rate,
+	 * then processes associated with weight-raised queues have a
+	 * higher probability to get a request from the pool
+	 * immediately (or at least soon) when they need one. Thus
+	 * they have a higher probability to actually get a fraction
+	 * of the device throughput proportional to their high
+	 * weight. This is especially true with NCQ-capable drives,
+	 * which enqueue several requests in advance, and further
+	 * reorder internally-queued requests.
+	 *
+	 * For this reason, we force to false the value of
+	 * idling_boosts_thr_without_issues if there are weight-raised
+	 * busy queues. In this case, and if bfqq is not weight-raised,
+	 * this guarantees that the device is not idled for bfqq (if,
+	 * instead, bfqq is weight-raised, then idling will be
+	 * guaranteed by another variable, see below). Combined with
+	 * the timestamping rules of BFQ (see [1] for details), this
+	 * behavior causes bfqq, and hence any sync non-weight-raised
+	 * queue, to get a lower number of requests served, and thus
+	 * to ask for a lower number of requests from the request
+	 * pool, before the busy weight-raised queues get served
+	 * again. This often mitigates starvation problems in the
+	 * presence of heavy write workloads and NCQ, thereby
+	 * guaranteeing a higher application and system responsiveness
+	 * in these hostile scenarios.
+	 */
+	idling_boosts_thr_without_issues = idling_boosts_thr &&
+		bfqd->wr_busy_queues == 0;
+
+	/*
+	 * There is then a case where idling must be performed not
+	 * for throughput concerns, but to preserve service
+	 * guarantees.
+	 *
+	 * To introduce this case, we can note that allowing the drive
+	 * to enqueue more than one request at a time, and hence
+	 * delegating de facto final scheduling decisions to the
+	 * drive's internal scheduler, entails loss of control on the
+	 * actual request service order. In particular, the critical
+	 * situation is when requests from different processes happen
+	 * to be present, at the same time, in the internal queue(s)
+	 * of the drive. In such a situation, the drive, by deciding
+	 * the service order of the internally-queued requests, does
+	 * determine also the actual throughput distribution among
+	 * these processes. But the drive typically has no notion or
+	 * concern about per-process throughput distribution, and
+	 * makes its decisions only on a per-request basis. Therefore,
+	 * the service distribution enforced by the drive's internal
+	 * scheduler is likely to coincide with the desired
+	 * device-throughput distribution only in a completely
+	 * symmetric scenario where:
+	 * (i)  each of these processes must get the same throughput as
+	 *      the others;
+	 * (ii) all these processes have the same I/O pattern
+	 *      (either sequential or random).
+	 * In fact, in such a scenario, the drive will tend to treat
+	 * the requests of each of these processes in about the same
+	 * way as the requests of the others, and thus to provide
+	 * each of these processes with about the same throughput
+	 * (which is exactly the desired throughput distribution). In
+	 * contrast, in any asymmetric scenario, device idling is
+	 * certainly needed to guarantee that bfqq receives its
+	 * assigned fraction of the device throughput (see [1] for
+	 * details).
+	 *
+	 * We address this issue by controlling, actually, only the
+	 * symmetry sub-condition (i), i.e., provided that
+	 * sub-condition (i) holds, idling is not performed,
+	 * regardless of whether sub-condition (ii) holds. In other
+	 * words, only if sub-condition (i) holds, then idling is
+	 * allowed, and the device tends to be prevented from queueing
+	 * many requests, possibly of several processes. The reason
+	 * for not controlling also sub-condition (ii) is that we
+	 * exploit preemption to preserve guarantees in case of
+	 * symmetric scenarios, even if (ii) does not hold, as
+	 * explained in the next two paragraphs.
+	 *
+	 * Even if a queue, say Q, is expired when it remains idle, Q
+	 * can still preempt the new in-service queue if the next
+	 * request of Q arrives soon (see the comments on
+	 * bfq_bfqq_update_budg_for_activation). If all queues and
+	 * groups have the same weight, this form of preemption,
+	 * combined with the hole-recovery heuristic described in the
+	 * comments on function bfq_bfqq_update_budg_for_activation,
+	 * are enough to preserve a correct bandwidth distribution in
+	 * the mid term, even without idling. In fact, even if not
+	 * idling allows the internal queues of the device to contain
+	 * many requests, and thus to reorder requests, we can rather
+	 * safely assume that the internal scheduler still preserves a
+	 * minimum of mid-term fairness. The motivation for using
+	 * preemption instead of idling is that, by not idling,
+	 * service guarantees are preserved without minimally
+	 * sacrificing throughput. In other words, both a high
+	 * throughput and its desired distribution are obtained.
+	 *
+	 * More precisely, this preemption-based, idleless approach
+	 * provides fairness in terms of IOPS, and not sectors per
+	 * second. This can be seen with a simple example. Suppose
+	 * that there are two queues with the same weight, but that
+	 * the first queue receives requests of 8 sectors, while the
+	 * second queue receives requests of 1024 sectors. In
+	 * addition, suppose that each of the two queues contains at
+	 * most one request at a time, which implies that each queue
+	 * always remains idle after it is served. Finally, after
+	 * remaining idle, each queue receives very quickly a new
+	 * request. It follows that the two queues are served
+	 * alternatively, preempting each other if needed. This
+	 * implies that, although both queues have the same weight,
+	 * the queue with large requests receives a service that is
+	 * 1024/8 times as high as the service received by the other
+	 * queue.
+	 *
+	 * On the other hand, device idling is performed, and thus
+	 * pure sector-domain guarantees are provided, for the
+	 * following queues, which are likely to need stronger
+	 * throughput guarantees: weight-raised queues, and queues
+	 * with a higher weight than other queues. When such queues
+	 * are active, sub-condition (i) is false, which triggers
+	 * device idling.
+	 *
+	 * According to the above considerations, the next variable is
+	 * true (only) if sub-condition (i) holds. To compute the
+	 * value of this variable, we not only use the return value of
+	 * the function bfq_symmetric_scenario(), but also check
+	 * whether bfqq is being weight-raised, because
+	 * bfq_symmetric_scenario() does not take into account also
+	 * weight-raised queues (see comments on
+	 * bfq_weights_tree_add()).
+	 *
+	 * As a side note, it is worth considering that the above
+	 * device-idling countermeasures may however fail in the
+	 * following unlucky scenario: if idling is (correctly)
+	 * disabled in a time period during which all symmetry
+	 * sub-conditions hold, and hence the device is allowed to
+	 * enqueue many requests, but at some later point in time some
+	 * sub-condition stops to hold, then it may become impossible
+	 * to let requests be served in the desired order until all
+	 * the requests already queued in the device have been served.
+	 */
+	asymmetric_scenario = bfqq->wr_coeff > 1 ||
+		!bfq_symmetric_scenario(bfqd);
+
+	/*
+	 * Finally, there is a case where maximizing throughput is the
+	 * best choice even if it may cause unfairness toward
+	 * bfqq. Such a case is when bfqq became active in a burst of
+	 * queue activations. Queues that became active during a large
+	 * burst benefit only from throughput, as discussed in the
+	 * comments on bfq_handle_burst. Thus, if bfqq became active
+	 * in a burst and not idling the device maximizes throughput,
+	 * then the device must no be idled, because not idling the
+	 * device provides bfqq and all other queues in the burst with
+	 * maximum benefit. Combining this and the above case, we can
+	 * now establish when idling is actually needed to preserve
+	 * service guarantees.
+	 */
+	idling_needed_for_service_guarantees =
+		asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq);
+
+	/*
+	 * We have now all the components we need to compute the return
+	 * value of the function, which is true only if both the following
+	 * conditions hold:
+	 * 1) bfqq is sync, because idling make sense only for sync queues;
+	 * 2) idling either boosts the throughput (without issues), or
+	 *    is necessary to preserve service guarantees.
+	 */
+	bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d",
+		     bfq_bfqq_sync(bfqq), idling_boosts_thr);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "may_idle: wr_busy %d boosts %d IO-bound %d guar %d",
+		     bfqd->wr_busy_queues,
+		     idling_boosts_thr_without_issues,
+		     bfq_bfqq_IO_bound(bfqq),
+		     idling_needed_for_service_guarantees);
+
+	return bfq_bfqq_sync(bfqq) &&
+		(idling_boosts_thr_without_issues ||
+		 idling_needed_for_service_guarantees);
+}
+
+/*
+ * If the in-service queue is empty but the function bfq_bfqq_may_idle
+ * returns true, then:
+ * 1) the queue must remain in service and cannot be expired, and
+ * 2) the device must be idled to wait for the possible arrival of a new
+ *    request for the queue.
+ * See the comments on the function bfq_bfqq_may_idle for the reasons
+ * why performing device idling is the best choice to boost the throughput
+ * and preserve service guarantees when bfq_bfqq_may_idle itself
+ * returns true.
+ */
+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq)
+{
+	struct bfq_data *bfqd = bfqq->bfqd;
+
+	return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 &&
+	       bfq_bfqq_may_idle(bfqq);
+}
+
+/*
+ * Select a queue for service.  If we have a current queue in service,
+ * check whether to continue servicing it, or retrieve and set a new one.
+ */
+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq;
+	struct request *next_rq;
+	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+
+	bfqq = bfqd->in_service_queue;
+	if (!bfqq)
+		goto new_queue;
+
+	bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");
+
+	if (bfq_may_expire_for_budg_timeout(bfqq) &&
+	    !hrtimer_active(&bfqd->idle_slice_timer) &&
+	    !bfq_bfqq_must_idle(bfqq))
+		goto expire;
+
+	next_rq = bfqq->next_rq;
+	/*
+	 * If bfqq has requests queued and it has enough budget left to
+	 * serve them, keep the queue, otherwise expire it.
+	 */
+	if (next_rq) {
+		if (bfq_serv_to_charge(next_rq, bfqq) >
+			bfq_bfqq_budget_left(bfqq)) {
+			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
+			goto expire;
+		} else {
+			/*
+			 * The idle timer may be pending because we may
+			 * not disable disk idling even when a new request
+			 * arrives.
+			 */
+			if (bfq_bfqq_wait_request(bfqq)) {
+				BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer));
+				/*
+				 * If we get here: 1) at least a new request
+				 * has arrived but we have not disabled the
+				 * timer because the request was too small,
+				 * 2) then the block layer has unplugged
+				 * the device, causing the dispatch to be
+				 * invoked.
+				 *
+				 * Since the device is unplugged, now the
+				 * requests are probably large enough to
+				 * provide a reasonable throughput.
+				 * So we disable idling.
+				 */
+				bfq_clear_bfqq_wait_request(bfqq);
+				hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+				bfqg_stats_update_idle_time(bfqq_group(bfqq));
+			}
+			goto keep_queue;
+		}
+	}
+
+	/*
+	 * No requests pending. However, if the in-service queue is idling
+	 * for a new request, or has requests waiting for a completion and
+	 * may idle after their completion, then keep it anyway.
+	 */
+	if (hrtimer_active(&bfqd->idle_slice_timer) ||
+	    (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
+		bfqq = NULL;
+		goto keep_queue;
+	}
+
+	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, false, reason);
+new_queue:
+	bfqq = bfq_set_in_service_queue(bfqd);
+	bfq_log(bfqd, "select_queue: new queue %d returned",
+		bfqq ? bfqq->pid : 0);
+keep_queue:
+	return bfqq;
+}
+
+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */
+		BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
+		       time_is_after_jiffies(bfqq->last_wr_start_finish));
+
+		bfq_log_bfqq(bfqd, bfqq,
+			"raising period dur %u/%u msec, old coeff %u, w %d(%d)",
+			jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish),
+			jiffies_to_msecs(bfqq->wr_cur_max_time),
+			bfqq->wr_coeff,
+			bfqq->entity.weight, bfqq->entity.orig_weight);
+
+		BUG_ON(bfqq != bfqd->in_service_queue && entity->weight !=
+		       entity->orig_weight * bfqq->wr_coeff);
+		if (entity->prio_changed)
+			bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change");
+
+		/*
+		 * If the queue was activated in a burst, or too much
+		 * time has elapsed from the beginning of this
+		 * weight-raising period, then end weight raising.
+		 */
+		if (bfq_bfqq_in_large_burst(bfqq))
+			bfq_bfqq_end_wr(bfqq);
+		else if (time_is_before_jiffies(bfqq->last_wr_start_finish +
+					   bfqq->wr_cur_max_time)) {
+			if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
+			time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+					bfq_wr_duration(bfqd)))
+				bfq_bfqq_end_wr(bfqq);
+			else {
+				/* switch back to interactive wr */
+				bfqq->wr_coeff = bfqd->bfq_wr_coeff;
+				bfqq->wr_cur_max_time = bfq_wr_duration(bfqd);
+				bfqq->last_wr_start_finish =
+					bfqq->wr_start_at_switch_to_srt;
+				BUG_ON(time_is_after_jiffies(
+					       bfqq->last_wr_start_finish));
+				bfqq->entity.prio_changed = 1;
+				bfq_log_bfqq(bfqd, bfqq,
+					"back to interactive wr");
+			}
+		}
+	}
+	/* Update weight both if it must be raised and if it must be lowered */
+	if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1))
+		__bfq_entity_update_weight_prio(
+			bfq_entity_service_tree(entity),
+			entity);
+}
+
+/*
+ * Dispatch one request from bfqq, moving it to the request queue
+ * dispatch list.
+ */
+static int bfq_dispatch_request(struct bfq_data *bfqd,
+				struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+	struct request *rq;
+	unsigned long service_to_charge;
+
+	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
+
+	/* Follow expired path, else get first next available. */
+	rq = bfq_check_fifo(bfqq);
+	if (!rq)
+		rq = bfqq->next_rq;
+	service_to_charge = bfq_serv_to_charge(rq, bfqq);
+
+	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
+		/*
+		 * This may happen if the next rq is chosen in fifo order
+		 * instead of sector order. The budget is properly
+		 * dimensioned to be always sufficient to serve the next
+		 * request only if it is chosen in sector order. The reason
+		 * is that it would be quite inefficient and little useful
+		 * to always make sure that the budget is large enough to
+		 * serve even the possible next rq in fifo order.
+		 * In fact, requests are seldom served in fifo order.
+		 *
+		 * Expire the queue for budget exhaustion, and make sure
+		 * that the next act_budget is enough to serve the next
+		 * request, even if it comes from the fifo expired path.
+		 */
+		bfqq->next_rq = rq;
+		/*
+		 * Since this dispatch is failed, make sure that
+		 * a new one will be performed
+		 */
+		if (!bfqd->rq_in_driver)
+			bfq_schedule_dispatch(bfqd);
+		BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+		goto expire;
+	}
+
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+	/* Finally, insert request into driver dispatch list. */
+	bfq_bfqq_served(bfqq, service_to_charge);
+
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+	bfq_dispatch_insert(bfqd->queue, rq);
+
+	/*
+	 * If weight raising has to terminate for bfqq, then next
+	 * function causes an immediate update of bfqq's weight,
+	 * without waiting for next activation. As a consequence, on
+	 * expiration, bfqq will be timestamped as if has never been
+	 * weight-raised during this service slot, even if it has
+	 * received part or even most of the service as a
+	 * weight-raised queue. This inflates bfqq's timestamps, which
+	 * is beneficial, as bfqq is then more willing to leave the
+	 * device immediately to possible other weight-raised queues.
+	 */
+	bfq_update_wr_data(bfqd, bfqq);
+
+	bfq_log_bfqq(bfqd, bfqq,
+			"dispatched %u sec req (%llu), budg left %d",
+			blk_rq_sectors(rq),
+			(unsigned long long) blk_rq_pos(rq),
+			bfq_bfqq_budget_left(bfqq));
+
+	dispatched++;
+
+	if (!bfqd->in_service_bic) {
+		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
+		bfqd->in_service_bic = RQ_BIC(rq);
+	}
+
+	if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
+		goto expire;
+
+	return dispatched;
+
+expire:
+	bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
+	return dispatched;
+}
+
+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
+{
+	int dispatched = 0;
+
+	while (bfqq->next_rq) {
+		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
+		dispatched++;
+	}
+
+	BUG_ON(!list_empty(&bfqq->fifo));
+	return dispatched;
+}
+
+/*
+ * Drain our current requests.
+ * Used for barriers and when switching io schedulers on-the-fly.
+ */
+static int bfq_forced_dispatch(struct bfq_data *bfqd)
+{
+	struct bfq_queue *bfqq, *n;
+	struct bfq_service_tree *st;
+	int dispatched = 0;
+
+	bfqq = bfqd->in_service_queue;
+	if (bfqq)
+		__bfq_bfqq_expire(bfqd, bfqq);
+
+	/*
+	 * Loop through classes, and be careful to leave the scheduler
+	 * in a consistent state, as feedback mechanisms and vtime
+	 * updates cannot be disabled during the process.
+	 */
+	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
+		st = bfq_entity_service_tree(&bfqq->entity);
+
+		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
+
+		bfqq->max_budget = bfq_max_budget(bfqd);
+		bfq_forget_idle(st);
+	}
+
+	BUG_ON(bfqd->busy_queues != 0);
+
+	return dispatched;
+}
+
+static int bfq_dispatch_requests(struct request_queue *q, int force)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq;
+
+	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
+
+	if (bfqd->busy_queues == 0)
+		return 0;
+
+	if (unlikely(force))
+		return bfq_forced_dispatch(bfqd);
+
+	/*
+	 * Force device to serve one request at a time if
+	 * strict_guarantees is true. Forcing this service scheme is
+	 * currently the ONLY way to guarantee that the request
+	 * service order enforced by the scheduler is respected by a
+	 * queueing device. Otherwise the device is free even to make
+	 * some unlucky request wait for as long as the device
+	 * wishes.
+	 *
+	 * Of course, serving one request at at time may cause loss of
+	 * throughput.
+	 */
+	if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+		return 0;
+
+	bfqq = bfq_select_queue(bfqd);
+	if (!bfqq)
+		return 0;
+
+	BUG_ON(bfqq->entity.budget < bfqq->entity.service);
+
+	BUG_ON(bfq_bfqq_wait_request(bfqq));
+
+	if (!bfq_dispatch_request(bfqd, bfqq))
+		return 0;
+
+	bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
+			bfq_bfqq_sync(bfqq) ? "sync" : "async");
+
+	BUG_ON(bfqq->next_rq == NULL &&
+	       bfqq->entity.budget < bfqq->entity.service);
+	return 1;
+}
+
+/*
+ * Task holds one reference to the queue, dropped when task exits.  Each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Queue lock must be held here.
+ */
+static void bfq_put_queue(struct bfq_queue *bfqq)
+{
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_group *bfqg = bfqq_group(bfqq);
+#endif
+
+	BUG_ON(bfqq->ref <= 0);
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
+	bfqq->ref--;
+	if (bfqq->ref)
+		return;
+
+	BUG_ON(rb_first(&bfqq->sort_list));
+	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
+	BUG_ON(bfqq->entity.tree);
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqq->bfqd->in_service_queue == bfqq);
+
+	if (bfq_bfqq_sync(bfqq))
+		/*
+		 * The fact that this queue is being destroyed does not
+		 * invalidate the fact that this queue may have been
+		 * activated during the current burst. As a consequence,
+		 * although the queue does not exist anymore, and hence
+		 * needs to be removed from the burst list if there,
+		 * the burst size has not to be decremented.
+		 */
+		hlist_del_init(&bfqq->burst_list_node);
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
+
+	kmem_cache_free(bfq_pool, bfqq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_put(bfqg);
+#endif
+}
+
+static void bfq_put_cooperator(struct bfq_queue *bfqq)
+{
+	struct bfq_queue *__bfqq, *next;
+
+	/*
+	 * If this queue was scheduled to merge with another queue, be
+	 * sure to drop the reference taken on that queue (and others in
+	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
+	 */
+	__bfqq = bfqq->new_bfqq;
+	while (__bfqq) {
+		if (__bfqq == bfqq)
+			break;
+		next = __bfqq->new_bfqq;
+		bfq_put_queue(__bfqq);
+		__bfqq = next;
+	}
+}
+
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	if (bfqq == bfqd->in_service_queue) {
+		__bfq_bfqq_expire(bfqd, bfqq);
+		bfq_schedule_dispatch(bfqd);
+	}
+
+	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref);
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+}
+
+static void bfq_init_icq(struct io_cq *icq)
+{
+	icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32);
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+	struct bfq_io_cq *bic = icq_to_bic(icq);
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+
+	if (bic_to_bfqq(bic, false)) {
+		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));
+		bic_set_bfqq(bic, NULL, false);
+	}
+
+	if (bic_to_bfqq(bic, true)) {
+		/*
+		 * If the bic is using a shared queue, put the reference
+		 * taken on the io_context when the bic started using a
+		 * shared bfq_queue.
+		 */
+		if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))
+			put_io_context(icq->ioc);
+		bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));
+		bic_set_bfqq(bic, NULL, true);
+	}
+}
+
+/*
+ * Update the entity prio values; note that the new values will not
+ * be used until the next (re)activation.
+ */
+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,
+				     struct bfq_io_cq *bic)
+{
+	struct task_struct *tsk = current;
+	int ioprio_class;
+
+	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+	switch (ioprio_class) {
+	default:
+		dev_err(bfqq->bfqd->queue->backing_dev_info.dev,
+			"bfq: bad prio class %d\n", ioprio_class);
+	case IOPRIO_CLASS_NONE:
+		/*
+		 * No prio set, inherit CPU scheduling settings.
+		 */
+		bfqq->new_ioprio = task_nice_ioprio(tsk);
+		bfqq->new_ioprio_class = task_nice_ioclass(tsk);
+		break;
+	case IOPRIO_CLASS_RT:
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
+		break;
+	case IOPRIO_CLASS_BE:
+		bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+		bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
+		break;
+	case IOPRIO_CLASS_IDLE:
+		bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
+		bfqq->new_ioprio = 7;
+		bfq_clear_bfqq_idle_window(bfqq);
+		break;
+	}
+
+	if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+		pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
+			bfqq->new_ioprio);
+		BUG();
+	}
+
+	bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+	bfqq->entity.prio_changed = 1;
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		     "set_next_ioprio_data: bic_class %d prio %d class %d",
+		     ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class);
+}
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
+{
+	struct bfq_data *bfqd = bic_to_bfqd(bic);
+	struct bfq_queue *bfqq;
+	unsigned long uninitialized_var(flags);
+	int ioprio = bic->icq.ioc->ioprio;
+
+	/*
+	 * This condition may trigger on a newly created bic, be sure to
+	 * drop the lock before returning.
+	 */
+	if (unlikely(!bfqd) || likely(bic->ioprio == ioprio))
+		return;
+
+	bic->ioprio = ioprio;
+
+	bfqq = bic_to_bfqq(bic, false);
+	if (bfqq) {
+		bfq_put_queue(bfqq);
+		bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
+		bic_set_bfqq(bic, bfqq, false);
+		bfq_log_bfqq(bfqd, bfqq,
+			     "check_ioprio_change: bfqq %p %d",
+			     bfqq, bfqq->ref);
+	}
+
+	bfqq = bic_to_bfqq(bic, true);
+	if (bfqq)
+		bfq_set_next_ioprio_data(bfqq, bic);
+}
+
+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			  struct bfq_io_cq *bic, pid_t pid, int is_sync)
+{
+	RB_CLEAR_NODE(&bfqq->entity.rb_node);
+	INIT_LIST_HEAD(&bfqq->fifo);
+	INIT_HLIST_NODE(&bfqq->burst_list_node);
+	BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
+
+	bfqq->ref = 0;
+	bfqq->bfqd = bfqd;
+
+	if (bic)
+		bfq_set_next_ioprio_data(bfqq, bic);
+
+	if (is_sync) {
+		if (!bfq_class_idle(bfqq))
+			bfq_mark_bfqq_idle_window(bfqq);
+		bfq_mark_bfqq_sync(bfqq);
+		bfq_mark_bfqq_just_created(bfqq);
+	} else
+		bfq_clear_bfqq_sync(bfqq);
+	bfq_mark_bfqq_IO_bound(bfqq);
+
+	/* Tentative initial value to trade off between thr and lat */
+	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
+	bfqq->pid = pid;
+
+	bfqq->wr_coeff = 1;
+	bfqq->last_wr_start_finish = jiffies;
+	bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now();
+	bfqq->budget_timeout = bfq_smallest_from_now();
+	bfqq->split_time = bfq_smallest_from_now();
+
+	/*
+	 * Set to the value for which bfqq will not be deemed as
+	 * soft rt when it becomes backlogged.
+	 */
+	bfqq->soft_rt_next_start = bfq_greatest_from_now();
+
+	/* first request is almost certainly seeky */
+	bfqq->seek_history = 1;
+}
+
+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
+					       struct bfq_group *bfqg,
+					       int ioprio_class, int ioprio)
+{
+	switch (ioprio_class) {
+	case IOPRIO_CLASS_RT:
+		return &bfqg->async_bfqq[0][ioprio];
+	case IOPRIO_CLASS_NONE:
+		ioprio = IOPRIO_NORM;
+		/* fall through */
+	case IOPRIO_CLASS_BE:
+		return &bfqg->async_bfqq[1][ioprio];
+	case IOPRIO_CLASS_IDLE:
+		return &bfqg->async_idle_bfqq;
+	default:
+		BUG();
+	}
+}
+
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bio *bio, bool is_sync,
+				       struct bfq_io_cq *bic)
+{
+	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
+	struct bfq_queue **async_bfqq = NULL;
+	struct bfq_queue *bfqq;
+	struct bfq_group *bfqg;
+
+	rcu_read_lock();
+
+	bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio));
+	if (!bfqg) {
+		bfqq = &bfqd->oom_bfqq;
+		goto out;
+	}
+
+	if (!is_sync) {
+		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
+						  ioprio);
+		bfqq = *async_bfqq;
+		if (bfqq)
+			goto out;
+	}
+
+	bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
+				     bfqd->queue->node);
+
+	if (bfqq) {
+		bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
+			      is_sync);
+		bfq_init_entity(&bfqq->entity, bfqg);
+		bfq_log_bfqq(bfqd, bfqq, "allocated");
+	} else {
+		bfqq = &bfqd->oom_bfqq;
+		bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
+		goto out;
+	}
+
+	/*
+	 * Pin the queue now that it's allocated, scheduler exit will
+	 * prune it.
+	 */
+	if (async_bfqq) {
+		bfqq->ref++;
+		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
+			     bfqq, bfqq->ref);
+		*async_bfqq = bfqq;
+	}
+
+out:
+	bfqq->ref++;
+	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
+	rcu_read_unlock();
+	return bfqq;
+}
+
+static void bfq_update_io_thinktime(struct bfq_data *bfqd,
+				    struct bfq_io_cq *bic)
+{
+	struct bfq_ttime *ttime = &bic->ttime;
+	u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request;
+
+	elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle);
+
+	ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
+	ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed,  8);
+	ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
+				     ttime->ttime_samples);
+}
+
+static void
+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+		       struct request *rq)
+{
+	bfqq->seek_history <<= 1;
+	bfqq->seek_history |=
+		get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR;
+}
+
+/*
+ * Disable idle window if the process thinks too long or seeks so much that
+ * it doesn't matter.
+ */
+static void bfq_update_idle_window(struct bfq_data *bfqd,
+				   struct bfq_queue *bfqq,
+				   struct bfq_io_cq *bic)
+{
+	int enable_idle;
+
+	/* Don't idle for async or idle io prio class. */
+	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
+		return;
+
+	/* Idle window just restored, statistics are meaningless. */
+	if (time_is_after_eq_jiffies(bfqq->split_time +
+				     bfqd->bfq_wr_min_idle_time))
+		return;
+
+	enable_idle = bfq_bfqq_idle_window(bfqq);
+
+	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
+	    bfqd->bfq_slice_idle == 0 ||
+		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
+			bfqq->wr_coeff == 1))
+		enable_idle = 0;
+	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
+		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
+			bfqq->wr_coeff == 1)
+			enable_idle = 0;
+		else
+			enable_idle = 1;
+	}
+	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
+		enable_idle);
+
+	if (enable_idle)
+		bfq_mark_bfqq_idle_window(bfqq);
+	else
+		bfq_clear_bfqq_idle_window(bfqq);
+}
+
+/*
+ * Called when a new fs request (rq) is added to bfqq.  Check if there's
+ * something we should do about it.
+ */
+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			    struct request *rq)
+{
+	struct bfq_io_cq *bic = RQ_BIC(rq);
+
+	if (rq->cmd_flags & REQ_META)
+		bfqq->meta_pending++;
+
+	bfq_update_io_thinktime(bfqd, bic);
+	bfq_update_io_seektime(bfqd, bfqq, rq);
+	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
+	    !BFQQ_SEEKY(bfqq))
+		bfq_update_idle_window(bfqd, bfqq, bic);
+
+	bfq_log_bfqq(bfqd, bfqq,
+		     "rq_enqueued: idle_window=%d (seeky %d)",
+		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq));
+
+	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
+
+	if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
+		bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 &&
+				 blk_rq_sectors(rq) < 32;
+		bool budget_timeout = bfq_bfqq_budget_timeout(bfqq);
+
+		/*
+		 * There is just this request queued: if the request
+		 * is small and the queue is not to be expired, then
+		 * just exit.
+		 *
+		 * In this way, if the device is being idled to wait
+		 * for a new request from the in-service queue, we
+		 * avoid unplugging the device and committing the
+		 * device to serve just a small request. On the
+		 * contrary, we wait for the block layer to decide
+		 * when to unplug the device: hopefully, new requests
+		 * will be merged to this one quickly, then the device
+		 * will be unplugged and larger requests will be
+		 * dispatched.
+		 */
+		if (small_req && !budget_timeout)
+			return;
+
+		/*
+		 * A large enough request arrived, or the queue is to
+		 * be expired: in both cases disk idling is to be
+		 * stopped, so clear wait_request flag and reset
+		 * timer.
+		 */
+		bfq_clear_bfqq_wait_request(bfqq);
+		hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+		bfqg_stats_update_idle_time(bfqq_group(bfqq));
+
+		/*
+		 * The queue is not empty, because a new request just
+		 * arrived. Hence we can safely expire the queue, in
+		 * case of budget timeout, without risking that the
+		 * timestamps of the queue are not updated correctly.
+		 * See [1] for more details.
+		 */
+		if (budget_timeout)
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_BUDGET_TIMEOUT);
+
+		/*
+		 * Let the request rip immediately, or let a new queue be
+		 * selected if bfqq has just been expired.
+		 */
+		__blk_run_queue(bfqd->queue);
+	}
+}
+
+static void bfq_insert_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+
+	/*
+	 * An unplug may trigger a requeue of a request from the device
+	 * driver: make sure we are in process context while trying to
+	 * merge two bfq_queues.
+	 */
+	if (!in_interrupt()) {
+		new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+		if (new_bfqq) {
+			if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
+				new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
+			/*
+			 * Release the request's reference to the old bfqq
+			 * and make sure one is taken to the shared queue.
+			 */
+			new_bfqq->allocated[rq_data_dir(rq)]++;
+			bfqq->allocated[rq_data_dir(rq)]--;
+			new_bfqq->ref++;
+			bfq_clear_bfqq_just_created(bfqq);
+			bfq_put_queue(bfqq);
+			if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+				bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
+						bfqq, new_bfqq);
+			rq->elv.priv[1] = new_bfqq;
+			bfqq = new_bfqq;
+		}
+	}
+
+	bfq_add_request(rq);
+
+	rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+	list_add_tail(&rq->queuelist, &bfqq->fifo);
+
+	bfq_rq_enqueued(bfqd, bfqq, rq);
+}
+
+static void bfq_update_hw_tag(struct bfq_data *bfqd)
+{
+	bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
+				       bfqd->rq_in_driver);
+
+	if (bfqd->hw_tag == 1)
+		return;
+
+	/*
+	 * This sample is valid if the number of outstanding requests
+	 * is large enough to allow a queueing behavior.  Note that the
+	 * sum is not exact, as it's not taking into account deactivated
+	 * requests.
+	 */
+	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
+		return;
+
+	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
+		return;
+
+	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
+	bfqd->max_rq_in_driver = 0;
+	bfqd->hw_tag_samples = 0;
+}
+
+static void bfq_completed_request(struct request_queue *q, struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+	struct bfq_data *bfqd = bfqq->bfqd;
+	u64 now_ns;
+	u32 delta_us;
+
+	bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
+		     blk_rq_sectors(rq));
+
+	assert_spin_locked(bfqd->queue->queue_lock);
+	bfq_update_hw_tag(bfqd);
+
+	BUG_ON(!bfqd->rq_in_driver);
+	BUG_ON(!bfqq->dispatched);
+	bfqd->rq_in_driver--;
+	bfqq->dispatched--;
+	bfqg_stats_update_completion(bfqq_group(bfqq),
+				     rq_start_time_ns(rq),
+				     rq_io_start_time_ns(rq), req_op(rq),
+				     rq->cmd_flags);
+
+	if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
+		BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+		/*
+		 * Set budget_timeout (which we overload to store the
+		 * time at which the queue remains with no backlog and
+		 * no outstanding request; used by the weight-raising
+		 * mechanism).
+		 */
+		bfqq->budget_timeout = jiffies;
+
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+	}
+
+	now_ns = ktime_get_ns();
+
+	RQ_BIC(rq)->ttime.last_end_request = now_ns;
+
+	/*
+	 * Using us instead of ns, to get a reasonable precision in
+	 * computing rate in next check.
+	 */
+	delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);
+
+	bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
+		delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
+		(USEC_PER_SEC*
+		(u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us))
+			>>BFQ_RATE_SHIFT,
+		(USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT);
+
+	/*
+	 * If the request took rather long to complete, and, according
+	 * to the maximum request size recorded, this completion latency
+	 * implies that the request was certainly served at a very low
+	 * rate (less than 1M sectors/sec), then the whole observation
+	 * interval that lasts up to this time instant cannot be a
+	 * valid time interval for computing a new peak rate.  Invoke
+	 * bfq_update_rate_reset to have the following three steps
+	 * taken:
+	 * - close the observation interval at the last (previous)
+	 *   request dispatch or completion
+	 * - compute rate, if possible, for that observation interval
+	 * - reset to zero samples, which will trigger a proper
+	 *   re-initialization of the observation interval on next
+	 *   dispatch
+	 */
+	if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC &&
+	   (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us <
+			1UL<<(BFQ_RATE_SHIFT - 10))
+		bfq_update_rate_reset(bfqd, NULL);
+	bfqd->last_completion = now_ns;
+
+	/*
+	 * If we are waiting to discover whether the request pattern
+	 * of the task associated with the queue is actually
+	 * isochronous, and both requisites for this condition to hold
+	 * are now satisfied, then compute soft_rt_next_start (see the
+	 * comments on the function bfq_bfqq_softrt_next_start()). We
+	 * schedule this delayed check when bfqq expires, if it still
+	 * has in-flight requests.
+	 */
+	if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 &&
+	    RB_EMPTY_ROOT(&bfqq->sort_list))
+		bfqq->soft_rt_next_start =
+			bfq_bfqq_softrt_next_start(bfqd, bfqq);
+
+	/*
+	 * If this is the in-service queue, check if it needs to be expired,
+	 * or if we want to idle in case it has no pending requests.
+	 */
+	if (bfqd->in_service_queue == bfqq) {
+		if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
+			bfq_arm_slice_timer(bfqd);
+			goto out;
+		} else if (bfq_may_expire_for_budg_timeout(bfqq))
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_BUDGET_TIMEOUT);
+		else if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+			 (bfqq->dispatched == 0 ||
+			  !bfq_bfqq_may_idle(bfqq)))
+			bfq_bfqq_expire(bfqd, bfqq, false,
+					BFQ_BFQQ_NO_MORE_REQUESTS);
+	}
+
+	if (!bfqd->rq_in_driver)
+		bfq_schedule_dispatch(bfqd);
+
+out:
+	return;
+}
+
+static int __bfq_may_queue(struct bfq_queue *bfqq)
+{
+	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
+		bfq_clear_bfqq_must_alloc(bfqq);
+		return ELV_MQUEUE_MUST;
+	}
+
+	return ELV_MQUEUE_MAY;
+}
+
+static int bfq_may_queue(struct request_queue *q, int op, int op_flags)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct task_struct *tsk = current;
+	struct bfq_io_cq *bic;
+	struct bfq_queue *bfqq;
+
+	/*
+	 * Don't force setup of a queue from here, as a call to may_queue
+	 * does not necessarily imply that a request actually will be
+	 * queued. So just lookup a possibly existing queue, or return
+	 * 'may queue' if that fails.
+	 */
+	bic = bfq_bic_lookup(bfqd, tsk->io_context);
+	if (!bic)
+		return ELV_MQUEUE_MAY;
+
+	bfqq = bic_to_bfqq(bic, rw_is_sync(op, op_flags));
+	if (bfqq)
+		return __bfq_may_queue(bfqq);
+
+	return ELV_MQUEUE_MAY;
+}
+
+/*
+ * Queue lock held here.
+ */
+static void bfq_put_request(struct request *rq)
+{
+	struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+	if (bfqq) {
+		const int rw = rq_data_dir(rq);
+
+		BUG_ON(!bfqq->allocated[rw]);
+		bfqq->allocated[rw]--;
+
+		rq->elv.priv[0] = NULL;
+		rq->elv.priv[1] = NULL;
+
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
+			     bfqq, bfqq->ref);
+		bfq_put_queue(bfqq);
+	}
+}
+
+/*
+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
+ * was the last process referring to that bfqq.
+ */
+static struct bfq_queue *
+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
+{
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
+
+	put_io_context(bic->icq.ioc);
+
+	if (bfqq_process_refs(bfqq) == 1) {
+		bfqq->pid = current->pid;
+		bfq_clear_bfqq_coop(bfqq);
+		bfq_clear_bfqq_split_coop(bfqq);
+		return bfqq;
+	}
+
+	bic_set_bfqq(bic, NULL, 1);
+
+	bfq_put_cooperator(bfqq);
+
+	bfq_put_queue(bfqq);
+	return NULL;
+}
+
+/*
+ * Allocate bfq data structures associated with this request.
+ */
+static int bfq_set_request(struct request_queue *q, struct request *rq,
+			   struct bio *bio, gfp_t gfp_mask)
+{
+	struct bfq_data *bfqd = q->elevator->elevator_data;
+	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
+	const int rw = rq_data_dir(rq);
+	const int is_sync = rq_is_sync(rq);
+	struct bfq_queue *bfqq;
+	unsigned long flags;
+	bool split = false;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	bfq_check_ioprio_change(bic, bio);
+
+	if (!bic)
+		goto queue_fail;
+
+	bfq_bic_update_cgroup(bic, bio);
+
+new_queue:
+	bfqq = bic_to_bfqq(bic, is_sync);
+	if (!bfqq || bfqq == &bfqd->oom_bfqq) {
+		if (bfqq)
+			bfq_put_queue(bfqq);
+		bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+		BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));
+
+		bic_set_bfqq(bic, bfqq, is_sync);
+		if (split && is_sync) {
+			bfq_log_bfqq(bfqd, bfqq,
+				     "set_request: was_in_list %d "
+				     "was_in_large_burst %d "
+				     "large burst in progress %d",
+				     bic->was_in_burst_list,
+				     bic->saved_in_large_burst,
+				     bfqd->large_burst);
+
+			if ((bic->was_in_burst_list && bfqd->large_burst) ||
+			    bic->saved_in_large_burst) {
+				bfq_log_bfqq(bfqd, bfqq,
+					     "set_request: marking in "
+					     "large burst");
+				bfq_mark_bfqq_in_large_burst(bfqq);
+			} else {
+				bfq_log_bfqq(bfqd, bfqq,
+					     "set_request: clearing in "
+					     "large burst");
+				bfq_clear_bfqq_in_large_burst(bfqq);
+				if (bic->was_in_burst_list)
+					hlist_add_head(&bfqq->burst_list_node,
+						       &bfqd->burst_list);
+			}
+			bfqq->split_time = jiffies;
+		}
+	} else {
+		/* If the queue was seeky for too long, break it apart. */
+		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
+			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+
+			/* Update bic before losing reference to bfqq */
+			if (bfq_bfqq_in_large_burst(bfqq))
+				bic->saved_in_large_burst = true;
+
+			bfqq = bfq_split_bfqq(bic, bfqq);
+			split = true;
+			if (!bfqq)
+				goto new_queue;
+		}
+	}
+
+	bfqq->allocated[rw]++;
+	bfqq->ref++;
+	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
+
+	rq->elv.priv[0] = bic;
+	rq->elv.priv[1] = bfqq;
+
+	/*
+	 * If a bfq_queue has only one process reference, it is owned
+	 * by only one bfq_io_cq: we can set the bic field of the
+	 * bfq_queue to the address of that structure. Also, if the
+	 * queue has just been split, mark a flag so that the
+	 * information is available to the other scheduler hooks.
+	 */
+	if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) {
+		bfqq->bic = bic;
+		if (split) {
+			/*
+			 * If the queue has just been split from a shared
+			 * queue, restore the idle window and the possible
+			 * weight raising period.
+			 */
+			bfq_bfqq_resume_state(bfqq, bic);
+		}
+	}
+
+	if (unlikely(bfq_bfqq_just_created(bfqq)))
+		bfq_handle_burst(bfqd, bfqq);
+
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 0;
+
+queue_fail:
+	bfq_schedule_dispatch(bfqd);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return 1;
+}
+
+static void bfq_kick_queue(struct work_struct *work)
+{
+	struct bfq_data *bfqd =
+		container_of(work, struct bfq_data, unplug_work);
+	struct request_queue *q = bfqd->queue;
+
+	spin_lock_irq(q->queue_lock);
+	__blk_run_queue(q);
+	spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * Handler of the expiration of the timer running if the in-service queue
+ * is idling inside its time slice.
+ */
+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
+{
+	struct bfq_data *bfqd = container_of(timer, struct bfq_data,
+					     idle_slice_timer);
+	struct bfq_queue *bfqq;
+	unsigned long flags;
+	enum bfqq_expiration reason;
+
+	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
+
+	bfqq = bfqd->in_service_queue;
+	/*
+	 * Theoretical race here: the in-service queue can be NULL or
+	 * different from the queue that was idling if the timer handler
+	 * spins on the queue_lock and a new request arrives for the
+	 * current queue and there is a full dispatch cycle that changes
+	 * the in-service queue.  This can hardly happen, but in the worst
+	 * case we just expire a queue too early.
+	 */
+	if (bfqq) {
+		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
+		bfq_clear_bfqq_wait_request(bfqq);
+
+		if (bfq_bfqq_budget_timeout(bfqq))
+			/*
+			 * Also here the queue can be safely expired
+			 * for budget timeout without wasting
+			 * guarantees
+			 */
+			reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+			/*
+			 * The queue may not be empty upon timer expiration,
+			 * because we may not disable the timer when the
+			 * first request of the in-service queue arrives
+			 * during disk idling.
+			 */
+			reason = BFQ_BFQQ_TOO_IDLE;
+		else
+			goto schedule_dispatch;
+
+		bfq_bfqq_expire(bfqd, bfqq, true, reason);
+	}
+
+schedule_dispatch:
+	bfq_schedule_dispatch(bfqd);
+
+	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
+	return HRTIMER_NORESTART;
+}
+
+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
+{
+	hrtimer_cancel(&bfqd->idle_slice_timer);
+	cancel_work_sync(&bfqd->unplug_work);
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
+					struct bfq_queue **bfqq_ptr)
+{
+	struct bfq_group *root_group = bfqd->root_group;
+	struct bfq_queue *bfqq = *bfqq_ptr;
+
+	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
+	if (bfqq) {
+		bfq_bfqq_move(bfqd, bfqq, root_group);
+		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
+			     bfqq, bfqq->ref);
+		bfq_put_queue(bfqq);
+		*bfqq_ptr = NULL;
+	}
+}
+
+/*
+ * Release all the bfqg references to its async queues.  If we are
+ * deallocating the group these queues may still contain requests, so
+ * we reparent them to the root cgroup (i.e., the only one that will
+ * exist for sure until all the requests on a device are gone).
+ */
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
+{
+	int i, j;
+
+	for (i = 0; i < 2; i++)
+		for (j = 0; j < IOPRIO_BE_NR; j++)
+			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+
+	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+}
+#endif
+
+static void bfq_exit_queue(struct elevator_queue *e)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	struct request_queue *q = bfqd->queue;
+	struct bfq_queue *bfqq, *n;
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	spin_lock_irq(q->queue_lock);
+
+	BUG_ON(bfqd->in_service_queue);
+	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
+		bfq_deactivate_bfqq(bfqd, bfqq, 0);
+
+	spin_unlock_irq(q->queue_lock);
+
+	bfq_shutdown_timer_wq(bfqd);
+
+	BUG_ON(hrtimer_active(&bfqd->idle_slice_timer));
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_deactivate_policy(q, &blkcg_policy_bfq);
+#else
+	kfree(bfqd->root_group);
+#endif
+
+	kfree(bfqd);
+}
+
+static void bfq_init_root_group(struct bfq_group *root_group,
+				struct bfq_data *bfqd)
+{
+	int i;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	root_group->entity.parent = NULL;
+	root_group->my_entity = NULL;
+	root_group->bfqd = bfqd;
+#endif
+	root_group->rq_pos_tree = RB_ROOT;
+	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
+		root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
+}
+
+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
+{
+	struct bfq_data *bfqd;
+	struct elevator_queue *eq;
+
+	eq = elevator_alloc(q, e);
+	if (!eq)
+		return -ENOMEM;
+
+	bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node);
+	if (!bfqd) {
+		kobject_put(&eq->kobj);
+		return -ENOMEM;
+	}
+	eq->elevator_data = bfqd;
+
+	/*
+	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
+	 * Grab a permanent reference to it, so that the normal code flow
+	 * will not attempt to free it.
+	 */
+	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+	bfqd->oom_bfqq.ref++;
+	bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
+	bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
+	bfqd->oom_bfqq.entity.new_weight =
+		bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio);
+
+	/* oom_bfqq does not participate to bursts */
+	bfq_clear_bfqq_just_created(&bfqd->oom_bfqq);
+	/*
+	 * Trigger weight initialization, according to ioprio, at the
+	 * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio
+	 * class won't be changed any more.
+	 */
+	bfqd->oom_bfqq.entity.prio_changed = 1;
+
+	bfqd->queue = q;
+
+	spin_lock_irq(q->queue_lock);
+	q->elevator = eq;
+	spin_unlock_irq(q->queue_lock);
+
+	bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
+	if (!bfqd->root_group)
+		goto out_free;
+	bfq_init_root_group(bfqd->root_group, bfqd);
+	bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
+
+	hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_REL);
+	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
+
+	bfqd->queue_weights_tree = RB_ROOT;
+	bfqd->group_weights_tree = RB_ROOT;
+
+	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
+
+	INIT_LIST_HEAD(&bfqd->active_list);
+	INIT_LIST_HEAD(&bfqd->idle_list);
+	INIT_HLIST_HEAD(&bfqd->burst_list);
+
+	bfqd->hw_tag = -1;
+
+	bfqd->bfq_max_budget = bfq_default_max_budget;
+
+	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
+	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
+	bfqd->bfq_back_max = bfq_back_max;
+	bfqd->bfq_back_penalty = bfq_back_penalty;
+	bfqd->bfq_slice_idle = bfq_slice_idle;
+	bfqd->bfq_class_idle_last_service = 0;
+	bfqd->bfq_timeout = bfq_timeout;
+
+	bfqd->bfq_requests_within_timer = 120;
+
+	bfqd->bfq_large_burst_thresh = 8;
+	bfqd->bfq_burst_interval = msecs_to_jiffies(180);
+
+	bfqd->low_latency = true;
+
+	/*
+	 * Trade-off between responsiveness and fairness.
+	 */
+	bfqd->bfq_wr_coeff = 30;
+	bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
+	bfqd->bfq_wr_max_time = 0;
+	bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
+	bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
+	bfqd->bfq_wr_max_softrt_rate = 7000; /*
+					      * Approximate rate required
+					      * to playback or record a
+					      * high-definition compressed
+					      * video.
+					      */
+	bfqd->wr_busy_queues = 0;
+
+	/*
+	 * Begin by assuming, optimistically, that the device is a
+	 * high-speed one, and that its peak rate is equal to 2/3 of
+	 * the highest reference rate.
+	 */
+	bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] *
+			T_fast[blk_queue_nonrot(bfqd->queue)];
+	bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+	bfqd->device_speed = BFQ_BFQD_FAST;
+
+	return 0;
+
+out_free:
+	kfree(bfqd);
+	kobject_put(&eq->kobj);
+	return -ENOMEM;
+}
+
+static void bfq_slab_kill(void)
+{
+	kmem_cache_destroy(bfq_pool);
+}
+
+static int __init bfq_slab_setup(void)
+{
+	bfq_pool = KMEM_CACHE(bfq_queue, 0);
+	if (!bfq_pool)
+		return -ENOMEM;
+	return 0;
+}
+
+static ssize_t bfq_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%u\n", var);
+}
+
+static ssize_t bfq_var_store(unsigned long *var, const char *page,
+			     size_t count)
+{
+	unsigned long new_val;
+	int ret = kstrtoul(page, 10, &new_val);
+
+	if (ret == 0)
+		*var = new_val;
+
+	return count;
+}
+
+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+
+	return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ?
+		       jiffies_to_msecs(bfqd->bfq_wr_max_time) :
+		       jiffies_to_msecs(bfq_wr_duration(bfqd)));
+}
+
+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
+{
+	struct bfq_queue *bfqq;
+	struct bfq_data *bfqd = e->elevator_data;
+	ssize_t num_char = 0;
+
+	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
+			    bfqd->queued);
+
+	spin_lock_irq(bfqd->queue->queue_lock);
+
+	num_char += sprintf(page + num_char, "Active:\n");
+	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
+		num_char += sprintf(page + num_char,
+				    "pid%d: weight %hu, nr_queued %d %d, ",
+				    bfqq->pid,
+				    bfqq->entity.weight,
+				    bfqq->queued[0],
+				    bfqq->queued[1]);
+		num_char += sprintf(page + num_char,
+				    "dur %d/%u\n",
+				    jiffies_to_msecs(
+					    jiffies -
+					    bfqq->last_wr_start_finish),
+				    jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	num_char += sprintf(page + num_char, "Idle:\n");
+	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
+		num_char += sprintf(page + num_char,
+				    "pid%d: weight %hu, dur %d/%u\n",
+				    bfqq->pid,
+				    bfqq->entity.weight,
+				    jiffies_to_msecs(jiffies -
+						     bfqq->last_wr_start_finish),
+				    jiffies_to_msecs(bfqq->wr_cur_max_time));
+	}
+
+	spin_unlock_irq(bfqd->queue->queue_lock);
+
+	return num_char;
+}
+
+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	u64 __data = __VAR;						\
+	if (__CONV == 1)						\
+		__data = jiffies_to_msecs(__data);			\
+	else if (__CONV == 2)						\
+		__data = div_u64(__data, NSEC_PER_MSEC);		\
+	return bfq_var_show(__data, (page));				\
+}
+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2);
+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2);
+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2);
+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1);
+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0);
+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0);
+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1);
+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1);
+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async,
+	1);
+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0);
+#undef SHOW_FUNCTION
+
+#define USEC_SHOW_FUNCTION(__FUNC, __VAR)				\
+static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	u64 __data = __VAR;						\
+	__data = div_u64(__data, NSEC_PER_USEC);			\
+	return bfq_var_show(__data, (page));				\
+}
+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle);
+#undef USEC_SHOW_FUNCTION
+
+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
+static ssize_t								\
+__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long uninitialized_var(__data);			\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	if (__CONV == 1)						\
+		*(__PTR) = msecs_to_jiffies(__data);			\
+	else if (__CONV == 2)						\
+		*(__PTR) = (u64)__data * NSEC_PER_MSEC;			\
+	else								\
+		*(__PTR) = __data;					\
+	return ret;							\
+}
+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
+		INT_MAX, 2);
+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
+		INT_MAX, 2);
+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
+		INT_MAX, 0);
+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2);
+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0);
+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX,
+		1);
+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0,
+		INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store,
+		&bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1);
+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0,
+		INT_MAX, 0);
+#undef STORE_FUNCTION
+
+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX)			\
+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\
+{									\
+	struct bfq_data *bfqd = e->elevator_data;			\
+	unsigned long __data;						\
+	int ret = bfq_var_store(&__data, (page), count);		\
+	if (__data < (MIN))						\
+		__data = (MIN);						\
+	else if (__data > (MAX))					\
+		__data = (MAX);						\
+	*(__PTR) = (u64)__data * NSEC_PER_USEC;				\
+	return ret;							\
+}
+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0,
+		    UINT_MAX);
+#undef USEC_STORE_FUNCTION
+
+/* do nothing for the moment */
+static ssize_t bfq_weights_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	return count;
+}
+
+static ssize_t bfq_max_budget_store(struct elevator_queue *e,
+				    const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data == 0)
+		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+	else {
+		if (__data > INT_MAX)
+			__data = INT_MAX;
+		bfqd->bfq_max_budget = __data;
+	}
+
+	bfqd->bfq_user_max_budget = __data;
+
+	return ret;
+}
+
+/*
+ * Leaving this name to preserve name compatibility with cfq
+ * parameters, but this timeout is used for both sync and async.
+ */
+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
+				      const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data < 1)
+		__data = 1;
+	else if (__data > INT_MAX)
+		__data = INT_MAX;
+
+	bfqd->bfq_timeout = msecs_to_jiffies(__data);
+	if (bfqd->bfq_user_max_budget == 0)
+		bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd);
+
+	return ret;
+}
+
+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	if (!bfqd->strict_guarantees && __data == 1
+	    && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC)
+		bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC;
+
+	bfqd->strict_guarantees = __data;
+
+	return ret;
+}
+
+static ssize_t bfq_low_latency_store(struct elevator_queue *e,
+				     const char *page, size_t count)
+{
+	struct bfq_data *bfqd = e->elevator_data;
+	unsigned long uninitialized_var(__data);
+	int ret = bfq_var_store(&__data, (page), count);
+
+	if (__data > 1)
+		__data = 1;
+	if (__data == 0 && bfqd->low_latency != 0)
+		bfq_end_wr(bfqd);
+	bfqd->low_latency = __data;
+
+	return ret;
+}
+
+#define BFQ_ATTR(name) \
+	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
+
+static struct elv_fs_entry bfq_attrs[] = {
+	BFQ_ATTR(fifo_expire_sync),
+	BFQ_ATTR(fifo_expire_async),
+	BFQ_ATTR(back_seek_max),
+	BFQ_ATTR(back_seek_penalty),
+	BFQ_ATTR(slice_idle),
+	BFQ_ATTR(slice_idle_us),
+	BFQ_ATTR(max_budget),
+	BFQ_ATTR(timeout_sync),
+	BFQ_ATTR(strict_guarantees),
+	BFQ_ATTR(low_latency),
+	BFQ_ATTR(wr_coeff),
+	BFQ_ATTR(wr_max_time),
+	BFQ_ATTR(wr_rt_max_time),
+	BFQ_ATTR(wr_min_idle_time),
+	BFQ_ATTR(wr_min_inter_arr_async),
+	BFQ_ATTR(wr_max_softrt_rate),
+	BFQ_ATTR(weights),
+	__ATTR_NULL
+};
+
+static struct elevator_type iosched_bfq = {
+	.ops = {
+		.elevator_merge_fn =		bfq_merge,
+		.elevator_merged_fn =		bfq_merged_request,
+		.elevator_merge_req_fn =	bfq_merged_requests,
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		.elevator_bio_merged_fn =	bfq_bio_merged,
+#endif
+		.elevator_allow_bio_merge_fn =	bfq_allow_bio_merge,
+		.elevator_allow_rq_merge_fn =	bfq_allow_rq_merge,
+		.elevator_dispatch_fn =		bfq_dispatch_requests,
+		.elevator_add_req_fn =		bfq_insert_request,
+		.elevator_activate_req_fn =	bfq_activate_request,
+		.elevator_deactivate_req_fn =	bfq_deactivate_request,
+		.elevator_completed_req_fn =	bfq_completed_request,
+		.elevator_former_req_fn =	elv_rb_former_request,
+		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_init_icq_fn =		bfq_init_icq,
+		.elevator_exit_icq_fn =		bfq_exit_icq,
+		.elevator_set_req_fn =		bfq_set_request,
+		.elevator_put_req_fn =		bfq_put_request,
+		.elevator_may_queue_fn =	bfq_may_queue,
+		.elevator_init_fn =		bfq_init_queue,
+		.elevator_exit_fn =		bfq_exit_queue,
+	},
+	.icq_size =		sizeof(struct bfq_io_cq),
+	.icq_align =		__alignof__(struct bfq_io_cq),
+	.elevator_attrs =	bfq_attrs,
+	.elevator_name =	"bfq",
+	.elevator_owner =	THIS_MODULE,
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct blkcg_policy blkcg_policy_bfq = {
+	.dfl_cftypes		= bfq_blkg_files,
+	.legacy_cftypes		= bfq_blkcg_legacy_files,
+
+	.cpd_alloc_fn		= bfq_cpd_alloc,
+	.cpd_init_fn		= bfq_cpd_init,
+	.cpd_bind_fn	        = bfq_cpd_init,
+	.cpd_free_fn		= bfq_cpd_free,
+
+	.pd_alloc_fn		= bfq_pd_alloc,
+	.pd_init_fn		= bfq_pd_init,
+	.pd_offline_fn		= bfq_pd_offline,
+	.pd_free_fn		= bfq_pd_free,
+	.pd_reset_stats_fn	= bfq_pd_reset_stats,
+};
+#endif
+
+static int __init bfq_init(void)
+{
+	int ret;
+	char msg[50] = "BFQ I/O-scheduler: v8r5";
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	ret = blkcg_policy_register(&blkcg_policy_bfq);
+	if (ret)
+		return ret;
+#endif
+
+	ret = -ENOMEM;
+	if (bfq_slab_setup())
+		goto err_pol_unreg;
+
+	/*
+	 * Times to load large popular applications for the typical
+	 * systems installed on the reference devices (see the
+	 * comments before the definitions of the next two
+	 * arrays). Actually, we use slightly slower values, as the
+	 * estimated peak rate tends to be smaller than the actual
+	 * peak rate.  The reason for this last fact is that estimates
+	 * are computed over much shorter time intervals than the long
+	 * intervals typically used for benchmarking. Why? First, to
+	 * adapt more quickly to variations. Second, because an I/O
+	 * scheduler cannot rely on a peak-rate-evaluation workload to
+	 * be run for a long time.
+	 */
+	T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */
+	T_slow[1] = msecs_to_jiffies(1000); /* actually 1.5 sec */
+	T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */
+	T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */
+
+	/*
+	 * Thresholds that determine the switch between speed classes
+	 * (see the comments before the definition of the array
+	 * device_speed_thresh). These thresholds are biased towards
+	 * transitions to the fast class. This is safer than the
+	 * opposite bias. In fact, a wrong transition to the slow
+	 * class results in short weight-raising periods, because the
+	 * speed of the device then tends to be higher that the
+	 * reference peak rate. On the opposite end, a wrong
+	 * transition to the fast class tends to increase
+	 * weight-raising periods, because of the opposite reason.
+	 */
+	device_speed_thresh[0] = (4 * R_slow[0]) / 3;
+	device_speed_thresh[1] = (4 * R_slow[1]) / 3;
+
+	ret = elv_register(&iosched_bfq);
+	if (ret)
+		goto err_pol_unreg;
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	strcat(msg, " (with cgroups support)");
+#endif
+	pr_info("%s", msg);
+
+	return 0;
+
+err_pol_unreg:
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+	return ret;
+}
+
+static void __exit bfq_exit(void)
+{
+	elv_unregister(&iosched_bfq);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	blkcg_policy_unregister(&blkcg_policy_bfq);
+#endif
+	bfq_slab_kill();
+}
+
+module_init(bfq_init);
+module_exit(bfq_exit);
+
+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");
+MODULE_LICENSE("GPL");
diff --git b/block/bfq-sched.c b/block/bfq-sched.c
new file mode 100644
index 0000000..45d63d3
--- /dev/null
+++ b/block/bfq-sched.c
@@ -0,0 +1,1501 @@
+/*
+ * BFQ: Hierarchical B-WF2Q+ scheduler.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
+ */
+
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+#define for_each_entity(entity)	\
+	for (; entity ; entity = entity->parent)
+
+#define for_each_entity_safe(entity, parent) \
+	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
+
+
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd);
+
+static void bfq_update_budget(struct bfq_entity *next_in_service)
+{
+	struct bfq_entity *bfqg_entity;
+	struct bfq_group *bfqg;
+	struct bfq_sched_data *group_sd;
+
+	BUG_ON(!next_in_service);
+
+	group_sd = next_in_service->sched_data;
+
+	bfqg = container_of(group_sd, struct bfq_group, sched_data);
+	/*
+	 * bfq_group's my_entity field is not NULL only if the group
+	 * is not the root group. We must not touch the root entity
+	 * as it must never become an in-service entity.
+	 */
+	bfqg_entity = bfqg->my_entity;
+	if (bfqg_entity)
+		bfqg_entity->budget = next_in_service->budget;
+}
+
+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+{
+	struct bfq_entity *next_in_service;
+	struct bfq_queue *bfqq;
+
+	if (sd->in_service_entity)
+		/* will update/requeue at the end of service */
+		return 0;
+
+	/*
+	 * NOTE: this can be improved in many ways, such as returning
+	 * 1 (and thus propagating upwards the update) only when the
+	 * budget changes, or caching the bfqq that will be scheduled
+	 * next from this subtree.  By now we worry more about
+	 * correctness than about performance...
+	 */
+	next_in_service = bfq_lookup_next_entity(sd, 0, NULL);
+	sd->next_in_service = next_in_service;
+
+	if (next_in_service)
+		bfq_update_budget(next_in_service);
+	else
+		goto exit;
+
+	bfqq = bfq_entity_to_bfqq(next_in_service);
+	if (bfqq)
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "update_next_in_service: chosen this queue");
+	else {
+		struct bfq_group *bfqg =
+			container_of(next_in_service,
+				     struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "update_next_in_service: chosen this entity");
+	}
+exit:
+	return 1;
+}
+
+static void bfq_check_next_in_service(struct bfq_sched_data *sd,
+				      struct bfq_entity *entity)
+{
+	WARN_ON(sd->next_in_service != entity);
+}
+#else
+#define for_each_entity(entity)	\
+	for (; entity ; entity = NULL)
+
+#define for_each_entity_safe(entity, parent) \
+	for (parent = NULL; entity ; entity = parent)
+
+static int bfq_update_next_in_service(struct bfq_sched_data *sd)
+{
+	return 0;
+}
+
+static void bfq_check_next_in_service(struct bfq_sched_data *sd,
+				      struct bfq_entity *entity)
+{
+}
+
+static void bfq_update_budget(struct bfq_entity *next_in_service)
+{
+}
+#endif
+
+/*
+ * Shift for timestamp calculations.  This actually limits the maximum
+ * service allowed in one timestamp delta (small shift values increase it),
+ * the maximum total weight that can be used for the queues in the system
+ * (big shift values increase it), and the period of virtual time
+ * wraparounds.
+ */
+#define WFQ_SERVICE_SHIFT	22
+
+/**
+ * bfq_gt - compare two timestamps.
+ * @a: first ts.
+ * @b: second ts.
+ *
+ * Return @a > @b, dealing with wrapping correctly.
+ */
+static int bfq_gt(u64 a, u64 b)
+{
+	return (s64)(a - b) > 0;
+}
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = NULL;
+
+	BUG_ON(!entity);
+
+	if (!entity->my_sched_data)
+		bfqq = container_of(entity, struct bfq_queue, entity);
+
+	return bfqq;
+}
+
+
+/**
+ * bfq_delta - map service into the virtual time domain.
+ * @service: amount of service.
+ * @weight: scale factor (weight of an entity or weight sum).
+ */
+static u64 bfq_delta(unsigned long service, unsigned long weight)
+{
+	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
+
+	do_div(d, weight);
+	return d;
+}
+
+/**
+ * bfq_calc_finish - assign the finish time to an entity.
+ * @entity: the entity to act upon.
+ * @service: the service to be charged to the entity.
+ */
+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	unsigned long long start, finish, delta;
+
+	BUG_ON(entity->weight == 0);
+
+	entity->finish = entity->start +
+		bfq_delta(service, entity->weight);
+
+	start = ((entity->start>>10)*1000)>>12;
+	finish = ((entity->finish>>10)*1000)>>12;
+	delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12;
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: serv %lu, w %d",
+			service, entity->weight);
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"calc_finish: start %llu, finish %llu, delta %llu",
+			start, finish, delta);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	} else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			"calc_finish group: serv %lu, w %d",
+			     service, entity->weight);
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			"calc_finish group: start %llu, finish %llu, delta %llu",
+			start, finish, delta);
+#endif
+	}
+}
+
+/**
+ * bfq_entity_of - get an entity from a node.
+ * @node: the node field of the entity.
+ *
+ * Convert a node pointer to the relative entity.  This is used only
+ * to simplify the logic of some functions and not as the generic
+ * conversion mechanism because, e.g., in the tree walking functions,
+ * the check for a %NULL value would be redundant.
+ */
+static struct bfq_entity *bfq_entity_of(struct rb_node *node)
+{
+	struct bfq_entity *entity = NULL;
+
+	if (node)
+		entity = rb_entry(node, struct bfq_entity, rb_node);
+
+	return entity;
+}
+
+/**
+ * bfq_extract - remove an entity from a tree.
+ * @root: the tree root.
+ * @entity: the entity to remove.
+ */
+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity)
+{
+	BUG_ON(entity->tree != root);
+
+	entity->tree = NULL;
+	rb_erase(&entity->rb_node, root);
+}
+
+/**
+ * bfq_idle_extract - extract an entity from the idle tree.
+ * @st: the service tree of the owning @entity.
+ * @entity: the entity being removed.
+ */
+static void bfq_idle_extract(struct bfq_service_tree *st,
+			     struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *next;
+
+	BUG_ON(entity->tree != &st->idle);
+
+	if (entity == st->first_idle) {
+		next = rb_next(&entity->rb_node);
+		st->first_idle = bfq_entity_of(next);
+	}
+
+	if (entity == st->last_idle) {
+		next = rb_prev(&entity->rb_node);
+		st->last_idle = bfq_entity_of(next);
+	}
+
+	bfq_extract(&st->idle, entity);
+
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+}
+
+/**
+ * bfq_insert - generic tree insertion.
+ * @root: tree root.
+ * @entity: entity to insert.
+ *
+ * This is used for the idle and the active tree, since they are both
+ * ordered by finish time.
+ */
+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
+{
+	struct bfq_entity *entry;
+	struct rb_node **node = &root->rb_node;
+	struct rb_node *parent = NULL;
+
+	BUG_ON(entity->tree);
+
+	while (*node) {
+		parent = *node;
+		entry = rb_entry(parent, struct bfq_entity, rb_node);
+
+		if (bfq_gt(entry->finish, entity->finish))
+			node = &parent->rb_left;
+		else
+			node = &parent->rb_right;
+	}
+
+	rb_link_node(&entity->rb_node, parent, node);
+	rb_insert_color(&entity->rb_node, root);
+
+	entity->tree = root;
+}
+
+/**
+ * bfq_update_min - update the min_start field of a entity.
+ * @entity: the entity to update.
+ * @node: one of its children.
+ *
+ * This function is called when @entity may store an invalid value for
+ * min_start due to updates to the active tree.  The function  assumes
+ * that the subtree rooted at @node (which may be its left or its right
+ * child) has a valid min_start value.
+ */
+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node)
+{
+	struct bfq_entity *child;
+
+	if (node) {
+		child = rb_entry(node, struct bfq_entity, rb_node);
+		if (bfq_gt(entity->min_start, child->min_start))
+			entity->min_start = child->min_start;
+	}
+}
+
+/**
+ * bfq_update_active_node - recalculate min_start.
+ * @node: the node to update.
+ *
+ * @node may have changed position or one of its children may have moved,
+ * this function updates its min_start value.  The left and right subtrees
+ * are assumed to hold a correct min_start value.
+ */
+static void bfq_update_active_node(struct rb_node *node)
+{
+	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	entity->min_start = entity->start;
+	bfq_update_min(entity, node->rb_right);
+	bfq_update_min(entity, node->rb_left);
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "update_active_node: new min_start %llu",
+			     ((entity->min_start>>10)*1000)>>12);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	} else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "update_active_node: new min_start %llu",
+			     ((entity->min_start>>10)*1000)>>12);
+#endif
+	}
+}
+
+/**
+ * bfq_update_active_tree - update min_start for the whole active tree.
+ * @node: the starting node.
+ *
+ * @node must be the deepest modified node after an update.  This function
+ * updates its min_start using the values held by its children, assuming
+ * that they did not change, and then updates all the nodes that may have
+ * changed in the path to the root.  The only nodes that may have changed
+ * are the ones in the path or their siblings.
+ */
+static void bfq_update_active_tree(struct rb_node *node)
+{
+	struct rb_node *parent;
+
+up:
+	bfq_update_active_node(node);
+
+	parent = rb_parent(node);
+	if (!parent)
+		return;
+
+	if (node == parent->rb_left && parent->rb_right)
+		bfq_update_active_node(parent->rb_right);
+	else if (parent->rb_left)
+		bfq_update_active_node(parent->rb_left);
+
+	node = parent;
+	goto up;
+}
+
+static void bfq_weights_tree_add(struct bfq_data *bfqd,
+				 struct bfq_entity *entity,
+				 struct rb_root *root);
+
+static void bfq_weights_tree_remove(struct bfq_data *bfqd,
+				    struct bfq_entity *entity,
+				    struct rb_root *root);
+
+
+/**
+ * bfq_active_insert - insert an entity in the active tree of its
+ *                     group/device.
+ * @st: the service tree of the entity.
+ * @entity: the entity being inserted.
+ *
+ * The active tree is ordered by finish time, but an extra key is kept
+ * per each node, containing the minimum value for the start times of
+ * its children (and the node itself), so it's possible to search for
+ * the eligible node with the lowest finish time in logarithmic time.
+ */
+static void bfq_active_insert(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node = &entity->rb_node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	bfq_insert(&st->active, entity);
+
+	if (node->rb_left)
+		node = node->rb_left;
+	else if (node->rb_right)
+		node = node->rb_right;
+
+	bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	BUG_ON(!bfqg);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else { /* bfq_group */
+		BUG_ON(!bfqd);
+		bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree);
+	}
+	if (bfqg != bfqd->root_group) {
+		BUG_ON(!bfqg);
+		BUG_ON(!bfqd);
+		bfqg->active_entities++;
+	}
+#endif
+}
+
+/**
+ * bfq_ioprio_to_weight - calc a weight from an ioprio.
+ * @ioprio: the ioprio value to convert.
+ */
+static unsigned short bfq_ioprio_to_weight(int ioprio)
+{
+	BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
+	return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+}
+
+/**
+ * bfq_weight_to_ioprio - calc an ioprio from a weight.
+ * @weight: the weight value to convert.
+ *
+ * To preserve as much as possible the old only-ioprio user interface,
+ * 0 is used as an escape ioprio value for weights (numerically) equal or
+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ */
+static unsigned short bfq_weight_to_ioprio(int weight)
+{
+	BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
+	return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ?
+		0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight;
+}
+
+static void bfq_get_entity(struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+	if (bfqq) {
+		bfqq->ref++;
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
+			     bfqq, bfqq->ref);
+	}
+}
+
+/**
+ * bfq_find_deepest - find the deepest node that an extraction can modify.
+ * @node: the node being removed.
+ *
+ * Do the first step of an extraction in an rb tree, looking for the
+ * node that will replace @node, and returning the deepest node that
+ * the following modifications to the tree can touch.  If @node is the
+ * last node in the tree return %NULL.
+ */
+static struct rb_node *bfq_find_deepest(struct rb_node *node)
+{
+	struct rb_node *deepest;
+
+	if (!node->rb_right && !node->rb_left)
+		deepest = rb_parent(node);
+	else if (!node->rb_right)
+		deepest = node->rb_left;
+	else if (!node->rb_left)
+		deepest = node->rb_right;
+	else {
+		deepest = rb_next(node);
+		if (deepest->rb_right)
+			deepest = deepest->rb_right;
+		else if (rb_parent(deepest) != node)
+			deepest = rb_parent(deepest);
+	}
+
+	return deepest;
+}
+
+/**
+ * bfq_active_extract - remove an entity from the active tree.
+ * @st: the service_tree containing the tree.
+ * @entity: the entity being removed.
+ */
+static void bfq_active_extract(struct bfq_service_tree *st,
+			       struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct rb_node *node;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	struct bfq_sched_data *sd = NULL;
+	struct bfq_group *bfqg = NULL;
+	struct bfq_data *bfqd = NULL;
+#endif
+
+	node = bfq_find_deepest(&entity->rb_node);
+	bfq_extract(&st->active, entity);
+
+	if (node)
+		bfq_update_active_tree(node);
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	sd = entity->sched_data;
+	bfqg = container_of(sd, struct bfq_group, sched_data);
+	BUG_ON(!bfqg);
+	bfqd = (struct bfq_data *)bfqg->bfqd;
+#endif
+	if (bfqq)
+		list_del(&bfqq->bfqq_list);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else { /* bfq_group */
+		BUG_ON(!bfqd);
+		bfq_weights_tree_remove(bfqd, entity,
+					&bfqd->group_weights_tree);
+	}
+	if (bfqg != bfqd->root_group) {
+		BUG_ON(!bfqg);
+		BUG_ON(!bfqd);
+		BUG_ON(!bfqg->active_entities);
+		bfqg->active_entities--;
+	}
+#endif
+}
+
+/**
+ * bfq_idle_insert - insert an entity into the idle tree.
+ * @st: the service tree containing the tree.
+ * @entity: the entity to insert.
+ */
+static void bfq_idle_insert(struct bfq_service_tree *st,
+			    struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (!first_idle || bfq_gt(first_idle->finish, entity->finish))
+		st->first_idle = entity;
+	if (!last_idle || bfq_gt(entity->finish, last_idle->finish))
+		st->last_idle = entity;
+
+	bfq_insert(&st->idle, entity);
+
+	if (bfqq)
+		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
+}
+
+/**
+ * bfq_forget_entity - remove an entity from the wfq trees.
+ * @st: the service tree.
+ * @entity: the entity being removed.
+ *
+ * Update the device status and forget everything about @entity, putting
+ * the device reference to it, if it is a queue.  Entities belonging to
+ * groups are not refcounted.
+ */
+static void bfq_forget_entity(struct bfq_service_tree *st,
+			      struct bfq_entity *entity)
+{
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	struct bfq_sched_data *sd;
+
+	BUG_ON(!entity->on_st);
+
+	entity->on_st = 0;
+	st->wsum -= entity->weight;
+	if (bfqq) {
+		sd = entity->sched_data;
+		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
+			     bfqq, bfqq->ref);
+		bfq_put_queue(bfqq);
+	}
+}
+
+/**
+ * bfq_put_idle_entity - release the idle tree ref of an entity.
+ * @st: service tree for the entity.
+ * @entity: the entity being released.
+ */
+static void bfq_put_idle_entity(struct bfq_service_tree *st,
+				struct bfq_entity *entity)
+{
+	bfq_idle_extract(st, entity);
+	bfq_forget_entity(st, entity);
+}
+
+/**
+ * bfq_forget_idle - update the idle tree if necessary.
+ * @st: the service tree to act upon.
+ *
+ * To preserve the global O(log N) complexity we only remove one entry here;
+ * as the idle tree will not grow indefinitely this can be done safely.
+ */
+static void bfq_forget_idle(struct bfq_service_tree *st)
+{
+	struct bfq_entity *first_idle = st->first_idle;
+	struct bfq_entity *last_idle = st->last_idle;
+
+	if (RB_EMPTY_ROOT(&st->active) && last_idle &&
+	    !bfq_gt(last_idle->finish, st->vtime)) {
+		/*
+		 * Forget the whole idle tree, increasing the vtime past
+		 * the last finish time of idle entities.
+		 */
+		st->vtime = last_idle->finish;
+	}
+
+	if (first_idle && !bfq_gt(first_idle->finish, st->vtime))
+		bfq_put_idle_entity(st, first_idle);
+}
+
+static struct bfq_service_tree *
+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
+			 struct bfq_entity *entity)
+{
+	struct bfq_service_tree *new_st = old_st;
+
+	if (entity->prio_changed) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+		unsigned int prev_weight, new_weight;
+		struct bfq_data *bfqd = NULL;
+		struct rb_root *root;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		struct bfq_sched_data *sd;
+		struct bfq_group *bfqg;
+#endif
+
+		if (bfqq)
+			bfqd = bfqq->bfqd;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			sd = entity->my_sched_data;
+			bfqg = container_of(sd, struct bfq_group, sched_data);
+			BUG_ON(!bfqg);
+			bfqd = (struct bfq_data *)bfqg->bfqd;
+			BUG_ON(!bfqd);
+		}
+#endif
+
+		BUG_ON(old_st->wsum < entity->weight);
+		old_st->wsum -= entity->weight;
+
+		if (entity->new_weight != entity->orig_weight) {
+			if (entity->new_weight < BFQ_MIN_WEIGHT ||
+			    entity->new_weight > BFQ_MAX_WEIGHT) {
+				pr_crit("update_weight_prio: new_weight %d\n",
+					entity->new_weight);
+				if (entity->new_weight < BFQ_MIN_WEIGHT)
+					entity->new_weight = BFQ_MIN_WEIGHT;
+				else
+					entity->new_weight = BFQ_MAX_WEIGHT;
+			}
+			entity->orig_weight = entity->new_weight;
+			if (bfqq)
+				bfqq->ioprio =
+				  bfq_weight_to_ioprio(entity->orig_weight);
+		}
+
+		if (bfqq)
+			bfqq->ioprio_class = bfqq->new_ioprio_class;
+		entity->prio_changed = 0;
+
+		/*
+		 * NOTE: here we may be changing the weight too early,
+		 * this will cause unfairness.  The correct approach
+		 * would have required additional complexity to defer
+		 * weight changes to the proper time instants (i.e.,
+		 * when entity->finish <= old_st->vtime).
+		 */
+		new_st = bfq_entity_service_tree(entity);
+
+		prev_weight = entity->weight;
+		new_weight = entity->orig_weight *
+			     (bfqq ? bfqq->wr_coeff : 1);
+		/*
+		 * If the weight of the entity changes, remove the entity
+		 * from its old weight counter (if there is a counter
+		 * associated with the entity), and add it to the counter
+		 * associated with its new weight.
+		 */
+		if (prev_weight != new_weight) {
+			if (bfqq)
+				bfq_log_bfqq(bfqq->bfqd, bfqq,
+					     "weight changed %d %d(%d %d)",
+					     prev_weight, new_weight,
+					     entity->orig_weight,
+					     bfqq->wr_coeff);
+
+			root = bfqq ? &bfqd->queue_weights_tree :
+				      &bfqd->group_weights_tree;
+			bfq_weights_tree_remove(bfqd, entity, root);
+		}
+		entity->weight = new_weight;
+		/*
+		 * Add the entity to its weights tree only if it is
+		 * not associated with a weight-raised queue.
+		 */
+		if (prev_weight != new_weight &&
+		    (bfqq ? bfqq->wr_coeff == 1 : 1))
+			/* If we get here, root has been initialized. */
+			bfq_weights_tree_add(bfqd, entity, root);
+
+		new_st->wsum += entity->weight;
+
+		if (new_st != old_st)
+			entity->start = new_st->vtime;
+	}
+
+	return new_st;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+#endif
+
+/**
+ * bfq_bfqq_served - update the scheduler status after selection for
+ *                   service.
+ * @bfqq: the queue being served.
+ * @served: bytes to transfer.
+ *
+ * NOTE: this can be optimized, as the timestamps of upper level entities
+ * are synchronized every time a new bfqq is selected for service.  By now,
+ * we keep it to better check consistency.
+ */
+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	struct bfq_service_tree *st;
+
+	for_each_entity(entity) {
+		st = bfq_entity_service_tree(entity);
+
+		entity->service += served;
+
+		BUG_ON(st->wsum == 0);
+
+		st->vtime += bfq_delta(served, st->wsum);
+		bfq_forget_idle(st);
+	}
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	bfqg_stats_set_start_empty_time(bfqq_group(bfqq));
+#endif
+	st = bfq_entity_service_tree(&bfqq->entity);
+	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p",
+		     served,  ((st->vtime>>10)*1000)>>12, st);
+}
+
+/**
+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length
+ *			  of the time interval during which bfqq has been in
+ *			  service.
+ * @bfqd: the device
+ * @bfqq: the queue that needs a service update.
+ * @time_ms: the amount of time during which the queue has received service
+ *
+ * If a queue does not consume its budget fast enough, then providing
+ * the queue with service fairness may impair throughput, more or less
+ * severely. For this reason, queues that consume their budget slowly
+ * are provided with time fairness instead of service fairness. This
+ * goal is achieved through the BFQ scheduling engine, even if such an
+ * engine works in the service, and not in the time domain. The trick
+ * is charging these queues with an inflated amount of service, equal
+ * to the amount of service that they would have received during their
+ * service slot if they had been fast, i.e., if their requests had
+ * been dispatched at a rate equal to the estimated peak rate.
+ *
+ * It is worth noting that time fairness can cause important
+ * distortions in terms of bandwidth distribution, on devices with
+ * internal queueing. The reason is that I/O requests dispatched
+ * during the service slot of a queue may be served after that service
+ * slot is finished, and may have a total processing time loosely
+ * correlated with the duration of the service slot. This is
+ * especially true for short service slots.
+ */
+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				 unsigned long time_ms)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+	int tot_serv_to_charge = entity->service;
+	unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout);
+
+	if (time_ms > 0 && time_ms < timeout_ms)
+		tot_serv_to_charge =
+			(bfqd->bfq_max_budget * time_ms) / timeout_ms;
+
+	if (tot_serv_to_charge < entity->service)
+		tot_serv_to_charge = entity->service;
+
+	bfq_log_bfqq(bfqq->bfqd, bfqq,
+		     "charge_time: %lu/%u ms, %d/%d/%d sectors",
+		     time_ms, timeout_ms, entity->service,
+		     tot_serv_to_charge, entity->budget);
+
+	/* Increase budget to avoid inconsistencies */
+	if (tot_serv_to_charge > entity->budget)
+		entity->budget = tot_serv_to_charge;
+
+	bfq_bfqq_served(bfqq,
+			max_t(int, 0, tot_serv_to_charge - entity->service));
+}
+
+/**
+ * __bfq_activate_entity - activate an entity.
+ * @entity: the entity being activated.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ *
+ * Called whenever an entity is activated, i.e., it is not active and one
+ * of its children receives a new request, or has to be reactivated due to
+ * budget exhaustion.  It uses the current budget of the entity (and the
+ * service received if @entity is active) of the queue to calculate its
+ * timestamps.
+ */
+static void __bfq_activate_entity(struct bfq_entity *entity,
+				  bool non_blocking_wait_rq)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	bool backshifted = false;
+
+	BUG_ON(!sd);
+	BUG_ON(!st);
+	if (entity == sd->in_service_entity) {
+		BUG_ON(entity->tree);
+		/*
+		 * If we are requeueing the current entity we have
+		 * to take care of not charging to it service it has
+		 * not received.
+		 */
+		bfq_calc_finish(entity, entity->service);
+		entity->start = entity->finish;
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active) {
+		/*
+		 * Requeueing an entity due to a change of some
+		 * next_in_service entity below it.  We reuse the
+		 * old start time.
+		 */
+		bfq_active_extract(st, entity);
+	} else {
+		unsigned long long min_vstart;
+
+		/* See comments on bfq_fqq_update_budg_for_activation */
+		if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) {
+			backshifted = true;
+			min_vstart = entity->finish;
+		} else
+			min_vstart = st->vtime;
+
+		if (entity->tree == &st->idle) {
+			/*
+			 * Must be on the idle tree, bfq_idle_extract() will
+			 * check for that.
+			 */
+			bfq_idle_extract(st, entity);
+			entity->start = bfq_gt(min_vstart, entity->finish) ?
+				min_vstart : entity->finish;
+		} else {
+			/*
+			 * The finish time of the entity may be invalid, and
+			 * it is in the past for sure, otherwise the queue
+			 * would have been on the idle tree.
+			 */
+			entity->start = min_vstart;
+			st->wsum += entity->weight;
+			bfq_get_entity(entity);
+
+			BUG_ON(entity->on_st);
+			entity->on_st = 1;
+		}
+	}
+
+	st = __bfq_entity_update_weight_prio(st, entity);
+	bfq_calc_finish(entity, entity->budget);
+
+	/*
+	 * If some queues enjoy backshifting for a while, then their
+	 * (virtual) finish timestamps may happen to become lower and
+	 * lower than the system virtual time.  In particular, if
+	 * these queues often happen to be idle for short time
+	 * periods, and during such time periods other queues with
+	 * higher timestamps happen to be busy, then the backshifted
+	 * timestamps of the former queues can become much lower than
+	 * the system virtual time. In fact, to serve the queues with
+	 * higher timestamps while the ones with lower timestamps are
+	 * idle, the system virtual time may be pushed-up to much
+	 * higher values than the finish timestamps of the idle
+	 * queues. As a consequence, the finish timestamps of all new
+	 * or newly activated queues may end up being much larger than
+	 * those of lucky queues with backshifted timestamps. The
+	 * latter queues may then monopolize the device for a lot of
+	 * time. This would simply break service guarantees.
+	 *
+	 * To reduce this problem, push up a little bit the
+	 * backshifted timestamps of the queue associated with this
+	 * entity (only a queue can happen to have the backshifted
+	 * flag set): just enough to let the finish timestamp of the
+	 * queue be equal to the current value of the system virtual
+	 * time. This may introduce a little unfairness among queues
+	 * with backshifted timestamps, but it does not break
+	 * worst-case fairness guarantees.
+	 *
+	 * As a special case, if bfqq is weight-raised, push up
+	 * timestamps much less, to keep very low the probability that
+	 * this push up causes the backshifted finish timestamps of
+	 * weight-raised queues to become higher than the backshifted
+	 * finish timestamps of non weight-raised queues.
+	 */
+	if (backshifted && bfq_gt(st->vtime, entity->finish)) {
+		unsigned long delta = st->vtime - entity->finish;
+
+		if (bfqq)
+			delta /= bfqq->wr_coeff;
+
+		entity->start += delta;
+		entity->finish += delta;
+
+		if (bfqq) {
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "__activate_entity: new queue finish %llu",
+				     ((entity->finish>>10)*1000)>>12);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		} else {
+			struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+				     "__activate_entity: new group finish %llu",
+				     ((entity->finish>>10)*1000)>>12);
+#endif
+		}
+	}
+
+	bfq_active_insert(st, entity);
+
+	if (bfqq) {
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			"__activate_entity: queue %seligible in st %p",
+			     entity->start <= st->vtime ? "" : "non ", st);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	} else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			"__activate_entity: group %seligible in st %p",
+			     entity->start <= st->vtime ? "" : "non ", st);
+#endif
+	}
+}
+
+/**
+ * bfq_activate_entity - activate an entity and its ancestors if necessary.
+ * @entity: the entity to activate.
+ * @non_blocking_wait_rq: true if this entity was waiting for a request
+ *
+ * Activate @entity and all the entities on the path from it to the root.
+ */
+static void bfq_activate_entity(struct bfq_entity *entity,
+				bool non_blocking_wait_rq)
+{
+	struct bfq_sched_data *sd;
+
+	for_each_entity(entity) {
+		BUG_ON(!entity);
+		__bfq_activate_entity(entity, non_blocking_wait_rq);
+
+		sd = entity->sched_data;
+		if (!bfq_update_next_in_service(sd))
+			/*
+			 * No need to propagate the activation to the
+			 * upper entities, as they will be updated when
+			 * the in-service entity is rescheduled.
+			 */
+			break;
+	}
+}
+
+/**
+ * __bfq_deactivate_entity - deactivate an entity from its service tree.
+ * @entity: the entity to deactivate.
+ * @requeue: if false, the entity will not be put into the idle tree.
+ *
+ * Deactivate an entity, independently from its previous state.  If the
+ * entity was not on a service tree just return, otherwise if it is on
+ * any scheduler tree, extract it from that tree, and if necessary
+ * and if the caller did not specify @requeue, put it on the idle tree.
+ *
+ * Return %1 if the caller should update the entity hierarchy, i.e.,
+ * if the entity was in service or if it was the next_in_service for
+ * its sched_data; return %0 otherwise.
+ */
+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd = entity->sched_data;
+	struct bfq_service_tree *st;
+	int was_in_service;
+	int ret = 0;
+
+	if (sd == NULL || !entity->on_st) /* never activated, or inactive */
+		return 0;
+
+	st = bfq_entity_service_tree(entity);
+	was_in_service = entity == sd->in_service_entity;
+
+	BUG_ON(was_in_service && entity->tree);
+
+	if (was_in_service) {
+		bfq_calc_finish(entity, entity->service);
+		sd->in_service_entity = NULL;
+	} else if (entity->tree == &st->active)
+		bfq_active_extract(st, entity);
+	else if (entity->tree == &st->idle)
+		bfq_idle_extract(st, entity);
+	else if (entity->tree)
+		BUG();
+
+	if (was_in_service || sd->next_in_service == entity)
+		ret = bfq_update_next_in_service(sd);
+
+	if (!requeue || !bfq_gt(entity->finish, st->vtime))
+		bfq_forget_entity(st, entity);
+	else
+		bfq_idle_insert(st, entity);
+
+	BUG_ON(sd->in_service_entity == entity);
+	BUG_ON(sd->next_in_service == entity);
+
+	return ret;
+}
+
+/**
+ * bfq_deactivate_entity - deactivate an entity.
+ * @entity: the entity to deactivate.
+ * @requeue: true if the entity can be put on the idle tree
+ */
+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
+{
+	struct bfq_sched_data *sd;
+	struct bfq_entity *parent;
+
+	for_each_entity_safe(entity, parent) {
+		sd = entity->sched_data;
+
+		if (!__bfq_deactivate_entity(entity, requeue))
+			/*
+			 * next_in_service has not been changed, so
+			 * no upwards update is needed
+			 */
+			break;
+
+		if (sd->next_in_service)
+			/*
+			 * The parent entity is still backlogged,
+			 * because next_in_service is not NULL, and
+			 * next_in_service has been updated (see
+			 * comment on the body of the above if):
+			 * upwards update of the schedule is needed.
+			 */
+			goto update;
+
+		/*
+		 * If we get here, then the parent is no more backlogged and
+		 * we want to propagate the deactivation upwards.
+		 */
+		requeue = 1;
+	}
+
+	return;
+
+update:
+	entity = parent;
+	for_each_entity(entity) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+		__bfq_activate_entity(entity, false);
+
+		sd = entity->sched_data;
+		if (bfqq)
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "invoking udpdate_next for this queue");
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			struct bfq_group *bfqg =
+				container_of(entity,
+					     struct bfq_group, entity);
+
+			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+				     "invoking udpdate_next for this entity");
+		}
+#endif
+		if (!bfq_update_next_in_service(sd))
+			break;
+	}
+}
+
+/**
+ * bfq_update_vtime - update vtime if necessary.
+ * @st: the service tree to act upon.
+ *
+ * If necessary update the service tree vtime to have at least one
+ * eligible entity, skipping to its start time.  Assumes that the
+ * active tree of the device is not empty.
+ *
+ * NOTE: this hierarchical implementation updates vtimes quite often,
+ * we may end up with reactivated processes getting timestamps after a
+ * vtime skip done because we needed a ->first_active entity on some
+ * intermediate node.
+ */
+static void bfq_update_vtime(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry;
+	struct rb_node *node = st->active.rb_node;
+
+	entry = rb_entry(node, struct bfq_entity, rb_node);
+	if (bfq_gt(entry->min_start, st->vtime)) {
+		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entry);
+		st->vtime = entry->min_start;
+
+		if (bfqq)
+			bfq_log_bfqq(bfqq->bfqd, bfqq,
+				     "update_vtime: new vtime %llu %p",
+				     ((st->vtime>>10)*1000)>>12, st);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			struct bfq_group *bfqg =
+				container_of(entry, struct bfq_group, entity);
+
+			bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+				     "update_vtime: new vtime %llu %p",
+				     ((st->vtime>>10)*1000)>>12, st);
+		}
+#endif
+		bfq_forget_idle(st);
+	}
+}
+
+/**
+ * bfq_first_active_entity - find the eligible entity with
+ *                           the smallest finish time
+ * @st: the service tree to select from.
+ *
+ * This function searches the first schedulable entity, starting from the
+ * root of the tree and going on the left every time on this side there is
+ * a subtree with at least one eligible (start >= vtime) entity. The path on
+ * the right is followed only if a) the left subtree contains no eligible
+ * entities and b) no eligible entity has been found yet.
+ */
+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
+{
+	struct bfq_entity *entry, *first = NULL;
+	struct rb_node *node = st->active.rb_node;
+
+	while (node) {
+		entry = rb_entry(node, struct bfq_entity, rb_node);
+left:
+		if (!bfq_gt(entry->start, st->vtime))
+			first = entry;
+
+		BUG_ON(bfq_gt(entry->min_start, st->vtime));
+
+		if (node->rb_left) {
+			entry = rb_entry(node->rb_left,
+					 struct bfq_entity, rb_node);
+			if (!bfq_gt(entry->min_start, st->vtime)) {
+				node = node->rb_left;
+				goto left;
+			}
+		}
+		if (first)
+			break;
+		node = node->rb_right;
+	}
+
+	BUG_ON(!first && !RB_EMPTY_ROOT(&st->active));
+	return first;
+}
+
+/**
+ * __bfq_lookup_next_entity - return the first eligible entity in @st.
+ * @st: the service tree.
+ *
+ * Update the virtual time in @st and return the first eligible entity
+ * it contains.
+ */
+static struct bfq_entity *
+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool force)
+{
+	struct bfq_entity *entity, *new_next_in_service = NULL;
+	struct bfq_queue *bfqq;
+
+	if (RB_EMPTY_ROOT(&st->active))
+		return NULL;
+
+	bfq_update_vtime(st);
+	entity = bfq_first_active_entity(st);
+	BUG_ON(bfq_gt(entity->start, st->vtime));
+
+	bfqq = bfq_entity_to_bfqq(entity);
+	if (bfqq)
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "__lookup_next: start %llu vtime %llu st %p",
+			     ((entity->start>>10)*1000)>>12,
+			     ((st->vtime>>10)*1000)>>12, st);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "__lookup_next: start %llu vtime %llu st %p",
+			     ((entity->start>>10)*1000)>>12,
+			     ((st->vtime>>10)*1000)>>12, st);
+	}
+#endif
+
+	/*
+	 * If the chosen entity does not match with the sched_data's
+	 * next_in_service and we are forcedly serving the IDLE priority
+	 * class tree, bubble up budget update.
+	 */
+	if (unlikely(force && entity != entity->sched_data->next_in_service)) {
+		new_next_in_service = entity;
+		for_each_entity(new_next_in_service)
+			bfq_update_budget(new_next_in_service);
+	}
+
+	return entity;
+}
+
+/**
+ * bfq_lookup_next_entity - return the first eligible entity in @sd.
+ * @sd: the sched_data.
+ * @extract: if true the returned entity will be also extracted from @sd.
+ *
+ * NOTE: since we cache the next_in_service entity at each level of the
+ * hierarchy, the complexity of the lookup can be decreased with
+ * absolutely no effort just returning the cached next_in_service value;
+ * we prefer to do full lookups to test the consistency of * the data
+ * structures.
+ */
+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
+						 int extract,
+						 struct bfq_data *bfqd)
+{
+	struct bfq_service_tree *st = sd->service_tree;
+	struct bfq_entity *entity;
+	int i = 0;
+
+	BUG_ON(sd->in_service_entity);
+
+	/*
+	 * Choose from idle class, if needed to guarantee a minimum
+	 * bandwidth to this class. This should also mitigate
+	 * priority-inversion problems in case a low priority task is
+	 * holding file system resources.
+	 */
+	if (bfqd &&
+	    jiffies - bfqd->bfq_class_idle_last_service >
+	    BFQ_CL_IDLE_TIMEOUT) {
+		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1,
+						  true);
+		if (entity) {
+			struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+			if (bfqq)
+				bfq_log_bfqq(bfqd, bfqq,
+					     "idle chosen from st %p %d",
+					     st + BFQ_IOPRIO_CLASSES - 1,
+					BFQ_IOPRIO_CLASSES - 1);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+			else {
+				struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+				bfq_log_bfqg(bfqd, bfqg,
+					     "idle chosen from st %p %d",
+					     st + BFQ_IOPRIO_CLASSES - 1,
+					BFQ_IOPRIO_CLASSES - 1);
+			}
+#endif
+			i = BFQ_IOPRIO_CLASSES - 1;
+			bfqd->bfq_class_idle_last_service = jiffies;
+			sd->next_in_service = entity;
+		}
+	}
+	for (; i < BFQ_IOPRIO_CLASSES; i++) {
+		entity = __bfq_lookup_next_entity(st + i, false);
+		if (entity) {
+			if (bfqd != NULL) {
+			struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+
+			if (bfqq)
+				bfq_log_bfqq(bfqd, bfqq,
+					     "chosen from st %p %d",
+					     st + i, i);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+			else {
+				struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+				bfq_log_bfqg(bfqd, bfqg,
+					     "chosen from st %p %d",
+					     st + i, i);
+			}
+#endif
+			}
+
+			if (extract) {
+				bfq_check_next_in_service(sd, entity);
+				bfq_active_extract(st + i, entity);
+				sd->in_service_entity = entity;
+				sd->next_in_service = NULL;
+			}
+			break;
+		}
+	}
+
+	return entity;
+}
+
+static bool next_queue_may_preempt(struct bfq_data *bfqd)
+{
+	struct bfq_sched_data *sd = &bfqd->root_group->sched_data;
+
+	return sd->next_in_service != sd->in_service_entity;
+}
+
+/*
+ * Get next queue for service.
+ */
+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
+{
+	struct bfq_entity *entity = NULL;
+	struct bfq_sched_data *sd;
+	struct bfq_queue *bfqq;
+
+	BUG_ON(bfqd->in_service_queue);
+
+	if (bfqd->busy_queues == 0)
+		return NULL;
+
+	sd = &bfqd->root_group->sched_data;
+	for (; sd ; sd = entity->my_sched_data) {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		if (entity) {
+			struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+			bfq_log_bfqg(bfqd, bfqg,
+				     "get_next_queue: lookup in this group");
+		} else
+			bfq_log_bfqg(bfqd, bfqd->root_group,
+				     "get_next_queue: lookup in root group");
+#endif
+
+		entity = bfq_lookup_next_entity(sd, 1, bfqd);
+
+		bfqq = bfq_entity_to_bfqq(entity);
+		if (bfqq)
+			bfq_log_bfqq(bfqd, bfqq,
+			     "get_next_queue: this queue, finish %llu",
+				(((entity->finish>>10)*1000)>>10)>>2);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+		else {
+			struct bfq_group *bfqg =
+				container_of(entity, struct bfq_group, entity);
+
+			bfq_log_bfqg(bfqd, bfqg,
+			     "get_next_queue: this entity, finish %llu",
+				(((entity->finish>>10)*1000)>>10)>>2);
+		}
+#endif
+
+		BUG_ON(!entity);
+		entity->service = 0;
+	}
+
+	bfqq = bfq_entity_to_bfqq(entity);
+	BUG_ON(!bfqq);
+
+	return bfqq;
+}
+
+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
+{
+	if (bfqd->in_service_bic) {
+		put_io_context(bfqd->in_service_bic->icq.ioc);
+		bfqd->in_service_bic = NULL;
+	}
+
+	bfq_clear_bfqq_wait_request(bfqd->in_service_queue);
+	hrtimer_try_to_cancel(&bfqd->idle_slice_timer);
+	bfqd->in_service_queue = NULL;
+}
+
+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+				int requeue)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	BUG_ON(bfqq == bfqd->in_service_queue);
+	bfq_deactivate_entity(entity, requeue);
+}
+
+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	struct bfq_entity *entity = &bfqq->entity;
+
+	bfq_activate_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq));
+	bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
+}
+
+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
+
+/*
+ * Called when the bfqq no longer has requests pending, remove it from
+ * the service tree.
+ */
+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+			      int requeue)
+{
+	BUG_ON(!bfq_bfqq_busy(bfqq));
+	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
+	BUG_ON(bfqq == bfqd->in_service_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "del from busy");
+
+	bfq_clear_bfqq_busy(bfqq);
+
+	BUG_ON(bfqd->busy_queues == 0);
+	bfqd->busy_queues--;
+
+	if (!bfqq->dispatched)
+		bfq_weights_tree_remove(bfqd, &bfqq->entity,
+					&bfqd->queue_weights_tree);
+
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues--;
+
+	bfqg_stats_update_dequeue(bfqq_group(bfqq));
+
+	BUG_ON(bfqq->entity.budget < 0);
+
+	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
+
+	BUG_ON(bfqq->entity.budget < 0);
+}
+
+/*
+ * Called when an inactive queue receives a new request.
+ */
+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+	BUG_ON(bfq_bfqq_busy(bfqq));
+	BUG_ON(bfqq == bfqd->in_service_queue);
+
+	bfq_log_bfqq(bfqd, bfqq, "add to busy");
+
+	bfq_activate_bfqq(bfqd, bfqq);
+
+	bfq_mark_bfqq_busy(bfqq);
+	bfqd->busy_queues++;
+
+	if (!bfqq->dispatched)
+		if (bfqq->wr_coeff == 1)
+			bfq_weights_tree_add(bfqd, &bfqq->entity,
+					     &bfqd->queue_weights_tree);
+
+	if (bfqq->wr_coeff > 1)
+		bfqd->wr_busy_queues++;
+}
diff --git b/block/bfq.h b/block/bfq.h
new file mode 100644
index 0000000..b80abe0
--- /dev/null
+++ b/block/bfq.h
@@ -0,0 +1,886 @@
+/*
+ * BFQ-v8r5 for 4.8.0: data structures and common functions prototypes.
+ *
+ * Based on ideas and code from CFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ *		      Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org>
+ */
+
+#ifndef _BFQ_H
+#define _BFQ_H
+
+#include <linux/blktrace_api.h>
+#include <linux/hrtimer.h>
+#include <linux/ioprio.h>
+#include <linux/rbtree.h>
+#include <linux/blk-cgroup.h>
+
+#define BFQ_IOPRIO_CLASSES	3
+#define BFQ_CL_IDLE_TIMEOUT	(HZ/5)
+
+#define BFQ_MIN_WEIGHT			1
+#define BFQ_MAX_WEIGHT			1000
+#define BFQ_WEIGHT_CONVERSION_COEFF	10
+
+#define BFQ_DEFAULT_QUEUE_IOPRIO	4
+
+#define BFQ_WEIGHT_LEGACY_DFL	100
+#define BFQ_DEFAULT_GRP_IOPRIO	0
+#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
+
+/*
+ * Soft real-time applications are extremely more latency sensitive
+ * than interactive ones. Over-raise the weight of the former to
+ * privilege them against the latter.
+ */
+#define BFQ_SOFTRT_WEIGHT_FACTOR	100
+
+struct bfq_entity;
+
+/**
+ * struct bfq_service_tree - per ioprio_class service tree.
+ *
+ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
+ * ioprio_class has its own independent scheduler, and so its own
+ * bfq_service_tree.  All the fields are protected by the queue lock
+ * of the containing bfqd.
+ */
+struct bfq_service_tree {
+	/* tree for active entities (i.e., those backlogged) */
+	struct rb_root active;
+	/* tree for idle entities (i.e., not backlogged, with V <= F_i)*/
+	struct rb_root idle;
+
+	struct bfq_entity *first_idle;	/* idle entity with minimum F_i */
+	struct bfq_entity *last_idle;	/* idle entity with maximum F_i */
+
+	u64 vtime; /* scheduler virtual time */
+	/* scheduler weight sum; active and idle entities contribute to it */
+	unsigned long wsum;
+};
+
+/**
+ * struct bfq_sched_data - multi-class scheduler.
+ *
+ * bfq_sched_data is the basic scheduler queue.  It supports three
+ * ioprio_classes, and can be used either as a toplevel queue or as an
+ * intermediate queue on a hierarchical setup.  @next_in_service
+ * points to the active entity of the sched_data service trees that
+ * will be scheduled next. It is used to reduce the number of steps
+ * needed for each hierarchical-schedule update.
+ *
+ * The supported ioprio_classes are the same as in CFQ, in descending
+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
+ * Requests from higher priority queues are served before all the
+ * requests from lower priority queues; among requests of the same
+ * queue requests are served according to B-WF2Q+.
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_sched_data {
+	struct bfq_entity *in_service_entity;  /* entity in service */
+	/* head-of-the-line entity in the scheduler (see comments above) */
+	struct bfq_entity *next_in_service;
+	/* array of service trees, one per ioprio_class */
+	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
+};
+
+/**
+ * struct bfq_weight_counter - counter of the number of all active entities
+ *                             with a given weight.
+ */
+struct bfq_weight_counter {
+	unsigned int weight; /* weight of the entities this counter refers to */
+	unsigned int num_active; /* nr of active entities with this weight */
+	/*
+	 * Weights tree member (see bfq_data's @queue_weights_tree and
+	 * @group_weights_tree)
+	 */
+	struct rb_node weights_node;
+};
+
+/**
+ * struct bfq_entity - schedulable entity.
+ *
+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
+ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
+ * entity belongs to the sched_data of the parent group in the cgroup
+ * hierarchy.  Non-leaf entities have also their own sched_data, stored
+ * in @my_sched_data.
+ *
+ * Each entity stores independently its priority values; this would
+ * allow different weights on different devices, but this
+ * functionality is not exported to userspace by now.  Priorities and
+ * weights are updated lazily, first storing the new values into the
+ * new_* fields, then setting the @prio_changed flag.  As soon as
+ * there is a transition in the entity state that allows the priority
+ * update to take place the effective and the requested priority
+ * values are synchronized.
+ *
+ * Unless cgroups are used, the weight value is calculated from the
+ * ioprio to export the same interface as CFQ.  When dealing with
+ * ``well-behaved'' queues (i.e., queues that do not spend too much
+ * time to consume their budget and have true sequential behavior, and
+ * when there are no external factors breaking anticipation) the
+ * relative weights at each level of the cgroups hierarchy should be
+ * guaranteed.  All the fields are protected by the queue lock of the
+ * containing bfqd.
+ */
+struct bfq_entity {
+	struct rb_node rb_node; /* service_tree member */
+	/* pointer to the weight counter associated with this entity */
+	struct bfq_weight_counter *weight_counter;
+
+	/*
+	 * flag, true if the entity is on a tree (either the active or
+	 * the idle one of its service_tree).
+	 */
+	int on_st;
+
+	u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */
+	u64 start;  /* B-WF2Q+ start timestamp (aka S_i) */
+
+	/* tree the entity is enqueued into; %NULL if not on a tree */
+	struct rb_root *tree;
+
+	/*
+	 * minimum start time of the (active) subtree rooted at this
+	 * entity; used for O(log N) lookups into active trees
+	 */
+	u64 min_start;
+
+	/* amount of service received during the last service slot */
+	int service;
+
+	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
+	int budget;
+
+	unsigned int weight;	 /* weight of the queue */
+	unsigned int new_weight; /* next weight if a change is in progress */
+
+	/* original weight, used to implement weight boosting */
+	unsigned int orig_weight;
+
+	/* parent entity, for hierarchical scheduling */
+	struct bfq_entity *parent;
+
+	/*
+	 * For non-leaf nodes in the hierarchy, the associated
+	 * scheduler queue, %NULL on leaf nodes.
+	 */
+	struct bfq_sched_data *my_sched_data;
+	/* the scheduler queue this entity belongs to */
+	struct bfq_sched_data *sched_data;
+
+	/* flag, set to request a weight, ioprio or ioprio_class change  */
+	int prio_changed;
+};
+
+struct bfq_group;
+
+/**
+ * struct bfq_queue - leaf schedulable entity.
+ *
+ * A bfq_queue is a leaf request queue; it can be associated with an
+ * io_context or more, if it  is  async or shared  between  cooperating
+ * processes. @cgroup holds a reference to the cgroup, to be sure that it
+ * does not disappear while a bfqq still references it (mostly to avoid
+ * races between request issuing and task migration followed by cgroup
+ * destruction).
+ * All the fields are protected by the queue lock of the containing bfqd.
+ */
+struct bfq_queue {
+	/* reference counter */
+	int ref;
+	/* parent bfq_data */
+	struct bfq_data *bfqd;
+
+	/* current ioprio and ioprio class */
+	unsigned short ioprio, ioprio_class;
+	/* next ioprio and ioprio class if a change is in progress */
+	unsigned short new_ioprio, new_ioprio_class;
+
+	/*
+	 * Shared bfq_queue if queue is cooperating with one or more
+	 * other queues.
+	 */
+	struct bfq_queue *new_bfqq;
+	/* request-position tree member (see bfq_group's @rq_pos_tree) */
+	struct rb_node pos_node;
+	/* request-position tree root (see bfq_group's @rq_pos_tree) */
+	struct rb_root *pos_root;
+
+	/* sorted list of pending requests */
+	struct rb_root sort_list;
+	/* if fifo isn't expired, next request to serve */
+	struct request *next_rq;
+	/* number of sync and async requests queued */
+	int queued[2];
+	/* number of sync and async requests currently allocated */
+	int allocated[2];
+	/* number of pending metadata requests */
+	int meta_pending;
+	/* fifo list of requests in sort_list */
+	struct list_head fifo;
+
+	/* entity representing this queue in the scheduler */
+	struct bfq_entity entity;
+
+	/* maximum budget allowed from the feedback mechanism */
+	int max_budget;
+	/* budget expiration (in jiffies) */
+	unsigned long budget_timeout;
+
+	/* number of requests on the dispatch list or inside driver */
+	int dispatched;
+
+	unsigned int flags; /* status flags.*/
+
+	/* node for active/idle bfqq list inside parent bfqd */
+	struct list_head bfqq_list;
+
+	/* bit vector: a 1 for each seeky requests in history */
+	u32 seek_history;
+
+	/* node for the device's burst list */
+	struct hlist_node burst_list_node;
+
+	/* position of the last request enqueued */
+	sector_t last_request_pos;
+
+	/* Number of consecutive pairs of request completion and
+	 * arrival, such that the queue becomes idle after the
+	 * completion, but the next request arrives within an idle
+	 * time slice; used only if the queue's IO_bound flag has been
+	 * cleared.
+	 */
+	unsigned int requests_within_timer;
+
+	/* pid of the process owning the queue, used for logging purposes */
+	pid_t pid;
+
+	/*
+	 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL
+	 * if the queue is shared.
+	 */
+	struct bfq_io_cq *bic;
+
+	/* current maximum weight-raising time for this queue */
+	unsigned long wr_cur_max_time;
+	/*
+	 * Minimum time instant such that, only if a new request is
+	 * enqueued after this time instant in an idle @bfq_queue with
+	 * no outstanding requests, then the task associated with the
+	 * queue it is deemed as soft real-time (see the comments on
+	 * the function bfq_bfqq_softrt_next_start())
+	 */
+	unsigned long soft_rt_next_start;
+	/*
+	 * Start time of the current weight-raising period if
+	 * the @bfq-queue is being weight-raised, otherwise
+	 * finish time of the last weight-raising period.
+	 */
+	unsigned long last_wr_start_finish;
+	/* factor by which the weight of this queue is multiplied */
+	unsigned int wr_coeff;
+	/*
+	 * Time of the last transition of the @bfq_queue from idle to
+	 * backlogged.
+	 */
+	unsigned long last_idle_bklogged;
+	/*
+	 * Cumulative service received from the @bfq_queue since the
+	 * last transition from idle to backlogged.
+	 */
+	unsigned long service_from_backlogged;
+	/*
+	 * Value of wr start time when switching to soft rt
+	 */
+	unsigned long wr_start_at_switch_to_srt;
+
+	unsigned long split_time; /* time of last split */
+};
+
+/**
+ * struct bfq_ttime - per process thinktime stats.
+ */
+struct bfq_ttime {
+	u64 last_end_request; /* completion time of last request */
+
+	u64 ttime_total; /* total process thinktime */
+	unsigned long ttime_samples; /* number of thinktime samples */
+	u64 ttime_mean; /* average process thinktime */
+
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+	/* associated io_cq structure */
+	struct io_cq icq; /* must be the first member */
+	/* array of two process queues, the sync and the async */
+	struct bfq_queue *bfqq[2];
+	/* associated @bfq_ttime struct */
+	struct bfq_ttime ttime;
+	/* per (request_queue, blkcg) ioprio */
+	int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+
+	/*
+	 * Snapshot of the idle window before merging; taken to
+	 * remember this value while the queue is merged, so as to be
+	 * able to restore it in case of split.
+	 */
+	bool saved_idle_window;
+	/*
+	 * Same purpose as the previous two fields for the I/O bound
+	 * classification of a queue.
+	 */
+	bool saved_IO_bound;
+
+	/*
+	 * Same purpose as the previous fields for the value of the
+	 * field keeping the queue's belonging to a large burst
+	 */
+	bool saved_in_large_burst;
+	/*
+	 * True if the queue belonged to a burst list before its merge
+	 * with another cooperating queue.
+	 */
+	bool was_in_burst_list;
+
+	/*
+	 * Similar to previous fields: save wr information.
+	 */
+	unsigned long saved_wr_coeff;
+	unsigned long saved_last_wr_start_finish;
+	unsigned long saved_wr_start_at_switch_to_srt;
+};
+
+enum bfq_device_speed {
+	BFQ_BFQD_FAST,
+	BFQ_BFQD_SLOW,
+};
+
+/**
+ * struct bfq_data - per-device data structure.
+ *
+ * All the fields are protected by the @queue lock.
+ */
+struct bfq_data {
+	/* request queue for the device */
+	struct request_queue *queue;
+
+	/* root bfq_group for the device */
+	struct bfq_group *root_group;
+
+	/*
+	 * rbtree of weight counters of @bfq_queues, sorted by
+	 * weight. Used to keep track of whether all @bfq_queues have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active and not
+	 * weight-raised @bfq_queue (see the comments to the functions
+	 * bfq_weights_tree_[add|remove] for further details).
+	 */
+	struct rb_root queue_weights_tree;
+	/*
+	 * rbtree of non-queue @bfq_entity weight counters, sorted by
+	 * weight. Used to keep track of whether all @bfq_groups have
+	 * the same weight. The tree contains one counter for each
+	 * distinct weight associated to some active @bfq_group (see
+	 * the comments to the functions bfq_weights_tree_[add|remove]
+	 * for further details).
+	 */
+	struct rb_root group_weights_tree;
+
+	/*
+	 * Number of bfq_queues containing requests (including the
+	 * queue in service, even if it is idling).
+	 */
+	int busy_queues;
+	/* number of weight-raised busy @bfq_queues */
+	int wr_busy_queues;
+	/* number of queued requests */
+	int queued;
+	/* number of requests dispatched and waiting for completion */
+	int rq_in_driver;
+
+	/*
+	 * Maximum number of requests in driver in the last
+	 * @hw_tag_samples completed requests.
+	 */
+	int max_rq_in_driver;
+	/* number of samples used to calculate hw_tag */
+	int hw_tag_samples;
+	/* flag set to one if the driver is showing a queueing behavior */
+	int hw_tag;
+
+	/* number of budgets assigned */
+	int budgets_assigned;
+
+	/*
+	 * Timer set when idling (waiting) for the next request from
+	 * the queue in service.
+	 */
+	struct hrtimer idle_slice_timer;
+	/* delayed work to restart dispatching on the request queue */
+	struct work_struct unplug_work;
+
+	/* bfq_queue in service */
+	struct bfq_queue *in_service_queue;
+	/* bfq_io_cq (bic) associated with the @in_service_queue */
+	struct bfq_io_cq *in_service_bic;
+
+	/* on-disk position of the last served request */
+	sector_t last_position;
+
+	/* time of last request completion (ns) */
+	u64 last_completion;
+
+	/* time of first rq dispatch in current observation interval (ns) */
+	u64 first_dispatch;
+	/* time of last rq dispatch in current observation interval (ns) */
+	u64 last_dispatch;
+
+	/* beginning of the last budget */
+	ktime_t last_budget_start;
+	/* beginning of the last idle slice */
+	ktime_t last_idling_start;
+
+	/* number of samples in current observation interval */
+	int peak_rate_samples;
+	/* num of samples of seq dispatches in current observation interval */
+	u32 sequential_samples;
+	/* total num of sectors transferred in current observation interval */
+	u64 tot_sectors_dispatched;
+	/* max rq size seen during current observation interval (sectors) */
+	u32 last_rq_max_size;
+	/* time elapsed from first dispatch in current observ. interval (us) */
+	u64 delta_from_first;
+	/* current estimate of device peak rate */
+	u32 peak_rate;
+
+	/* maximum budget allotted to a bfq_queue before rescheduling */
+	int bfq_max_budget;
+
+	/* list of all the bfq_queues active on the device */
+	struct list_head active_list;
+	/* list of all the bfq_queues idle on the device */
+	struct list_head idle_list;
+
+	/*
+	 * Timeout for async/sync requests; when it fires, requests
+	 * are served in fifo order.
+	 */
+	u64 bfq_fifo_expire[2];
+	/* weight of backward seeks wrt forward ones */
+	unsigned int bfq_back_penalty;
+	/* maximum allowed backward seek */
+	unsigned int bfq_back_max;
+	/* maximum idling time */
+	u32 bfq_slice_idle;
+	/* last time CLASS_IDLE was served */
+	u64 bfq_class_idle_last_service;
+
+	/* user-configured max budget value (0 for auto-tuning) */
+	int bfq_user_max_budget;
+	/*
+	 * Timeout for bfq_queues to consume their budget; used to
+	 * prevent seeky queues from imposing long latencies to
+	 * sequential or quasi-sequential ones (this also implies that
+	 * seeky queues cannot receive guarantees in the service
+	 * domain; after a timeout they are charged for the time they
+	 * have been in service, to preserve fairness among them, but
+	 * without service-domain guarantees).
+	 */
+	unsigned int bfq_timeout;
+
+	/*
+	 * Number of consecutive requests that must be issued within
+	 * the idle time slice to set again idling to a queue which
+	 * was marked as non-I/O-bound (see the definition of the
+	 * IO_bound flag for further details).
+	 */
+	unsigned int bfq_requests_within_timer;
+
+	/*
+	 * Force device idling whenever needed to provide accurate
+	 * service guarantees, without caring about throughput
+	 * issues. CAVEAT: this may even increase latencies, in case
+	 * of useless idling for processes that did stop doing I/O.
+	 */
+	bool strict_guarantees;
+
+	/*
+	 * Last time at which a queue entered the current burst of
+	 * queues being activated shortly after each other; for more
+	 * details about this and the following parameters related to
+	 * a burst of activations, see the comments on the function
+	 * bfq_handle_burst.
+	 */
+	unsigned long last_ins_in_burst;
+	/*
+	 * Reference time interval used to decide whether a queue has
+	 * been activated shortly after @last_ins_in_burst.
+	 */
+	unsigned long bfq_burst_interval;
+	/* number of queues in the current burst of queue activations */
+	int burst_size;
+
+	/* common parent entity for the queues in the burst */
+	struct bfq_entity *burst_parent_entity;
+	/* Maximum burst size above which the current queue-activation
+	 * burst is deemed as 'large'.
+	 */
+	unsigned long bfq_large_burst_thresh;
+	/* true if a large queue-activation burst is in progress */
+	bool large_burst;
+	/*
+	 * Head of the burst list (as for the above fields, more
+	 * details in the comments on the function bfq_handle_burst).
+	 */
+	struct hlist_head burst_list;
+
+	/* if set to true, low-latency heuristics are enabled */
+	bool low_latency;
+	/*
+	 * Maximum factor by which the weight of a weight-raised queue
+	 * is multiplied.
+	 */
+	unsigned int bfq_wr_coeff;
+	/* maximum duration of a weight-raising period (jiffies) */
+	unsigned int bfq_wr_max_time;
+
+	/* Maximum weight-raising duration for soft real-time processes */
+	unsigned int bfq_wr_rt_max_time;
+	/*
+	 * Minimum idle period after which weight-raising may be
+	 * reactivated for a queue (in jiffies).
+	 */
+	unsigned int bfq_wr_min_idle_time;
+	/*
+	 * Minimum period between request arrivals after which
+	 * weight-raising may be reactivated for an already busy async
+	 * queue (in jiffies).
+	 */
+	unsigned long bfq_wr_min_inter_arr_async;
+
+	/* Max service-rate for a soft real-time queue, in sectors/sec */
+	unsigned int bfq_wr_max_softrt_rate;
+	/*
+	 * Cached value of the product R*T, used for computing the
+	 * maximum duration of weight raising automatically.
+	 */
+	u64 RT_prod;
+	/* device-speed class for the low-latency heuristic */
+	enum bfq_device_speed device_speed;
+
+	/* fallback dummy bfqq for extreme OOM conditions */
+	struct bfq_queue oom_bfqq;
+};
+
+enum bfqq_state_flags {
+	BFQ_BFQQ_FLAG_just_created = 0,	/* queue just allocated */
+	BFQ_BFQQ_FLAG_busy,		/* has requests or is in service */
+	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
+	BFQ_BFQQ_FLAG_non_blocking_wait_rq, /*
+					     * waiting for a request
+					     * without idling the device
+					     */
+	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
+	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
+	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
+	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
+	BFQ_BFQQ_FLAG_IO_bound,		/*
+					 * bfqq has timed-out at least once
+					 * having consumed at most 2/10 of
+					 * its budget
+					 */
+	BFQ_BFQQ_FLAG_in_large_burst,	/*
+					 * bfqq activated in a large burst,
+					 * see comments to bfq_handle_burst.
+					 */
+	BFQ_BFQQ_FLAG_softrt_update,	/*
+					 * may need softrt-next-start
+					 * update
+					 */
+	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
+	BFQ_BFQQ_FLAG_split_coop	/* shared bfqq will be split */
+};
+
+#define BFQ_BFQQ_FNS(name)						\
+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)		\
+{									\
+	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
+}									\
+static int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
+{									\
+	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
+}
+
+BFQ_BFQQ_FNS(just_created);
+BFQ_BFQQ_FNS(busy);
+BFQ_BFQQ_FNS(wait_request);
+BFQ_BFQQ_FNS(non_blocking_wait_rq);
+BFQ_BFQQ_FNS(must_alloc);
+BFQ_BFQQ_FNS(fifo_expire);
+BFQ_BFQQ_FNS(idle_window);
+BFQ_BFQQ_FNS(sync);
+BFQ_BFQQ_FNS(IO_bound);
+BFQ_BFQQ_FNS(in_large_burst);
+BFQ_BFQQ_FNS(coop);
+BFQ_BFQQ_FNS(split_coop);
+BFQ_BFQQ_FNS(softrt_update);
+#undef BFQ_BFQQ_FNS
+
+/* Logging facilities. */
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	assert_spin_locked((bfqd)->queue->queue_lock);			\
+	blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \
+			  (bfqq)->pid,			  \
+			  bfq_bfqq_sync((bfqq)) ? 'S' : 'A',	\
+			  __pbuf, ##args);				\
+} while (0)
+
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
+	char __pbuf[128];						\
+									\
+	blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf));		\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args);	\
+} while (0)
+
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	\
+	blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid,	\
+			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
+				##args)
+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
+
+#endif /* CONFIG_BFQ_GROUP_IOSCHED */
+
+#define bfq_log(bfqd, fmt, args...) \
+	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
+
+/* Expiration reasons. */
+enum bfqq_expiration {
+	BFQ_BFQQ_TOO_IDLE = 0,		/*
+					 * queue has been idling for
+					 * too long
+					 */
+	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
+	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
+	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
+	BFQ_BFQQ_PREEMPTED		/* preemption in progress */
+};
+
+
+struct bfqg_stats {
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	/* number of ios merged */
+	struct blkg_rwstat		merged;
+	/* total time spent on device in ns, may not be accurate w/ queueing */
+	struct blkg_rwstat		service_time;
+	/* total time spent waiting in scheduler queue in ns */
+	struct blkg_rwstat		wait_time;
+	/* number of IOs queued up */
+	struct blkg_rwstat		queued;
+	/* total disk time and nr sectors dispatched by this group */
+	struct blkg_stat		time;
+	/* sum of number of ios queued across all samples */
+	struct blkg_stat		avg_queue_size_sum;
+	/* count of samples taken for average */
+	struct blkg_stat		avg_queue_size_samples;
+	/* how many times this group has been removed from service tree */
+	struct blkg_stat		dequeue;
+	/* total time spent waiting for it to be assigned a timeslice. */
+	struct blkg_stat		group_wait_time;
+	/* time spent idling for this blkcg_gq */
+	struct blkg_stat		idle_time;
+	/* total time with empty current active q with other requests queued */
+	struct blkg_stat		empty_time;
+	/* fields after this shouldn't be cleared on stat reset */
+	uint64_t			start_group_wait_time;
+	uint64_t			start_idle_time;
+	uint64_t			start_empty_time;
+	uint16_t			flags;
+#endif
+};
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/*
+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem.
+ *
+ * @ps: @blkcg_policy_storage that this structure inherits
+ * @weight: weight of the bfq_group
+ */
+struct bfq_group_data {
+	/* must be the first member */
+	struct blkcg_policy_data pd;
+
+	unsigned int weight;
+};
+
+/**
+ * struct bfq_group - per (device, cgroup) data structure.
+ * @entity: schedulable entity to insert into the parent group sched_data.
+ * @sched_data: own sched_data, to contain child entities (they may be
+ *              both bfq_queues and bfq_groups).
+ * @bfqd: the bfq_data for the device this group acts upon.
+ * @async_bfqq: array of async queues for all the tasks belonging to
+ *              the group, one queue per ioprio value per ioprio_class,
+ *              except for the idle class that has only one queue.
+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
+ *             to avoid too many special cases during group creation/
+ *             migration.
+ * @active_entities: number of active entities belonging to the group;
+ *                   unused for the root group. Used to know whether there
+ *                   are groups with more than one active @bfq_entity
+ *                   (see the comments to the function
+ *                   bfq_bfqq_may_idle()).
+ * @rq_pos_tree: rbtree sorted by next_request position, used when
+ *               determining if two or more queues have interleaving
+ *               requests (see bfq_find_close_cooperator()).
+ *
+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
+ * there is a set of bfq_groups, each one collecting the lower-level
+ * entities belonging to the group that are acting on the same device.
+ *
+ * Locking works as follows:
+ *    o @bfqd is protected by the queue lock, RCU is used to access it
+ *      from the readers.
+ *    o All the other fields are protected by the @bfqd queue lock.
+ */
+struct bfq_group {
+	/* must be the first member */
+	struct blkg_policy_data pd;
+
+	struct bfq_entity entity;
+	struct bfq_sched_data sched_data;
+
+	void *bfqd;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct bfq_entity *my_entity;
+
+	int active_entities;
+
+	struct rb_root rq_pos_tree;
+
+	struct bfqg_stats stats;
+};
+
+#else
+struct bfq_group {
+	struct bfq_sched_data sched_data;
+
+	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
+	struct bfq_queue *async_idle_bfqq;
+
+	struct rb_root rq_pos_tree;
+};
+#endif
+
+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
+
+static struct bfq_service_tree *
+bfq_entity_service_tree(struct bfq_entity *entity)
+{
+	struct bfq_sched_data *sched_data = entity->sched_data;
+	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+	unsigned int idx = bfqq ? bfqq->ioprio_class - 1 :
+				  BFQ_DEFAULT_GRP_CLASS - 1;
+
+	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
+	BUG_ON(sched_data == NULL);
+
+	if (bfqq)
+		bfq_log_bfqq(bfqq->bfqd, bfqq,
+			     "entity_service_tree %p %d",
+			     sched_data->service_tree + idx, idx);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+	else {
+		struct bfq_group *bfqg =
+			container_of(entity, struct bfq_group, entity);
+
+		bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg,
+			     "entity_service_tree %p %d",
+			     sched_data->service_tree + idx, idx);
+	}
+#endif
+	return sched_data->service_tree + idx;
+}
+
+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+{
+	return bic->bfqq[is_sync];
+}
+
+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq,
+			 bool is_sync)
+{
+	bic->bfqq[is_sync] = bfqq;
+}
+
+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
+{
+	return bic->icq.q->elevator->elevator_data;
+}
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	struct bfq_entity *group_entity = bfqq->entity.parent;
+
+	if (!group_entity)
+		group_entity = &bfqq->bfqd->root_group->entity;
+
+	return container_of(group_entity, struct bfq_group, entity);
+}
+
+#else
+
+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+{
+	return bfqq->bfqd->root_group;
+}
+
+#endif
+
+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
+static void bfq_put_queue(struct bfq_queue *bfqq);
+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
+				       struct bio *bio, bool is_sync,
+				       struct bfq_io_cq *bic);
+static void bfq_end_wr_async_queues(struct bfq_data *bfqd,
+				    struct bfq_group *bfqg);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+#endif
+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+
+#endif /* _BFQ_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 36c7ac3..cdcb188 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -33,6 +33,7 @@
 #include <linux/ratelimit.h>
 #include <linux/pm_runtime.h>
 #include <linux/blk-cgroup.h>
+#include <linux/wbt.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -882,6 +883,8 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 
 fail:
 	blk_free_flush_queue(q->fq);
+	wbt_exit(q->rq_wb);
+	q->rq_wb = NULL;
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1346,6 +1349,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, &rq->wb_stat);
 
 	if (rq->cmd_flags & REQ_QUEUED)
 		blk_queue_end_tag(q, rq);
@@ -1436,6 +1440,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 
+	wbt_done(q->rq_wb, &req->wb_stat);
+
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
@@ -1667,6 +1673,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	int el_ret, rw_flags = 0, where = ELEVATOR_INSERT_SORT;
 	struct request *req;
 	unsigned int request_count = 0;
+	unsigned int wb_acct;
 
 	/*
 	 * low level driver can indicate that it wants pages above a
@@ -1719,6 +1726,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
 	}
 
 get_rq:
+	wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, q->queue_lock);
+
 	/*
 	 * This sync check and mask will be re-done in init_request_from_bio(),
 	 * but we need to set it earlier to expose the sync flag to the
@@ -1738,11 +1747,14 @@ get_rq:
 	 */
 	req = get_request(q, bio_data_dir(bio), rw_flags, bio, GFP_NOIO);
 	if (IS_ERR(req)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		bio->bi_error = PTR_ERR(req);
 		bio_endio(bio);
 		goto out_unlock;
 	}
 
+	wbt_track(&req->wb_stat, wb_acct);
+
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
@@ -2475,6 +2487,8 @@ void blk_start_request(struct request *req)
 {
 	blk_dequeue_request(req);
 
+	wbt_issue(req->q->rq_wb, &req->wb_stat);
+
 	/*
 	 * We are now handing the request to the hardware, initialize
 	 * resid_len to full count and add the timeout handler.
@@ -2542,6 +2556,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 
 	trace_block_rq_complete(req->q, req, nr_bytes);
 
+	blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
+
 	if (!req->bio)
 		return false;
 
@@ -2709,9 +2725,10 @@ void blk_finish_request(struct request *req, int error)
 
 	blk_account_io_done(req);
 
-	if (req->end_io)
+	if (req->end_io) {
+		wbt_done(req->q->rq_wb, &req->wb_stat);
 		req->end_io(req, error);
-	else {
+	} else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index fe822aa..b66bbf1 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
 	return ret;
 }
 
+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
+{
+	struct blk_mq_ctx *ctx;
+	unsigned int i;
+
+	hctx_for_each_ctx(hctx, ctx, i) {
+		blk_stat_init(&ctx->stat[0]);
+		blk_stat_init(&ctx->stat[1]);
+	}
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
+					  const char *page, size_t count)
+{
+	blk_mq_stat_clear(hctx);
+	return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+			pre, (long long) stat->nr_samples,
+			(long long) stat->mean, (long long) stat->min,
+			(long long) stat->max);
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+	struct blk_rq_stat stat[2];
+	ssize_t ret;
+
+	blk_stat_init(&stat[0]);
+	blk_stat_init(&stat[1]);
+
+	blk_hctx_stat_get(hctx, stat);
+
+	ret = print_stat(page, &stat[0], "read :");
+	ret += print_stat(page + ret, &stat[1], "write:");
+	return ret;
+}
+
 static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
 	.attr = {.name = "dispatched", .mode = S_IRUGO },
 	.show = blk_mq_sysfs_dispatched_show,
@@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
 	.attr = {.name = "io_poll", .mode = S_IRUGO },
 	.show = blk_mq_hw_sysfs_poll_show,
 };
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
+	.attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
+	.show = blk_mq_hw_sysfs_stat_show,
+	.store = blk_mq_hw_sysfs_stat_store,
+};
 
 static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_queued.attr,
@@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
 	&blk_mq_hw_sysfs_cpus.attr,
 	&blk_mq_hw_sysfs_active.attr,
 	&blk_mq_hw_sysfs_poll.attr,
+	&blk_mq_hw_sysfs_stat.attr,
 	NULL,
 };
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c207fa9..815e2ac 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -22,6 +22,7 @@
 #include <linux/sched/sysctl.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
+#include <linux/wbt.h>
 
 #include <trace/events/block.h>
 
@@ -29,6 +30,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
+#include "blk-stat.h"
 
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
@@ -330,6 +332,8 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 
 	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
+
+	wbt_done(q->rq_wb, &rq->wb_stat);
 	rq->cmd_flags = 0;
 
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -362,6 +366,7 @@ inline void __blk_mq_end_request(struct request *rq, int error)
 	blk_account_io_done(rq);
 
 	if (rq->end_io) {
+		wbt_done(rq->q->rq_wb, &rq->wb_stat);
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
@@ -412,10 +417,19 @@ static void blk_mq_ipi_complete_request(struct request *rq)
 	put_cpu();
 }
 
+static void blk_mq_stat_add(struct request *rq)
+{
+	struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
+
+	blk_stat_add(stat, rq);
+}
+
 static void __blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 
+	blk_mq_stat_add(rq);
+
 	if (!q->softirq_done_fn)
 		blk_mq_end_request(rq, rq->errors);
 	else
@@ -459,6 +473,8 @@ void blk_mq_start_request(struct request *rq)
 	if (unlikely(blk_bidi_rq(rq)))
 		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 
+	wbt_issue(q->rq_wb, &rq->wb_stat);
+
 	blk_add_timer(rq);
 
 	/*
@@ -494,6 +510,7 @@ static void __blk_mq_requeue_request(struct request *rq)
 	struct request_queue *q = rq->q;
 
 	trace_block_rq_requeue(q, rq);
+	wbt_requeue(q->rq_wb, &rq->wb_stat);
 
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
@@ -1312,6 +1329,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	struct blk_plug *plug;
 	struct request *same_queue_rq = NULL;
 	blk_qc_t cookie;
+	unsigned int wb_acct;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1326,9 +1344,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
 	    blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq))
 		return BLK_QC_T_NONE;
 
+	wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
+
 	rq = blk_mq_map_request(q, bio, &data);
-	if (unlikely(!rq))
+	if (unlikely(!rq)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
+	}
+
+	wbt_track(&rq->wb_stat, wb_acct);
 
 	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
@@ -1405,6 +1429,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	struct blk_map_ctx data;
 	struct request *rq;
 	blk_qc_t cookie;
+	unsigned int wb_acct;
 
 	blk_queue_bounce(q, &bio);
 
@@ -1421,9 +1446,15 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
 	} else
 		request_count = blk_plug_queued_count(q);
 
+	wb_acct = wbt_wait(q->rq_wb, bio->bi_opf, NULL);
+
 	rq = blk_mq_map_request(q, bio, &data);
-	if (unlikely(!rq))
+	if (unlikely(!rq)) {
+		__wbt_done(q->rq_wb, wb_acct);
 		return BLK_QC_T_NONE;
+	}
+
+	wbt_track(&rq->wb_stat, wb_acct);
 
 	cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);
 
@@ -1807,6 +1838,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
+		blk_stat_init(&__ctx->stat[0]);
+		blk_stat_init(&__ctx->stat[1]);
 
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
@@ -2145,6 +2178,9 @@ void blk_mq_free_queue(struct request_queue *q)
 	list_del_init(&q->all_q_node);
 	mutex_unlock(&all_q_mutex);
 
+	wbt_exit(q->rq_wb);
+	q->rq_wb = NULL;
+
 	blk_mq_del_queue_tag_set(q);
 
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9087b11..e107f70 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
 #ifndef INT_BLK_MQ_H
 #define INT_BLK_MQ_H
 
+#include "blk-stat.h"
+
 struct blk_mq_tag_set;
 
 struct blk_mq_ctx {
@@ -20,6 +22,7 @@ struct blk_mq_ctx {
 
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
+	struct blk_rq_stat	stat[2];
 
 	struct request_queue	*queue;
 	struct kobject		kobj;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index f679ae1..746dc9f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -832,6 +832,19 @@ void blk_queue_flush_queueable(struct request_queue *q, bool queueable)
 EXPORT_SYMBOL_GPL(blk_queue_flush_queueable);
 
 /**
+ * blk_set_queue_depth - tell the block layer about the device queue depth
+ * @q:		the request queue for the device
+ * @depth:		queue depth
+ *
+ */
+void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
+{
+	q->queue_depth = depth;
+	wbt_set_queue_depth(q->rq_wb, depth);
+}
+EXPORT_SYMBOL(blk_set_queue_depth);
+
+/**
  * blk_queue_write_cache - configure queue's write cache
  * @q:		the request queue for the device
  * @wc:		write back cache on or off
@@ -851,6 +864,8 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
 	else
 		queue_flag_clear(QUEUE_FLAG_FUA, q);
 	spin_unlock_irq(q->queue_lock);
+
+	wbt_set_write_cache(q->rq_wb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
 }
 EXPORT_SYMBOL_GPL(blk_queue_write_cache);
 
diff --git b/block/blk-stat.c b/block/blk-stat.c
new file mode 100644
index 0000000..bdb16d8
--- /dev/null
+++ b/block/blk-stat.c
@@ -0,0 +1,221 @@
+/*
+ * Block stat tracking code
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/blk-mq.h>
+
+#include "blk-stat.h"
+#include "blk-mq.h"
+
+static void blk_stat_flush_batch(struct blk_rq_stat *stat)
+{
+	if (!stat->nr_batch)
+		return;
+	if (!stat->nr_samples)
+		stat->mean = div64_s64(stat->batch, stat->nr_batch);
+	else {
+		stat->mean = div64_s64((stat->mean * stat->nr_samples) +
+					stat->batch,
+					stat->nr_samples + stat->nr_batch);
+	}
+
+	stat->nr_samples += stat->nr_batch;
+	stat->nr_batch = stat->batch = 0;
+}
+
+void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+	if (!src->nr_samples)
+		return;
+
+	blk_stat_flush_batch(src);
+
+	dst->min = min(dst->min, src->min);
+	dst->max = max(dst->max, src->max);
+
+	if (!dst->nr_samples)
+		dst->mean = src->mean;
+	else {
+		dst->mean = div64_s64((src->mean * src->nr_samples) +
+					(dst->mean * dst->nr_samples),
+					dst->nr_samples + src->nr_samples);
+	}
+	dst->nr_samples += src->nr_samples;
+}
+
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+	struct blk_mq_hw_ctx *hctx;
+	struct blk_mq_ctx *ctx;
+	uint64_t latest = 0;
+	int i, j, nr;
+
+	blk_stat_init(&dst[0]);
+	blk_stat_init(&dst[1]);
+
+	nr = 0;
+	do {
+		uint64_t newest = 0;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				if (!ctx->stat[0].nr_samples &&
+				    !ctx->stat[1].nr_samples)
+					continue;
+				if (ctx->stat[0].time > newest)
+					newest = ctx->stat[0].time;
+				if (ctx->stat[1].time > newest)
+					newest = ctx->stat[1].time;
+			}
+		}
+
+		/*
+		 * No samples
+		 */
+		if (!newest)
+			break;
+
+		if (newest > latest)
+			latest = newest;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				if (ctx->stat[0].time == newest) {
+					blk_stat_sum(&dst[0], &ctx->stat[0]);
+					nr++;
+				}
+				if (ctx->stat[1].time == newest) {
+					blk_stat_sum(&dst[1], &ctx->stat[1]);
+					nr++;
+				}
+			}
+		}
+		/*
+		 * If we race on finding an entry, just loop back again.
+		 * Should be very rare.
+		 */
+	} while (!nr);
+
+	dst[0].time = dst[1].time = latest;
+}
+
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+	if (q->mq_ops)
+		blk_mq_stat_get(q, dst);
+	else {
+		memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
+		memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
+	}
+}
+
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+	struct blk_mq_ctx *ctx;
+	unsigned int i, nr;
+
+	nr = 0;
+	do {
+		uint64_t newest = 0;
+
+		hctx_for_each_ctx(hctx, ctx, i) {
+			if (!ctx->stat[0].nr_samples &&
+			    !ctx->stat[1].nr_samples)
+				continue;
+
+			if (ctx->stat[0].time > newest)
+				newest = ctx->stat[0].time;
+			if (ctx->stat[1].time > newest)
+				newest = ctx->stat[1].time;
+		}
+
+		if (!newest)
+			break;
+
+		hctx_for_each_ctx(hctx, ctx, i) {
+			if (ctx->stat[0].time == newest) {
+				blk_stat_sum(&dst[0], &ctx->stat[0]);
+				nr++;
+			}
+			if (ctx->stat[1].time == newest) {
+				blk_stat_sum(&dst[1], &ctx->stat[1]);
+				nr++;
+			}
+		}
+		/*
+		 * If we race on finding an entry, just loop back again.
+		 * Should be very rare, as the window is only updated
+		 * occasionally
+		 */
+	} while (!nr);
+}
+
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+{
+	stat->min = -1ULL;
+	stat->max = stat->nr_samples = stat->mean = 0;
+	stat->batch = stat->nr_batch = 0;
+	stat->time = time_now & BLK_STAT_MASK;
+}
+
+void blk_stat_init(struct blk_rq_stat *stat)
+{
+	__blk_stat_init(stat, ktime_to_ns(ktime_get()));
+}
+
+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
+{
+	return (now & BLK_STAT_MASK) == (stat->time & BLK_STAT_MASK);
+}
+
+bool blk_stat_is_current(struct blk_rq_stat *stat)
+{
+	return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+}
+
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+{
+	s64 now, value;
+	u64 rq_time = wbt_issue_stat_get_time(&rq->wb_stat);
+
+	now = ktime_to_ns(ktime_get());
+	if (now < rq_time)
+		return;
+
+	if (!__blk_stat_is_current(stat, now))
+		__blk_stat_init(stat, now);
+
+	value = now - rq_time;
+	if (value > stat->max)
+		stat->max = value;
+	if (value < stat->min)
+		stat->min = value;
+
+	if (stat->batch + value < stat->batch ||
+	    stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+		blk_stat_flush_batch(stat);
+
+	stat->batch += value;
+	stat->nr_batch++;
+}
+
+void blk_stat_clear(struct request_queue *q)
+{
+	if (q->mq_ops) {
+		struct blk_mq_hw_ctx *hctx;
+		struct blk_mq_ctx *ctx;
+		int i, j;
+
+		queue_for_each_hw_ctx(q, hctx, i) {
+			hctx_for_each_ctx(hctx, ctx, j) {
+				blk_stat_init(&ctx->stat[0]);
+				blk_stat_init(&ctx->stat[1]);
+			}
+		}
+	} else {
+		blk_stat_init(&q->rq_stats[0]);
+		blk_stat_init(&q->rq_stats[1]);
+	}
+}
diff --git b/block/blk-stat.h b/block/blk-stat.h
new file mode 100644
index 0000000..376a6cc
--- /dev/null
+++ b/block/blk-stat.h
@@ -0,0 +1,18 @@
+#ifndef BLK_STAT_H
+#define BLK_STAT_H
+
+/*
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
+ */
+#define BLK_STAT_NSEC	134217728ULL
+#define BLK_STAT_MASK	~(BLK_STAT_NSEC - 1)
+
+void blk_stat_add(struct blk_rq_stat *, struct request *);
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
+void blk_stat_clear(struct request_queue *q);
+void blk_stat_init(struct blk_rq_stat *);
+void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+bool blk_stat_is_current(struct blk_rq_stat *);
+
+#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index f87a7e7..85c3dc2 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -10,6 +10,7 @@
 #include <linux/blktrace_api.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-cgroup.h>
+#include <linux/wbt.h>
 
 #include "blk.h"
 #include "blk-mq.h"
@@ -41,6 +42,19 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
 	return count;
 }
 
+static ssize_t queue_var_store64(u64 *var, const char *page)
+{
+	int err;
+	u64 v;
+
+	err = kstrtou64(page, 10, &v);
+	if (err < 0)
+		return err;
+
+	*var = v;
+	return 0;
+}
+
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->nr_requests, (page));
@@ -347,6 +361,58 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 	return ret;
 }
 
+static ssize_t queue_wb_win_show(struct request_queue *q, char *page)
+{
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	return sprintf(page, "%llu\n", div_u64(q->rq_wb->win_nsec, 1000));
+}
+
+static ssize_t queue_wb_win_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	ssize_t ret;
+	u64 val;
+
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	ret = queue_var_store64(&val, page);
+	if (ret < 0)
+		return ret;
+
+	q->rq_wb->win_nsec = val * 1000ULL;
+	wbt_update_limits(q->rq_wb);
+	return count;
+}
+
+static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
+{
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	return sprintf(page, "%llu\n", div_u64(q->rq_wb->min_lat_nsec, 1000));
+}
+
+static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
+				  size_t count)
+{
+	ssize_t ret;
+	u64 val;
+
+	if (!q->rq_wb)
+		return -EINVAL;
+
+	ret = queue_var_store64(&val, page);
+	if (ret < 0)
+		return ret;
+
+	q->rq_wb->min_lat_nsec = val * 1000ULL;
+	wbt_update_limits(q->rq_wb);
+	return count;
+}
+
 static ssize_t queue_wc_show(struct request_queue *q, char *page)
 {
 	if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -384,6 +450,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
 	return queue_var_show(blk_queue_dax(q), page);
 }
 
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+	return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+			pre, (long long) stat->nr_samples,
+			(long long) stat->mean, (long long) stat->min,
+			(long long) stat->max);
+}
+
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
+{
+	struct blk_rq_stat stat[2];
+	ssize_t ret;
+
+	blk_queue_stat_get(q, stat);
+
+	ret = print_stat(page, &stat[0], "read :");
+	ret += print_stat(page + ret, &stat[1], "write:");
+	return ret;
+}
+
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
@@ -526,6 +612,23 @@ static struct queue_sysfs_entry queue_dax_entry = {
 	.show = queue_dax_show,
 };
 
+static struct queue_sysfs_entry queue_stats_entry = {
+	.attr = {.name = "stats", .mode = S_IRUGO },
+	.show = queue_stats_show,
+};
+
+static struct queue_sysfs_entry queue_wb_lat_entry = {
+	.attr = {.name = "wbt_lat_usec", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_wb_lat_show,
+	.store = queue_wb_lat_store,
+};
+
+static struct queue_sysfs_entry queue_wb_win_entry = {
+	.attr = {.name = "wbt_window_usec", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_wb_win_show,
+	.store = queue_wb_win_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -553,6 +656,9 @@ static struct attribute *default_attrs[] = {
 	&queue_poll_entry.attr,
 	&queue_wc_entry.attr,
 	&queue_dax_entry.attr,
+	&queue_stats_entry.attr,
+	&queue_wb_lat_entry.attr,
+	&queue_wb_win_entry.attr,
 	NULL,
 };
 
@@ -667,6 +773,49 @@ struct kobj_type blk_queue_ktype = {
 	.release	= blk_release_queue,
 };
 
+static void blk_wb_stat_get(void *data, struct blk_rq_stat *stat)
+{
+	blk_queue_stat_get(data, stat);
+}
+
+static void blk_wb_stat_clear(void *data)
+{
+	blk_stat_clear(data);
+}
+
+static bool blk_wb_stat_is_current(struct blk_rq_stat *stat)
+{
+	return blk_stat_is_current(stat);
+}
+
+static struct wb_stat_ops wb_stat_ops = {
+	.get		= blk_wb_stat_get,
+	.is_current	= blk_wb_stat_is_current,
+	.clear		= blk_wb_stat_clear,
+};
+
+static void blk_wb_init(struct request_queue *q)
+{
+	struct rq_wb *rwb;
+
+	rwb = wbt_init(&q->backing_dev_info, &wb_stat_ops, q);
+
+	/*
+	 * If this fails, we don't get throttling
+	 */
+	if (IS_ERR(rwb))
+		return;
+
+	if (blk_queue_nonrot(q))
+		rwb->min_lat_nsec = 2000000ULL;
+	else
+		rwb->min_lat_nsec = 75000000ULL;
+
+	wbt_set_queue_depth(rwb, blk_queue_depth(q));
+	wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+	q->rq_wb = rwb;
+}
+
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
@@ -706,6 +855,8 @@ int blk_register_queue(struct gendisk *disk)
 	if (q->mq_ops)
 		blk_mq_register_disk(disk);
 
+	blk_wb_init(q);
+
 	if (!q->request_fn)
 		return 0;
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 5e24d88..f336dcb 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3771,9 +3771,11 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
 	uint64_t serial_nr;
+	bool nonroot_cg;
 
 	rcu_read_lock();
 	serial_nr = bio_blkcg(bio)->css.serial_nr;
+	nonroot_cg = bio_blkcg(bio) != &blkcg_root;
 	rcu_read_unlock();
 
 	/*
@@ -3784,6 +3786,17 @@ static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 		return;
 
 	/*
+	 * If we have a non-root cgroup, we can depend on that to
+	 * do proper throttling of writes. Turn off wbt for that
+	 * case.
+	 */
+	if (nonroot_cg) {
+		struct request_queue *q = cfqd->queue;
+
+		wbt_disable(q->rq_wb);
+	}
+
+	/*
 	 * Drop reference to queues.  New queues will be assigned in new
 	 * group upon arrival of fresh requests.
 	 */
diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c
index 2b38c1b..7a2e4d4 100644
--- a/drivers/acpi/sleep.c
+++ b/drivers/acpi/sleep.c
@@ -47,32 +47,15 @@ static void acpi_sleep_tts_switch(u32 acpi_state)
 	}
 }
 
-static void acpi_sleep_pts_switch(u32 acpi_state)
-{
-	acpi_status status;
-
-	status = acpi_execute_simple_method(NULL, "\\_PTS", acpi_state);
-	if (ACPI_FAILURE(status) && status != AE_NOT_FOUND) {
-		/*
-		 * OS can't evaluate the _PTS object correctly. Some warning
-		 * message will be printed. But it won't break anything.
-		 */
-		printk(KERN_NOTICE "Failure in evaluating _PTS object\n");
-	}
-}
-
-static int sleep_notify_reboot(struct notifier_block *this,
+static int tts_notify_reboot(struct notifier_block *this,
 			unsigned long code, void *x)
 {
 	acpi_sleep_tts_switch(ACPI_STATE_S5);
-
-	acpi_sleep_pts_switch(ACPI_STATE_S5);
-
 	return NOTIFY_DONE;
 }
 
-static struct notifier_block sleep_notifier = {
-	.notifier_call	= sleep_notify_reboot,
+static struct notifier_block tts_notifier = {
+	.notifier_call	= tts_notify_reboot,
 	.next		= NULL,
 	.priority	= 0,
 };
@@ -916,9 +899,9 @@ int __init acpi_sleep_init(void)
 	pr_info(PREFIX "(supports%s)\n", supported);
 
 	/*
-	 * Register the sleep_notifier to reboot notifier list so that the _TTS
-	 * and _PTS object can also be evaluated when the system enters S5.
+	 * Register the tts_notifier to reboot notifier list so that the _TTS
+	 * object can also be evaluated when the system enters S5.
 	 */
-	register_reboot_notifier(&sleep_notifier);
+	register_reboot_notifier(&tts_notifier);
 	return 0;
 }
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index c9f2107..005e292 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -701,6 +701,24 @@ static inline int is_loop_device(struct file *file)
 	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
 }
 
+/*
+ * for AUFS
+ * no get/put for file.
+ */
+struct file *loop_backing_file(struct super_block *sb)
+{
+	struct file *ret;
+	struct loop_device *l;
+
+	ret = NULL;
+	if (MAJOR(sb->s_dev) == LOOP_MAJOR) {
+		l = sb->s_bdev->bd_disk->private_data;
+		ret = l->lo_backing_file;
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(loop_backing_file);
+
 /* loop sysfs attributes */
 
 static ssize_t loop_attr_show(struct device *dev, char *page,
diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c
index 3a1f49f..79dc7cf 100644
--- a/drivers/cpufreq/cpufreq_ondemand.c
+++ b/drivers/cpufreq/cpufreq_ondemand.c
@@ -21,7 +21,11 @@
 
 /* On-demand governor macros */
 #define DEF_FREQUENCY_UP_THRESHOLD		(80)
+#ifdef CONFIG_PCK_INTERACTIVE
+#define DEF_SAMPLING_DOWN_FACTOR		(10)
+#else
 #define DEF_SAMPLING_DOWN_FACTOR		(1)
+#endif
 #define MAX_SAMPLING_DOWN_FACTOR		(100000)
 #define MICRO_FREQUENCY_UP_THRESHOLD		(95)
 #define MICRO_FREQUENCY_MIN_SAMPLE_RATE		(10000)
diff --git a/drivers/input/joydev.c b/drivers/input/joydev.c
index f3135ae..b30ab7e 100644
--- a/drivers/input/joydev.c
+++ b/drivers/input/joydev.c
@@ -28,15 +28,21 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/cdev.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
 
 MODULE_AUTHOR("Vojtech Pavlik <vojtech@ucw.cz>");
 MODULE_DESCRIPTION("Joystick device interfaces");
 MODULE_SUPPORTED_DEVICE("input/js");
 MODULE_LICENSE("GPL");
-
 #define JOYDEV_MINOR_BASE	0
 #define JOYDEV_MINORS		16
 #define JOYDEV_BUFFER_SIZE	64
+#define MAX_REMAP_SIZE          10
+
+static int remap_array[MAX_REMAP_SIZE];
+static int remap_count = 0;
+static int free_buttons[MAX_REMAP_SIZE];
 
 struct joydev {
 	int open;
@@ -71,6 +77,9 @@ struct joydev_client {
 	struct list_head node;
 };
 
+module_param_array(remap_array, int, &remap_count, 0 );
+MODULE_PARM_DESC( remap_array, "remap axis to buttons\n" );
+
 static int joydev_correct(int value, struct js_corr *corr)
 {
 	switch (corr->type) {
@@ -121,6 +130,17 @@ static void joydev_event(struct input_handle *handle,
 	struct joydev *joydev = handle->private;
 	struct joydev_client *client;
 	struct js_event event;
+        int i;
+
+        if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){
+                for( i = 0; i < remap_count; i++ )
+                        if( code == remap_array[i] ){
+                                type = EV_KEY;
+                                code = free_buttons[i];
+                                if( value == 255 )
+                                        value = 1;
+                        }
+        }
 
 	switch (type) {
 
@@ -816,7 +836,7 @@ static int joydev_connect(struct input_handler *handler, struct input_dev *dev,
 			  const struct input_device_id *id)
 {
 	struct joydev *joydev;
-	int i, j, t, minor, dev_no;
+	int i, j = 0, t, minor, dev_no;
 	int error;
 
 	minor = input_get_new_minor(JOYDEV_MINOR_BASE, JOYDEV_MINORS, true);
@@ -860,15 +880,24 @@ static int joydev_connect(struct input_handler *handler, struct input_dev *dev,
 			joydev->keymap[i] = joydev->nkey;
 			joydev->keypam[joydev->nkey] = i + BTN_MISC;
 			joydev->nkey++;
-		}
+                       j = i;
+                }
+        if( remap_count > 0 && remap_count < MAX_REMAP_SIZE ){
+                printk( "[joydev] axis remapping enabled\n" );
+                for( i = 0; i < remap_count; i++ ){
+                        joydev->keymap[j + i + 1] = joydev->nkey;
+                        joydev->keypam[joydev->nkey] = i + j + 1 + BTN_MISC;
+                        free_buttons[i] = j + i + 1 + BTN_MISC;
+                        joydev->nkey++;
 
+		}
 	for (i = 0; i < BTN_JOYSTICK - BTN_MISC; i++)
 		if (test_bit(i + BTN_MISC, dev->keybit)) {
 			joydev->keymap[i] = joydev->nkey;
 			joydev->keypam[joydev->nkey] = i + BTN_MISC;
 			joydev->nkey++;
 		}
-
+	}
 	for (i = 0; i < joydev->nabs; i++) {
 		j = joydev->abspam[i];
 		if (input_abs_get_max(dev, j) == input_abs_get_min(dev, j)) {
diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c
index a41d832..a5c2eb2 100644
--- a/drivers/input/mouse/synaptics.c
+++ b/drivers/input/mouse/synaptics.c
@@ -1251,7 +1251,9 @@ static void set_input_params(struct psmouse *psmouse,
 		/* Clickpads report only left button */
 		__clear_bit(BTN_RIGHT, dev->keybit);
 		__clear_bit(BTN_MIDDLE, dev->keybit);
-	}
+	} else if (SYN_CAP_CLICKPAD2BTN(priv->ext_cap_0c) ||
+		   SYN_CAP_CLICKPAD2BTN2(priv->ext_cap_0c))
+		__set_bit(INPUT_PROP_BUTTONPAD, dev->propbit);
 }
 
 static ssize_t synaptics_show_disable_gesture(struct psmouse *psmouse,
diff --git a/drivers/input/mouse/synaptics.h b/drivers/input/mouse/synaptics.h
index 56faa7e..c20f32c 100644
--- a/drivers/input/mouse/synaptics.h
+++ b/drivers/input/mouse/synaptics.h
@@ -85,6 +85,7 @@
  */
 #define SYN_CAP_CLICKPAD(ex0c)		((ex0c) & 0x100000) /* 1-button ClickPad */
 #define SYN_CAP_CLICKPAD2BTN(ex0c)	((ex0c) & 0x000100) /* 2-button ClickPad */
+#define SYN_CAP_CLICKPAD2BTN2(ex0c)	((ex0c) & 0x200000) /* 2-button ClickPad */
 #define SYN_CAP_MAX_DIMENSIONS(ex0c)	((ex0c) & 0x020000)
 #define SYN_CAP_MIN_DIMENSIONS(ex0c)	((ex0c) & 0x002000)
 #define SYN_CAP_ADV_GESTURE(ex0c)	((ex0c) & 0x080000)
diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c
index 29e1b49..13bda57 100644
--- a/drivers/input/touchscreen/atmel_mxt_ts.c
+++ b/drivers/input/touchscreen/atmel_mxt_ts.c
@@ -1973,7 +1973,7 @@ static int mxt_initialize(struct mxt_data *data)
 	if (error)
 		goto err_free_object_table;
 
-	error = reject_firmware_nowait(THIS_MODULE, true, MXT_CFG_NAME,
+	error = request_firmware_nowait(THIS_MODULE, true, MXT_CFG_NAME,
 					&client->dev, GFP_KERNEL, data,
 					mxt_config_cb);
 	if (error) {
diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig
index d28690f..61fbb64 100644
--- a/drivers/macintosh/Kconfig
+++ b/drivers/macintosh/Kconfig
@@ -170,6 +170,13 @@ config INPUT_ADBHID
 
 	  If unsure, say Y.
 
+config ADB_TRACKPAD_ABSOLUTE
+	bool "Enable absolute mode for adb trackpads"
+	depends on INPUT_ADBHID
+	help
+	  Enable absolute mode in adb-base trackpads. This feature adds
+	  compatibility with synaptics Xorg / Xfree drivers.
+
 config MAC_EMUMOUSEBTN
 	tristate "Support for mouse button 2+3 emulation"
 	depends on SYSCTL && INPUT
diff --git a/drivers/macintosh/adbhid.c b/drivers/macintosh/adbhid.c
index 09d72bb..8d23b27 100644
--- a/drivers/macintosh/adbhid.c
+++ b/drivers/macintosh/adbhid.c
@@ -261,6 +261,15 @@ static struct adb_ids buttons_ids;
 #define ADBMOUSE_MS_A3		8	/* Mouse systems A3 trackball (handler 3) */
 #define ADBMOUSE_MACALLY2	9	/* MacAlly 2-button mouse */
 
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+#define	ABS_XMIN	310
+#define	ABS_XMAX	1700
+#define	ABS_YMIN	200
+#define	ABS_YMAX	1000
+#define	ABS_ZMIN	0
+#define	ABS_ZMAX	55
+#endif
+
 static void
 adbhid_keyboard_input(unsigned char *data, int nb, int apoll)
 {
@@ -405,6 +414,9 @@ static void
 adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
 {
 	int id = (data[0] >> 4) & 0x0f;
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+	int btn = 0; int x_axis = 0; int y_axis = 0; int z_axis = 0;
+#endif
 
 	if (!adbhid[id]) {
 		printk(KERN_ERR "ADB HID on ID %d not yet registered\n", id);
@@ -436,6 +448,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
 	      high bits of y-axis motion.  XY is additional
 	      high bits of x-axis motion.
 
+    For ADB Absolute motion protocol the data array will contain the
+    following values:
+
+		BITS    COMMENTS
+    data[0] = dddd 1100 ADB command: Talk, register 0, for device dddd.
+    data[1] = byyy yyyy Left button and y-axis motion.
+    data[2] = bxxx xxxx Second button and x-axis motion.
+    data[3] = 1yyy 1xxx Half bits of y-axis and x-axis motion.
+    data[4] = 1yyy 1xxx Higher bits of y-axis and x-axis motion.
+    data[5] = 1zzz 1zzz Higher and lower bits of z-pressure.
+
     MacAlly 2-button mouse protocol.
 
     For MacAlly 2-button mouse protocol the data array will contain the
@@ -458,8 +481,17 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
 	switch (adbhid[id]->mouse_kind)
 	{
 	    case ADBMOUSE_TRACKPAD:
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+		x_axis = (data[2] & 0x7f) | ((data[3] & 0x07) << 7) |
+			((data[4] & 0x07) << 10);
+		y_axis = (data[1] & 0x7f) | ((data[3] & 0x70) << 3) |
+			((data[4] & 0x70) << 6);
+		z_axis = (data[5] & 0x07) | ((data[5] & 0x70) >> 1);
+		btn = (!(data[1] >> 7)) & 1;
+#else
 		data[1] = (data[1] & 0x7f) | ((data[1] & data[2]) & 0x80);
 		data[2] = data[2] | 0x80;
+#endif
 		break;
 	    case ADBMOUSE_MICROSPEED:
 		data[1] = (data[1] & 0x7f) | ((data[3] & 0x01) << 7);
@@ -485,17 +517,39 @@ adbhid_mouse_input(unsigned char *data, int nb, int autopoll)
                 break;
 	}
 
-	input_report_key(adbhid[id]->input, BTN_LEFT,   !((data[1] >> 7) & 1));
-	input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1));
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+	if ( adbhid[id]->mouse_kind == ADBMOUSE_TRACKPAD ) {
 
-	if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD)
-		input_report_key(adbhid[id]->input, BTN_RIGHT,  !((data[3] >> 7) & 1));
+		if(z_axis > 30) input_report_key(adbhid[id]->input, BTN_TOUCH, 1);
+		if(z_axis < 25) input_report_key(adbhid[id]->input, BTN_TOUCH, 0);
 
-	input_report_rel(adbhid[id]->input, REL_X,
-			 ((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 ));
-	input_report_rel(adbhid[id]->input, REL_Y,
-			 ((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 ));
+		if(z_axis > 0){
+			input_report_abs(adbhid[id]->input, ABS_X, x_axis);
+			input_report_abs(adbhid[id]->input, ABS_Y, y_axis);
+			input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 1);
+			input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 5);
+		} else {
+			input_report_key(adbhid[id]->input, BTN_TOOL_FINGER, 0);
+			input_report_key(adbhid[id]->input, ABS_TOOL_WIDTH, 0);
+		}
+
+		input_report_abs(adbhid[id]->input, ABS_PRESSURE, z_axis);
+		input_report_key(adbhid[id]->input, BTN_LEFT, btn);
+	} else {
+#endif
+		input_report_key(adbhid[id]->input, BTN_LEFT,   !((data[1] >> 7) & 1));
+		input_report_key(adbhid[id]->input, BTN_MIDDLE, !((data[2] >> 7) & 1));
+
+		if (nb >= 4 && adbhid[id]->mouse_kind != ADBMOUSE_TRACKPAD)
+			input_report_key(adbhid[id]->input, BTN_RIGHT,  !((data[3] >> 7) & 1));
 
+		input_report_rel(adbhid[id]->input, REL_X,
+				((data[2]&0x7f) < 64 ? (data[2]&0x7f) : (data[2]&0x7f)-128 ));
+		input_report_rel(adbhid[id]->input, REL_Y,
+				((data[1]&0x7f) < 64 ? (data[1]&0x7f) : (data[1]&0x7f)-128 ));
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+	}
+#endif
 	input_sync(adbhid[id]->input);
 }
 
@@ -849,6 +903,15 @@ adbhid_input_register(int id, int default_id, int original_handler_id,
 		input_dev->keybit[BIT_WORD(BTN_MOUSE)] = BIT_MASK(BTN_LEFT) |
 			BIT_MASK(BTN_MIDDLE) | BIT_MASK(BTN_RIGHT);
 		input_dev->relbit[0] = BIT_MASK(REL_X) | BIT_MASK(REL_Y);
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+                set_bit(EV_ABS, input_dev->evbit);
+		input_set_abs_params(input_dev, ABS_X, ABS_XMIN, ABS_XMAX, 0, 0);
+		input_set_abs_params(input_dev, ABS_Y, ABS_YMIN, ABS_YMAX, 0, 0);
+		input_set_abs_params(input_dev, ABS_PRESSURE, ABS_ZMIN, ABS_ZMAX, 0, 0);
+		set_bit(BTN_TOUCH, input_dev->keybit);
+		set_bit(BTN_TOOL_FINGER, input_dev->keybit);
+		set_bit(ABS_TOOL_WIDTH, input_dev->absbit);
+#endif
 		break;
 
 	case ADB_MISC:
@@ -1132,7 +1195,11 @@ init_trackpad(int id)
 	            r1_buffer[3],
 	            r1_buffer[4],
 	            r1_buffer[5],
+#ifdef CONFIG_ADB_TRACKPAD_ABSOLUTE
+		    0x00, /* Enable absolute mode */
+#else
 	            0x03, /*r1_buffer[6],*/
+#endif
 	            r1_buffer[7]);
 
 	    /* Without this flush, the trackpad may be locked up */
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 81b8dcc..7435fa5 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -502,9 +502,28 @@ config THINKPAD_ACPI_HOTKEY_POLL
 	  If you are not sure, say Y here.  The driver enables polling only if
 	  it is strictly necessary to do so.
 
+config THINKPAD_EC
+	tristate
+	---help---
+	  This is a low-level driver for accessing the ThinkPad H8S embedded
+	  controller over the LPC bus (not to be confused with the ACPI Embedded
+	  Controller interface).
+
+config TP_SMAPI
+	tristate "ThinkPad SMAPI Support"
+	select THINKPAD_EC
+	default n
+	help
+	  This adds SMAPI support on Lenovo/IBM ThinkPads, for features such
+	  as battery charging control. For more information about this driver
+	  see <http://www.thinkwiki.org/wiki/tp_smapi>.
+
+	  If you have a Lenovo/IBM ThinkPad laptop, say Y or M here.
+
 config SENSORS_HDAPS
 	tristate "Thinkpad Hard Drive Active Protection System (hdaps)"
 	depends on INPUT
+	select THINKPAD_EC
 	select INPUT_POLLDEV
 	default n
 	help
diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile
index 2efa86d..ae14f8e 100644
--- a/drivers/platform/x86/Makefile
+++ b/drivers/platform/x86/Makefile
@@ -27,6 +27,8 @@ obj-$(CONFIG_TC1100_WMI)	+= tc1100-wmi.o
 obj-$(CONFIG_SONY_LAPTOP)	+= sony-laptop.o
 obj-$(CONFIG_IDEAPAD_LAPTOP)	+= ideapad-laptop.o
 obj-$(CONFIG_THINKPAD_ACPI)	+= thinkpad_acpi.o
+obj-$(CONFIG_THINKPAD_EC)	+= thinkpad_ec.o
+obj-$(CONFIG_TP_SMAPI)		+= tp_smapi.o
 obj-$(CONFIG_SENSORS_HDAPS)	+= hdaps.o
 obj-$(CONFIG_FUJITSU_LAPTOP)	+= fujitsu-laptop.o
 obj-$(CONFIG_FUJITSU_TABLET)	+= fujitsu-tablet.o
diff --git a/drivers/platform/x86/hdaps.c b/drivers/platform/x86/hdaps.c
index 458e6c9..dadfb99 100644
--- a/drivers/platform/x86/hdaps.c
+++ b/drivers/platform/x86/hdaps.c
@@ -30,266 +30,384 @@
 
 #include <linux/delay.h>
 #include <linux/platform_device.h>
-#include <linux/input-polldev.h>
+#include <linux/input.h>
 #include <linux/kernel.h>
-#include <linux/mutex.h>
 #include <linux/module.h>
 #include <linux/timer.h>
 #include <linux/dmi.h>
 #include <linux/jiffies.h>
-#include <linux/io.h>
-
-#define HDAPS_LOW_PORT		0x1600	/* first port used by hdaps */
-#define HDAPS_NR_PORTS		0x30	/* number of ports: 0x1600 - 0x162f */
-
-#define HDAPS_PORT_STATE	0x1611	/* device state */
-#define HDAPS_PORT_YPOS		0x1612	/* y-axis position */
-#define	HDAPS_PORT_XPOS		0x1614	/* x-axis position */
-#define HDAPS_PORT_TEMP1	0x1616	/* device temperature, in Celsius */
-#define HDAPS_PORT_YVAR		0x1617	/* y-axis variance (what is this?) */
-#define HDAPS_PORT_XVAR		0x1619	/* x-axis variance (what is this?) */
-#define HDAPS_PORT_TEMP2	0x161b	/* device temperature (again?) */
-#define HDAPS_PORT_UNKNOWN	0x161c	/* what is this? */
-#define HDAPS_PORT_KMACT	0x161d	/* keyboard or mouse activity */
-
-#define STATE_FRESH		0x50	/* accelerometer data is fresh */
+#include <linux/thinkpad_ec.h>
+#include <linux/pci_ids.h>
+#include <linux/version.h>
+
+/* Embedded controller accelerometer read command and its result: */
+static const struct thinkpad_ec_row ec_accel_args =
+	{ .mask = 0x0001, .val = {0x11} };
+#define EC_ACCEL_IDX_READOUTS	0x1	/* readouts included in this read */
+					/* First readout, if READOUTS>=1: */
+#define EC_ACCEL_IDX_YPOS1	0x2	/*   y-axis position word */
+#define EC_ACCEL_IDX_XPOS1	0x4	/*   x-axis position word */
+#define EC_ACCEL_IDX_TEMP1	0x6	/*   device temperature in Celsius */
+					/* Second readout, if READOUTS>=2: */
+#define EC_ACCEL_IDX_XPOS2	0x7	/*   y-axis position word */
+#define EC_ACCEL_IDX_YPOS2	0x9	/*   x-axis position word */
+#define EC_ACCEL_IDX_TEMP2	0xb	/*   device temperature in Celsius */
+#define EC_ACCEL_IDX_QUEUED	0xc	/* Number of queued readouts left */
+#define EC_ACCEL_IDX_KMACT	0xd	/* keyboard or mouse activity */
+#define EC_ACCEL_IDX_RETVAL	0xf	/* command return value, good=0x00 */
 
 #define KEYBD_MASK		0x20	/* set if keyboard activity */
 #define MOUSE_MASK		0x40	/* set if mouse activity */
-#define KEYBD_ISSET(n)		(!! (n & KEYBD_MASK))	/* keyboard used? */
-#define MOUSE_ISSET(n)		(!! (n & MOUSE_MASK))	/* mouse used? */
 
-#define INIT_TIMEOUT_MSECS	4000	/* wait up to 4s for device init ... */
-#define INIT_WAIT_MSECS		200	/* ... in 200ms increments */
+#define READ_TIMEOUT_MSECS	100	/* wait this long for device read */
+#define RETRY_MSECS		3	/* retry delay */
 
-#define HDAPS_POLL_INTERVAL	50	/* poll for input every 1/20s (50 ms)*/
 #define HDAPS_INPUT_FUZZ	4	/* input event threshold */
 #define HDAPS_INPUT_FLAT	4
-
-#define HDAPS_X_AXIS		(1 << 0)
-#define HDAPS_Y_AXIS		(1 << 1)
-#define HDAPS_BOTH_AXES		(HDAPS_X_AXIS | HDAPS_Y_AXIS)
-
+#define KMACT_REMEMBER_PERIOD   (HZ/10) /* keyboard/mouse persistance */
+
+/* Input IDs */
+#define HDAPS_INPUT_VENDOR	PCI_VENDOR_ID_IBM
+#define HDAPS_INPUT_PRODUCT	0x5054 /* "TP", shared with thinkpad_acpi */
+#define HDAPS_INPUT_JS_VERSION	0x6801 /* Joystick emulation input device */
+#define HDAPS_INPUT_RAW_VERSION	0x4801 /* Raw accelerometer input device */
+
+/* Axis orientation. */
+/* The unnatural bit-representation of inversions is for backward
+ * compatibility with the"invert=1" module parameter.             */
+#define HDAPS_ORIENT_INVERT_XY  0x01   /* Invert both X and Y axes.       */
+#define HDAPS_ORIENT_INVERT_X   0x02   /* Invert the X axis (uninvert if
+					* already inverted by INVERT_XY). */
+#define HDAPS_ORIENT_SWAP       0x04   /* Swap the axes. The swap occurs
+					* before inverting X or Y.        */
+#define HDAPS_ORIENT_MAX        0x07
+#define HDAPS_ORIENT_UNDEFINED  0xFF   /* Placeholder during initialization */
+#define HDAPS_ORIENT_INVERT_Y   (HDAPS_ORIENT_INVERT_XY | HDAPS_ORIENT_INVERT_X)
+
+static struct timer_list hdaps_timer;
 static struct platform_device *pdev;
-static struct input_polled_dev *hdaps_idev;
-static unsigned int hdaps_invert;
-static u8 km_activity;
-static int rest_x;
-static int rest_y;
-
-static DEFINE_MUTEX(hdaps_mtx);
-
-/*
- * __get_latch - Get the value from a given port.  Callers must hold hdaps_mtx.
- */
-static inline u8 __get_latch(u16 port)
+static struct input_dev *hdaps_idev;     /* joystick-like device with fuzz */
+static struct input_dev *hdaps_idev_raw; /* raw hdaps sensor readouts */
+static unsigned int hdaps_invert = HDAPS_ORIENT_UNDEFINED;
+static int needs_calibration;
+
+/* Configuration: */
+static int sampling_rate = 50;       /* Sampling rate  */
+static int oversampling_ratio = 5;   /* Ratio between our sampling rate and
+				      * EC accelerometer sampling rate      */
+static int running_avg_filter_order = 2; /* EC running average filter order */
+
+/* Latest state readout: */
+static int pos_x, pos_y;      /* position */
+static int temperature;       /* temperature */
+static int stale_readout = 1; /* last read invalid */
+static int rest_x, rest_y;    /* calibrated rest position */
+
+/* Last time we saw keyboard and mouse activity: */
+static u64 last_keyboard_jiffies = INITIAL_JIFFIES;
+static u64 last_mouse_jiffies = INITIAL_JIFFIES;
+static u64 last_update_jiffies = INITIAL_JIFFIES;
+
+/* input device use count */
+static int hdaps_users;
+static DEFINE_MUTEX(hdaps_users_mtx);
+
+/* Some models require an axis transformation to the standard representation */
+static void transform_axes(int *x, int *y)
 {
-	return inb(port) & 0xff;
+	if (hdaps_invert & HDAPS_ORIENT_SWAP) {
+		int z;
+		z = *x;
+		*x = *y;
+		*y = z;
+	}
+	if (hdaps_invert & HDAPS_ORIENT_INVERT_XY) {
+		*x = -*x;
+		*y = -*y;
+	}
+	if (hdaps_invert & HDAPS_ORIENT_INVERT_X)
+		*x = -*x;
 }
 
-/*
- * __check_latch - Check a port latch for a given value.  Returns zero if the
- * port contains the given value.  Callers must hold hdaps_mtx.
+/**
+ * __hdaps_update - query current state, with locks already acquired
+ * @fast: if nonzero, do one quick attempt without retries.
+ *
+ * Query current accelerometer state and update global state variables.
+ * Also prefetches the next query. Caller must hold controller lock.
  */
-static inline int __check_latch(u16 port, u8 val)
+static int __hdaps_update(int fast)
 {
-	if (__get_latch(port) == val)
-		return 0;
-	return -EINVAL;
-}
+	/* Read data: */
+	struct thinkpad_ec_row data;
+	int ret;
 
-/*
- * __wait_latch - Wait up to 100us for a port latch to get a certain value,
- * returning zero if the value is obtained.  Callers must hold hdaps_mtx.
- */
-static int __wait_latch(u16 port, u8 val)
-{
-	unsigned int i;
+	data.mask = (1 << EC_ACCEL_IDX_READOUTS) | (1 << EC_ACCEL_IDX_KMACT) |
+		    (3 << EC_ACCEL_IDX_YPOS1)    | (3 << EC_ACCEL_IDX_XPOS1) |
+		    (1 << EC_ACCEL_IDX_TEMP1)    | (1 << EC_ACCEL_IDX_RETVAL);
+	if (fast)
+		ret = thinkpad_ec_try_read_row(&ec_accel_args, &data);
+	else
+		ret = thinkpad_ec_read_row(&ec_accel_args, &data);
+	thinkpad_ec_prefetch_row(&ec_accel_args); /* Prefetch even if error */
+	if (ret)
+		return ret;
 
-	for (i = 0; i < 20; i++) {
-		if (!__check_latch(port, val))
-			return 0;
-		udelay(5);
+	/* Check status: */
+	if (data.val[EC_ACCEL_IDX_RETVAL] != 0x00) {
+		pr_warn("read RETVAL=0x%02x\n",
+		       data.val[EC_ACCEL_IDX_RETVAL]);
+		return -EIO;
+	}
+
+	if (data.val[EC_ACCEL_IDX_READOUTS] < 1)
+		return -EBUSY; /* no pending readout, try again later */
+
+	/* Parse position data: */
+	pos_x = *(s16 *)(data.val+EC_ACCEL_IDX_XPOS1);
+	pos_y = *(s16 *)(data.val+EC_ACCEL_IDX_YPOS1);
+	transform_axes(&pos_x, &pos_y);
+
+	/* Keyboard and mouse activity status is cleared as soon as it's read,
+	 * so applications will eat each other's events. Thus we remember any
+	 * event for KMACT_REMEMBER_PERIOD jiffies.
+	 */
+	if (data.val[EC_ACCEL_IDX_KMACT] & KEYBD_MASK)
+		last_keyboard_jiffies = get_jiffies_64();
+	if (data.val[EC_ACCEL_IDX_KMACT] & MOUSE_MASK)
+		last_mouse_jiffies = get_jiffies_64();
+
+	temperature = data.val[EC_ACCEL_IDX_TEMP1];
+
+	last_update_jiffies = get_jiffies_64();
+	stale_readout = 0;
+	if (needs_calibration) {
+		rest_x = pos_x;
+		rest_y = pos_y;
+		needs_calibration = 0;
 	}
 
-	return -EIO;
+	return 0;
 }
 
-/*
- * __device_refresh - request a refresh from the accelerometer.  Does not wait
- * for refresh to complete.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_update - acquire locks and query current state
+ *
+ * Query current accelerometer state and update global state variables.
+ * Also prefetches the next query.
+ * Retries until timeout if the accelerometer is not in ready status (common).
+ * Does its own locking.
  */
-static void __device_refresh(void)
+static int hdaps_update(void)
 {
-	udelay(200);
-	if (inb(0x1604) != STATE_FRESH) {
-		outb(0x11, 0x1610);
-		outb(0x01, 0x161f);
+	u64 age = get_jiffies_64() - last_update_jiffies;
+	int total, ret;
+
+	if (!stale_readout && age < (9*HZ)/(10*sampling_rate))
+		return 0; /* already updated recently */
+	for (total = 0; total < READ_TIMEOUT_MSECS; total += RETRY_MSECS) {
+		ret = thinkpad_ec_lock();
+		if (ret)
+			return ret;
+		ret = __hdaps_update(0);
+		thinkpad_ec_unlock();
+
+		if (!ret)
+			return 0;
+		if (ret != -EBUSY)
+			break;
+		msleep(RETRY_MSECS);
 	}
+	return ret;
 }
 
-/*
- * __device_refresh_sync - request a synchronous refresh from the
- * accelerometer.  We wait for the refresh to complete.  Returns zero if
- * successful and nonzero on error.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_set_power - enable or disable power to the accelerometer.
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static int __device_refresh_sync(void)
+static int hdaps_set_power(int on)
 {
-	__device_refresh();
-	return __wait_latch(0x1604, STATE_FRESH);
+	struct thinkpad_ec_row args =
+		{ .mask = 0x0003, .val = {0x14, on?0x01:0x00} };
+	struct thinkpad_ec_row data = { .mask = 0x8000 };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	if (ret)
+		return ret;
+	if (data.val[0xF] != 0x00)
+		return -EIO;
+	return 0;
 }
 
-/*
- * __device_complete - indicate to the accelerometer that we are done reading
- * data, and then initiate an async refresh.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_set_ec_config - set accelerometer parameters.
+ * @ec_rate: embedded controller sampling rate
+ * @order: embedded controller running average filter order
+ * (Normally we have @ec_rate = sampling_rate * oversampling_ratio.)
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static inline void __device_complete(void)
+static int hdaps_set_ec_config(int ec_rate, int order)
 {
-	inb(0x161f);
-	inb(0x1604);
-	__device_refresh();
+	struct thinkpad_ec_row args = { .mask = 0x000F,
+		.val = {0x10, (u8)ec_rate, (u8)(ec_rate>>8), order} };
+	struct thinkpad_ec_row data = { .mask = 0x8000 };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	pr_debug("setting ec_rate=%d, filter_order=%d\n", ec_rate, order);
+	if (ret)
+		return ret;
+	if (data.val[0xF] == 0x03) {
+		pr_warn("config param out of range\n");
+		return -EINVAL;
+	}
+	if (data.val[0xF] == 0x06) {
+		pr_warn("config change already pending\n");
+		return -EBUSY;
+	}
+	if (data.val[0xF] != 0x00) {
+		pr_warn("config change error, ret=%d\n",
+		      data.val[0xF]);
+		return -EIO;
+	}
+	return 0;
 }
 
-/*
- * hdaps_readb_one - reads a byte from a single I/O port, placing the value in
- * the given pointer.  Returns zero on success or a negative error on failure.
- * Can sleep.
+/**
+ * hdaps_get_ec_config - get accelerometer parameters.
+ * @ec_rate: embedded controller sampling rate
+ * @order: embedded controller running average filter order
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static int hdaps_readb_one(unsigned int port, u8 *val)
+static int hdaps_get_ec_config(int *ec_rate, int *order)
 {
-	int ret;
-
-	mutex_lock(&hdaps_mtx);
-
-	/* do a sync refresh -- we need to be sure that we read fresh data */
-	ret = __device_refresh_sync();
+	const struct thinkpad_ec_row args =
+		{ .mask = 0x0003, .val = {0x17, 0x82} };
+	struct thinkpad_ec_row data = { .mask = 0x801F };
+	int ret = thinkpad_ec_read_row(&args, &data);
 	if (ret)
-		goto out;
-
-	*val = inb(port);
-	__device_complete();
-
-out:
-	mutex_unlock(&hdaps_mtx);
-	return ret;
+		return ret;
+	if (data.val[0xF] != 0x00)
+		return -EIO;
+	if (!(data.val[0x1] & 0x01))
+		return -ENXIO; /* accelerometer polling not enabled */
+	if (data.val[0x1] & 0x02)
+		return -EBUSY; /* config change in progress, retry later */
+	*ec_rate = data.val[0x2] | ((int)(data.val[0x3]) << 8);
+	*order = data.val[0x4];
+	return 0;
 }
 
-/* __hdaps_read_pair - internal lockless helper for hdaps_read_pair(). */
-static int __hdaps_read_pair(unsigned int port1, unsigned int port2,
-			     int *x, int *y)
+/**
+ * hdaps_get_ec_mode - get EC accelerometer mode
+ * Returns zero on success and negative error code on failure.  Can sleep.
+ */
+static int hdaps_get_ec_mode(u8 *mode)
 {
-	/* do a sync refresh -- we need to be sure that we read fresh data */
-	if (__device_refresh_sync())
+	const struct thinkpad_ec_row args =
+		{ .mask = 0x0001, .val = {0x13} };
+	struct thinkpad_ec_row data = { .mask = 0x8002 };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	if (ret)
+		return ret;
+	if (data.val[0xF] != 0x00) {
+		pr_warn("accelerometer not implemented (0x%02x)\n",
+		       data.val[0xF]);
 		return -EIO;
-
-	*y = inw(port2);
-	*x = inw(port1);
-	km_activity = inb(HDAPS_PORT_KMACT);
-	__device_complete();
-
-	/* hdaps_invert is a bitvector to negate the axes */
-	if (hdaps_invert & HDAPS_X_AXIS)
-		*x = -*x;
-	if (hdaps_invert & HDAPS_Y_AXIS)
-		*y = -*y;
-
+	}
+	*mode = data.val[0x1];
 	return 0;
 }
 
-/*
- * hdaps_read_pair - reads the values from a pair of ports, placing the values
- * in the given pointers.  Returns zero on success.  Can sleep.
+/**
+ * hdaps_check_ec - checks something about the EC.
+ * Follows the clean-room spec for HDAPS; we don't know what it means.
+ * Returns zero on success and negative error code on failure.  Can sleep.
  */
-static int hdaps_read_pair(unsigned int port1, unsigned int port2,
-			   int *val1, int *val2)
+static int hdaps_check_ec(void)
 {
-	int ret;
-
-	mutex_lock(&hdaps_mtx);
-	ret = __hdaps_read_pair(port1, port2, val1, val2);
-	mutex_unlock(&hdaps_mtx);
-
-	return ret;
+	const struct thinkpad_ec_row args =
+		{ .mask = 0x0003, .val = {0x17, 0x81} };
+	struct thinkpad_ec_row data = { .mask = 0x800E };
+	int ret = thinkpad_ec_read_row(&args, &data);
+	if (ret)
+		return  ret;
+	if (!((data.val[0x1] == 0x00 && data.val[0x2] == 0x60) || /* cleanroom spec */
+	      (data.val[0x1] == 0x01 && data.val[0x2] == 0x00)) || /* seen on T61 */
+	    data.val[0x3] != 0x00 || data.val[0xF] != 0x00) {
+		pr_warn("hdaps_check_ec: bad response (0x%x,0x%x,0x%x,0x%x)\n",
+		       data.val[0x1], data.val[0x2],
+		       data.val[0x3], data.val[0xF]);
+		return -EIO;
+	}
+	return 0;
 }
 
-/*
- * hdaps_device_init - initialize the accelerometer.  Returns zero on success
- * and negative error code on failure.  Can sleep.
+/**
+ * hdaps_device_init - initialize the accelerometer.
+ *
+ * Call several embedded controller functions to test and initialize the
+ * accelerometer.
+ * Returns zero on success and negative error code on failure. Can sleep.
  */
+#define FAILED_INIT(msg) pr_err("init failed at: %s\n", msg)
 static int hdaps_device_init(void)
 {
-	int total, ret = -ENXIO;
+	int ret;
+	u8 mode;
 
-	mutex_lock(&hdaps_mtx);
+	ret = thinkpad_ec_lock();
+	if (ret)
+		return ret;
 
-	outb(0x13, 0x1610);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
+	if (hdaps_get_ec_mode(&mode))
+		{ FAILED_INIT("hdaps_get_ec_mode failed"); goto bad; }
 
-	/*
-	 * Most ThinkPads return 0x01.
-	 *
-	 * Others--namely the R50p, T41p, and T42p--return 0x03.  These laptops
-	 * have "inverted" axises.
-	 *
-	 * The 0x02 value occurs when the chip has been previously initialized.
-	 */
-	if (__check_latch(0x1611, 0x03) &&
-		     __check_latch(0x1611, 0x02) &&
-		     __check_latch(0x1611, 0x01))
-		goto out;
-
-	printk(KERN_DEBUG "hdaps: initial latch check good (0x%02x)\n",
-	       __get_latch(0x1611));
+	pr_debug("initial mode latch is 0x%02x\n", mode);
+	if (mode == 0x00)
+		{ FAILED_INIT("accelerometer not available"); goto bad; }
 
-	outb(0x17, 0x1610);
-	outb(0x81, 0x1611);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
-	if (__wait_latch(0x1611, 0x00))
-		goto out;
-	if (__wait_latch(0x1612, 0x60))
-		goto out;
-	if (__wait_latch(0x1613, 0x00))
-		goto out;
-	outb(0x14, 0x1610);
-	outb(0x01, 0x1611);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
-	outb(0x10, 0x1610);
-	outb(0xc8, 0x1611);
-	outb(0x00, 0x1612);
-	outb(0x02, 0x1613);
-	outb(0x01, 0x161f);
-	if (__wait_latch(0x161f, 0x00))
-		goto out;
-	if (__device_refresh_sync())
-		goto out;
-	if (__wait_latch(0x1611, 0x00))
-		goto out;
+	if (hdaps_check_ec())
+		{ FAILED_INIT("hdaps_check_ec failed"); goto bad; }
 
-	/* we have done our dance, now let's wait for the applause */
-	for (total = INIT_TIMEOUT_MSECS; total > 0; total -= INIT_WAIT_MSECS) {
-		int x, y;
+	if (hdaps_set_power(1))
+		{ FAILED_INIT("hdaps_set_power failed"); goto bad; }
 
-		/* a read of the device helps push it into action */
-		__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y);
-		if (!__wait_latch(0x1611, 0x02)) {
-			ret = 0;
-			break;
-		}
+	if (hdaps_set_ec_config(sampling_rate*oversampling_ratio,
+				running_avg_filter_order))
+		{ FAILED_INIT("hdaps_set_ec_config failed"); goto bad; }
 
-		msleep(INIT_WAIT_MSECS);
-	}
+	thinkpad_ec_invalidate();
+	udelay(200);
 
-out:
-	mutex_unlock(&hdaps_mtx);
+	/* Just prefetch instead of reading, to avoid ~1sec delay on load */
+	ret = thinkpad_ec_prefetch_row(&ec_accel_args);
+	if (ret)
+		{ FAILED_INIT("initial prefetch failed"); goto bad; }
+	goto good;
+bad:
+	thinkpad_ec_invalidate();
+	ret = -ENXIO;
+good:
+	stale_readout = 1;
+	thinkpad_ec_unlock();
 	return ret;
 }
 
+/**
+ * hdaps_device_shutdown - power off the accelerometer
+ * Returns nonzero on failure. Can sleep.
+ */
+static int hdaps_device_shutdown(void)
+{
+	int ret;
+	ret = hdaps_set_power(0);
+	if (ret) {
+		pr_warn("cannot power off\n");
+		return ret;
+	}
+	ret = hdaps_set_ec_config(0, 1);
+	if (ret)
+		pr_warn("cannot stop EC sampling\n");
+	return ret;
+}
 
 /* Device model stuff */
 
@@ -306,13 +424,29 @@ static int hdaps_probe(struct platform_device *dev)
 }
 
 #ifdef CONFIG_PM_SLEEP
+static int hdaps_suspend(struct device *dev)
+{
+	/* Don't do hdaps polls until resume re-initializes the sensor. */
+	del_timer_sync(&hdaps_timer);
+	hdaps_device_shutdown(); /* ignore errors, effect is negligible */
+	return 0;
+}
+
 static int hdaps_resume(struct device *dev)
 {
-	return hdaps_device_init();
+	int ret = hdaps_device_init();
+	if (ret)
+		return ret;
+
+	mutex_lock(&hdaps_users_mtx);
+	if (hdaps_users)
+		mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate);
+	mutex_unlock(&hdaps_users_mtx);
+	return 0;
 }
 #endif
 
-static SIMPLE_DEV_PM_OPS(hdaps_pm, NULL, hdaps_resume);
+static SIMPLE_DEV_PM_OPS(hdaps_pm, hdaps_suspend, hdaps_resume);
 
 static struct platform_driver hdaps_driver = {
 	.probe = hdaps_probe,
@@ -322,30 +456,47 @@ static struct platform_driver hdaps_driver = {
 	},
 };
 
-/*
- * hdaps_calibrate - Set our "resting" values.  Callers must hold hdaps_mtx.
+/**
+ * hdaps_calibrate - set our "resting" values.
+ * Does its own locking.
  */
 static void hdaps_calibrate(void)
 {
-	__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &rest_x, &rest_y);
+	needs_calibration = 1;
+	hdaps_update();
+	/* If that fails, the mousedev poll will take care of things later. */
 }
 
-static void hdaps_mousedev_poll(struct input_polled_dev *dev)
+/* Timer handler for updating the input device. Runs in softirq context,
+ * so avoid lenghty or blocking operations.
+ */
+static void hdaps_mousedev_poll(unsigned long unused)
 {
-	struct input_dev *input_dev = dev->input;
-	int x, y;
+	int ret;
 
-	mutex_lock(&hdaps_mtx);
+	stale_readout = 1;
 
-	if (__hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y))
-		goto out;
+	/* Cannot sleep.  Try nonblockingly.  If we fail, try again later. */
+	if (thinkpad_ec_try_lock())
+		goto keep_active;
 
-	input_report_abs(input_dev, ABS_X, x - rest_x);
-	input_report_abs(input_dev, ABS_Y, y - rest_y);
-	input_sync(input_dev);
+	ret = __hdaps_update(1); /* fast update, we're in softirq context */
+	thinkpad_ec_unlock();
+	/* Any of "successful", "not yet ready" and "not prefetched"? */
+	if (ret != 0 && ret != -EBUSY && ret != -ENODATA) {
+		pr_err("poll failed, disabling updates\n");
+		return;
+	}
 
-out:
-	mutex_unlock(&hdaps_mtx);
+keep_active:
+	/* Even if we failed now, pos_x,y may have been updated earlier: */
+	input_report_abs(hdaps_idev, ABS_X, pos_x - rest_x);
+	input_report_abs(hdaps_idev, ABS_Y, pos_y - rest_y);
+	input_sync(hdaps_idev);
+	input_report_abs(hdaps_idev_raw, ABS_X, pos_x);
+	input_report_abs(hdaps_idev_raw, ABS_Y, pos_y);
+	input_sync(hdaps_idev_raw);
+	mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate);
 }
 
 
@@ -354,65 +505,41 @@ out:
 static ssize_t hdaps_position_show(struct device *dev,
 				   struct device_attribute *attr, char *buf)
 {
-	int ret, x, y;
-
-	ret = hdaps_read_pair(HDAPS_PORT_XPOS, HDAPS_PORT_YPOS, &x, &y);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "(%d,%d)\n", x, y);
-}
-
-static ssize_t hdaps_variance_show(struct device *dev,
-				   struct device_attribute *attr, char *buf)
-{
-	int ret, x, y;
-
-	ret = hdaps_read_pair(HDAPS_PORT_XVAR, HDAPS_PORT_YVAR, &x, &y);
+	int ret = hdaps_update();
 	if (ret)
 		return ret;
-
-	return sprintf(buf, "(%d,%d)\n", x, y);
+	return sprintf(buf, "(%d,%d)\n", pos_x, pos_y);
 }
 
 static ssize_t hdaps_temp1_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
-	u8 uninitialized_var(temp);
-	int ret;
-
-	ret = hdaps_readb_one(HDAPS_PORT_TEMP1, &temp);
-	if (ret)
-		return ret;
-
-	return sprintf(buf, "%u\n", temp);
-}
-
-static ssize_t hdaps_temp2_show(struct device *dev,
-				struct device_attribute *attr, char *buf)
-{
-	u8 uninitialized_var(temp);
-	int ret;
-
-	ret = hdaps_readb_one(HDAPS_PORT_TEMP2, &temp);
+	int ret = hdaps_update();
 	if (ret)
 		return ret;
-
-	return sprintf(buf, "%u\n", temp);
+	return sprintf(buf, "%d\n", temperature);
 }
 
 static ssize_t hdaps_keyboard_activity_show(struct device *dev,
 					    struct device_attribute *attr,
 					    char *buf)
 {
-	return sprintf(buf, "%u\n", KEYBD_ISSET(km_activity));
+	int ret = hdaps_update();
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n",
+	   get_jiffies_64() < last_keyboard_jiffies + KMACT_REMEMBER_PERIOD);
 }
 
 static ssize_t hdaps_mouse_activity_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
-	return sprintf(buf, "%u\n", MOUSE_ISSET(km_activity));
+	int ret = hdaps_update();
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n",
+	   get_jiffies_64() < last_mouse_jiffies + KMACT_REMEMBER_PERIOD);
 }
 
 static ssize_t hdaps_calibrate_show(struct device *dev,
@@ -425,10 +552,7 @@ static ssize_t hdaps_calibrate_store(struct device *dev,
 				     struct device_attribute *attr,
 				     const char *buf, size_t count)
 {
-	mutex_lock(&hdaps_mtx);
 	hdaps_calibrate();
-	mutex_unlock(&hdaps_mtx);
-
 	return count;
 }
 
@@ -445,7 +569,7 @@ static ssize_t hdaps_invert_store(struct device *dev,
 	int invert;
 
 	if (sscanf(buf, "%d", &invert) != 1 ||
-	    invert < 0 || invert > HDAPS_BOTH_AXES)
+	    invert < 0 || invert > HDAPS_ORIENT_MAX)
 		return -EINVAL;
 
 	hdaps_invert = invert;
@@ -454,24 +578,128 @@ static ssize_t hdaps_invert_store(struct device *dev,
 	return count;
 }
 
+static ssize_t hdaps_sampling_rate_show(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", sampling_rate);
+}
+
+static ssize_t hdaps_sampling_rate_store(
+	struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	int rate, ret;
+	if (sscanf(buf, "%d", &rate) != 1 || rate > HZ || rate <= 0) {
+		pr_warn("must have 0<input_sampling_rate<=HZ=%d\n", HZ);
+		return -EINVAL;
+	}
+	ret = hdaps_set_ec_config(rate*oversampling_ratio,
+				  running_avg_filter_order);
+	if (ret)
+		return ret;
+	sampling_rate = rate;
+	return count;
+}
+
+static ssize_t hdaps_oversampling_ratio_show(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int ec_rate, order;
+	int ret = hdaps_get_ec_config(&ec_rate, &order);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n", ec_rate / sampling_rate);
+}
+
+static ssize_t hdaps_oversampling_ratio_store(
+	struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	int ratio, ret;
+	if (sscanf(buf, "%d", &ratio) != 1 || ratio < 1)
+		return -EINVAL;
+	ret = hdaps_set_ec_config(sampling_rate*ratio,
+				  running_avg_filter_order);
+	if (ret)
+		return ret;
+	oversampling_ratio = ratio;
+	return count;
+}
+
+static ssize_t hdaps_running_avg_filter_order_show(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int rate, order;
+	int ret = hdaps_get_ec_config(&rate, &order);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%u\n", order);
+}
+
+static ssize_t hdaps_running_avg_filter_order_store(
+	struct device *dev, struct device_attribute *attr,
+	const char *buf, size_t count)
+{
+	int order, ret;
+	if (sscanf(buf, "%d", &order) != 1)
+		return -EINVAL;
+	ret = hdaps_set_ec_config(sampling_rate*oversampling_ratio, order);
+	if (ret)
+		return ret;
+	running_avg_filter_order = order;
+	return count;
+}
+
+static int hdaps_mousedev_open(struct input_dev *dev)
+{
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&hdaps_users_mtx);
+	if (hdaps_users++ == 0) /* first input user */
+		mod_timer(&hdaps_timer, jiffies + HZ/sampling_rate);
+	mutex_unlock(&hdaps_users_mtx);
+	return 0;
+}
+
+static void hdaps_mousedev_close(struct input_dev *dev)
+{
+	mutex_lock(&hdaps_users_mtx);
+	if (--hdaps_users == 0) /* no input users left */
+		del_timer_sync(&hdaps_timer);
+	mutex_unlock(&hdaps_users_mtx);
+
+	module_put(THIS_MODULE);
+}
+
 static DEVICE_ATTR(position, 0444, hdaps_position_show, NULL);
-static DEVICE_ATTR(variance, 0444, hdaps_variance_show, NULL);
 static DEVICE_ATTR(temp1, 0444, hdaps_temp1_show, NULL);
-static DEVICE_ATTR(temp2, 0444, hdaps_temp2_show, NULL);
-static DEVICE_ATTR(keyboard_activity, 0444, hdaps_keyboard_activity_show, NULL);
+  /* "temp1" instead of "temperature" is hwmon convention */
+static DEVICE_ATTR(keyboard_activity, 0444,
+		   hdaps_keyboard_activity_show, NULL);
 static DEVICE_ATTR(mouse_activity, 0444, hdaps_mouse_activity_show, NULL);
-static DEVICE_ATTR(calibrate, 0644, hdaps_calibrate_show,hdaps_calibrate_store);
+static DEVICE_ATTR(calibrate, 0644,
+		   hdaps_calibrate_show, hdaps_calibrate_store);
 static DEVICE_ATTR(invert, 0644, hdaps_invert_show, hdaps_invert_store);
+static DEVICE_ATTR(sampling_rate, 0644,
+		   hdaps_sampling_rate_show, hdaps_sampling_rate_store);
+static DEVICE_ATTR(oversampling_ratio, 0644,
+		   hdaps_oversampling_ratio_show,
+		   hdaps_oversampling_ratio_store);
+static DEVICE_ATTR(running_avg_filter_order, 0644,
+		   hdaps_running_avg_filter_order_show,
+		   hdaps_running_avg_filter_order_store);
 
 static struct attribute *hdaps_attributes[] = {
 	&dev_attr_position.attr,
-	&dev_attr_variance.attr,
 	&dev_attr_temp1.attr,
-	&dev_attr_temp2.attr,
 	&dev_attr_keyboard_activity.attr,
 	&dev_attr_mouse_activity.attr,
 	&dev_attr_calibrate.attr,
 	&dev_attr_invert.attr,
+	&dev_attr_sampling_rate.attr,
+	&dev_attr_oversampling_ratio.attr,
+	&dev_attr_running_avg_filter_order.attr,
 	NULL,
 };
 
@@ -482,84 +710,78 @@ static struct attribute_group hdaps_attribute_group = {
 
 /* Module stuff */
 
-/* hdaps_dmi_match - found a match.  return one, short-circuiting the hunt. */
-static int __init hdaps_dmi_match(const struct dmi_system_id *id)
-{
-	pr_info("%s detected\n", id->ident);
-	return 1;
-}
-
 /* hdaps_dmi_match_invert - found an inverted match. */
 static int __init hdaps_dmi_match_invert(const struct dmi_system_id *id)
 {
-	hdaps_invert = (unsigned long)id->driver_data;
-	pr_info("inverting axis (%u) readings\n", hdaps_invert);
-	return hdaps_dmi_match(id);
+	unsigned int orient = (kernel_ulong_t) id->driver_data;
+	hdaps_invert = orient;
+	pr_info("%s detected, setting orientation %u\n", id->ident, orient);
+	return 1; /* stop enumeration */
 }
 
-#define HDAPS_DMI_MATCH_INVERT(vendor, model, axes) {	\
+#define HDAPS_DMI_MATCH_INVERT(vendor, model, orient) { \
 	.ident = vendor " " model,			\
 	.callback = hdaps_dmi_match_invert,		\
-	.driver_data = (void *)axes,			\
+	.driver_data = (void *)(orient),		\
 	.matches = {					\
 		DMI_MATCH(DMI_BOARD_VENDOR, vendor),	\
 		DMI_MATCH(DMI_PRODUCT_VERSION, model)	\
 	}						\
 }
 
-#define HDAPS_DMI_MATCH_NORMAL(vendor, model)		\
-	HDAPS_DMI_MATCH_INVERT(vendor, model, 0)
-
-/* Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match
-   "ThinkPad T42p", so the order of the entries matters.
-   If your ThinkPad is not recognized, please update to latest
-   BIOS. This is especially the case for some R52 ThinkPads. */
-static struct dmi_system_id __initdata hdaps_whitelist[] = {
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R50"),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R51"),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad R52"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61i", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T41"),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T42"),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad T43"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61p", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad X40"),
-	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_Y_AXIS),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61s", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_NORMAL("IBM", "ThinkPad Z60m"),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61m", HDAPS_BOTH_AXES),
-	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad Z61p", HDAPS_BOTH_AXES),
+/* List of models with abnormal axis configuration.
+   Note that HDAPS_DMI_MATCH_NORMAL("ThinkPad T42") would match
+   "ThinkPad T42p", and enumeration stops after first match,
+   so the order of the entries matters. */
+struct dmi_system_id __initdata hdaps_whitelist[] = {
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R50p", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T41p", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad T42p", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X40", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("IBM", "ThinkPad X41", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R60", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R61", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R400", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad R500", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T60", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T61", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60 Tablet", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60s", HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X60", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X61", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400s", HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T400", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410s", HDAPS_ORIENT_SWAP),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T410", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T500", HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad T510", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W510", HDAPS_ORIENT_MAX),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad W520", HDAPS_ORIENT_MAX),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X200", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X | HDAPS_ORIENT_INVERT_Y),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201 Tablet", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201s", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_XY),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X201", HDAPS_ORIENT_SWAP | HDAPS_ORIENT_INVERT_X),
+	HDAPS_DMI_MATCH_INVERT("LENOVO", "ThinkPad X220", HDAPS_ORIENT_SWAP),
 	{ .ident = NULL }
 };
 
 static int __init hdaps_init(void)
 {
-	struct input_dev *idev;
 	int ret;
 
-	if (!dmi_check_system(hdaps_whitelist)) {
-		pr_warn("supported laptop not found!\n");
-		ret = -ENODEV;
-		goto out;
-	}
-
-	if (!request_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS, "hdaps")) {
-		ret = -ENXIO;
-		goto out;
-	}
+	/* Determine axis orientation orientation */
+	if (hdaps_invert == HDAPS_ORIENT_UNDEFINED) /* set by module param? */
+		if (dmi_check_system(hdaps_whitelist) < 1) /* in whitelist? */
+			hdaps_invert = 0; /* default */
 
+	/* Init timer before platform_driver_register, in case of suspend */
+	init_timer(&hdaps_timer);
+	hdaps_timer.function = hdaps_mousedev_poll;
 	ret = platform_driver_register(&hdaps_driver);
 	if (ret)
-		goto out_region;
+		goto out;
 
 	pdev = platform_device_register_simple("hdaps", -1, NULL, 0);
 	if (IS_ERR(pdev)) {
@@ -571,47 +793,79 @@ static int __init hdaps_init(void)
 	if (ret)
 		goto out_device;
 
-	hdaps_idev = input_allocate_polled_device();
+	hdaps_idev = input_allocate_device();
 	if (!hdaps_idev) {
 		ret = -ENOMEM;
 		goto out_group;
 	}
 
-	hdaps_idev->poll = hdaps_mousedev_poll;
-	hdaps_idev->poll_interval = HDAPS_POLL_INTERVAL;
-
-	/* initial calibrate for the input device */
-	hdaps_calibrate();
+	hdaps_idev_raw = input_allocate_device();
+	if (!hdaps_idev_raw) {
+		ret = -ENOMEM;
+		goto out_idev_first;
+	}
 
-	/* initialize the input class */
-	idev = hdaps_idev->input;
-	idev->name = "hdaps";
-	idev->phys = "isa1600/input0";
-	idev->id.bustype = BUS_ISA;
-	idev->dev.parent = &pdev->dev;
-	idev->evbit[0] = BIT_MASK(EV_ABS);
-	input_set_abs_params(idev, ABS_X,
+	/* calibration for the input device (deferred to avoid delay) */
+	needs_calibration = 1;
+
+	/* initialize the joystick-like fuzzed input device */
+	hdaps_idev->name = "ThinkPad HDAPS joystick emulation";
+	hdaps_idev->phys = "hdaps/input0";
+	hdaps_idev->id.bustype = BUS_HOST;
+	hdaps_idev->id.vendor  = HDAPS_INPUT_VENDOR;
+	hdaps_idev->id.product = HDAPS_INPUT_PRODUCT;
+	hdaps_idev->id.version = HDAPS_INPUT_JS_VERSION;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+	hdaps_idev->cdev.dev = &pdev->dev;
+#endif
+	hdaps_idev->evbit[0] = BIT(EV_ABS);
+	hdaps_idev->open = hdaps_mousedev_open;
+	hdaps_idev->close = hdaps_mousedev_close;
+	input_set_abs_params(hdaps_idev, ABS_X,
 			-256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT);
-	input_set_abs_params(idev, ABS_Y,
+	input_set_abs_params(hdaps_idev, ABS_Y,
 			-256, 256, HDAPS_INPUT_FUZZ, HDAPS_INPUT_FLAT);
 
-	ret = input_register_polled_device(hdaps_idev);
+	ret = input_register_device(hdaps_idev);
 	if (ret)
 		goto out_idev;
 
-	pr_info("driver successfully loaded\n");
+	/* initialize the raw data input device */
+	hdaps_idev_raw->name = "ThinkPad HDAPS accelerometer data";
+	hdaps_idev_raw->phys = "hdaps/input1";
+	hdaps_idev_raw->id.bustype = BUS_HOST;
+	hdaps_idev_raw->id.vendor  = HDAPS_INPUT_VENDOR;
+	hdaps_idev_raw->id.product = HDAPS_INPUT_PRODUCT;
+	hdaps_idev_raw->id.version = HDAPS_INPUT_RAW_VERSION;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,25)
+	hdaps_idev_raw->cdev.dev = &pdev->dev;
+#endif
+	hdaps_idev_raw->evbit[0] = BIT(EV_ABS);
+	hdaps_idev_raw->open = hdaps_mousedev_open;
+	hdaps_idev_raw->close = hdaps_mousedev_close;
+	input_set_abs_params(hdaps_idev_raw, ABS_X, -32768, 32767, 0, 0);
+	input_set_abs_params(hdaps_idev_raw, ABS_Y, -32768, 32767, 0, 0);
+
+	ret = input_register_device(hdaps_idev_raw);
+	if (ret)
+		goto out_idev_reg_first;
+
+	pr_info("driver successfully loaded.\n");
 	return 0;
 
+out_idev_reg_first:
+	input_unregister_device(hdaps_idev);
 out_idev:
-	input_free_polled_device(hdaps_idev);
+	input_free_device(hdaps_idev_raw);
+out_idev_first:
+	input_free_device(hdaps_idev);
 out_group:
 	sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group);
 out_device:
 	platform_device_unregister(pdev);
 out_driver:
 	platform_driver_unregister(&hdaps_driver);
-out_region:
-	release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS);
+	hdaps_device_shutdown();
 out:
 	pr_warn("driver init failed (ret=%d)!\n", ret);
 	return ret;
@@ -619,12 +873,12 @@ out:
 
 static void __exit hdaps_exit(void)
 {
-	input_unregister_polled_device(hdaps_idev);
-	input_free_polled_device(hdaps_idev);
+	input_unregister_device(hdaps_idev_raw);
+	input_unregister_device(hdaps_idev);
+	hdaps_device_shutdown(); /* ignore errors, effect is negligible */
 	sysfs_remove_group(&pdev->dev.kobj, &hdaps_attribute_group);
 	platform_device_unregister(pdev);
 	platform_driver_unregister(&hdaps_driver);
-	release_region(HDAPS_LOW_PORT, HDAPS_NR_PORTS);
 
 	pr_info("driver unloaded\n");
 }
@@ -632,9 +886,8 @@ static void __exit hdaps_exit(void)
 module_init(hdaps_init);
 module_exit(hdaps_exit);
 
-module_param_named(invert, hdaps_invert, int, 0);
-MODULE_PARM_DESC(invert, "invert data along each axis. 1 invert x-axis, "
-		 "2 invert y-axis, 3 invert both axes.");
+module_param_named(invert, hdaps_invert, uint, 0);
+MODULE_PARM_DESC(invert, "axis orientation code");
 
 MODULE_AUTHOR("Robert Love");
 MODULE_DESCRIPTION("IBM Hard Drive Active Protection System (HDAPS) driver");
diff --git b/drivers/platform/x86/thinkpad_ec.c b/drivers/platform/x86/thinkpad_ec.c
new file mode 100644
index 0000000..597614b
--- /dev/null
+++ b/drivers/platform/x86/thinkpad_ec.c
@@ -0,0 +1,513 @@
+/*
+ *  thinkpad_ec.c - ThinkPad embedded controller LPC3 functions
+ *
+ *  The embedded controller on ThinkPad laptops has a non-standard interface,
+ *  where LPC channel 3 of the H8S EC chip is hooked up to IO ports
+ *  0x1600-0x161F and implements (a special case of) the H8S LPC protocol.
+ *  The EC LPC interface provides various system management services (currently
+ *  known: battery information and accelerometer readouts). This driver
+ *  provides access and mutual exclusion for the EC interface.
+*
+ *  The LPC protocol and terminology are documented here:
+ *  "H8S/2104B Group Hardware Manual",
+ *  http://documentation.renesas.com/eng/products/mpumcu/rej09b0300_2140bhm.pdf
+ *
+ *  Copyright (C) 2006-2007 Shem Multinymous <multinymous@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/dmi.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/thinkpad_ec.h>
+#include <linux/jiffies.h>
+#include <asm/io.h>
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
+	#include <asm/semaphore.h>
+#else
+	#include <linux/semaphore.h>
+#endif
+
+#define TP_VERSION "0.42"
+
+MODULE_AUTHOR("Shem Multinymous");
+MODULE_DESCRIPTION("ThinkPad embedded controller hardware access");
+MODULE_VERSION(TP_VERSION);
+MODULE_LICENSE("GPL");
+
+/* IO ports used by embedded controller LPC channel 3: */
+#define TPC_BASE_PORT 0x1600
+#define TPC_NUM_PORTS 0x20
+#define TPC_STR3_PORT 0x1604  /* Reads H8S EC register STR3 */
+#define TPC_TWR0_PORT  0x1610 /* Mapped to H8S EC register TWR0MW/SW  */
+#define TPC_TWR15_PORT 0x161F /* Mapped to H8S EC register TWR15. */
+  /* (and port TPC_TWR0_PORT+i is mapped to H8S reg TWRi for 0<i<16) */
+
+/* H8S STR3 status flags (see "H8S/2104B Group Hardware Manual" p.549) */
+#define H8S_STR3_IBF3B 0x80  /* Bidi. Data Register Input Buffer Full */
+#define H8S_STR3_OBF3B 0x40  /* Bidi. Data Register Output Buffer Full */
+#define H8S_STR3_MWMF  0x20  /* Master Write Mode Flag */
+#define H8S_STR3_SWMF  0x10  /* Slave Write Mode Flag */
+#define H8S_STR3_MASK  0xF0  /* All bits we care about in STR3 */
+
+/* Timeouts and retries */
+#define TPC_READ_RETRIES     150
+#define TPC_READ_NDELAY      500
+#define TPC_REQUEST_RETRIES 1000
+#define TPC_REQUEST_NDELAY    10
+#define TPC_PREFETCH_TIMEOUT   (HZ/10)  /* invalidate prefetch after 0.1sec */
+
+/* A few macros for printk()ing: */
+#define MSG_FMT(fmt, args...) \
+  "thinkpad_ec: %s: " fmt "\n", __func__, ## args
+#define REQ_FMT(msg, code) \
+  MSG_FMT("%s: (0x%02x:0x%02x)->0x%02x", \
+	  msg, args->val[0x0], args->val[0xF], code)
+
+/* State of request prefetching: */
+static u8 prefetch_arg0, prefetch_argF;           /* Args of last prefetch */
+static u64 prefetch_jiffies;                      /* time of prefetch, or: */
+#define TPC_PREFETCH_NONE   INITIAL_JIFFIES       /*   No prefetch */
+#define TPC_PREFETCH_JUNK   (INITIAL_JIFFIES+1)   /*   Ignore prefetch */
+
+/* Locking: */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+static DECLARE_MUTEX(thinkpad_ec_mutex);
+#else
+static DEFINE_SEMAPHORE(thinkpad_ec_mutex);
+#endif
+
+/* Kludge in case the ACPI DSDT reserves the ports we need. */
+static bool force_io;    /* Willing to do IO to ports we couldn't reserve? */
+static int reserved_io; /* Successfully reserved the ports? */
+module_param_named(force_io, force_io, bool, 0600);
+MODULE_PARM_DESC(force_io, "Force IO even if region already reserved (0=off, 1=on)");
+
+/**
+ * thinkpad_ec_lock - get lock on the ThinkPad EC
+ *
+ * Get exclusive lock for accesing the ThinkPad embedded controller LPC3
+ * interface. Returns 0 iff lock acquired.
+ */
+int thinkpad_ec_lock(void)
+{
+	int ret;
+	ret = down_interruptible(&thinkpad_ec_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_lock);
+
+/**
+ * thinkpad_ec_try_lock - try getting lock on the ThinkPad EC
+ *
+ * Try getting an exclusive lock for accesing the ThinkPad embedded
+ * controller LPC3. Returns immediately if lock is not available; neither
+ * blocks nor sleeps. Returns 0 iff lock acquired .
+ */
+int thinkpad_ec_try_lock(void)
+{
+	return down_trylock(&thinkpad_ec_mutex);
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_try_lock);
+
+/**
+ * thinkpad_ec_unlock - release lock on ThinkPad EC
+ *
+ * Release a previously acquired exclusive lock on the ThinkPad ebmedded
+ * controller LPC3 interface.
+ */
+void thinkpad_ec_unlock(void)
+{
+	up(&thinkpad_ec_mutex);
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_unlock);
+
+/**
+ * thinkpad_ec_request_row - tell embedded controller to prepare a row
+ * @args Input register arguments
+ *
+ * Requests a data row by writing to H8S LPC registers TRW0 through TWR15 (or
+ * a subset thereof) following the protocol prescribed by the "H8S/2104B Group
+ * Hardware Manual". Does sanity checks via status register STR3.
+ */
+static int thinkpad_ec_request_row(const struct thinkpad_ec_row *args)
+{
+	u8 str3;
+	int i;
+
+	/* EC protocol requires write to TWR0 (function code): */
+	if (!(args->mask & 0x0001)) {
+		printk(KERN_ERR MSG_FMT("bad args->mask=0x%02x", args->mask));
+		return -EINVAL;
+	}
+
+	/* Check initial STR3 status: */
+	str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	if (str3 & H8S_STR3_OBF3B) { /* data already pending */
+		inb(TPC_TWR15_PORT); /* marks end of previous transaction */
+		if (prefetch_jiffies == TPC_PREFETCH_NONE)
+			printk(KERN_WARNING REQ_FMT(
+			       "EC has result from unrequested transaction",
+			       str3));
+		return -EBUSY; /* EC will be ready in a few usecs */
+	} else if (str3 == H8S_STR3_SWMF) { /* busy with previous request */
+		if (prefetch_jiffies == TPC_PREFETCH_NONE)
+			printk(KERN_WARNING REQ_FMT(
+			       "EC is busy with unrequested transaction",
+			       str3));
+		return -EBUSY; /* data will be pending in a few usecs */
+	} else if (str3 != 0x00) { /* unexpected status? */
+		printk(KERN_WARNING REQ_FMT("unexpected initial STR3", str3));
+		return -EIO;
+	}
+
+	/* Send TWR0MW: */
+	outb(args->val[0], TPC_TWR0_PORT);
+	str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	if (str3 != H8S_STR3_MWMF) { /* not accepted? */
+		printk(KERN_WARNING REQ_FMT("arg0 rejected", str3));
+		return -EIO;
+	}
+
+	/* Send TWR1 through TWR14: */
+	for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++)
+		if ((args->mask>>i)&1)
+			outb(args->val[i], TPC_TWR0_PORT+i);
+
+	/* Send TWR15 (default to 0x01). This marks end of command. */
+	outb((args->mask & 0x8000) ? args->val[0xF] : 0x01, TPC_TWR15_PORT);
+
+	/* Wait until EC starts writing its reply (~60ns on average).
+	 * Releasing locks before this happens may cause an EC hang
+	 * due to firmware bug!
+	 */
+	for (i = 0; i < TPC_REQUEST_RETRIES; i++) {
+		str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+		if (str3 & H8S_STR3_SWMF) /* EC started replying */
+			return 0;
+		else if (!(str3 & ~(H8S_STR3_IBF3B|H8S_STR3_MWMF)))
+			/* Normal progress (the EC hasn't seen the request
+			 * yet, or is processing it). Wait it out. */
+			ndelay(TPC_REQUEST_NDELAY);
+		else { /* weird EC status */
+			printk(KERN_WARNING
+			       REQ_FMT("bad end STR3", str3));
+			return -EIO;
+		}
+	}
+	printk(KERN_WARNING REQ_FMT("EC is mysteriously silent", str3));
+	return -EIO;
+}
+
+/**
+ * thinkpad_ec_read_data - read pre-requested row-data from EC
+ * @args Input register arguments of pre-requested rows
+ * @data Output register values
+ *
+ * Reads current row data from the controller, assuming it's already
+ * requested. Follows the H8S spec for register access and status checks.
+ */
+static int thinkpad_ec_read_data(const struct thinkpad_ec_row *args,
+				 struct thinkpad_ec_row *data)
+{
+	int i;
+	u8 str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	/* Once we make a request, STR3 assumes the sequence of values listed
+	 * in the following 'if' as it reads the request and writes its data.
+	 * It takes about a few dozen nanosecs total, with very high variance.
+	 */
+	if (str3 == (H8S_STR3_IBF3B|H8S_STR3_MWMF) ||
+	    str3 == 0x00 ||  /* the 0x00 is indistinguishable from idle EC! */
+	    str3 == H8S_STR3_SWMF)
+		return -EBUSY; /* not ready yet */
+	/* Finally, the EC signals output buffer full: */
+	if (str3 != (H8S_STR3_OBF3B|H8S_STR3_SWMF)) {
+		printk(KERN_WARNING
+		       REQ_FMT("bad initial STR3", str3));
+		return -EIO;
+	}
+
+	/* Read first byte (signals start of read transactions): */
+	data->val[0] = inb(TPC_TWR0_PORT);
+	/* Optionally read 14 more bytes: */
+	for (i = 1; i < TP_CONTROLLER_ROW_LEN-1; i++)
+		if ((data->mask >> i)&1)
+			data->val[i] = inb(TPC_TWR0_PORT+i);
+	/* Read last byte from 0x161F (signals end of read transaction): */
+	data->val[0xF] = inb(TPC_TWR15_PORT);
+
+	/* Readout still pending? */
+	str3 = inb(TPC_STR3_PORT) & H8S_STR3_MASK;
+	if (str3 & H8S_STR3_OBF3B)
+		printk(KERN_WARNING
+		       REQ_FMT("OBF3B=1 after read", str3));
+	/* If port 0x161F returns 0x80 too often, the EC may lock up. Warn: */
+	if (data->val[0xF] == 0x80)
+		printk(KERN_WARNING
+		       REQ_FMT("0x161F reports error", data->val[0xF]));
+	return 0;
+}
+
+/**
+ * thinkpad_ec_is_row_fetched - is the given row currently prefetched?
+ *
+ * To keep things simple we compare only the first and last args;
+ * this suffices for all known cases.
+ */
+static int thinkpad_ec_is_row_fetched(const struct thinkpad_ec_row *args)
+{
+	return (prefetch_jiffies != TPC_PREFETCH_NONE) &&
+	       (prefetch_jiffies != TPC_PREFETCH_JUNK) &&
+	       (prefetch_arg0 == args->val[0]) &&
+	       (prefetch_argF == args->val[0xF]) &&
+	       (get_jiffies_64() < prefetch_jiffies + TPC_PREFETCH_TIMEOUT);
+}
+
+/**
+ * thinkpad_ec_read_row - request and read data from ThinkPad EC
+ * @args Input register arguments
+ * @data Output register values
+ *
+ * Read a data row from the ThinkPad embedded controller LPC3 interface.
+ * Does fetching and retrying if needed. The row is specified by an
+ * array of 16 bytes, some of which may be undefined (but the first is
+ * mandatory). These bytes are given in @args->val[], where @args->val[i] is
+ * used iff (@args->mask>>i)&1). The resulting row data is stored in
+ * @data->val[], but is only guaranteed to be valid for indices corresponding
+ * to set bit in @data->mask. That is, if @data->mask&(1<<i)==0 then
+ * @data->val[i] is undefined.
+ *
+ * Returns -EBUSY on transient error and -EIO on abnormal condition.
+ * Caller must hold controller lock.
+ */
+int thinkpad_ec_read_row(const struct thinkpad_ec_row *args,
+			 struct thinkpad_ec_row *data)
+{
+	int retries, ret;
+
+	if (thinkpad_ec_is_row_fetched(args))
+		goto read_row; /* already requested */
+
+	/* Request the row */
+	for (retries = 0; retries < TPC_READ_RETRIES; ++retries) {
+		ret = thinkpad_ec_request_row(args);
+		if (!ret)
+			goto read_row;
+		if (ret != -EBUSY)
+			break;
+		ndelay(TPC_READ_NDELAY);
+	}
+	printk(KERN_ERR REQ_FMT("failed requesting row", ret));
+	goto out;
+
+read_row:
+	/* Read the row's data */
+	for (retries = 0; retries < TPC_READ_RETRIES; ++retries) {
+		ret = thinkpad_ec_read_data(args, data);
+		if (!ret)
+			goto out;
+		if (ret != -EBUSY)
+			break;
+		ndelay(TPC_READ_NDELAY);
+	}
+
+	printk(KERN_ERR REQ_FMT("failed waiting for data", ret));
+
+out:
+	prefetch_jiffies = TPC_PREFETCH_JUNK;
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_read_row);
+
+/**
+ * thinkpad_ec_try_read_row - try reading prefetched data from ThinkPad EC
+ * @args Input register arguments
+ * @data Output register values
+ *
+ * Try reading a data row from the ThinkPad embedded controller LPC3
+ * interface, if this raw was recently prefetched using
+ * thinkpad_ec_prefetch_row(). Does not fetch, retry or block.
+ * The parameters have the same meaning as in thinkpad_ec_read_row().
+ *
+ * Returns -EBUSY is data not ready and -ENODATA if row not prefetched.
+ * Caller must hold controller lock.
+ */
+int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args,
+			     struct thinkpad_ec_row *data)
+{
+	int ret;
+	if (!thinkpad_ec_is_row_fetched(args)) {
+		ret = -ENODATA;
+	} else {
+		ret = thinkpad_ec_read_data(args, data);
+		if (!ret)
+			prefetch_jiffies = TPC_PREFETCH_NONE; /* eaten up */
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_try_read_row);
+
+/**
+ * thinkpad_ec_prefetch_row - prefetch data from ThinkPad EC
+ * @args Input register arguments
+ *
+ * Prefetch a data row from the ThinkPad embedded controller LCP3
+ * interface. A subsequent call to thinkpad_ec_read_row() with the
+ * same arguments will be faster, and a subsequent call to
+ * thinkpad_ec_try_read_row() stands a good chance of succeeding if
+ * done neither too soon nor too late. See
+ * thinkpad_ec_read_row() for the meaning of @args.
+ *
+ * Returns -EBUSY on transient error and -EIO on abnormal condition.
+ * Caller must hold controller lock.
+ */
+int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args)
+{
+	int ret;
+	ret = thinkpad_ec_request_row(args);
+	if (ret) {
+		prefetch_jiffies = TPC_PREFETCH_JUNK;
+	} else {
+		prefetch_jiffies = get_jiffies_64();
+		prefetch_arg0 = args->val[0x0];
+		prefetch_argF = args->val[0xF];
+	}
+	return ret;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_prefetch_row);
+
+/**
+ * thinkpad_ec_invalidate - invalidate prefetched ThinkPad EC data
+ *
+ * Invalidate the data prefetched via thinkpad_ec_prefetch_row() from the
+ * ThinkPad embedded controller LPC3 interface.
+ * Must be called before unlocking by any code that accesses the controller
+ * ports directly.
+ */
+void thinkpad_ec_invalidate(void)
+{
+	prefetch_jiffies = TPC_PREFETCH_JUNK;
+}
+EXPORT_SYMBOL_GPL(thinkpad_ec_invalidate);
+
+
+/*** Checking for EC hardware ***/
+
+/**
+ * thinkpad_ec_test - verify the EC is present and follows protocol
+ *
+ * Ensure the EC LPC3 channel really works on this machine by making
+ * an EC request and seeing if the EC follows the documented H8S protocol.
+ * The requested row just reads battery status, so it should be harmless to
+ * access it (on a correct EC).
+ * This test writes to IO ports, so execute only after checking DMI.
+ */
+static int __init thinkpad_ec_test(void)
+{
+	int ret;
+	const struct thinkpad_ec_row args = /* battery 0 basic status */
+	  { .mask = 0x8001, .val = {0x01,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0x00} };
+	struct thinkpad_ec_row data = { .mask = 0x0000 };
+	ret = thinkpad_ec_lock();
+	if (ret)
+		return ret;
+	ret = thinkpad_ec_read_row(&args, &data);
+	thinkpad_ec_unlock();
+	return ret;
+}
+
+/* Search all DMI device names of a given type for a substring */
+static int __init dmi_find_substring(int type, const char *substr)
+{
+	const struct dmi_device *dev = NULL;
+	while ((dev = dmi_find_device(type, NULL, dev))) {
+		if (strstr(dev->name, substr))
+			return 1;
+	}
+	return 0;
+}
+
+#define TP_DMI_MATCH(vendor,model)	{		\
+	.ident = vendor " " model,			\
+	.matches = {					\
+		DMI_MATCH(DMI_BOARD_VENDOR, vendor),	\
+		DMI_MATCH(DMI_PRODUCT_VERSION, model)	\
+	}						\
+}
+
+/* Check DMI for existence of ThinkPad embedded controller */
+static int __init check_dmi_for_ec(void)
+{
+	/* A few old models that have a good EC but don't report it in DMI */
+	struct dmi_system_id tp_whitelist[] = {
+		TP_DMI_MATCH("IBM", "ThinkPad A30"),
+		TP_DMI_MATCH("IBM", "ThinkPad T23"),
+		TP_DMI_MATCH("IBM", "ThinkPad X24"),
+		TP_DMI_MATCH("LENOVO", "ThinkPad"),
+		{ .ident = NULL }
+	};
+	return dmi_find_substring(DMI_DEV_TYPE_OEM_STRING,
+				  "IBM ThinkPad Embedded Controller") ||
+	       dmi_check_system(tp_whitelist);
+}
+
+/*** Init and cleanup ***/
+
+static int __init thinkpad_ec_init(void)
+{
+	if (!check_dmi_for_ec()) {
+		printk(KERN_WARNING
+		       "thinkpad_ec: no ThinkPad embedded controller!\n");
+		return -ENODEV;
+	}
+
+	if (request_region(TPC_BASE_PORT, TPC_NUM_PORTS, "thinkpad_ec")) {
+		reserved_io = 1;
+	} else {
+		printk(KERN_ERR "thinkpad_ec: cannot claim IO ports %#x-%#x... ",
+		       TPC_BASE_PORT,
+		       TPC_BASE_PORT + TPC_NUM_PORTS - 1);
+		if (force_io) {
+			printk("forcing use of unreserved IO ports.\n");
+		} else {
+			printk("consider using force_io=1.\n");
+			return -ENXIO;
+		}
+	}
+	prefetch_jiffies = TPC_PREFETCH_JUNK;
+	if (thinkpad_ec_test()) {
+		printk(KERN_ERR "thinkpad_ec: initial ec test failed\n");
+		if (reserved_io)
+			release_region(TPC_BASE_PORT, TPC_NUM_PORTS);
+		return -ENXIO;
+	}
+	printk(KERN_INFO "thinkpad_ec: thinkpad_ec " TP_VERSION " loaded.\n");
+	return 0;
+}
+
+static void __exit thinkpad_ec_exit(void)
+{
+	if (reserved_io)
+		release_region(TPC_BASE_PORT, TPC_NUM_PORTS);
+	printk(KERN_INFO "thinkpad_ec: unloaded.\n");
+}
+
+module_init(thinkpad_ec_init);
+module_exit(thinkpad_ec_exit);
diff --git b/drivers/platform/x86/tp_smapi.c b/drivers/platform/x86/tp_smapi.c
new file mode 100644
index 0000000..209cb64
--- /dev/null
+++ b/drivers/platform/x86/tp_smapi.c
@@ -0,0 +1,1493 @@
+/*
+ *  tp_smapi.c - ThinkPad SMAPI support
+ *
+ *  This driver exposes some features of the System Management Application
+ *  Program Interface (SMAPI) BIOS found on ThinkPad laptops. It works on
+ *  models in which the SMAPI BIOS runs in SMM and is invoked by writing
+ *  to the APM control port 0xB2.
+ *  It also exposes battery status information, obtained from the ThinkPad
+ *  embedded controller (via the thinkpad_ec module).
+ *  Ancient ThinkPad models use a different interface, supported by the
+ *  "thinkpad" module from "tpctl".
+ *
+ *  Many of the battery status values obtained from the EC simply mirror
+ *  values provided by the battery's Smart Battery System (SBS) interface, so
+ *  their meaning is defined by the Smart Battery Data Specification (see
+ *  http://sbs-forum.org/specs/sbdat110.pdf). References to this SBS spec
+ *  are given in the code where relevant.
+ *
+ *  Copyright (C) 2006 Shem Multinymous <multinymous@gmail.com>.
+ *  SMAPI access code based on the mwave driver by Mike Sullivan.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/proc_fs.h>
+#include <linux/mc146818rtc.h>	/* CMOS defines */
+#include <linux/delay.h>
+#include <linux/version.h>
+#include <linux/thinkpad_ec.h>
+#include <linux/platform_device.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+
+#define TP_VERSION "0.42"
+#define TP_DESC "ThinkPad SMAPI Support"
+#define TP_DIR "smapi"
+
+MODULE_AUTHOR("Shem Multinymous");
+MODULE_DESCRIPTION(TP_DESC);
+MODULE_VERSION(TP_VERSION);
+MODULE_LICENSE("GPL");
+
+static struct platform_device *pdev;
+
+static int tp_debug;
+module_param_named(debug, tp_debug, int, 0600);
+MODULE_PARM_DESC(debug, "Debug level (0=off, 1=on)");
+
+/* A few macros for printk()ing: */
+#define TPRINTK(level, fmt, args...) \
+  dev_printk(level, &(pdev->dev), "%s: " fmt "\n", __func__, ## args)
+#define DPRINTK(fmt, args...) \
+  do { if (tp_debug) TPRINTK(KERN_DEBUG, fmt, ## args); } while (0)
+
+/*********************************************************************
+ * SMAPI interface
+ */
+
+/* SMAPI functions (register BX when making the SMM call). */
+#define SMAPI_GET_INHIBIT_CHARGE                0x2114
+#define SMAPI_SET_INHIBIT_CHARGE                0x2115
+#define SMAPI_GET_THRESH_START                  0x2116
+#define SMAPI_SET_THRESH_START                  0x2117
+#define SMAPI_GET_FORCE_DISCHARGE               0x2118
+#define SMAPI_SET_FORCE_DISCHARGE               0x2119
+#define SMAPI_GET_THRESH_STOP                   0x211a
+#define SMAPI_SET_THRESH_STOP                   0x211b
+
+/* SMAPI error codes (see ThinkPad 770 Technical Reference Manual p.83 at
+ http://www-307.ibm.com/pc/support/site.wss/document.do?lndocid=PFAN-3TUQQD */
+#define SMAPI_RETCODE_EOF 0xff
+static struct { u8 rc; char *msg; int ret; } smapi_retcode[] =
+{
+	{0x00, "OK", 0},
+	{0x53, "SMAPI function is not available", -ENXIO},
+	{0x81, "Invalid parameter", -EINVAL},
+	{0x86, "Function is not supported by SMAPI BIOS", -EOPNOTSUPP},
+	{0x90, "System error", -EIO},
+	{0x91, "System is invalid", -EIO},
+	{0x92, "System is busy, -EBUSY"},
+	{0xa0, "Device error (disk read error)", -EIO},
+	{0xa1, "Device is busy", -EBUSY},
+	{0xa2, "Device is not attached", -ENXIO},
+	{0xa3, "Device is disbled", -EIO},
+	{0xa4, "Request parameter is out of range", -EINVAL},
+	{0xa5, "Request parameter is not accepted", -EINVAL},
+	{0xa6, "Transient error", -EBUSY}, /* ? */
+	{SMAPI_RETCODE_EOF, "Unknown error code", -EIO}
+};
+
+
+#define SMAPI_MAX_RETRIES 10
+#define SMAPI_PORT2 0x4F           /* fixed port, meaning unclear */
+static unsigned short smapi_port;  /* APM control port, normally 0xB2 */
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+static DECLARE_MUTEX(smapi_mutex);
+#else
+static DEFINE_SEMAPHORE(smapi_mutex);
+#endif
+
+/**
+ * find_smapi_port - read SMAPI port from NVRAM
+ */
+static int __init find_smapi_port(void)
+{
+	u16 smapi_id = 0;
+	unsigned short port = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc_lock, flags);
+	smapi_id = CMOS_READ(0x7C);
+	smapi_id |= (CMOS_READ(0x7D) << 8);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
+	if (smapi_id != 0x5349) {
+		printk(KERN_ERR "SMAPI not supported (ID=0x%x)\n", smapi_id);
+		return -ENXIO;
+	}
+	spin_lock_irqsave(&rtc_lock, flags);
+	port = CMOS_READ(0x7E);
+	port |= (CMOS_READ(0x7F) << 8);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+	if (port == 0) {
+		printk(KERN_ERR "unable to read SMAPI port number\n");
+		return -ENXIO;
+	}
+	return port;
+}
+
+/**
+ * smapi_request - make a SMAPI call
+ * @inEBX, @inECX, @inEDI, @inESI: input registers
+ * @outEBX, @outECX, @outEDX, @outEDI, @outESI: outputs registers
+ * @msg: textual error message
+ * Invokes the SMAPI SMBIOS with the given input and outpu args.
+ * All outputs are optional (can be %NULL).
+ * Returns 0 when successful, and a negative errno constant
+ * (see smapi_retcode above) upon failure.
+ */
+static int smapi_request(u32 inEBX, u32 inECX,
+			 u32 inEDI, u32 inESI,
+			 u32 *outEBX, u32 *outECX, u32 *outEDX,
+			 u32 *outEDI, u32 *outESI, const char **msg)
+{
+	int ret = 0;
+	int i;
+	int retries;
+	u8 rc;
+	/* Must use local vars for output regs, due to reg pressure. */
+	u32 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI;
+
+	for (retries = 0; retries < SMAPI_MAX_RETRIES; ++retries) {
+		DPRINTK("req_in: BX=%x CX=%x DI=%x SI=%x",
+			inEBX, inECX, inEDI, inESI);
+
+		/* SMAPI's SMBIOS call and thinkpad_ec end up using use
+		 * different interfaces to the same chip, so play it safe. */
+		ret = thinkpad_ec_lock();
+		if (ret)
+			return ret;
+
+		__asm__ __volatile__(
+			"movl  $0x00005380,%%eax\n\t"
+			"movl  %6,%%ebx\n\t"
+			"movl  %7,%%ecx\n\t"
+			"movl  %8,%%edi\n\t"
+			"movl  %9,%%esi\n\t"
+			"xorl  %%edx,%%edx\n\t"
+			"movw  %10,%%dx\n\t"
+			"out   %%al,%%dx\n\t"  /* trigger SMI to SMBIOS */
+			"out   %%al,$0x4F\n\t"
+			"movl  %%eax,%0\n\t"
+			"movl  %%ebx,%1\n\t"
+			"movl  %%ecx,%2\n\t"
+			"movl  %%edx,%3\n\t"
+			"movl  %%edi,%4\n\t"
+			"movl  %%esi,%5\n\t"
+			:"=m"(tmpEAX),
+			 "=m"(tmpEBX),
+			 "=m"(tmpECX),
+			 "=m"(tmpEDX),
+			 "=m"(tmpEDI),
+			 "=m"(tmpESI)
+			:"m"(inEBX), "m"(inECX), "m"(inEDI), "m"(inESI),
+			 "m"((u16)smapi_port)
+			:"%eax", "%ebx", "%ecx", "%edx", "%edi",
+			 "%esi");
+
+		thinkpad_ec_invalidate();
+		thinkpad_ec_unlock();
+
+		/* Don't let the next SMAPI access happen too quickly,
+		 * may case problems. (We're hold smapi_mutex).       */
+		msleep(50);
+
+		if (outEBX) *outEBX = tmpEBX;
+		if (outECX) *outECX = tmpECX;
+		if (outEDX) *outEDX = tmpEDX;
+		if (outESI) *outESI = tmpESI;
+		if (outEDI) *outEDI = tmpEDI;
+
+		/* Look up error code */
+		rc = (tmpEAX>>8)&0xFF;
+		for (i = 0; smapi_retcode[i].rc != SMAPI_RETCODE_EOF &&
+			    smapi_retcode[i].rc != rc; ++i) {}
+		ret = smapi_retcode[i].ret;
+		if (msg)
+			*msg = smapi_retcode[i].msg;
+
+		DPRINTK("req_out: AX=%x BX=%x CX=%x DX=%x DI=%x SI=%x r=%d",
+			 tmpEAX, tmpEBX, tmpECX, tmpEDX, tmpEDI, tmpESI, ret);
+		if (ret)
+			TPRINTK(KERN_NOTICE, "SMAPI error: %s (func=%x)",
+				smapi_retcode[i].msg, inEBX);
+
+		if (ret != -EBUSY)
+			return ret;
+	}
+	return ret;
+}
+
+/* Convenience wrapper: discard output arguments */
+static int smapi_write(u32 inEBX, u32 inECX,
+		       u32 inEDI, u32 inESI, const char **msg)
+{
+	return smapi_request(inEBX, inECX, inEDI, inESI,
+			     NULL, NULL, NULL, NULL, NULL, msg);
+}
+
+
+/*********************************************************************
+ * Specific SMAPI services
+ * All of these functions return 0 upon success, and a negative errno
+ * constant (see smapi_retcode) on failure.
+ */
+
+enum thresh_type {
+	THRESH_STOP  = 0, /* the code assumes this is 0 for brevity */
+	THRESH_START
+};
+#define THRESH_NAME(which) ((which == THRESH_START) ? "start" : "stop")
+
+/**
+ * __get_real_thresh - read battery charge start/stop threshold from SMAPI
+ * @bat:    battery number (0 or 1)
+ * @which:  THRESH_START or THRESH_STOP
+ * @thresh: 1..99, 0=default 1..99, 0=default (pass this as-is to SMAPI)
+ * @outEDI: some additional state that needs to be preserved, meaning unknown
+ * @outESI: some additional state that needs to be preserved, meaning unknown
+ */
+static int __get_real_thresh(int bat, enum thresh_type which, int *thresh,
+			     u32 *outEDI, u32 *outESI)
+{
+	u32 ebx = (which == THRESH_START) ? SMAPI_GET_THRESH_START
+					  : SMAPI_GET_THRESH_STOP;
+	u32 ecx = (bat+1)<<8;
+	const char *msg;
+	int ret = smapi_request(ebx, ecx, 0, 0, NULL,
+				&ecx, NULL, outEDI, outESI, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: %s",
+			THRESH_NAME(which), bat, msg);
+		return ret;
+	}
+	if (!(ecx&0x00000100)) {
+		TPRINTK(KERN_NOTICE, "cannot get %s_thresh of bat=%d: ecx=0%x",
+			THRESH_NAME(which), bat, ecx);
+		return -EIO;
+	}
+	if (thresh)
+		*thresh = ecx&0xFF;
+	return 0;
+}
+
+/**
+ * get_real_thresh - read battery charge start/stop threshold from SMAPI
+ * @bat:    battery number (0 or 1)
+ * @which:  THRESH_START or THRESH_STOP
+ * @thresh: 1..99, 0=default (passes as-is to SMAPI)
+ */
+static int get_real_thresh(int bat, enum thresh_type which, int *thresh)
+{
+	return __get_real_thresh(bat, which, thresh, NULL, NULL);
+}
+
+/**
+ * set_real_thresh - write battery start/top charge threshold to SMAPI
+ * @bat:    battery number (0 or 1)
+ * @which:  THRESH_START or THRESH_STOP
+ * @thresh: 1..99, 0=default (passes as-is to SMAPI)
+ */
+static int set_real_thresh(int bat, enum thresh_type which, int thresh)
+{
+	u32 ebx = (which == THRESH_START) ? SMAPI_SET_THRESH_START
+					  : SMAPI_SET_THRESH_STOP;
+	u32 ecx = ((bat+1)<<8) + thresh;
+	u32 getDI, getSI;
+	const char *msg;
+	int ret;
+
+	/* verify read before writing */
+	ret = __get_real_thresh(bat, which, NULL, &getDI, &getSI);
+	if (ret)
+		return ret;
+
+	ret = smapi_write(ebx, ecx, getDI, getSI, &msg);
+	if (ret)
+		TPRINTK(KERN_NOTICE, "set %s to %d for bat=%d failed: %s",
+			THRESH_NAME(which), thresh, bat, msg);
+	else
+		TPRINTK(KERN_INFO, "set %s to %d for bat=%d",
+			THRESH_NAME(which), thresh, bat);
+	return ret;
+}
+
+/**
+ * __get_inhibit_charge_minutes - get inhibit charge period from SMAPI
+ * @bat:     battery number (0 or 1)
+ * @minutes: period in minutes (1..65535 minutes, 0=disabled)
+ * @outECX: some additional state that needs to be preserved, meaning unknown
+ * Note that @minutes is the originally set value, it does not count down.
+ */
+static int __get_inhibit_charge_minutes(int bat, int *minutes, u32 *outECX)
+{
+	u32 ecx = (bat+1)<<8;
+	u32 esi;
+	const char *msg;
+	int ret = smapi_request(SMAPI_GET_INHIBIT_CHARGE, ecx, 0, 0,
+				NULL, &ecx, NULL, NULL, &esi, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg);
+		return ret;
+	}
+	if (!(ecx&0x0100)) {
+		TPRINTK(KERN_NOTICE, "bad ecx=0x%x for bat=%d", ecx, bat);
+		return -EIO;
+	}
+	if (minutes)
+		*minutes = (ecx&0x0001)?esi:0;
+	if (outECX)
+		*outECX = ecx;
+	return 0;
+}
+
+/**
+ * get_inhibit_charge_minutes - get inhibit charge period from SMAPI
+ * @bat:     battery number (0 or 1)
+ * @minutes: period in minutes (1..65535 minutes, 0=disabled)
+ * Note that @minutes is the originally set value, it does not count down.
+ */
+static int get_inhibit_charge_minutes(int bat, int *minutes)
+{
+	return __get_inhibit_charge_minutes(bat, minutes, NULL);
+}
+
+/**
+ * set_inhibit_charge_minutes - write inhibit charge period to SMAPI
+ * @bat:     battery number (0 or 1)
+ * @minutes: period in minutes (1..65535 minutes, 0=disabled)
+ */
+static int set_inhibit_charge_minutes(int bat, int minutes)
+{
+	u32 ecx;
+	const char *msg;
+	int ret;
+
+	/* verify read before writing */
+	ret = __get_inhibit_charge_minutes(bat, NULL, &ecx);
+	if (ret)
+		return ret;
+
+	ecx = ((bat+1)<<8) | (ecx&0x00FE) | (minutes > 0 ? 0x0001 : 0x0000);
+	if (minutes > 0xFFFF)
+		minutes = 0xFFFF;
+	ret = smapi_write(SMAPI_SET_INHIBIT_CHARGE, ecx, 0, minutes, &msg);
+	if (ret)
+		TPRINTK(KERN_NOTICE,
+			"set to %d failed for bat=%d: %s", minutes, bat, msg);
+	else
+		TPRINTK(KERN_INFO, "set to %d for bat=%d\n", minutes, bat);
+	return ret;
+}
+
+
+/**
+ * get_force_discharge - get status of forced discharging from SMAPI
+ * @bat:     battery number (0 or 1)
+ * @enabled: 1 if forced discharged is enabled, 0 if not
+ */
+static int get_force_discharge(int bat, int *enabled)
+{
+	u32 ecx = (bat+1)<<8;
+	const char *msg;
+	int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0,
+				NULL, &ecx, NULL, NULL, NULL, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "failed for bat=%d: %s", bat, msg);
+		return ret;
+	}
+	*enabled = (!(ecx&0x00000100) && (ecx&0x00000001))?1:0;
+	return 0;
+}
+
+/**
+ * set_force_discharge - write status of forced discharging to SMAPI
+ * @bat:     battery number (0 or 1)
+ * @enabled: 1 if forced discharged is enabled, 0 if not
+ */
+static int set_force_discharge(int bat, int enabled)
+{
+	u32 ecx = (bat+1)<<8;
+	const char *msg;
+	int ret = smapi_request(SMAPI_GET_FORCE_DISCHARGE, ecx, 0, 0,
+				NULL, &ecx, NULL, NULL, NULL, &msg);
+	if (ret) {
+		TPRINTK(KERN_NOTICE, "get failed for bat=%d: %s", bat, msg);
+		return ret;
+	}
+	if (ecx&0x00000100) {
+		TPRINTK(KERN_NOTICE, "cannot force discharge bat=%d", bat);
+		return -EIO;
+	}
+
+	ecx = ((bat+1)<<8) | (ecx&0x000000FA) | (enabled?0x00000001:0);
+	ret = smapi_write(SMAPI_SET_FORCE_DISCHARGE, ecx, 0, 0, &msg);
+	if (ret)
+		TPRINTK(KERN_NOTICE, "set to %d failed for bat=%d: %s",
+			enabled, bat, msg);
+	else
+		TPRINTK(KERN_INFO, "set to %d for bat=%d", enabled, bat);
+	return ret;
+}
+
+
+/*********************************************************************
+ * Wrappers to threshold-related SMAPI functions, which handle default
+ * thresholds and related quirks.
+ */
+
+/* Minimum, default and minimum difference for battery charging thresholds: */
+#define MIN_THRESH_DELTA      4  /* Min delta between start and stop thresh */
+#define MIN_THRESH_START      2
+#define MAX_THRESH_START      (100-MIN_THRESH_DELTA)
+#define MIN_THRESH_STOP       (MIN_THRESH_START + MIN_THRESH_DELTA)
+#define MAX_THRESH_STOP       100
+#define DEFAULT_THRESH_START  MAX_THRESH_START
+#define DEFAULT_THRESH_STOP   MAX_THRESH_STOP
+
+/* The GUI of IBM's Battery Maximizer seems to show a start threshold that
+ * is 1 more than the value we set/get via SMAPI. Since the threshold is
+ * maintained across reboot, this can be confusing. So we kludge our
+ * interface for interoperability: */
+#define BATMAX_FIX   1
+
+/* Get charge start/stop threshold (1..100),
+ * substituting default values if needed and applying BATMAT_FIX. */
+static int get_thresh(int bat, enum thresh_type which, int *thresh)
+{
+	int ret = get_real_thresh(bat, which, thresh);
+	if (ret)
+		return ret;
+	if (*thresh == 0)
+		*thresh = (which == THRESH_START) ? DEFAULT_THRESH_START
+						  : DEFAULT_THRESH_STOP;
+	else if (which == THRESH_START)
+		*thresh += BATMAX_FIX;
+	return 0;
+}
+
+
+/* Set charge start/stop threshold (1..100),
+ * substituting default values if needed and applying BATMAT_FIX. */
+static int set_thresh(int bat, enum thresh_type which, int thresh)
+{
+	if (which == THRESH_STOP && thresh == DEFAULT_THRESH_STOP)
+		thresh = 0; /* 100 is out of range, but default means 100 */
+	if (which == THRESH_START)
+		thresh -= BATMAX_FIX;
+	return set_real_thresh(bat, which, thresh);
+}
+
+/*********************************************************************
+ * ThinkPad embedded controller readout and basic functions
+ */
+
+/**
+ * read_tp_ec_row - read data row from the ThinkPad embedded controller
+ * @arg0: EC command code
+ * @bat: battery number, 0 or 1
+ * @j: the byte value to be used for "junk" (unused) input/outputs
+ * @dataval: result vector
+ */
+static int read_tp_ec_row(u8 arg0, int bat, u8 j, u8 *dataval)
+{
+	int ret;
+	const struct thinkpad_ec_row args = { .mask = 0xFFFF,
+		.val = {arg0, j,j,j,j,j,j,j,j,j,j,j,j,j,j, (u8)bat} };
+	struct thinkpad_ec_row data = { .mask = 0xFFFF };
+
+	ret = thinkpad_ec_lock();
+	if (ret)
+		return ret;
+	ret = thinkpad_ec_read_row(&args, &data);
+	thinkpad_ec_unlock();
+	memcpy(dataval, &data.val, TP_CONTROLLER_ROW_LEN);
+	return ret;
+}
+
+/**
+ * power_device_present - check for presence of battery or AC power
+ * @bat: 0 for battery 0, 1 for battery 1, otherwise AC power
+ * Returns 1 if present, 0 if not present, negative if error.
+ */
+static int power_device_present(int bat)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	u8 test;
+	int ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	switch (bat) {
+	case 0:  test = 0x40; break; /* battery 0 */
+	case 1:  test = 0x20; break; /* battery 1 */
+	default: test = 0x80;        /* AC power */
+	}
+	return (row[0] & test) ? 1 : 0;
+}
+
+/**
+ * bat_has_status - check if battery can report detailed status
+ * @bat: 0 for battery 0, 1 for battery 1
+ * Returns 1 if yes, 0 if no, negative if error.
+ */
+static int bat_has_status(int bat)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	if ((row[0] & (bat?0x20:0x40)) == 0) /* no battery */
+		return 0;
+	if ((row[1] & (0x60)) == 0) /* no status */
+		return 0;
+	return 1;
+}
+
+/**
+ * get_tp_ec_bat_16 - read a 16-bit value from EC battery status data
+ * @arg0: first argument to EC
+ * @off: offset in row returned from EC
+ * @bat: battery (0 or 1)
+ * @val: the 16-bit value obtained
+ * Returns nonzero on error.
+ */
+static int get_tp_ec_bat_16(u8 arg0, int offset, int bat, u16 *val)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int ret;
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(arg0, bat, 0, row);
+	if (ret)
+		return ret;
+	*val = *(u16 *)(row+offset);
+	return 0;
+}
+
+/*********************************************************************
+ * sysfs attributes for batteries -
+ * definitions and helper functions
+ */
+
+/* A custom device attribute struct which holds a battery number */
+struct bat_device_attribute {
+	struct device_attribute dev_attr;
+	int bat;
+};
+
+/**
+ * attr_get_bat - get the battery to which the attribute belongs
+ */
+static int attr_get_bat(struct device_attribute *attr)
+{
+	return container_of(attr, struct bat_device_attribute, dev_attr)->bat;
+}
+
+/**
+ * show_tp_ec_bat_u16 - show an unsigned 16-bit battery attribute
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @mul: correction factor to multiply by
+ * @na_msg: string to output is value not available (0xFFFFFFFF)
+ * @attr: battery attribute
+ * @buf: output buffer
+ * The 16-bit value is read from the EC, treated as unsigned,
+ * transformed as x->mul*x, and printed to the buffer.
+ * If the value is 0xFFFFFFFF and na_msg!=%NULL, na_msg is printed instead.
+ */
+static ssize_t show_tp_ec_bat_u16(u8 arg0, int offset, int mul,
+			      const char *na_msg,
+			      struct device_attribute *attr, char *buf)
+{
+	u16 val;
+	int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val);
+	if (ret)
+		return ret;
+	if (na_msg && val == 0xFFFF)
+		return sprintf(buf, "%s\n", na_msg);
+	else
+		return sprintf(buf, "%u\n", mul*(unsigned int)val);
+}
+
+/**
+ * show_tp_ec_bat_s16 - show an signed 16-bit battery attribute
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @mul: correction factor to multiply by
+ * @add: correction term to add after multiplication
+ * @attr: battery attribute
+ * @buf: output buffer
+ * The 16-bit value is read from the EC, treated as signed,
+ * transformed as x->mul*x+add, and printed to the buffer.
+ */
+static ssize_t show_tp_ec_bat_s16(u8 arg0, int offset, int mul, int add,
+			      struct device_attribute *attr, char *buf)
+{
+	u16 val;
+	int ret = get_tp_ec_bat_16(arg0, offset, attr_get_bat(attr), &val);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", mul*(s16)val+add);
+}
+
+/**
+ * show_tp_ec_bat_str - show a string from EC battery status data
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @maxlen: maximum string length
+ * @attr: battery attribute
+ * @buf: output buffer
+ */
+static ssize_t show_tp_ec_bat_str(u8 arg0, int offset, int maxlen,
+			      struct device_attribute *attr, char *buf)
+{
+	int bat = attr_get_bat(attr);
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int ret;
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(arg0, bat, 0, row);
+	if (ret)
+		return ret;
+	strncpy(buf, (char *)row+offset, maxlen);
+	buf[maxlen] = 0;
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+/**
+ * show_tp_ec_bat_power - show a power readout from EC battery status data
+ * @arg0: specified 1st argument of EC raw to read
+ * @offV: byte offset of voltage in EC raw data
+ * @offI: byte offset of current in EC raw data
+ * @attr: battery attribute
+ * @buf: output buffer
+ * Computes the power as current*voltage from the two given readout offsets.
+ */
+static ssize_t show_tp_ec_bat_power(u8 arg0, int offV, int offI,
+				struct device_attribute *attr, char *buf)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	int milliamp, millivolt, ret;
+	int bat = attr_get_bat(attr);
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	millivolt = *(u16 *)(row+offV);
+	milliamp = *(s16 *)(row+offI);
+	return sprintf(buf, "%d\n", milliamp*millivolt/1000); /* units: mW */
+}
+
+/**
+ * show_tp_ec_bat_date - decode and show a date from EC battery status data
+ * @arg0: specified 1st argument of EC raw to read
+ * @offset: byte offset in EC raw data
+ * @attr: battery attribute
+ * @buf: output buffer
+ */
+static ssize_t show_tp_ec_bat_date(u8 arg0, int offset,
+			       struct device_attribute *attr, char *buf)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	u16 v;
+	int ret;
+	int day, month, year;
+	int bat = attr_get_bat(attr);
+	if (bat_has_status(bat) != 1)
+		return -ENXIO;
+	ret = read_tp_ec_row(arg0, bat, 0, row);
+	if (ret)
+		return ret;
+
+	/* Decode bit-packed: v = day | (month<<5) | ((year-1980)<<9) */
+	v = *(u16 *)(row+offset);
+	day = v & 0x1F;
+	month = (v >> 5) & 0xF;
+	year = (v >> 9) + 1980;
+
+	return sprintf(buf, "%04d-%02d-%02d\n", year, month, day);
+}
+
+
+/*********************************************************************
+ * sysfs attribute I/O for batteries -
+ * the actual attribute show/store functions
+ */
+
+static ssize_t show_battery_start_charge_thresh(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int thresh;
+	int bat = attr_get_bat(attr);
+	int ret = get_thresh(bat, THRESH_START, &thresh);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", thresh);  /* units: percent */
+}
+
+static ssize_t show_battery_stop_charge_thresh(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int thresh;
+	int bat = attr_get_bat(attr);
+	int ret = get_thresh(bat, THRESH_STOP, &thresh);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", thresh);  /* units: percent */
+}
+
+/**
+ * store_battery_start_charge_thresh - store battery_start_charge_thresh attr
+ * Since this is a kernel<->user interface, we ensure a valid state for
+ * the hardware. We do this by clamping the requested threshold to the
+ * valid range and, if necessary, moving the other threshold so that
+ * it's MIN_THRESH_DELTA away from this one.
+ */
+static ssize_t store_battery_start_charge_thresh(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	int thresh, other_thresh, ret;
+	int bat = attr_get_bat(attr);
+
+	if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100)
+		return -EINVAL;
+
+	if (thresh < MIN_THRESH_START) /* clamp up to MIN_THRESH_START */
+		thresh = MIN_THRESH_START;
+	if (thresh > MAX_THRESH_START) /* clamp down to MAX_THRESH_START */
+		thresh = MAX_THRESH_START;
+
+	down(&smapi_mutex);
+	ret = get_thresh(bat, THRESH_STOP, &other_thresh);
+	if (ret != -EOPNOTSUPP && ret != -ENXIO) {
+		if (ret) /* other threshold is set? */
+			goto out;
+		ret = get_real_thresh(bat, THRESH_START, NULL);
+		if (ret) /* this threshold is set? */
+			goto out;
+		if (other_thresh < thresh+MIN_THRESH_DELTA) {
+			/* move other thresh to keep it above this one */
+			ret = set_thresh(bat, THRESH_STOP,
+					 thresh+MIN_THRESH_DELTA);
+			if (ret)
+				goto out;
+		}
+	}
+	ret = set_thresh(bat, THRESH_START, thresh);
+out:
+	up(&smapi_mutex);
+	return count;
+
+}
+
+/**
+ * store_battery_stop_charge_thresh - store battery_stop_charge_thresh attr
+ * Since this is a kernel<->user interface, we ensure a valid state for
+ * the hardware. We do this by clamping the requested threshold to the
+ * valid range and, if necessary, moving the other threshold so that
+ * it's MIN_THRESH_DELTA away from this one.
+ */
+static ssize_t store_battery_stop_charge_thresh(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	int thresh, other_thresh, ret;
+	int bat = attr_get_bat(attr);
+
+	if (sscanf(buf, "%d", &thresh) != 1 || thresh < 1 || thresh > 100)
+		return -EINVAL;
+
+	if (thresh < MIN_THRESH_STOP) /* clamp up to MIN_THRESH_STOP */
+		thresh = MIN_THRESH_STOP;
+
+	down(&smapi_mutex);
+	ret = get_thresh(bat, THRESH_START, &other_thresh);
+	if (ret != -EOPNOTSUPP && ret != -ENXIO) { /* other threshold exists? */
+		if (ret)
+			goto out;
+		/* this threshold exists? */
+		ret = get_real_thresh(bat, THRESH_STOP, NULL);
+		if (ret)
+			goto out;
+		if (other_thresh >= thresh-MIN_THRESH_DELTA) {
+			 /* move other thresh to be below this one */
+			ret = set_thresh(bat, THRESH_START,
+					 thresh-MIN_THRESH_DELTA);
+			if (ret)
+				goto out;
+		}
+	}
+	ret = set_thresh(bat, THRESH_STOP, thresh);
+out:
+	up(&smapi_mutex);
+	return count;
+}
+
+static ssize_t show_battery_inhibit_charge_minutes(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int minutes;
+	int bat = attr_get_bat(attr);
+	int ret = get_inhibit_charge_minutes(bat, &minutes);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", minutes);  /* units: minutes */
+}
+
+static ssize_t store_battery_inhibit_charge_minutes(struct device *dev,
+				struct device_attribute *attr,
+				const char *buf, size_t count)
+{
+	int ret;
+	int minutes;
+	int bat = attr_get_bat(attr);
+	if (sscanf(buf, "%d", &minutes) != 1 || minutes < 0) {
+		TPRINTK(KERN_ERR, "inhibit_charge_minutes: "
+			      "must be a non-negative integer");
+		return -EINVAL;
+	}
+	ret = set_inhibit_charge_minutes(bat, minutes);
+	if (ret)
+		return ret;
+	return count;
+}
+
+static ssize_t show_battery_force_discharge(struct device *dev,
+	struct device_attribute *attr, char *buf)
+{
+	int enabled;
+	int bat = attr_get_bat(attr);
+	int ret = get_force_discharge(bat, &enabled);
+	if (ret)
+		return ret;
+	return sprintf(buf, "%d\n", enabled);  /* type: boolean */
+}
+
+static ssize_t store_battery_force_discharge(struct device *dev,
+	struct device_attribute *attr, const char *buf, size_t count)
+{
+	int ret;
+	int enabled;
+	int bat = attr_get_bat(attr);
+	if (sscanf(buf, "%d", &enabled) != 1 || enabled < 0 || enabled > 1)
+		return -EINVAL;
+	ret = set_force_discharge(bat, enabled);
+	if (ret)
+		return ret;
+	return count;
+}
+
+static ssize_t show_battery_installed(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int bat = attr_get_bat(attr);
+	int ret = power_device_present(bat);
+	if (ret < 0)
+		return ret;
+	return sprintf(buf, "%d\n", ret); /* type: boolean */
+}
+
+static ssize_t show_battery_state(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	u8 row[TP_CONTROLLER_ROW_LEN];
+	const char *txt;
+	int ret;
+	int bat = attr_get_bat(attr);
+	if (bat_has_status(bat) != 1)
+		return sprintf(buf, "none\n");
+	ret = read_tp_ec_row(1, bat, 0, row);
+	if (ret)
+		return ret;
+	switch (row[1] & 0xf0) {
+	case 0xc0: txt = "idle"; break;
+	case 0xd0: txt = "discharging"; break;
+	case 0xe0: txt = "charging"; break;
+	default:   return sprintf(buf, "unknown (0x%x)\n", row[1]);
+	}
+	return sprintf(buf, "%s\n", txt);  /* type: string from fixed set */
+}
+
+static ssize_t show_battery_manufacturer(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string. SBS spec v1.1 p34: ManufacturerName() */
+	return show_tp_ec_bat_str(4, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf);
+}
+
+static ssize_t show_battery_model(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string. SBS spec v1.1 p34: DeviceName() */
+	return show_tp_ec_bat_str(5, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf);
+}
+
+static ssize_t show_battery_barcoding(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string */
+	return show_tp_ec_bat_str(7, 2, TP_CONTROLLER_ROW_LEN-2, attr, buf);
+}
+
+static ssize_t show_battery_chemistry(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: string. SBS spec v1.1 p34-35: DeviceChemistry() */
+	return show_tp_ec_bat_str(6, 2, 5, attr, buf);
+}
+
+static ssize_t show_battery_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV. SBS spec v1.1 p24: Voltage() */
+	return show_tp_ec_bat_u16(1, 6, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_design_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV. SBS spec v1.1 p32: DesignVoltage() */
+	return show_tp_ec_bat_u16(3, 4, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_charging_max_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV. SBS spec v1.1 p37,39: ChargingVoltage() */
+	return show_tp_ec_bat_u16(9, 8, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group0_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 12, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group1_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 10, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group2_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 8, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_group3_voltage(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mV */
+	return show_tp_ec_bat_u16(0xA, 6, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_current_now(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mA. SBS spec v1.1 p24: Current() */
+	return show_tp_ec_bat_s16(1, 8, 1, 0, attr, buf);
+}
+
+static ssize_t show_battery_current_avg(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mA. SBS spec v1.1 p24: AverageCurrent() */
+	return show_tp_ec_bat_s16(1, 10, 1, 0, attr, buf);
+}
+
+static ssize_t show_battery_charging_max_current(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mA. SBS spec v1.1 p36,38: ChargingCurrent() */
+	return show_tp_ec_bat_s16(9, 6, 1, 0, attr, buf);
+}
+
+static ssize_t show_battery_power_now(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mW. SBS spec v1.1: Voltage()*Current() */
+	return show_tp_ec_bat_power(1, 6, 8, attr, buf);
+}
+
+static ssize_t show_battery_power_avg(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mW. SBS spec v1.1: Voltage()*AverageCurrent() */
+	return show_tp_ec_bat_power(1, 6, 10, attr, buf);
+}
+
+static ssize_t show_battery_remaining_percent(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: percent. SBS spec v1.1 p25: RelativeStateOfCharge() */
+	return show_tp_ec_bat_u16(1, 12, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_remaining_percent_error(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: percent. SBS spec v1.1 p25: MaxError() */
+	return show_tp_ec_bat_u16(9, 4, 1, NULL, attr, buf);
+}
+
+static ssize_t show_battery_remaining_charging_time(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: minutes. SBS spec v1.1 p27: AverageTimeToFull() */
+	return show_tp_ec_bat_u16(2, 8, 1, "not_charging", attr, buf);
+}
+
+static ssize_t show_battery_remaining_running_time(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */
+	return show_tp_ec_bat_u16(2, 6, 1, "not_discharging", attr, buf);
+}
+
+static ssize_t show_battery_remaining_running_time_now(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: minutes. SBS spec v1.1 p27: RunTimeToEmpty() */
+	return show_tp_ec_bat_u16(2, 4, 1, "not_discharging", attr, buf);
+}
+
+static ssize_t show_battery_remaining_capacity(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mWh. SBS spec v1.1 p26. */
+	return show_tp_ec_bat_u16(1, 14, 10, "", attr, buf);
+}
+
+static ssize_t show_battery_last_full_capacity(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mWh. SBS spec v1.1 p26: FullChargeCapacity() */
+	return show_tp_ec_bat_u16(2, 2, 10, "", attr, buf);
+}
+
+static ssize_t show_battery_design_capacity(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: mWh. SBS spec v1.1 p32: DesignCapacity() */
+	return show_tp_ec_bat_u16(3, 2, 10, "", attr, buf);
+}
+
+static ssize_t show_battery_cycle_count(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: ordinal. SBS spec v1.1 p32: CycleCount() */
+	return show_tp_ec_bat_u16(2, 12, 1, "", attr, buf);
+}
+
+static ssize_t show_battery_temperature(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* units: millicelsius. SBS spec v1.1: Temperature()*10 */
+	return show_tp_ec_bat_s16(1, 4, 100, -273100, attr, buf);
+}
+
+static ssize_t show_battery_serial(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: int. SBS spec v1.1 p34: SerialNumber() */
+	return show_tp_ec_bat_u16(3, 10, 1, "", attr, buf);
+}
+
+static ssize_t show_battery_manufacture_date(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: YYYY-MM-DD. SBS spec v1.1 p34: ManufactureDate() */
+	return show_tp_ec_bat_date(3, 8, attr, buf);
+}
+
+static ssize_t show_battery_first_use_date(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	/* type: YYYY-MM-DD */
+	return show_tp_ec_bat_date(8, 2, attr, buf);
+}
+
+/**
+ * show_battery_dump - show the battery's dump attribute
+ * The dump attribute gives a hex dump of all EC readouts related to a
+ * battery. Some of the enumerated values don't really exist (i.e., the
+ * EC function just leaves them untouched); we use a kludge to detect and
+ * denote these.
+ */
+#define MIN_DUMP_ARG0 0x00
+#define MAX_DUMP_ARG0 0x0a /* 0x0b is useful too but hangs old EC firmware */
+static ssize_t show_battery_dump(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int i;
+	char *p = buf;
+	int bat = attr_get_bat(attr);
+	u8 arg0; /* first argument to EC */
+	u8 rowa[TP_CONTROLLER_ROW_LEN],
+	   rowb[TP_CONTROLLER_ROW_LEN];
+	const u8 junka = 0xAA,
+		 junkb = 0x55; /* junk values for testing changes */
+	int ret;
+
+	for (arg0 = MIN_DUMP_ARG0; arg0 <= MAX_DUMP_ARG0; ++arg0) {
+		if ((p-buf) > PAGE_SIZE-TP_CONTROLLER_ROW_LEN*5)
+			return -ENOMEM; /* don't overflow sysfs buf */
+		/* Read raw twice with different junk values,
+		 * to detect unused output bytes which are left unchaged: */
+		ret = read_tp_ec_row(arg0, bat, junka, rowa);
+		if (ret)
+			return ret;
+		ret = read_tp_ec_row(arg0, bat, junkb, rowb);
+		if (ret)
+			return ret;
+		for (i = 0; i < TP_CONTROLLER_ROW_LEN; i++) {
+			if (rowa[i] == junka && rowb[i] == junkb)
+				p += sprintf(p, "-- "); /* unused by EC */
+			else
+				p += sprintf(p, "%02x ", rowa[i]);
+		}
+		p += sprintf(p, "\n");
+	}
+	return p-buf;
+}
+
+
+/*********************************************************************
+ * sysfs attribute I/O, other than batteries
+ */
+
+static ssize_t show_ac_connected(
+	struct device *dev, struct device_attribute *attr, char *buf)
+{
+	int ret = power_device_present(0xFF);
+	if (ret < 0)
+		return ret;
+	return sprintf(buf, "%d\n", ret);  /* type: boolean */
+}
+
+/*********************************************************************
+ * The the "smapi_request" sysfs attribute executes a raw SMAPI call.
+ * You write to make a request and read to get the result. The state
+ * is saved globally rather than per fd (sysfs limitation), so
+ * simultaenous requests may get each other's results! So this is for
+ * development and debugging only.
+ */
+#define MAX_SMAPI_ATTR_ANSWER_LEN   128
+static char smapi_attr_answer[MAX_SMAPI_ATTR_ANSWER_LEN] = "";
+
+static ssize_t show_smapi_request(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	int ret = snprintf(buf, PAGE_SIZE, "%s", smapi_attr_answer);
+	smapi_attr_answer[0] = '\0';
+	return ret;
+}
+
+static ssize_t store_smapi_request(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t count)
+{
+	unsigned int inEBX, inECX, inEDI, inESI;
+	u32 outEBX, outECX, outEDX, outEDI, outESI;
+	const char *msg;
+	int ret;
+	if (sscanf(buf, "%x %x %x %x", &inEBX, &inECX, &inEDI, &inESI) != 4) {
+		smapi_attr_answer[0] = '\0';
+		return -EINVAL;
+	}
+	ret = smapi_request(
+		   inEBX, inECX, inEDI, inESI,
+		   &outEBX, &outECX, &outEDX, &outEDI, &outESI, &msg);
+	snprintf(smapi_attr_answer, MAX_SMAPI_ATTR_ANSWER_LEN,
+		 "%x %x %x %x %x %d '%s'\n",
+		 (unsigned int)outEBX, (unsigned int)outECX,
+		 (unsigned int)outEDX, (unsigned int)outEDI,
+		 (unsigned int)outESI, ret, msg);
+	if (ret)
+		return ret;
+	else
+		return count;
+}
+
+/*********************************************************************
+ * Power management: the embedded controller forgets the battery
+ * thresholds when the system is suspended to disk and unplugged from
+ * AC and battery, so we restore it upon resume.
+ */
+
+static int saved_threshs[4] = {-1, -1, -1, -1};  /* -1 = don't know */
+
+static int tp_suspend(struct platform_device *dev, pm_message_t state)
+{
+	int restore = (state.event == PM_EVENT_HIBERNATE ||
+	               state.event == PM_EVENT_FREEZE);
+	if (!restore || get_real_thresh(0, THRESH_STOP , &saved_threshs[0]))
+		saved_threshs[0] = -1;
+	if (!restore || get_real_thresh(0, THRESH_START, &saved_threshs[1]))
+		saved_threshs[1] = -1;
+	if (!restore || get_real_thresh(1, THRESH_STOP , &saved_threshs[2]))
+		saved_threshs[2] = -1;
+	if (!restore || get_real_thresh(1, THRESH_START, &saved_threshs[3]))
+		saved_threshs[3] = -1;
+	DPRINTK("suspend saved: %d %d %d %d", saved_threshs[0],
+		saved_threshs[1], saved_threshs[2], saved_threshs[3]);
+	return 0;
+}
+
+static int tp_resume(struct platform_device *dev)
+{
+	DPRINTK("resume restoring: %d %d %d %d", saved_threshs[0],
+		saved_threshs[1], saved_threshs[2], saved_threshs[3]);
+	if (saved_threshs[0] >= 0)
+		set_real_thresh(0, THRESH_STOP , saved_threshs[0]);
+	if (saved_threshs[1] >= 0)
+		set_real_thresh(0, THRESH_START, saved_threshs[1]);
+	if (saved_threshs[2] >= 0)
+		set_real_thresh(1, THRESH_STOP , saved_threshs[2]);
+	if (saved_threshs[3] >= 0)
+		set_real_thresh(1, THRESH_START, saved_threshs[3]);
+	return 0;
+}
+
+
+/*********************************************************************
+ * Driver model
+ */
+
+static struct platform_driver tp_driver = {
+	.suspend = tp_suspend,
+	.resume = tp_resume,
+	.driver = {
+		.name = "smapi",
+		.owner = THIS_MODULE
+	},
+};
+
+
+/*********************************************************************
+ * Sysfs device model
+ */
+
+/* Attributes in /sys/devices/platform/smapi/ */
+
+static DEVICE_ATTR(ac_connected, 0444, show_ac_connected, NULL);
+static DEVICE_ATTR(smapi_request, 0600, show_smapi_request,
+					store_smapi_request);
+
+static struct attribute *tp_root_attributes[] = {
+	&dev_attr_ac_connected.attr,
+	&dev_attr_smapi_request.attr,
+	NULL
+};
+static struct attribute_group tp_root_attribute_group = {
+	.attrs = tp_root_attributes
+};
+
+/* Attributes under /sys/devices/platform/smapi/BAT{0,1}/ :
+ * Every attribute needs to be defined (i.e., statically allocated) for
+ * each battery, and then referenced in the attribute list of each battery.
+ * We use preprocessor voodoo to avoid duplicating the list of attributes 4
+ * times. The preprocessor output is just normal sysfs attributes code.
+ */
+
+/**
+ * FOREACH_BAT_ATTR - invoke the given macros on all our battery attributes
+ * @_BAT:     battery number (0 or 1)
+ * @_ATTR_RW: macro to invoke for each read/write attribute
+ * @_ATTR_R:  macro to invoke for each read-only  attribute
+ */
+#define FOREACH_BAT_ATTR(_BAT, _ATTR_RW, _ATTR_R) \
+	_ATTR_RW(_BAT, start_charge_thresh) \
+	_ATTR_RW(_BAT, stop_charge_thresh) \
+	_ATTR_RW(_BAT, inhibit_charge_minutes) \
+	_ATTR_RW(_BAT, force_discharge) \
+	_ATTR_R(_BAT, installed) \
+	_ATTR_R(_BAT, state) \
+	_ATTR_R(_BAT, manufacturer) \
+	_ATTR_R(_BAT, model) \
+	_ATTR_R(_BAT, barcoding) \
+	_ATTR_R(_BAT, chemistry) \
+	_ATTR_R(_BAT, voltage) \
+	_ATTR_R(_BAT, group0_voltage) \
+	_ATTR_R(_BAT, group1_voltage) \
+	_ATTR_R(_BAT, group2_voltage) \
+	_ATTR_R(_BAT, group3_voltage) \
+	_ATTR_R(_BAT, current_now) \
+	_ATTR_R(_BAT, current_avg) \
+	_ATTR_R(_BAT, charging_max_current) \
+	_ATTR_R(_BAT, power_now) \
+	_ATTR_R(_BAT, power_avg) \
+	_ATTR_R(_BAT, remaining_percent) \
+	_ATTR_R(_BAT, remaining_percent_error) \
+	_ATTR_R(_BAT, remaining_charging_time) \
+	_ATTR_R(_BAT, remaining_running_time) \
+	_ATTR_R(_BAT, remaining_running_time_now) \
+	_ATTR_R(_BAT, remaining_capacity) \
+	_ATTR_R(_BAT, last_full_capacity) \
+	_ATTR_R(_BAT, design_voltage) \
+	_ATTR_R(_BAT, charging_max_voltage) \
+	_ATTR_R(_BAT, design_capacity) \
+	_ATTR_R(_BAT, cycle_count) \
+	_ATTR_R(_BAT, temperature) \
+	_ATTR_R(_BAT, serial) \
+	_ATTR_R(_BAT, manufacture_date) \
+	_ATTR_R(_BAT, first_use_date) \
+	_ATTR_R(_BAT, dump)
+
+/* Define several macros we will feed into FOREACH_BAT_ATTR: */
+
+#define DEFINE_BAT_ATTR_RW(_BAT,_NAME) \
+	static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = {  \
+		.dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME,   \
+						store_battery_##_NAME), \
+		.bat = _BAT \
+	};
+
+#define DEFINE_BAT_ATTR_R(_BAT,_NAME) \
+	static struct bat_device_attribute dev_attr_##_NAME##_##_BAT = {    \
+		.dev_attr = __ATTR(_NAME, 0644, show_battery_##_NAME, 0), \
+		.bat = _BAT \
+	};
+
+#define REF_BAT_ATTR(_BAT,_NAME) \
+	&dev_attr_##_NAME##_##_BAT.dev_attr.attr,
+
+/* This provide all attributes for one battery: */
+
+#define PROVIDE_BAT_ATTRS(_BAT) \
+	FOREACH_BAT_ATTR(_BAT, DEFINE_BAT_ATTR_RW, DEFINE_BAT_ATTR_R) \
+	static struct attribute *tp_bat##_BAT##_attributes[] = { \
+		FOREACH_BAT_ATTR(_BAT, REF_BAT_ATTR, REF_BAT_ATTR) \
+		NULL \
+	}; \
+	static struct attribute_group tp_bat##_BAT##_attribute_group = { \
+		.name  = "BAT" #_BAT, \
+		.attrs = tp_bat##_BAT##_attributes \
+	};
+
+/* Finally genereate the attributes: */
+
+PROVIDE_BAT_ATTRS(0)
+PROVIDE_BAT_ATTRS(1)
+
+/* List of attribute groups */
+
+static struct attribute_group *attr_groups[] = {
+	&tp_root_attribute_group,
+	&tp_bat0_attribute_group,
+	&tp_bat1_attribute_group,
+	NULL
+};
+
+
+/*********************************************************************
+ * Init and cleanup
+ */
+
+static struct attribute_group **next_attr_group; /* next to register */
+
+static int __init tp_init(void)
+{
+	int ret;
+	printk(KERN_INFO "tp_smapi " TP_VERSION " loading...\n");
+
+	ret = find_smapi_port();
+	if (ret < 0)
+		goto err;
+	else
+		smapi_port = ret;
+
+	if (!request_region(smapi_port, 1, "smapi")) {
+		printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n",
+		       smapi_port);
+		ret = -ENXIO;
+		goto err;
+	}
+
+	if (!request_region(SMAPI_PORT2, 1, "smapi")) {
+		printk(KERN_ERR "tp_smapi cannot claim port 0x%x\n",
+		       SMAPI_PORT2);
+		ret = -ENXIO;
+		goto err_port1;
+	}
+
+	ret = platform_driver_register(&tp_driver);
+	if (ret)
+		goto err_port2;
+
+	pdev = platform_device_alloc("smapi", -1);
+	if (!pdev) {
+		ret = -ENOMEM;
+		goto err_driver;
+	}
+
+	ret = platform_device_add(pdev);
+	if (ret)
+		goto err_device_free;
+
+	for (next_attr_group = attr_groups; *next_attr_group;
+	     ++next_attr_group) {
+		ret = sysfs_create_group(&pdev->dev.kobj, *next_attr_group);
+		if (ret)
+			goto err_attr;
+	}
+
+	printk(KERN_INFO "tp_smapi successfully loaded (smapi_port=0x%x).\n",
+	       smapi_port);
+	return 0;
+
+err_attr:
+	while (--next_attr_group >= attr_groups)
+		sysfs_remove_group(&pdev->dev.kobj, *next_attr_group);
+	platform_device_unregister(pdev);
+err_device_free:
+	platform_device_put(pdev);
+err_driver:
+	platform_driver_unregister(&tp_driver);
+err_port2:
+	release_region(SMAPI_PORT2, 1);
+err_port1:
+	release_region(smapi_port, 1);
+err:
+	printk(KERN_ERR "tp_smapi init failed (ret=%d)!\n", ret);
+	return ret;
+}
+
+static void __exit tp_exit(void)
+{
+	while (next_attr_group && --next_attr_group >= attr_groups)
+		sysfs_remove_group(&pdev->dev.kobj, *next_attr_group);
+	platform_device_unregister(pdev);
+	platform_driver_unregister(&tp_driver);
+	release_region(SMAPI_PORT2, 1);
+	if (smapi_port)
+		release_region(smapi_port, 1);
+
+	printk(KERN_INFO "tp_smapi unloaded.\n");
+}
+
+module_init(tp_init);
+module_exit(tp_exit);
diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c
index 1deb6ad..75455d4 100644
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -621,6 +621,9 @@ int scsi_change_queue_depth(struct scsi_device *sdev, int depth)
 		wmb();
 	}
 
+	if (sdev->request_queue)
+		blk_set_queue_depth(sdev->request_queue, depth);
+
 	return sdev->queue_depth;
 }
 EXPORT_SYMBOL(scsi_change_queue_depth);
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index af94764..c498444 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -104,4 +104,6 @@ source "drivers/staging/i4l/Kconfig"
 
 source "drivers/staging/ks7010/Kconfig"
 
+source "drivers/staging/vhba/Kconfig"
+
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 9f6009d..ab0b46b 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_VME_BUS)		+= vme/
 obj-$(CONFIG_IIO)		+= iio/
 obj-$(CONFIG_FB_SM750)		+= sm750fb/
 obj-$(CONFIG_FB_XGI)		+= xgifb/
+obj-$(CONFIG_VHBA)		+= vhba/
 obj-$(CONFIG_USB_EMXX)		+= emxx_udc/
 obj-$(CONFIG_SPEAKUP)		+= speakup/
 obj-$(CONFIG_MFD_NVEC)		+= nvec/
diff --git b/drivers/staging/vhba/Kconfig b/drivers/staging/vhba/Kconfig
new file mode 100644
index 0000000..7ccb7d8
--- /dev/null
+++ b/drivers/staging/vhba/Kconfig
@@ -0,0 +1,9 @@
+config VHBA
+	tristate "Virtual (SCSI) Host Bus Adapter"
+	depends on SCSI
+	---help---
+        This is the in-kernel part of CDEmu, a CD/DVD-ROM device
+        emulator.
+
+	This driver can also be built as a module. If so, the module
+	will be called vhba.
diff --git b/drivers/staging/vhba/Makefile b/drivers/staging/vhba/Makefile
new file mode 100644
index 0000000..60b9e26
--- /dev/null
+++ b/drivers/staging/vhba/Makefile
@@ -0,0 +1,4 @@
+VHBA_VERSION := 20140928
+
+obj-$(CONFIG_VHBA)		+= vhba.o
+ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror
diff --git b/drivers/staging/vhba/vhba.c b/drivers/staging/vhba/vhba.c
new file mode 100644
index 0000000..4a14a10
--- /dev/null
+++ b/drivers/staging/vhba/vhba.c
@@ -0,0 +1,1071 @@
+/*
+ * vhba.c
+ *
+ * Copyright (C) 2007-2012 Chia-I Wu <b90201047 AT ntu DOT edu DOT tw>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
+#include <linux/sched.h>
+#include <linux/platform_device.h>
+#include <linux/miscdevice.h>
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/version.h>
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+#include <asm/uaccess.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+
+/* scatterlist.page_link and sg_page() were introduced in 2.6.24 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 24)
+#define USE_SG_PAGE
+#include <linux/scatterlist.h>
+#endif
+
+MODULE_AUTHOR("Chia-I Wu");
+MODULE_VERSION(VHBA_VERSION);
+MODULE_DESCRIPTION("Virtual SCSI HBA");
+MODULE_LICENSE("GPL");
+
+#ifdef DEBUG
+#define DPRINTK(fmt, args...) printk(KERN_DEBUG "%s: " fmt, __FUNCTION__, ## args)
+#else
+#define DPRINTK(fmt, args...)
+#endif
+
+/* scmd_dbg was introduced in 3.15 */
+#ifndef scmd_dbg
+#define scmd_dbg(scmd, fmt, a...)       \
+    dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a)
+#endif
+
+#ifndef scmd_warn
+#define scmd_warn(scmd, fmt, a...)      \
+    dev_warn(&(scmd)->device->sdev_gendev, fmt, ##a)
+#endif
+
+#define VHBA_MAX_SECTORS_PER_IO 256
+#define VHBA_MAX_ID 32
+#define VHBA_CAN_QUEUE 32
+#define VHBA_INVALID_ID VHBA_MAX_ID
+
+#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL)
+#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL)
+
+
+/* SCSI macros were introduced in 2.6.23 */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2, 6, 23)
+#define scsi_sg_count(cmd) ((cmd)->use_sg)
+#define scsi_sglist(cmd) ((cmd)->request_buffer)
+#define scsi_bufflen(cmd) ((cmd)->request_bufflen)
+#define scsi_set_resid(cmd, to_read) {(cmd)->resid = (to_read);}
+#endif
+
+/* 1-argument form of k[un]map_atomic was introduced in 2.6.37-rc1;
+   2-argument form was deprecated in 3.4-rc1 */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2, 6, 37)
+#define vhba_kmap_atomic kmap_atomic
+#define vhba_kunmap_atomic kunmap_atomic
+#else
+#define vhba_kmap_atomic(page) kmap_atomic(page, KM_USER0)
+#define vhba_kunmap_atomic(page) kunmap_atomic(page, KM_USER0)
+#endif
+
+
+enum vhba_req_state {
+    VHBA_REQ_FREE,
+    VHBA_REQ_PENDING,
+    VHBA_REQ_READING,
+    VHBA_REQ_SENT,
+    VHBA_REQ_WRITING,
+};
+
+struct vhba_command {
+    struct scsi_cmnd *cmd;
+    int status;
+    struct list_head entry;
+};
+
+struct vhba_device {
+    uint id;
+    spinlock_t cmd_lock;
+    struct list_head cmd_list;
+    wait_queue_head_t cmd_wq;
+    atomic_t refcnt;
+};
+
+struct vhba_host {
+    struct Scsi_Host *shost;
+    spinlock_t cmd_lock;
+    int cmd_next;
+    struct vhba_command commands[VHBA_CAN_QUEUE];
+    spinlock_t dev_lock;
+    struct vhba_device *devices[VHBA_MAX_ID];
+    int num_devices;
+    DECLARE_BITMAP(chgmap, VHBA_MAX_ID);
+    int chgtype[VHBA_MAX_ID];
+    struct work_struct scan_devices;
+};
+
+#define MAX_COMMAND_SIZE 16
+
+struct vhba_request {
+    __u32 tag;
+    __u32 lun;
+    __u8 cdb[MAX_COMMAND_SIZE];
+    __u8 cdb_len;
+    __u32 data_len;
+};
+
+struct vhba_response {
+    __u32 tag;
+    __u32 status;
+    __u32 data_len;
+};
+
+static struct vhba_command *vhba_alloc_command (void);
+static void vhba_free_command (struct vhba_command *vcmd);
+
+static struct platform_device vhba_platform_device;
+
+static struct vhba_device *vhba_device_alloc (void)
+{
+    struct vhba_device *vdev;
+
+    vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL);
+    if (!vdev) {
+        return NULL;
+    }
+
+    vdev->id = VHBA_INVALID_ID;
+    spin_lock_init(&vdev->cmd_lock);
+    INIT_LIST_HEAD(&vdev->cmd_list);
+    init_waitqueue_head(&vdev->cmd_wq);
+    atomic_set(&vdev->refcnt, 1);
+
+    return vdev;
+}
+
+static void vhba_device_put (struct vhba_device *vdev)
+{
+    if (atomic_dec_and_test(&vdev->refcnt)) {
+        kfree(vdev);
+    }
+}
+
+static struct vhba_device *vhba_device_get (struct vhba_device *vdev)
+{
+    atomic_inc(&vdev->refcnt);
+
+    return vdev;
+}
+
+static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
+{
+    struct vhba_command *vcmd;
+    unsigned long flags;
+
+    vcmd = vhba_alloc_command();
+    if (!vcmd) {
+        return SCSI_MLQUEUE_HOST_BUSY;
+    }
+
+    vcmd->cmd = cmd;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_add_tail(&vcmd->entry, &vdev->cmd_list);
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    wake_up_interruptible(&vdev->cmd_wq);
+
+    return 0;
+}
+
+static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd)
+{
+    struct vhba_command *vcmd;
+    int retval;
+    unsigned long flags;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->cmd == cmd) {
+            list_del_init(&vcmd->entry);
+            break;
+        }
+    }
+
+    /* command not found */
+    if (&vcmd->entry == &vdev->cmd_list) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        return SUCCESS;
+    }
+
+    while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        scmd_dbg(cmd, "wait for I/O before aborting\n");
+        schedule_timeout(1);
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+    }
+
+    retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS;
+
+    vhba_free_command(vcmd);
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return retval;
+}
+
+static inline void vhba_scan_devices_add (struct vhba_host *vhost, int id)
+{
+    struct scsi_device *sdev;
+
+    sdev = scsi_device_lookup(vhost->shost, 0, id, 0);
+    if (!sdev) {
+        scsi_add_device(vhost->shost, 0, id, 0);
+    } else {
+        dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device 0:%d:0!\n", id);
+        scsi_device_put(sdev);
+    }
+}
+
+static inline void vhba_scan_devices_remove (struct vhba_host *vhost, int id)
+{
+    struct scsi_device *sdev;
+
+    sdev = scsi_device_lookup(vhost->shost, 0, id, 0);
+    if (sdev) {
+        scsi_remove_device(sdev);
+        scsi_device_put(sdev);
+    } else {
+        dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device 0:%d:0!\n", id);
+    }
+}
+
+static void vhba_scan_devices (struct work_struct *work)
+{
+    struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices);
+    unsigned long flags;
+    int id, change, exists;
+
+    while (1) {
+        spin_lock_irqsave(&vhost->dev_lock, flags);
+
+        id = find_first_bit(vhost->chgmap, vhost->shost->max_id);
+        if (id >= vhost->shost->max_id) {
+            spin_unlock_irqrestore(&vhost->dev_lock, flags);
+            break;
+        }
+        change = vhost->chgtype[id];
+        exists = vhost->devices[id] != NULL;
+
+        vhost->chgtype[id] = 0;
+        clear_bit(id, vhost->chgmap);
+
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+        if (change < 0) {
+            dev_dbg(&vhost->shost->shost_gendev, "trying to remove target 0:%d:0\n", id);
+            vhba_scan_devices_remove(vhost, id);
+        } else if (change > 0) {
+            dev_dbg(&vhost->shost->shost_gendev, "trying to add target 0:%d:0\n", id);
+            vhba_scan_devices_add(vhost, id);
+        } else {
+            /* quick sequence of add/remove or remove/add; we determine
+               which one it was by checking if device structure exists */
+            if (exists) {
+                /* remove followed by add: remove and (re)add */
+                dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target 0:%d:0\n", id);
+                vhba_scan_devices_remove(vhost, id);
+                vhba_scan_devices_add(vhost, id);
+            } else {
+                /* add followed by remove: no-op */
+                dev_dbg(&vhost->shost->shost_gendev, "no-op for target 0:%d:0\n", id);
+            }
+        }
+    }
+}
+
+static int vhba_add_device (struct vhba_device *vdev)
+{
+    struct vhba_host *vhost;
+    int i;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    vhba_device_get(vdev);
+
+    spin_lock_irqsave(&vhost->dev_lock, flags);
+    if (vhost->num_devices >= vhost->shost->max_id) {
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+        vhba_device_put(vdev);
+        return -EBUSY;
+    }
+
+    for (i = 0; i < vhost->shost->max_id; i++) {
+        if (vhost->devices[i] == NULL) {
+            vdev->id = i;
+            vhost->devices[i] = vdev;
+            vhost->num_devices++;
+            set_bit(vdev->id, vhost->chgmap);
+            vhost->chgtype[vdev->id]++;
+            break;
+        }
+    }
+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+    schedule_work(&vhost->scan_devices);
+
+    return 0;
+}
+
+static int vhba_remove_device (struct vhba_device *vdev)
+{
+    struct vhba_host *vhost;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->dev_lock, flags);
+    set_bit(vdev->id, vhost->chgmap);
+    vhost->chgtype[vdev->id]--;
+    vhost->devices[vdev->id] = NULL;
+    vhost->num_devices--;
+    vdev->id = VHBA_INVALID_ID;
+    spin_unlock_irqrestore(&vhost->dev_lock, flags);
+
+    vhba_device_put(vdev);
+
+    schedule_work(&vhost->scan_devices);
+
+    return 0;
+}
+
+static struct vhba_device *vhba_lookup_device (int id)
+{
+    struct vhba_host *vhost;
+    struct vhba_device *vdev = NULL;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    if (likely(id < vhost->shost->max_id)) {
+        spin_lock_irqsave(&vhost->dev_lock, flags);
+        vdev = vhost->devices[id];
+        if (vdev) {
+            vdev = vhba_device_get(vdev);
+        }
+
+        spin_unlock_irqrestore(&vhost->dev_lock, flags);
+    }
+
+    return vdev;
+}
+
+static struct vhba_command *vhba_alloc_command (void)
+{
+    struct vhba_host *vhost;
+    struct vhba_command *vcmd;
+    unsigned long flags;
+    int i;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->cmd_lock, flags);
+
+    vcmd = vhost->commands + vhost->cmd_next++;
+    if (vcmd->status != VHBA_REQ_FREE) {
+        for (i = 0; i < vhost->shost->can_queue; i++) {
+            vcmd = vhost->commands + i;
+
+            if (vcmd->status == VHBA_REQ_FREE) {
+                vhost->cmd_next = i + 1;
+                break;
+            }
+        }
+
+        if (i == vhost->shost->can_queue) {
+            vcmd = NULL;
+        }
+    }
+
+    if (vcmd) {
+        vcmd->status = VHBA_REQ_PENDING;
+    }
+
+    vhost->cmd_next %= vhost->shost->can_queue;
+
+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
+
+    return vcmd;
+}
+
+static void vhba_free_command (struct vhba_command *vcmd)
+{
+    struct vhba_host *vhost;
+    unsigned long flags;
+
+    vhost = platform_get_drvdata(&vhba_platform_device);
+
+    spin_lock_irqsave(&vhost->cmd_lock, flags);
+    vcmd->status = VHBA_REQ_FREE;
+    spin_unlock_irqrestore(&vhost->cmd_lock, flags);
+}
+
+static int vhba_queuecommand_lck (struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *))
+{
+    struct vhba_device *vdev;
+    int retval;
+
+    scmd_dbg(cmd, "queue %lu\n", cmd->serial_number);
+
+    vdev = vhba_lookup_device(cmd->device->id);
+    if (!vdev) {
+        scmd_dbg(cmd, "no such device\n");
+
+        cmd->result = DID_NO_CONNECT << 16;
+        done(cmd);
+
+        return 0;
+    }
+
+    cmd->scsi_done = done;
+    retval = vhba_device_queue(vdev, cmd);
+
+    vhba_device_put(vdev);
+
+    return retval;
+}
+
+#ifdef DEF_SCSI_QCMD
+DEF_SCSI_QCMD(vhba_queuecommand)
+#else
+#define vhba_queuecommand vhba_queuecommand_lck
+#endif
+
+static int vhba_abort (struct scsi_cmnd *cmd)
+{
+    struct vhba_device *vdev;
+    int retval = SUCCESS;
+
+    scmd_warn(cmd, "abort %lu\n", cmd->serial_number);
+
+    vdev = vhba_lookup_device(cmd->device->id);
+    if (vdev) {
+        retval = vhba_device_dequeue(vdev, cmd);
+        vhba_device_put(vdev);
+    } else {
+        cmd->result = DID_NO_CONNECT << 16;
+    }
+
+    return retval;
+}
+
+static struct scsi_host_template vhba_template = {
+    .module = THIS_MODULE,
+    .name = "vhba",
+    .proc_name = "vhba",
+    .queuecommand = vhba_queuecommand,
+    .eh_abort_handler = vhba_abort,
+    .can_queue = VHBA_CAN_QUEUE,
+    .this_id = -1,
+    .cmd_per_lun = 1,
+    .max_sectors = VHBA_MAX_SECTORS_PER_IO,
+    .sg_tablesize = 256,
+};
+
+static ssize_t do_request (struct scsi_cmnd *cmd, char __user *buf, size_t buf_len)
+{
+    struct vhba_request vreq;
+    ssize_t ret;
+
+    scmd_dbg(cmd, "request %lu, cdb 0x%x, bufflen %d, use_sg %d\n",
+        cmd->serial_number, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd));
+
+    ret = sizeof(vreq);
+    if (DATA_TO_DEVICE(cmd->sc_data_direction)) {
+        ret += scsi_bufflen(cmd);
+    }
+
+    if (ret > buf_len) {
+        scmd_warn(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret);
+        return -EIO;
+    }
+
+    vreq.tag = cmd->serial_number;
+    vreq.lun = cmd->device->lun;
+    memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE);
+    vreq.cdb_len = cmd->cmd_len;
+    vreq.data_len = scsi_bufflen(cmd);
+
+    if (copy_to_user(buf, &vreq, sizeof(vreq))) {
+        return -EFAULT;
+    }
+
+    if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) {
+        buf += sizeof(vreq);
+
+        if (scsi_sg_count(cmd)) {
+            unsigned char buf_stack[64];
+            unsigned char *kaddr, *uaddr, *kbuf;
+            struct scatterlist *sg = scsi_sglist(cmd);
+            int i;
+
+            uaddr = (unsigned char *) buf;
+
+            if (vreq.data_len > 64) {
+                kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+            } else {
+                kbuf = buf_stack;
+            }
+
+            for (i = 0; i < scsi_sg_count(cmd); i++) {
+                size_t len = sg[i].length;
+
+#ifdef USE_SG_PAGE
+                kaddr = vhba_kmap_atomic(sg_page(&sg[i]));
+#else
+                kaddr = vhba_kmap_atomic(sg[i].page);
+#endif
+                memcpy(kbuf, kaddr + sg[i].offset, len);
+                vhba_kunmap_atomic(kaddr);
+
+                if (copy_to_user(uaddr, kbuf, len)) {
+                    if (kbuf != buf_stack) {
+                        kfree(kbuf);
+                    }
+                    return -EFAULT;
+                }
+                uaddr += len;
+            }
+
+            if (kbuf != buf_stack) {
+                kfree(kbuf);
+            }
+        } else {
+            if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) {
+                return -EFAULT;
+            }
+        }
+    }
+
+    return ret;
+}
+
+static ssize_t do_response (struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res)
+{
+    ssize_t ret = 0;
+
+    scmd_dbg(cmd, "response %lu, status %x, data len %d, use_sg %d\n",
+         cmd->serial_number, res->status, res->data_len, scsi_sg_count(cmd));
+
+    if (res->status) {
+        unsigned char sense_stack[SCSI_SENSE_BUFFERSIZE];
+
+        if (res->data_len > SCSI_SENSE_BUFFERSIZE) {
+            scmd_warn(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len);
+            res->data_len = SCSI_SENSE_BUFFERSIZE;
+        }
+
+        /* Copy via temporary buffer on stack in order to avoid problems
+           with PAX on grsecurity-enabled kernels */
+        if (copy_from_user(sense_stack, buf, res->data_len)) {
+            return -EFAULT;
+        }
+        memcpy(cmd->sense_buffer, sense_stack, res->data_len);
+
+        cmd->result = res->status;
+
+        ret += res->data_len;
+    } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) {
+        size_t to_read;
+
+        if (res->data_len > scsi_bufflen(cmd)) {
+            scmd_warn(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len);
+            res->data_len = scsi_bufflen(cmd);
+        }
+
+        to_read = res->data_len;
+
+        if (scsi_sg_count(cmd)) {
+            unsigned char buf_stack[64];
+            unsigned char *kaddr, *uaddr, *kbuf;
+            struct scatterlist *sg = scsi_sglist(cmd);
+            int i;
+
+            uaddr = (unsigned char *)buf;
+
+            if (res->data_len > 64) {
+                kbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+            } else {
+                kbuf = buf_stack;
+            }
+
+            for (i = 0; i < scsi_sg_count(cmd); i++) {
+                size_t len = (sg[i].length < to_read) ? sg[i].length : to_read;
+
+                if (copy_from_user(kbuf, uaddr, len)) {
+                    if (kbuf != buf_stack) {
+                        kfree(kbuf);
+                    }
+                    return -EFAULT;
+                }
+                uaddr += len;
+
+#ifdef USE_SG_PAGE
+                kaddr = vhba_kmap_atomic(sg_page(&sg[i]));
+#else
+                kaddr = vhba_kmap_atomic(sg[i].page);
+#endif
+                memcpy(kaddr + sg[i].offset, kbuf, len);
+                vhba_kunmap_atomic(kaddr);
+
+                to_read -= len;
+                if (to_read == 0) {
+                    break;
+                }
+            }
+
+            if (kbuf != buf_stack) {
+                kfree(kbuf);
+            }
+        } else {
+            if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) {
+                return -EFAULT;
+            }
+
+            to_read -= res->data_len;
+        }
+
+        scsi_set_resid(cmd, to_read);
+
+        ret += res->data_len - to_read;
+    }
+
+    return ret;
+}
+
+static inline struct vhba_command *next_command (struct vhba_device *vdev)
+{
+    struct vhba_command *vcmd;
+
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->status == VHBA_REQ_PENDING) {
+            break;
+        }
+    }
+
+    if (&vcmd->entry == &vdev->cmd_list) {
+        vcmd = NULL;
+    }
+
+    return vcmd;
+}
+
+static inline struct vhba_command *match_command (struct vhba_device *vdev, u32 tag)
+{
+    struct vhba_command *vcmd;
+
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        if (vcmd->cmd->serial_number == tag) {
+            break;
+        }
+    }
+
+    if (&vcmd->entry == &vdev->cmd_list) {
+        vcmd = NULL;
+    }
+
+    return vcmd;
+}
+
+static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags)
+{
+    struct vhba_command *vcmd;
+    DEFINE_WAIT(wait);
+
+    while (!(vcmd = next_command(vdev))) {
+        if (signal_pending(current)) {
+            break;
+        }
+
+        prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE);
+
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        schedule();
+
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+    }
+
+    finish_wait(&vdev->cmd_wq, &wait);
+    if (vcmd) {
+        vcmd->status = VHBA_REQ_READING;
+    }
+
+    return vcmd;
+}
+
+static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    ssize_t ret;
+    unsigned long flags;
+
+    vdev = file->private_data;
+
+    /* Get next command */
+    if (file->f_flags & O_NONBLOCK) {
+        /* Non-blocking variant */
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+        vcmd = next_command(vdev);
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        if (!vcmd) {
+            return -EWOULDBLOCK;
+        }
+    } else {
+        /* Blocking variant */
+        spin_lock_irqsave(&vdev->cmd_lock, flags);
+        vcmd = wait_command(vdev, flags);
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+        if (!vcmd) {
+            return -ERESTARTSYS;
+        }
+    }
+
+    ret = do_request(vcmd->cmd, buf, buf_len);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (ret >= 0) {
+        vcmd->status = VHBA_REQ_SENT;
+        *offset += ret;
+    } else {
+        vcmd->status = VHBA_REQ_PENDING;
+    }
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return ret;
+}
+
+static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    struct vhba_response res;
+    ssize_t ret;
+    unsigned long flags;
+
+    if (buf_len < sizeof(res)) {
+        return -EIO;
+    }
+
+    if (copy_from_user(&res, buf, sizeof(res))) {
+        return -EFAULT;
+    }
+
+    vdev = file->private_data;
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    vcmd = match_command(vdev, res.tag);
+    if (!vcmd || vcmd->status != VHBA_REQ_SENT) {
+        spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+        DPRINTK("not expecting response\n");
+        return -EIO;
+    }
+    vcmd->status = VHBA_REQ_WRITING;
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    ret = do_response(vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (ret >= 0) {
+        vcmd->cmd->scsi_done(vcmd->cmd);
+        ret += sizeof(res);
+
+        /* don't compete with vhba_device_dequeue */
+        if (!list_empty(&vcmd->entry)) {
+            list_del_init(&vcmd->entry);
+            vhba_free_command(vcmd);
+        }
+    } else {
+        vcmd->status = VHBA_REQ_SENT;
+    }
+
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return ret;
+}
+
+static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    struct vhba_device *vdev = file->private_data;
+    struct vhba_host *vhost;
+    struct scsi_device *sdev;
+
+    switch (cmd) {
+        case 0xBEEF001: {
+            vhost = platform_get_drvdata(&vhba_platform_device);
+            sdev = scsi_device_lookup(vhost->shost, 0, vdev->id, 0);
+
+            if (sdev) {
+                int id[4] = {
+                    sdev->host->host_no,
+                    sdev->channel,
+                    sdev->id,
+                    sdev->lun
+                };
+
+                scsi_device_put(sdev);
+
+                if (copy_to_user((void *)arg, id, sizeof(id))) {
+                    return -EFAULT;
+                }
+
+                return 0;
+            } else {
+                return -ENODEV;
+            }
+        }
+    }
+
+    return -ENOTTY;
+}
+
+#ifdef CONFIG_COMPAT
+static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg)
+{
+    unsigned long compat_arg = (unsigned long)compat_ptr(arg);
+    return vhba_ctl_ioctl(file, cmd, compat_arg);
+}
+#endif
+
+static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait)
+{
+    struct vhba_device *vdev = file->private_data;
+    unsigned int mask = 0;
+    unsigned long flags;
+
+    poll_wait(file, &vdev->cmd_wq, wait);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    if (next_command(vdev)) {
+        mask |= POLLIN | POLLRDNORM;
+    }
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    return mask;
+}
+
+static int vhba_ctl_open (struct inode *inode, struct file *file)
+{
+    struct vhba_device *vdev;
+    int retval;
+
+    DPRINTK("open\n");
+
+    /* check if vhba is probed */
+    if (!platform_get_drvdata(&vhba_platform_device)) {
+        return -ENODEV;
+    }
+
+    vdev = vhba_device_alloc();
+    if (!vdev) {
+        return -ENOMEM;
+    }
+
+    if (!(retval = vhba_add_device(vdev))) {
+        file->private_data = vdev;
+    }
+
+    vhba_device_put(vdev);
+
+    return retval;
+}
+
+static int vhba_ctl_release (struct inode *inode, struct file *file)
+{
+    struct vhba_device *vdev;
+    struct vhba_command *vcmd;
+    unsigned long flags;
+
+    DPRINTK("release\n");
+
+    vdev = file->private_data;
+
+    vhba_device_get(vdev);
+    vhba_remove_device(vdev);
+
+    spin_lock_irqsave(&vdev->cmd_lock, flags);
+    list_for_each_entry(vcmd, &vdev->cmd_list, entry) {
+        WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING);
+
+        scmd_warn(vcmd->cmd, "device released with command %lu\n", vcmd->cmd->serial_number);
+        vcmd->cmd->result = DID_NO_CONNECT << 16;
+        vcmd->cmd->scsi_done(vcmd->cmd);
+
+        vhba_free_command(vcmd);
+    }
+    INIT_LIST_HEAD(&vdev->cmd_list);
+    spin_unlock_irqrestore(&vdev->cmd_lock, flags);
+
+    vhba_device_put(vdev);
+
+    return 0;
+}
+
+static struct file_operations vhba_ctl_fops = {
+    .owner = THIS_MODULE,
+    .open = vhba_ctl_open,
+    .release = vhba_ctl_release,
+    .read = vhba_ctl_read,
+    .write = vhba_ctl_write,
+    .poll = vhba_ctl_poll,
+    .unlocked_ioctl = vhba_ctl_ioctl,
+#ifdef CONFIG_COMPAT
+    .compat_ioctl = vhba_ctl_compat_ioctl,
+#endif
+};
+
+static struct miscdevice vhba_miscdev = {
+    .minor = MISC_DYNAMIC_MINOR,
+    .name = "vhba_ctl",
+    .fops = &vhba_ctl_fops,
+};
+
+static int vhba_probe (struct platform_device *pdev)
+{
+    struct Scsi_Host *shost;
+    struct vhba_host *vhost;
+    int i;
+
+    shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host));
+    if (!shost) {
+        return -ENOMEM;
+    }
+
+    shost->max_id = VHBA_MAX_ID;
+    /* we don't support lun > 0 */
+    shost->max_lun = 1;
+    shost->max_cmd_len = MAX_COMMAND_SIZE;
+
+    vhost = (struct vhba_host *)shost->hostdata;
+    memset(vhost, 0, sizeof(*vhost));
+
+    vhost->shost = shost;
+    vhost->num_devices = 0;
+    spin_lock_init(&vhost->dev_lock);
+    spin_lock_init(&vhost->cmd_lock);
+    INIT_WORK(&vhost->scan_devices, vhba_scan_devices);
+    vhost->cmd_next = 0;
+    for (i = 0; i < vhost->shost->can_queue; i++) {
+        vhost->commands[i].status = VHBA_REQ_FREE;
+    }
+
+    platform_set_drvdata(pdev, vhost);
+
+    if (scsi_add_host(shost, &pdev->dev)) {
+        scsi_host_put(shost);
+        return -ENOMEM;
+    }
+
+    return 0;
+}
+
+static int vhba_remove (struct platform_device *pdev)
+{
+    struct vhba_host *vhost;
+    struct Scsi_Host *shost;
+
+    vhost = platform_get_drvdata(pdev);
+    shost = vhost->shost;
+
+    scsi_remove_host(shost);
+    scsi_host_put(shost);
+
+    return 0;
+}
+
+static void vhba_release (struct device * dev)
+{
+    return;
+}
+
+static struct platform_device vhba_platform_device = {
+    .name = "vhba",
+    .id = -1,
+    .dev = {
+        .release = vhba_release,
+    },
+};
+
+static struct platform_driver vhba_platform_driver = {
+    .driver = {
+        .owner = THIS_MODULE,
+        .name = "vhba",
+    },
+    .probe = vhba_probe,
+    .remove = vhba_remove,
+};
+
+static int __init vhba_init (void)
+{
+    int ret;
+
+    ret = platform_device_register(&vhba_platform_device);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = platform_driver_register(&vhba_platform_driver);
+    if (ret < 0) {
+        platform_device_unregister(&vhba_platform_device);
+        return ret;
+    }
+
+    ret = misc_register(&vhba_miscdev);
+    if (ret < 0) {
+        platform_driver_unregister(&vhba_platform_driver);
+        platform_device_unregister(&vhba_platform_device);
+        return ret;
+    }
+
+    return 0;
+}
+
+static void __exit vhba_exit(void)
+{
+    misc_deregister(&vhba_miscdev);
+    platform_driver_unregister(&vhba_platform_driver);
+    platform_device_unregister(&vhba_platform_device);
+}
+
+module_init(vhba_init);
+module_exit(vhba_exit);
+
diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig
index 9510305..c5af5f0 100644
--- a/drivers/tty/Kconfig
+++ b/drivers/tty/Kconfig
@@ -75,6 +75,19 @@ config VT_CONSOLE_SLEEP
 	def_bool y
 	depends on VT_CONSOLE && PM_SLEEP
 
+config NR_TTY_DEVICES
+        int "Maximum tty device number"
+        depends on VT
+        range 12 63
+        default 63
+        ---help---
+          This option is used to change the number of tty devices in /dev.
+          The default value is 63. The lowest number you can set is 12,
+          63 is also the upper limit so we don't overrun the serial
+          consoles.
+
+          If unsure, say 63.
+
 config HW_CONSOLE
 	bool
 	depends on VT && !UML
diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c
index e0cd1e4..a91fe64 100644
--- a/drivers/usb/gadget/function/u_serial.c
+++ b/drivers/usb/gadget/function/u_serial.c
@@ -1518,8 +1518,13 @@ void gserial_disconnect(struct gserial *gser)
 	gser->ioport = NULL;
 	if (port->port.count > 0 || port->openclose) {
 		wake_up_interruptible(&port->drain_wait);
+#if 0
 		if (port->port.tty)
 			tty_hangup(port->port.tty);
+#else
+		if (port->port.tty)
+			stop_tty(port->port.tty);
+#endif
 	}
 	spin_unlock_irqrestore(&port->port_lock, flags);
 
diff --git a/fs/Kconfig b/fs/Kconfig
index 2bc7ad7..301c76f 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -124,6 +124,7 @@ if BLOCK
 menu "DOS/FAT/NT Filesystems"
 
 source "fs/fat/Kconfig"
+source "fs/exfat/Kconfig"
 source "fs/ntfs/Kconfig"
 
 endmenu
@@ -245,6 +246,7 @@ source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/aufs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/Makefile b/fs/Makefile
index ed2b632..f3c25e7 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -78,6 +78,7 @@ obj-$(CONFIG_HUGETLBFS)		+= hugetlbfs/
 obj-$(CONFIG_CODA_FS)		+= coda/
 obj-$(CONFIG_MINIX_FS)		+= minix/
 obj-$(CONFIG_FAT_FS)		+= fat/
+obj-$(CONFIG_EXFAT_FS)		+= exfat/
 obj-$(CONFIG_BFS_FS)		+= bfs/
 obj-$(CONFIG_ISO9660_FS)	+= isofs/
 obj-$(CONFIG_HFSPLUS_FS)	+= hfsplus/ # Before hfs to find wrapped HFS+
@@ -129,3 +130,4 @@ obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_AUFS_FS)           += aufs/
diff --git b/fs/aufs/Kconfig b/fs/aufs/Kconfig
new file mode 100644
index 0000000..63560ce
--- /dev/null
+++ b/fs/aufs/Kconfig
@@ -0,0 +1,185 @@
+config AUFS_FS
+	tristate "Aufs (Advanced multi layered unification filesystem) support"
+	help
+	Aufs is a stackable unification filesystem such as Unionfs,
+	which unifies several directories and provides a merged single
+	directory.
+	In the early days, aufs was entirely re-designed and
+	re-implemented Unionfs Version 1.x series. Introducing many
+	original ideas, approaches and improvements, it becomes totally
+	different from Unionfs while keeping the basic features.
+
+if AUFS_FS
+choice
+	prompt "Maximum number of branches"
+	default AUFS_BRANCH_MAX_127
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_127
+	bool "127"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_511
+	bool "511"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_1023
+	bool "1023"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+config AUFS_BRANCH_MAX_32767
+	bool "32767"
+	help
+	Specifies the maximum number of branches (or member directories)
+	in a single aufs. The larger value consumes more system
+	resources and has a minor impact to performance.
+endchoice
+
+config AUFS_SBILIST
+	bool
+	depends on AUFS_MAGIC_SYSRQ || PROC_FS
+	default y
+	help
+	Automatic configuration for internal use.
+	When aufs supports Magic SysRq or /proc, enabled automatically.
+
+config AUFS_HNOTIFY
+	bool "Detect direct branch access (bypassing aufs)"
+	help
+	If you want to modify files on branches directly, eg. bypassing aufs,
+	and want aufs to detect the changes of them fully, then enable this
+	option and use 'udba=notify' mount option.
+	Currently there is only one available configuration, "fsnotify".
+	It will have a negative impact to the performance.
+	See detail in aufs.5.
+
+choice
+	prompt "method" if AUFS_HNOTIFY
+	default AUFS_HFSNOTIFY
+config AUFS_HFSNOTIFY
+	bool "fsnotify"
+	select FSNOTIFY
+endchoice
+
+config AUFS_EXPORT
+	bool "NFS-exportable aufs"
+	depends on EXPORTFS
+	help
+	If you want to export your mounted aufs via NFS, then enable this
+	option. There are several requirements for this configuration.
+	See detail in aufs.5.
+
+config AUFS_INO_T_64
+	bool
+	depends on AUFS_EXPORT
+	depends on 64BIT && !(ALPHA || S390)
+	default y
+	help
+	Automatic configuration for internal use.
+	/* typedef unsigned long/int __kernel_ino_t */
+	/* alpha and s390x are int */
+
+config AUFS_XATTR
+	bool "support for XATTR/EA (including Security Labels)"
+	help
+	If your branch fs supports XATTR/EA and you want to make them
+	available in aufs too, then enable this opsion and specify the
+	branch attributes for EA.
+	See detail in aufs.5.
+
+config AUFS_FHSM
+	bool "File-based Hierarchical Storage Management"
+	help
+	Hierarchical Storage Management (or HSM) is a well-known feature
+	in the storage world. Aufs provides this feature as file-based.
+	with multiple branches.
+	These multiple branches are prioritized, ie. the topmost one
+	should be the fastest drive and be used heavily.
+
+config AUFS_RDU
+	bool "Readdir in userspace"
+	help
+	Aufs has two methods to provide a merged view for a directory,
+	by a user-space library and by kernel-space natively. The latter
+	is always enabled but sometimes large and slow.
+	If you enable this option, install the library in aufs2-util
+	package, and set some environment variables for your readdir(3),
+	then the work will be handled in user-space which generally
+	shows better performance in most cases.
+	See detail in aufs.5.
+
+config AUFS_SHWH
+	bool "Show whiteouts"
+	help
+	If you want to make the whiteouts in aufs visible, then enable
+	this option and specify 'shwh' mount option. Although it may
+	sounds like philosophy or something, but in technically it
+	simply shows the name of whiteout with keeping its behaviour.
+
+config AUFS_BR_RAMFS
+	bool "Ramfs (initramfs/rootfs) as an aufs branch"
+	help
+	If you want to use ramfs as an aufs branch fs, then enable this
+	option. Generally tmpfs is recommended.
+	Aufs prohibited them to be a branch fs by default, because
+	initramfs becomes unusable after switch_root or something
+	generally. If you sets initramfs as an aufs branch and boot your
+	system by switch_root, you will meet a problem easily since the
+	files in initramfs may be inaccessible.
+	Unless you are going to use ramfs as an aufs branch fs without
+	switch_root or something, leave it N.
+
+config AUFS_BR_FUSE
+	bool "Fuse fs as an aufs branch"
+	depends on FUSE_FS
+	select AUFS_POLL
+	help
+	If you want to use fuse-based userspace filesystem as an aufs
+	branch fs, then enable this option.
+	It implements the internal poll(2) operation which is
+	implemented by fuse only (curretnly).
+
+config AUFS_POLL
+	bool
+	help
+	Automatic configuration for internal use.
+
+config AUFS_BR_HFSPLUS
+	bool "Hfsplus as an aufs branch"
+	depends on HFSPLUS_FS
+	default y
+	help
+	If you want to use hfsplus fs as an aufs branch fs, then enable
+	this option. This option introduces a small overhead at
+	copying-up a file on hfsplus.
+
+config AUFS_BDEV_LOOP
+	bool
+	depends on BLK_DEV_LOOP
+	default y
+	help
+	Automatic configuration for internal use.
+	Convert =[ym] into =y.
+
+config AUFS_DEBUG
+	bool "Debug aufs"
+	help
+	Enable this to compile aufs internal debug code.
+	It will have a negative impact to the performance.
+
+config AUFS_MAGIC_SYSRQ
+	bool
+	depends on AUFS_DEBUG && MAGIC_SYSRQ
+	default y
+	help
+	Automatic configuration for internal use.
+	When aufs supports Magic SysRq, enabled automatically.
+endif
diff --git b/fs/aufs/Makefile b/fs/aufs/Makefile
new file mode 100644
index 0000000..c7efb62
--- /dev/null
+++ b/fs/aufs/Makefile
@@ -0,0 +1,36 @@
+
+include ${srctree}/${src}/magic.mk
+
+# cf. include/linux/kernel.h
+# enable pr_debug
+ccflags-y += -DDEBUG
+# sparse requires the full pathname
+ccflags-y += -include ${srctree}/include/uapi/linux/aufs_type.h
+
+obj-$(CONFIG_AUFS_FS) += aufs.o
+aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \
+	wkq.o vfsub.o dcsub.o \
+	cpup.o whout.o wbr_policy.o \
+	dinfo.o dentry.o \
+	dynop.o \
+	finfo.o file.o f_op.o \
+	dir.o vdir.o \
+	iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \
+	mvdown.o ioctl.o
+
+# all are boolean
+aufs-$(CONFIG_PROC_FS) += procfs.o plink.o
+aufs-$(CONFIG_SYSFS) += sysfs.o
+aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o
+aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o
+aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o
+aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o
+aufs-$(CONFIG_AUFS_EXPORT) += export.o
+aufs-$(CONFIG_AUFS_XATTR) += xattr.o
+aufs-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
+aufs-$(CONFIG_AUFS_FHSM) += fhsm.o
+aufs-$(CONFIG_AUFS_POLL) += poll.o
+aufs-$(CONFIG_AUFS_RDU) += rdu.o
+aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o
+aufs-$(CONFIG_AUFS_DEBUG) += debug.o
+aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o
diff --git b/fs/aufs/aufs.h b/fs/aufs/aufs.h
new file mode 100644
index 0000000..49f43b4
--- /dev/null
+++ b/fs/aufs/aufs.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * all header files
+ */
+
+#ifndef __AUFS_H__
+#define __AUFS_H__
+
+#ifdef __KERNEL__
+
+#define AuStub(type, name, body, ...) \
+	static inline type name(__VA_ARGS__) { body; }
+
+#define AuStubVoid(name, ...) \
+	AuStub(void, name, , __VA_ARGS__)
+#define AuStubInt0(name, ...) \
+	AuStub(int, name, return 0, __VA_ARGS__)
+
+#include "debug.h"
+
+#include "branch.h"
+#include "cpup.h"
+#include "dcsub.h"
+#include "dbgaufs.h"
+#include "dentry.h"
+#include "dir.h"
+#include "dynop.h"
+#include "file.h"
+#include "fstype.h"
+#include "inode.h"
+#include "loop.h"
+#include "module.h"
+#include "opts.h"
+#include "rwsem.h"
+#include "spl.h"
+#include "super.h"
+#include "sysaufs.h"
+#include "vfsub.h"
+#include "whout.h"
+#include "wkq.h"
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_H__ */
diff --git b/fs/aufs/branch.c b/fs/aufs/branch.c
new file mode 100644
index 0000000..3bfbe5b
--- /dev/null
+++ b/fs/aufs/branch.c
@@ -0,0 +1,1399 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * branch management
+ */
+
+#include <linux/compat.h>
+#include <linux/statfs.h>
+#include "aufs.h"
+
+/*
+ * free a single branch
+ */
+static void au_br_do_free(struct au_branch *br)
+{
+	int i;
+	struct au_wbr *wbr;
+	struct au_dykey **key;
+
+	au_hnotify_fin_br(br);
+
+	if (br->br_xino.xi_file)
+		fput(br->br_xino.xi_file);
+	mutex_destroy(&br->br_xino.xi_nondir_mtx);
+
+	AuDebugOn(au_br_count(br));
+	au_br_count_fin(br);
+
+	wbr = br->br_wbr;
+	if (wbr) {
+		for (i = 0; i < AuBrWh_Last; i++)
+			dput(wbr->wbr_wh[i]);
+		AuDebugOn(atomic_read(&wbr->wbr_wh_running));
+		AuRwDestroy(&wbr->wbr_wh_rwsem);
+	}
+
+	if (br->br_fhsm) {
+		au_br_fhsm_fin(br->br_fhsm);
+		au_delayed_kfree(br->br_fhsm);
+	}
+
+	key = br->br_dykey;
+	for (i = 0; i < AuBrDynOp; i++, key++)
+		if (*key)
+			au_dy_put(*key);
+		else
+			break;
+
+	/* recursive lock, s_umount of branch's */
+	lockdep_off();
+	path_put(&br->br_path);
+	lockdep_on();
+	if (wbr)
+		au_delayed_kfree(wbr);
+	au_delayed_kfree(br);
+}
+
+/*
+ * frees all branches
+ */
+void au_br_free(struct au_sbinfo *sbinfo)
+{
+	aufs_bindex_t bmax;
+	struct au_branch **br;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	bmax = sbinfo->si_bbot + 1;
+	br = sbinfo->si_branch;
+	while (bmax--)
+		au_br_do_free(*br++);
+}
+
+/*
+ * find the index of a branch which is specified by @br_id.
+ */
+int au_br_index(struct super_block *sb, aufs_bindex_t br_id)
+{
+	aufs_bindex_t bindex, bbot;
+
+	bbot = au_sbbot(sb);
+	for (bindex = 0; bindex <= bbot; bindex++)
+		if (au_sbr_id(sb, bindex) == br_id)
+			return bindex;
+	return -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * add a branch
+ */
+
+static int test_overlap(struct super_block *sb, struct dentry *h_adding,
+			struct dentry *h_root)
+{
+	if (unlikely(h_adding == h_root
+		     || au_test_loopback_overlap(sb, h_adding)))
+		return 1;
+	if (h_adding->d_sb != h_root->d_sb)
+		return 0;
+	return au_test_subdir(h_adding, h_root)
+		|| au_test_subdir(h_root, h_adding);
+}
+
+/*
+ * returns a newly allocated branch. @new_nbranch is a number of branches
+ * after adding a branch.
+ */
+static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch,
+				     int perm)
+{
+	struct au_branch *add_branch;
+	struct dentry *root;
+	struct inode *inode;
+	int err;
+
+	err = -ENOMEM;
+	root = sb->s_root;
+	add_branch = kzalloc(sizeof(*add_branch), GFP_NOFS);
+	if (unlikely(!add_branch))
+		goto out;
+
+	err = au_hnotify_init_br(add_branch, perm);
+	if (unlikely(err))
+		goto out_br;
+
+	if (au_br_writable(perm)) {
+		/* may be freed separately at changing the branch permission */
+		add_branch->br_wbr = kzalloc(sizeof(*add_branch->br_wbr),
+					     GFP_NOFS);
+		if (unlikely(!add_branch->br_wbr))
+			goto out_hnotify;
+	}
+
+	if (au_br_fhsm(perm)) {
+		err = au_fhsm_br_alloc(add_branch);
+		if (unlikely(err))
+			goto out_wbr;
+	}
+
+	err = au_sbr_realloc(au_sbi(sb), new_nbranch, /*may_shrink*/0);
+	if (!err)
+		err = au_di_realloc(au_di(root), new_nbranch, /*may_shrink*/0);
+	if (!err) {
+		inode = d_inode(root);
+		err = au_hinode_realloc(au_ii(inode), new_nbranch, /*may_shrink*/0);
+	}
+	if (!err)
+		return add_branch; /* success */
+
+out_wbr:
+	if (add_branch->br_wbr)
+		au_delayed_kfree(add_branch->br_wbr);
+out_hnotify:
+	au_hnotify_fin_br(add_branch);
+out_br:
+	au_delayed_kfree(add_branch);
+out:
+	return ERR_PTR(err);
+}
+
+/*
+ * test if the branch permission is legal or not.
+ */
+static int test_br(struct inode *inode, int brperm, char *path)
+{
+	int err;
+
+	err = (au_br_writable(brperm) && IS_RDONLY(inode));
+	if (!err)
+		goto out;
+
+	err = -EINVAL;
+	pr_err("write permission for readonly mount or inode, %s\n", path);
+
+out:
+	return err;
+}
+
+/*
+ * returns:
+ * 0: success, the caller will add it
+ * plus: success, it is already unified, the caller should ignore it
+ * minus: error
+ */
+static int test_add(struct super_block *sb, struct au_opt_add *add, int remount)
+{
+	int err;
+	aufs_bindex_t bbot, bindex;
+	struct dentry *root, *h_dentry;
+	struct inode *inode, *h_inode;
+
+	root = sb->s_root;
+	bbot = au_sbbot(sb);
+	if (unlikely(bbot >= 0
+		     && au_find_dbindex(root, add->path.dentry) >= 0)) {
+		err = 1;
+		if (!remount) {
+			err = -EINVAL;
+			pr_err("%s duplicated\n", add->pathname);
+		}
+		goto out;
+	}
+
+	err = -ENOSPC; /* -E2BIG; */
+	if (unlikely(AUFS_BRANCH_MAX <= add->bindex
+		     || AUFS_BRANCH_MAX - 1 <= bbot)) {
+		pr_err("number of branches exceeded %s\n", add->pathname);
+		goto out;
+	}
+
+	err = -EDOM;
+	if (unlikely(add->bindex < 0 || bbot + 1 < add->bindex)) {
+		pr_err("bad index %d\n", add->bindex);
+		goto out;
+	}
+
+	inode = d_inode(add->path.dentry);
+	err = -ENOENT;
+	if (unlikely(!inode->i_nlink)) {
+		pr_err("no existence %s\n", add->pathname);
+		goto out;
+	}
+
+	err = -EINVAL;
+	if (unlikely(inode->i_sb == sb)) {
+		pr_err("%s must be outside\n", add->pathname);
+		goto out;
+	}
+
+	if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) {
+		pr_err("unsupported filesystem, %s (%s)\n",
+		       add->pathname, au_sbtype(inode->i_sb));
+		goto out;
+	}
+
+	if (unlikely(inode->i_sb->s_stack_depth)) {
+		pr_err("already stacked, %s (%s)\n",
+		       add->pathname, au_sbtype(inode->i_sb));
+		goto out;
+	}
+
+	err = test_br(d_inode(add->path.dentry), add->perm, add->pathname);
+	if (unlikely(err))
+		goto out;
+
+	if (bbot < 0)
+		return 0; /* success */
+
+	err = -EINVAL;
+	for (bindex = 0; bindex <= bbot; bindex++)
+		if (unlikely(test_overlap(sb, add->path.dentry,
+					  au_h_dptr(root, bindex)))) {
+			pr_err("%s is overlapped\n", add->pathname);
+			goto out;
+		}
+
+	err = 0;
+	if (au_opt_test(au_mntflags(sb), WARN_PERM)) {
+		h_dentry = au_h_dptr(root, 0);
+		h_inode = d_inode(h_dentry);
+		if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO)
+		    || !uid_eq(h_inode->i_uid, inode->i_uid)
+		    || !gid_eq(h_inode->i_gid, inode->i_gid))
+			pr_warn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n",
+				add->pathname,
+				i_uid_read(inode), i_gid_read(inode),
+				(inode->i_mode & S_IALLUGO),
+				i_uid_read(h_inode), i_gid_read(h_inode),
+				(h_inode->i_mode & S_IALLUGO));
+	}
+
+out:
+	return err;
+}
+
+/*
+ * initialize or clean the whiteouts for an adding branch
+ */
+static int au_br_init_wh(struct super_block *sb, struct au_branch *br,
+			 int new_perm)
+{
+	int err, old_perm;
+	aufs_bindex_t bindex;
+	struct inode *h_inode;
+	struct au_wbr *wbr;
+	struct au_hinode *hdir;
+	struct dentry *h_dentry;
+
+	err = vfsub_mnt_want_write(au_br_mnt(br));
+	if (unlikely(err))
+		goto out;
+
+	wbr = br->br_wbr;
+	old_perm = br->br_perm;
+	br->br_perm = new_perm;
+	hdir = NULL;
+	h_inode = NULL;
+	bindex = au_br_index(sb, br->br_id);
+	if (0 <= bindex) {
+		hdir = au_hi(d_inode(sb->s_root), bindex);
+		au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT);
+	} else {
+		h_dentry = au_br_dentry(br);
+		h_inode = d_inode(h_dentry);
+		inode_lock_nested(h_inode, AuLsc_I_PARENT);
+	}
+	if (!wbr)
+		err = au_wh_init(br, sb);
+	else {
+		wbr_wh_write_lock(wbr);
+		err = au_wh_init(br, sb);
+		wbr_wh_write_unlock(wbr);
+	}
+	if (hdir)
+		au_hn_inode_unlock(hdir);
+	else
+		inode_unlock(h_inode);
+	vfsub_mnt_drop_write(au_br_mnt(br));
+	br->br_perm = old_perm;
+
+	if (!err && wbr && !au_br_writable(new_perm)) {
+		au_delayed_kfree(wbr);
+		br->br_wbr = NULL;
+	}
+
+out:
+	return err;
+}
+
+static int au_wbr_init(struct au_branch *br, struct super_block *sb,
+		       int perm)
+{
+	int err;
+	struct kstatfs kst;
+	struct au_wbr *wbr;
+
+	wbr = br->br_wbr;
+	au_rw_init(&wbr->wbr_wh_rwsem);
+	atomic_set(&wbr->wbr_wh_running, 0);
+
+	/*
+	 * a limit for rmdir/rename a dir
+	 * cf. AUFS_MAX_NAMELEN in include/uapi/linux/aufs_type.h
+	 */
+	err = vfs_statfs(&br->br_path, &kst);
+	if (unlikely(err))
+		goto out;
+	err = -EINVAL;
+	if (kst.f_namelen >= NAME_MAX)
+		err = au_br_init_wh(sb, br, perm);
+	else
+		pr_err("%pd(%s), unsupported namelen %ld\n",
+		       au_br_dentry(br),
+		       au_sbtype(au_br_dentry(br)->d_sb), kst.f_namelen);
+
+out:
+	return err;
+}
+
+/* initialize a new branch */
+static int au_br_init(struct au_branch *br, struct super_block *sb,
+		      struct au_opt_add *add)
+{
+	int err;
+	struct inode *h_inode;
+
+	err = 0;
+	mutex_init(&br->br_xino.xi_nondir_mtx);
+	br->br_perm = add->perm;
+	br->br_path = add->path; /* set first, path_get() later */
+	spin_lock_init(&br->br_dykey_lock);
+	au_br_count_init(br);
+	atomic_set(&br->br_xino_running, 0);
+	br->br_id = au_new_br_id(sb);
+	AuDebugOn(br->br_id < 0);
+
+	if (au_br_writable(add->perm)) {
+		err = au_wbr_init(br, sb, add->perm);
+		if (unlikely(err))
+			goto out_err;
+	}
+
+	if (au_opt_test(au_mntflags(sb), XINO)) {
+		h_inode = d_inode(add->path.dentry);
+		err = au_xino_br(sb, br, h_inode->i_ino,
+				 au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1);
+		if (unlikely(err)) {
+			AuDebugOn(br->br_xino.xi_file);
+			goto out_err;
+		}
+	}
+
+	sysaufs_br_init(br);
+	path_get(&br->br_path);
+	goto out; /* success */
+
+out_err:
+	memset(&br->br_path, 0, sizeof(br->br_path));
+out:
+	return err;
+}
+
+static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex,
+			     struct au_branch *br, aufs_bindex_t bbot,
+			     aufs_bindex_t amount)
+{
+	struct au_branch **brp;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	brp = sbinfo->si_branch + bindex;
+	memmove(brp + 1, brp, sizeof(*brp) * amount);
+	*brp = br;
+	sbinfo->si_bbot++;
+	if (unlikely(bbot < 0))
+		sbinfo->si_bbot = 0;
+}
+
+static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex,
+			     aufs_bindex_t bbot, aufs_bindex_t amount)
+{
+	struct au_hdentry *hdp;
+
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	hdp = au_hdentry(dinfo, bindex);
+	memmove(hdp + 1, hdp, sizeof(*hdp) * amount);
+	au_h_dentry_init(hdp);
+	dinfo->di_bbot++;
+	if (unlikely(bbot < 0))
+		dinfo->di_btop = 0;
+}
+
+static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex,
+			     aufs_bindex_t bbot, aufs_bindex_t amount)
+{
+	struct au_hinode *hip;
+
+	AuRwMustWriteLock(&iinfo->ii_rwsem);
+
+	hip = au_hinode(iinfo, bindex);
+	memmove(hip + 1, hip, sizeof(*hip) * amount);
+	au_hinode_init(hip);
+	iinfo->ii_bbot++;
+	if (unlikely(bbot < 0))
+		iinfo->ii_btop = 0;
+}
+
+static void au_br_do_add(struct super_block *sb, struct au_branch *br,
+			 aufs_bindex_t bindex)
+{
+	struct dentry *root, *h_dentry;
+	struct inode *root_inode, *h_inode;
+	aufs_bindex_t bbot, amount;
+
+	root = sb->s_root;
+	root_inode = d_inode(root);
+	bbot = au_sbbot(sb);
+	amount = bbot + 1 - bindex;
+	h_dentry = au_br_dentry(br);
+	au_sbilist_lock();
+	au_br_do_add_brp(au_sbi(sb), bindex, br, bbot, amount);
+	au_br_do_add_hdp(au_di(root), bindex, bbot, amount);
+	au_br_do_add_hip(au_ii(root_inode), bindex, bbot, amount);
+	au_set_h_dptr(root, bindex, dget(h_dentry));
+	h_inode = d_inode(h_dentry);
+	au_set_h_iptr(root_inode, bindex, au_igrab(h_inode), /*flags*/0);
+	au_sbilist_unlock();
+}
+
+int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount)
+{
+	int err;
+	aufs_bindex_t bbot, add_bindex;
+	struct dentry *root, *h_dentry;
+	struct inode *root_inode;
+	struct au_branch *add_branch;
+
+	root = sb->s_root;
+	root_inode = d_inode(root);
+	IMustLock(root_inode);
+	IiMustWriteLock(root_inode);
+	err = test_add(sb, add, remount);
+	if (unlikely(err < 0))
+		goto out;
+	if (err) {
+		err = 0;
+		goto out; /* success */
+	}
+
+	bbot = au_sbbot(sb);
+	add_branch = au_br_alloc(sb, bbot + 2, add->perm);
+	err = PTR_ERR(add_branch);
+	if (IS_ERR(add_branch))
+		goto out;
+
+	err = au_br_init(add_branch, sb, add);
+	if (unlikely(err)) {
+		au_br_do_free(add_branch);
+		goto out;
+	}
+
+	add_bindex = add->bindex;
+	if (!remount)
+		au_br_do_add(sb, add_branch, add_bindex);
+	else {
+		sysaufs_brs_del(sb, add_bindex);
+		au_br_do_add(sb, add_branch, add_bindex);
+		sysaufs_brs_add(sb, add_bindex);
+	}
+
+	h_dentry = add->path.dentry;
+	if (!add_bindex) {
+		au_cpup_attr_all(root_inode, /*force*/1);
+		sb->s_maxbytes = h_dentry->d_sb->s_maxbytes;
+	} else
+		au_add_nlink(root_inode, d_inode(h_dentry));
+
+	/*
+	 * this test/set prevents aufs from handling unnecesary notify events
+	 * of xino files, in case of re-adding a writable branch which was
+	 * once detached from aufs.
+	 */
+	if (au_xino_brid(sb) < 0
+	    && au_br_writable(add_branch->br_perm)
+	    && !au_test_fs_bad_xino(h_dentry->d_sb)
+	    && add_branch->br_xino.xi_file
+	    && add_branch->br_xino.xi_file->f_path.dentry->d_parent == h_dentry)
+		au_xino_brid_set(sb, add_branch->br_id);
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static unsigned long long au_farray_cb(struct super_block *sb, void *a,
+				       unsigned long long max __maybe_unused,
+				       void *arg)
+{
+	unsigned long long n;
+	struct file **p, *f;
+	struct au_sphlhead *files;
+	struct au_finfo *finfo;
+
+	n = 0;
+	p = a;
+	files = &au_sbi(sb)->si_files;
+	spin_lock(&files->spin);
+	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
+		f = finfo->fi_file;
+		if (file_count(f)
+		    && !special_file(file_inode(f)->i_mode)) {
+			get_file(f);
+			*p++ = f;
+			n++;
+			AuDebugOn(n > max);
+		}
+	}
+	spin_unlock(&files->spin);
+
+	return n;
+}
+
+static struct file **au_farray_alloc(struct super_block *sb,
+				     unsigned long long *max)
+{
+	*max = au_nfiles(sb);
+	return au_array_alloc(max, au_farray_cb, sb, /*arg*/NULL);
+}
+
+static void au_farray_free(struct file **a, unsigned long long max)
+{
+	unsigned long long ull;
+
+	for (ull = 0; ull < max; ull++)
+		if (a[ull])
+			fput(a[ull]);
+	kvfree(a);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * delete a branch
+ */
+
+/* to show the line number, do not make it inlined function */
+#define AuVerbose(do_info, fmt, ...) do { \
+	if (do_info) \
+		pr_info(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static int au_test_ibusy(struct inode *inode, aufs_bindex_t btop,
+			 aufs_bindex_t bbot)
+{
+	return (inode && !S_ISDIR(inode->i_mode)) || btop == bbot;
+}
+
+static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t btop,
+			 aufs_bindex_t bbot)
+{
+	return au_test_ibusy(d_inode(dentry), btop, bbot);
+}
+
+/*
+ * test if the branch is deletable or not.
+ */
+static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex,
+			    unsigned int sigen, const unsigned int verbose)
+{
+	int err, i, j, ndentry;
+	aufs_bindex_t btop, bbot;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry *d;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, root, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	for (i = 0; !err && i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		ndentry = dpage->ndentry;
+		for (j = 0; !err && j < ndentry; j++) {
+			d = dpage->dentries[j];
+			AuDebugOn(au_dcount(d) <= 0);
+			if (!au_digen_test(d, sigen)) {
+				di_read_lock_child(d, AuLock_IR);
+				if (unlikely(au_dbrange_test(d))) {
+					di_read_unlock(d, AuLock_IR);
+					continue;
+				}
+			} else {
+				di_write_lock_child(d);
+				if (unlikely(au_dbrange_test(d))) {
+					di_write_unlock(d);
+					continue;
+				}
+				err = au_reval_dpath(d, sigen);
+				if (!err)
+					di_downgrade_lock(d, AuLock_IR);
+				else {
+					di_write_unlock(d);
+					break;
+				}
+			}
+
+			/* AuDbgDentry(d); */
+			btop = au_dbtop(d);
+			bbot = au_dbbot(d);
+			if (btop <= bindex
+			    && bindex <= bbot
+			    && au_h_dptr(d, bindex)
+			    && au_test_dbusy(d, btop, bbot)) {
+				err = -EBUSY;
+				AuVerbose(verbose, "busy %pd\n", d);
+				AuDbgDentry(d);
+			}
+			di_read_unlock(d, AuLock_IR);
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex,
+			   unsigned int sigen, const unsigned int verbose)
+{
+	int err;
+	unsigned long long max, ull;
+	struct inode *i, **array;
+	aufs_bindex_t btop, bbot;
+
+	array = au_iarray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	err = 0;
+	AuDbg("b%d\n", bindex);
+	for (ull = 0; !err && ull < max; ull++) {
+		i = array[ull];
+		if (unlikely(!i))
+			break;
+		if (i->i_ino == AUFS_ROOT_INO)
+			continue;
+
+		/* AuDbgInode(i); */
+		if (au_iigen(i, NULL) == sigen)
+			ii_read_lock_child(i);
+		else {
+			ii_write_lock_child(i);
+			err = au_refresh_hinode_self(i);
+			au_iigen_dec(i);
+			if (!err)
+				ii_downgrade_lock(i);
+			else {
+				ii_write_unlock(i);
+				break;
+			}
+		}
+
+		btop = au_ibtop(i);
+		bbot = au_ibbot(i);
+		if (btop <= bindex
+		    && bindex <= bbot
+		    && au_h_iptr(i, bindex)
+		    && au_test_ibusy(i, btop, bbot)) {
+			err = -EBUSY;
+			AuVerbose(verbose, "busy i%lu\n", i->i_ino);
+			AuDbgInode(i);
+		}
+		ii_read_unlock(i);
+	}
+	au_iarray_free(array, max);
+
+out:
+	return err;
+}
+
+static int test_children_busy(struct dentry *root, aufs_bindex_t bindex,
+			      const unsigned int verbose)
+{
+	int err;
+	unsigned int sigen;
+
+	sigen = au_sigen(root->d_sb);
+	DiMustNoWaiters(root);
+	IiMustNoWaiters(d_inode(root));
+	di_write_unlock(root);
+	err = test_dentry_busy(root, bindex, sigen, verbose);
+	if (!err)
+		err = test_inode_busy(root->d_sb, bindex, sigen, verbose);
+	di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */
+
+	return err;
+}
+
+static int test_dir_busy(struct file *file, aufs_bindex_t br_id,
+			 struct file **to_free, int *idx)
+{
+	int err;
+	unsigned char matched, root;
+	aufs_bindex_t bindex, bbot;
+	struct au_fidir *fidir;
+	struct au_hfile *hfile;
+
+	err = 0;
+	root = IS_ROOT(file->f_path.dentry);
+	if (root) {
+		get_file(file);
+		to_free[*idx] = file;
+		(*idx)++;
+		goto out;
+	}
+
+	matched = 0;
+	fidir = au_fi(file)->fi_hdir;
+	AuDebugOn(!fidir);
+	bbot = au_fbbot_dir(file);
+	for (bindex = au_fbtop(file); bindex <= bbot; bindex++) {
+		hfile = fidir->fd_hfile + bindex;
+		if (!hfile->hf_file)
+			continue;
+
+		if (hfile->hf_br->br_id == br_id) {
+			matched = 1;
+			break;
+		}
+	}
+	if (matched)
+		err = -EBUSY;
+
+out:
+	return err;
+}
+
+static int test_file_busy(struct super_block *sb, aufs_bindex_t br_id,
+			  struct file **to_free, int opened)
+{
+	int err, idx;
+	unsigned long long ull, max;
+	aufs_bindex_t btop;
+	struct file *file, **array;
+	struct dentry *root;
+	struct au_hfile *hfile;
+
+	array = au_farray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	err = 0;
+	idx = 0;
+	root = sb->s_root;
+	di_write_unlock(root);
+	for (ull = 0; ull < max; ull++) {
+		file = array[ull];
+		if (unlikely(!file))
+			break;
+
+		/* AuDbg("%pD\n", file); */
+		fi_read_lock(file);
+		btop = au_fbtop(file);
+		if (!d_is_dir(file->f_path.dentry)) {
+			hfile = &au_fi(file)->fi_htop;
+			if (hfile->hf_br->br_id == br_id)
+				err = -EBUSY;
+		} else
+			err = test_dir_busy(file, br_id, to_free, &idx);
+		fi_read_unlock(file);
+		if (unlikely(err))
+			break;
+	}
+	di_write_lock_child(root);
+	au_farray_free(array, max);
+	AuDebugOn(idx > opened);
+
+out:
+	return err;
+}
+
+static void br_del_file(struct file **to_free, unsigned long long opened,
+			  aufs_bindex_t br_id)
+{
+	unsigned long long ull;
+	aufs_bindex_t bindex, btop, bbot, bfound;
+	struct file *file;
+	struct au_fidir *fidir;
+	struct au_hfile *hfile;
+
+	for (ull = 0; ull < opened; ull++) {
+		file = to_free[ull];
+		if (unlikely(!file))
+			break;
+
+		/* AuDbg("%pD\n", file); */
+		AuDebugOn(!d_is_dir(file->f_path.dentry));
+		bfound = -1;
+		fidir = au_fi(file)->fi_hdir;
+		AuDebugOn(!fidir);
+		fi_write_lock(file);
+		btop = au_fbtop(file);
+		bbot = au_fbbot_dir(file);
+		for (bindex = btop; bindex <= bbot; bindex++) {
+			hfile = fidir->fd_hfile + bindex;
+			if (!hfile->hf_file)
+				continue;
+
+			if (hfile->hf_br->br_id == br_id) {
+				bfound = bindex;
+				break;
+			}
+		}
+		AuDebugOn(bfound < 0);
+		au_set_h_fptr(file, bfound, NULL);
+		if (bfound == btop) {
+			for (btop++; btop <= bbot; btop++)
+				if (au_hf_dir(file, btop)) {
+					au_set_fbtop(file, btop);
+					break;
+				}
+		}
+		fi_write_unlock(file);
+	}
+}
+
+static void au_br_do_del_brp(struct au_sbinfo *sbinfo,
+			     const aufs_bindex_t bindex,
+			     const aufs_bindex_t bbot)
+{
+	struct au_branch **brp, **p;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	brp = sbinfo->si_branch + bindex;
+	if (bindex < bbot)
+		memmove(brp, brp + 1, sizeof(*brp) * (bbot - bindex));
+	sbinfo->si_branch[0 + bbot] = NULL;
+	sbinfo->si_bbot--;
+
+	p = au_krealloc(sbinfo->si_branch, sizeof(*p) * bbot, AuGFP_SBILIST,
+			/*may_shrink*/1);
+	if (p)
+		sbinfo->si_branch = p;
+	/* harmless error */
+}
+
+static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex,
+			     const aufs_bindex_t bbot)
+{
+	struct au_hdentry *hdp, *p;
+
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	hdp = au_hdentry(dinfo, bindex);
+	if (bindex < bbot)
+		memmove(hdp, hdp + 1, sizeof(*hdp) * (bbot - bindex));
+	/* au_h_dentry_init(au_hdentry(dinfo, bbot); */
+	dinfo->di_bbot--;
+
+	p = au_krealloc(dinfo->di_hdentry, sizeof(*p) * bbot, AuGFP_SBILIST,
+			/*may_shrink*/1);
+	if (p)
+		dinfo->di_hdentry = p;
+	/* harmless error */
+}
+
+static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex,
+			     const aufs_bindex_t bbot)
+{
+	struct au_hinode *hip, *p;
+
+	AuRwMustWriteLock(&iinfo->ii_rwsem);
+
+	hip = au_hinode(iinfo, bindex);
+	if (bindex < bbot)
+		memmove(hip, hip + 1, sizeof(*hip) * (bbot - bindex));
+	/* au_hinode_init(au_hinode(iinfo, bbot)); */
+	iinfo->ii_bbot--;
+
+	p = au_krealloc(iinfo->ii_hinode, sizeof(*p) * bbot, AuGFP_SBILIST,
+			/*may_shrink*/1);
+	if (p)
+		iinfo->ii_hinode = p;
+	/* harmless error */
+}
+
+static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex,
+			 struct au_branch *br)
+{
+	aufs_bindex_t bbot;
+	struct au_sbinfo *sbinfo;
+	struct dentry *root, *h_root;
+	struct inode *inode, *h_inode;
+	struct au_hinode *hinode;
+
+	SiMustWriteLock(sb);
+
+	root = sb->s_root;
+	inode = d_inode(root);
+	sbinfo = au_sbi(sb);
+	bbot = sbinfo->si_bbot;
+
+	h_root = au_h_dptr(root, bindex);
+	hinode = au_hi(inode, bindex);
+	h_inode = au_igrab(hinode->hi_inode);
+	au_hiput(hinode);
+
+	au_sbilist_lock();
+	au_br_do_del_brp(sbinfo, bindex, bbot);
+	au_br_do_del_hdp(au_di(root), bindex, bbot);
+	au_br_do_del_hip(au_ii(inode), bindex, bbot);
+	au_sbilist_unlock();
+
+	dput(h_root);
+	iput(h_inode);
+	au_br_do_free(br);
+}
+
+static unsigned long long empty_cb(struct super_block *sb, void *array,
+				   unsigned long long max, void *arg)
+{
+	return max;
+}
+
+int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount)
+{
+	int err, rerr, i;
+	unsigned long long opened;
+	unsigned int mnt_flags;
+	aufs_bindex_t bindex, bbot, br_id;
+	unsigned char do_wh, verbose;
+	struct au_branch *br;
+	struct au_wbr *wbr;
+	struct dentry *root;
+	struct file **to_free;
+
+	err = 0;
+	opened = 0;
+	to_free = NULL;
+	root = sb->s_root;
+	bindex = au_find_dbindex(root, del->h_path.dentry);
+	if (bindex < 0) {
+		if (remount)
+			goto out; /* success */
+		err = -ENOENT;
+		pr_err("%s no such branch\n", del->pathname);
+		goto out;
+	}
+	AuDbg("bindex b%d\n", bindex);
+
+	err = -EBUSY;
+	mnt_flags = au_mntflags(sb);
+	verbose = !!au_opt_test(mnt_flags, VERBOSE);
+	bbot = au_sbbot(sb);
+	if (unlikely(!bbot)) {
+		AuVerbose(verbose, "no more branches left\n");
+		goto out;
+	}
+	br = au_sbr(sb, bindex);
+	AuDebugOn(!path_equal(&br->br_path, &del->h_path));
+
+	br_id = br->br_id;
+	opened = au_br_count(br);
+	if (unlikely(opened)) {
+		to_free = au_array_alloc(&opened, empty_cb, sb, NULL);
+		err = PTR_ERR(to_free);
+		if (IS_ERR(to_free))
+			goto out;
+
+		err = test_file_busy(sb, br_id, to_free, opened);
+		if (unlikely(err)) {
+			AuVerbose(verbose, "%llu file(s) opened\n", opened);
+			goto out;
+		}
+	}
+
+	wbr = br->br_wbr;
+	do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph);
+	if (do_wh) {
+		/* instead of WbrWhMustWriteLock(wbr) */
+		SiMustWriteLock(sb);
+		for (i = 0; i < AuBrWh_Last; i++) {
+			dput(wbr->wbr_wh[i]);
+			wbr->wbr_wh[i] = NULL;
+		}
+	}
+
+	err = test_children_busy(root, bindex, verbose);
+	if (unlikely(err)) {
+		if (do_wh)
+			goto out_wh;
+		goto out;
+	}
+
+	err = 0;
+	if (to_free) {
+		/*
+		 * now we confirmed the branch is deletable.
+		 * let's free the remaining opened dirs on the branch.
+		 */
+		di_write_unlock(root);
+		br_del_file(to_free, opened, br_id);
+		di_write_lock_child(root);
+	}
+
+	if (!remount)
+		au_br_do_del(sb, bindex, br);
+	else {
+		sysaufs_brs_del(sb, bindex);
+		au_br_do_del(sb, bindex, br);
+		sysaufs_brs_add(sb, bindex);
+	}
+
+	if (!bindex) {
+		au_cpup_attr_all(d_inode(root), /*force*/1);
+		sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes;
+	} else
+		au_sub_nlink(d_inode(root), d_inode(del->h_path.dentry));
+	if (au_opt_test(mnt_flags, PLINK))
+		au_plink_half_refresh(sb, br_id);
+
+	if (au_xino_brid(sb) == br_id)
+		au_xino_brid_set(sb, -1);
+	goto out; /* success */
+
+out_wh:
+	/* revert */
+	rerr = au_br_init_wh(sb, br, br->br_perm);
+	if (rerr)
+		pr_warn("failed re-creating base whiteout, %s. (%d)\n",
+			del->pathname, rerr);
+out:
+	if (to_free)
+		au_farray_free(to_free, opened);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg)
+{
+	int err;
+	aufs_bindex_t btop, bbot;
+	struct aufs_ibusy ibusy;
+	struct inode *inode, *h_inode;
+
+	err = -EPERM;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = copy_from_user(&ibusy, arg, sizeof(ibusy));
+	if (!err)
+		err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+
+	err = -EINVAL;
+	si_read_lock(sb, AuLock_FLUSH);
+	if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbbot(sb)))
+		goto out_unlock;
+
+	err = 0;
+	ibusy.h_ino = 0; /* invalid */
+	inode = ilookup(sb, ibusy.ino);
+	if (!inode
+	    || inode->i_ino == AUFS_ROOT_INO
+	    || au_is_bad_inode(inode))
+		goto out_unlock;
+
+	ii_read_lock_child(inode);
+	btop = au_ibtop(inode);
+	bbot = au_ibbot(inode);
+	if (btop <= ibusy.bindex && ibusy.bindex <= bbot) {
+		h_inode = au_h_iptr(inode, ibusy.bindex);
+		if (h_inode && au_test_ibusy(inode, btop, bbot))
+			ibusy.h_ino = h_inode->i_ino;
+	}
+	ii_read_unlock(inode);
+	iput(inode);
+
+out_unlock:
+	si_read_unlock(sb);
+	if (!err) {
+		err = __put_user(ibusy.h_ino, &arg->h_ino);
+		if (unlikely(err)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+		}
+	}
+out:
+	return err;
+}
+
+long au_ibusy_ioctl(struct file *file, unsigned long arg)
+{
+	return au_ibusy(file->f_path.dentry->d_sb, (void __user *)arg);
+}
+
+#ifdef CONFIG_COMPAT
+long au_ibusy_compat_ioctl(struct file *file, unsigned long arg)
+{
+	return au_ibusy(file->f_path.dentry->d_sb, compat_ptr(arg));
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * change a branch permission
+ */
+
+static void au_warn_ima(void)
+{
+#ifdef CONFIG_IMA
+	/* since it doesn't support mark_files_ro() */
+	AuWarn1("RW -> RO makes IMA to produce wrong message\n");
+#endif
+}
+
+static int do_need_sigen_inc(int a, int b)
+{
+	return au_br_whable(a) && !au_br_whable(b);
+}
+
+static int need_sigen_inc(int old, int new)
+{
+	return do_need_sigen_inc(old, new)
+		|| do_need_sigen_inc(new, old);
+}
+
+static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex)
+{
+	int err, do_warn;
+	unsigned int mnt_flags;
+	unsigned long long ull, max;
+	aufs_bindex_t br_id;
+	unsigned char verbose, writer;
+	struct file *file, *hf, **array;
+	struct au_hfile *hfile;
+
+	mnt_flags = au_mntflags(sb);
+	verbose = !!au_opt_test(mnt_flags, VERBOSE);
+
+	array = au_farray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	do_warn = 0;
+	br_id = au_sbr_id(sb, bindex);
+	for (ull = 0; ull < max; ull++) {
+		file = array[ull];
+		if (unlikely(!file))
+			break;
+
+		/* AuDbg("%pD\n", file); */
+		fi_read_lock(file);
+		if (unlikely(au_test_mmapped(file))) {
+			err = -EBUSY;
+			AuVerbose(verbose, "mmapped %pD\n", file);
+			AuDbgFile(file);
+			FiMustNoWaiters(file);
+			fi_read_unlock(file);
+			goto out_array;
+		}
+
+		hfile = &au_fi(file)->fi_htop;
+		hf = hfile->hf_file;
+		if (!d_is_reg(file->f_path.dentry)
+		    || !(file->f_mode & FMODE_WRITE)
+		    || hfile->hf_br->br_id != br_id
+		    || !(hf->f_mode & FMODE_WRITE))
+			array[ull] = NULL;
+		else {
+			do_warn = 1;
+			get_file(file);
+		}
+
+		FiMustNoWaiters(file);
+		fi_read_unlock(file);
+		fput(file);
+	}
+
+	err = 0;
+	if (do_warn)
+		au_warn_ima();
+
+	for (ull = 0; ull < max; ull++) {
+		file = array[ull];
+		if (!file)
+			continue;
+
+		/* todo: already flushed? */
+		/*
+		 * fs/super.c:mark_files_ro() is gone, but aufs keeps its
+		 * approach which resets f_mode and calls mnt_drop_write() and
+		 * file_release_write() for each file, because the branch
+		 * attribute in aufs world is totally different from the native
+		 * fs rw/ro mode.
+		*/
+		/* fi_read_lock(file); */
+		hfile = &au_fi(file)->fi_htop;
+		hf = hfile->hf_file;
+		/* fi_read_unlock(file); */
+		spin_lock(&hf->f_lock);
+		writer = !!(hf->f_mode & FMODE_WRITER);
+		hf->f_mode &= ~(FMODE_WRITE | FMODE_WRITER);
+		spin_unlock(&hf->f_lock);
+		if (writer) {
+			put_write_access(file_inode(hf));
+			__mnt_drop_write(hf->f_path.mnt);
+		}
+	}
+
+out_array:
+	au_farray_free(array, max);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
+	      int *do_refresh)
+{
+	int err, rerr;
+	aufs_bindex_t bindex;
+	struct dentry *root;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	root = sb->s_root;
+	bindex = au_find_dbindex(root, mod->h_root);
+	if (bindex < 0) {
+		if (remount)
+			return 0; /* success */
+		err = -ENOENT;
+		pr_err("%s no such branch\n", mod->path);
+		goto out;
+	}
+	AuDbg("bindex b%d\n", bindex);
+
+	err = test_br(d_inode(mod->h_root), mod->perm, mod->path);
+	if (unlikely(err))
+		goto out;
+
+	br = au_sbr(sb, bindex);
+	AuDebugOn(mod->h_root != au_br_dentry(br));
+	if (br->br_perm == mod->perm)
+		return 0; /* success */
+
+	/* pre-allocate for non-fhsm --> fhsm */
+	bf = NULL;
+	if (!au_br_fhsm(br->br_perm) && au_br_fhsm(mod->perm)) {
+		err = au_fhsm_br_alloc(br);
+		if (unlikely(err))
+			goto out;
+		bf = br->br_fhsm;
+		br->br_fhsm = NULL;
+	}
+
+	if (au_br_writable(br->br_perm)) {
+		/* remove whiteout base */
+		err = au_br_init_wh(sb, br, mod->perm);
+		if (unlikely(err))
+			goto out_bf;
+
+		if (!au_br_writable(mod->perm)) {
+			/* rw --> ro, file might be mmapped */
+			DiMustNoWaiters(root);
+			IiMustNoWaiters(d_inode(root));
+			di_write_unlock(root);
+			err = au_br_mod_files_ro(sb, bindex);
+			/* aufs_write_lock() calls ..._child() */
+			di_write_lock_child(root);
+
+			if (unlikely(err)) {
+				rerr = -ENOMEM;
+				br->br_wbr = kzalloc(sizeof(*br->br_wbr),
+						     GFP_NOFS);
+				if (br->br_wbr)
+					rerr = au_wbr_init(br, sb, br->br_perm);
+				if (unlikely(rerr)) {
+					AuIOErr("nested error %d (%d)\n",
+						rerr, err);
+					br->br_perm = mod->perm;
+				}
+			}
+		}
+	} else if (au_br_writable(mod->perm)) {
+		/* ro --> rw */
+		err = -ENOMEM;
+		br->br_wbr = kzalloc(sizeof(*br->br_wbr), GFP_NOFS);
+		if (br->br_wbr) {
+			err = au_wbr_init(br, sb, mod->perm);
+			if (unlikely(err)) {
+				au_delayed_kfree(br->br_wbr);
+				br->br_wbr = NULL;
+			}
+		}
+	}
+	if (unlikely(err))
+		goto out_bf;
+
+	if (au_br_fhsm(br->br_perm)) {
+		if (!au_br_fhsm(mod->perm)) {
+			/* fhsm --> non-fhsm */
+			au_br_fhsm_fin(br->br_fhsm);
+			au_delayed_kfree(br->br_fhsm);
+			br->br_fhsm = NULL;
+		}
+	} else if (au_br_fhsm(mod->perm))
+		/* non-fhsm --> fhsm */
+		br->br_fhsm = bf;
+
+	*do_refresh |= need_sigen_inc(br->br_perm, mod->perm);
+	br->br_perm = mod->perm;
+	goto out; /* success */
+
+out_bf:
+	if (bf)
+		au_delayed_kfree(bf);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs)
+{
+	int err;
+	struct kstatfs kstfs;
+
+	err = vfs_statfs(&br->br_path, &kstfs);
+	if (!err) {
+		stfs->f_blocks = kstfs.f_blocks;
+		stfs->f_bavail = kstfs.f_bavail;
+		stfs->f_files = kstfs.f_files;
+		stfs->f_ffree = kstfs.f_ffree;
+	}
+
+	return err;
+}
diff --git b/fs/aufs/branch.h b/fs/aufs/branch.h
new file mode 100644
index 0000000..32a4d8f
--- /dev/null
+++ b/fs/aufs/branch.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * branch filesystems and xino for them
+ */
+
+#ifndef __AUFS_BRANCH_H__
+#define __AUFS_BRANCH_H__
+
+#ifdef __KERNEL__
+
+#include <linux/mount.h>
+#include "dynop.h"
+#include "rwsem.h"
+#include "super.h"
+
+/* ---------------------------------------------------------------------- */
+
+/* a xino file */
+struct au_xino_file {
+	struct file		*xi_file;
+	struct mutex		xi_nondir_mtx;
+
+	/* todo: make xino files an array to support huge inode number */
+
+#ifdef CONFIG_DEBUG_FS
+	struct dentry		 *xi_dbgaufs;
+#endif
+};
+
+/* File-based Hierarchical Storage Management */
+struct au_br_fhsm {
+#ifdef CONFIG_AUFS_FHSM
+	struct mutex		bf_lock;
+	unsigned long		bf_jiffy;
+	struct aufs_stfs	bf_stfs;
+	int			bf_readable;
+#endif
+};
+
+/* members for writable branch only */
+enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last};
+struct au_wbr {
+	struct au_rwsem		wbr_wh_rwsem;
+	struct dentry		*wbr_wh[AuBrWh_Last];
+	atomic_t		wbr_wh_running;
+#define wbr_whbase		wbr_wh[AuBrWh_BASE]	/* whiteout base */
+#define wbr_plink		wbr_wh[AuBrWh_PLINK]	/* pseudo-link dir */
+#define wbr_orph		wbr_wh[AuBrWh_ORPH]	/* dir for orphans */
+
+	/* mfs mode */
+	unsigned long long	wbr_bytes;
+};
+
+/* ext2 has 3 types of operations at least, ext3 has 4 */
+#define AuBrDynOp (AuDyLast * 4)
+
+#ifdef CONFIG_AUFS_HFSNOTIFY
+/* support for asynchronous destruction */
+struct au_br_hfsnotify {
+	struct fsnotify_group	*hfsn_group;
+};
+#endif
+
+/* sysfs entries */
+struct au_brsysfs {
+	char			name[16];
+	struct attribute	attr;
+};
+
+enum {
+	AuBrSysfs_BR,
+	AuBrSysfs_BRID,
+	AuBrSysfs_Last
+};
+
+/* protected by superblock rwsem */
+struct au_branch {
+	struct au_xino_file	br_xino;
+
+	aufs_bindex_t		br_id;
+
+	int			br_perm;
+	struct path		br_path;
+	spinlock_t		br_dykey_lock;
+	struct au_dykey		*br_dykey[AuBrDynOp];
+	struct percpu_counter	br_count;
+
+	struct au_wbr		*br_wbr;
+	struct au_br_fhsm	*br_fhsm;
+
+	/* xino truncation */
+	atomic_t		br_xino_running;
+
+#ifdef CONFIG_AUFS_HFSNOTIFY
+	struct au_br_hfsnotify	*br_hfsn;
+#endif
+
+#ifdef CONFIG_SYSFS
+	/* entries under sysfs per mount-point */
+	struct au_brsysfs	br_sysfs[AuBrSysfs_Last];
+#endif
+};
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct vfsmount *au_br_mnt(struct au_branch *br)
+{
+	return br->br_path.mnt;
+}
+
+static inline struct dentry *au_br_dentry(struct au_branch *br)
+{
+	return br->br_path.dentry;
+}
+
+static inline struct super_block *au_br_sb(struct au_branch *br)
+{
+	return au_br_mnt(br)->mnt_sb;
+}
+
+static inline void au_br_get(struct au_branch *br)
+{
+	percpu_counter_inc(&br->br_count);
+}
+
+static inline void au_br_put(struct au_branch *br)
+{
+	percpu_counter_dec(&br->br_count);
+}
+
+static inline s64 au_br_count(struct au_branch *br)
+{
+	return percpu_counter_sum(&br->br_count);
+}
+
+static inline void au_br_count_init(struct au_branch *br)
+{
+	percpu_counter_init(&br->br_count, 0, GFP_NOFS);
+}
+
+static inline void au_br_count_fin(struct au_branch *br)
+{
+	percpu_counter_destroy(&br->br_count);
+}
+
+static inline int au_br_rdonly(struct au_branch *br)
+{
+	return ((au_br_sb(br)->s_flags & MS_RDONLY)
+		|| !au_br_writable(br->br_perm))
+		? -EROFS : 0;
+}
+
+static inline int au_br_hnotifyable(int brperm __maybe_unused)
+{
+#ifdef CONFIG_AUFS_HNOTIFY
+	return !(brperm & AuBrPerm_RR);
+#else
+	return 0;
+#endif
+}
+
+static inline int au_br_test_oflag(int oflag, struct au_branch *br)
+{
+	int err, exec_flag;
+
+	err = 0;
+	exec_flag = oflag & __FMODE_EXEC;
+	if (unlikely(exec_flag && (au_br_mnt(br)->mnt_flags & MNT_NOEXEC)))
+		err = -EACCES;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* branch.c */
+struct au_sbinfo;
+void au_br_free(struct au_sbinfo *sinfo);
+int au_br_index(struct super_block *sb, aufs_bindex_t br_id);
+struct au_opt_add;
+int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount);
+struct au_opt_del;
+int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount);
+long au_ibusy_ioctl(struct file *file, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long au_ibusy_compat_ioctl(struct file *file, unsigned long arg);
+#endif
+struct au_opt_mod;
+int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
+	      int *do_refresh);
+struct aufs_stfs;
+int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs);
+
+/* xino.c */
+static const loff_t au_loff_max = LLONG_MAX;
+
+int au_xib_trunc(struct super_block *sb);
+ssize_t xino_fread(vfs_readf_t func, struct file *file, void *buf, size_t size,
+		   loff_t *pos);
+ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
+		    size_t size, loff_t *pos);
+struct file *au_xino_create2(struct file *base_file, struct file *copy_src);
+struct file *au_xino_create(struct super_block *sb, char *fname, int silent);
+ino_t au_xino_new_ino(struct super_block *sb);
+void au_xino_delete_inode(struct inode *inode, const int unlinked);
+int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		  ino_t ino);
+int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		 ino_t *ino);
+int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino,
+	       struct file *base_file, int do_test);
+int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex);
+
+struct au_opt_xino;
+int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount);
+void au_xino_clr(struct super_block *sb);
+struct file *au_xino_def(struct super_block *sb);
+int au_xino_path(struct seq_file *seq, struct file *file);
+
+/* ---------------------------------------------------------------------- */
+
+/* Superblock to branch */
+static inline
+aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_sbr(sb, bindex)->br_id;
+}
+
+static inline
+struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_br_mnt(au_sbr(sb, bindex));
+}
+
+static inline
+struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_br_sb(au_sbr(sb, bindex));
+}
+
+static inline void au_sbr_get(struct super_block *sb, aufs_bindex_t bindex)
+{
+	au_br_get(au_sbr(sb, bindex));
+}
+
+static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex)
+{
+	au_br_put(au_sbr(sb, bindex));
+}
+
+static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_sbr(sb, bindex)->br_perm;
+}
+
+static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex)
+{
+	return au_br_whable(au_sbr_perm(sb, bindex));
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * wbr_wh_read_lock, wbr_wh_write_lock
+ * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock
+ */
+AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem);
+
+#define WbrWhMustNoWaiters(wbr)	AuRwMustNoWaiters(&wbr->wbr_wh_rwsem)
+#define WbrWhMustAnyLock(wbr)	AuRwMustAnyLock(&wbr->wbr_wh_rwsem)
+#define WbrWhMustWriteLock(wbr)	AuRwMustWriteLock(&wbr->wbr_wh_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_FHSM
+static inline void au_br_fhsm_init(struct au_br_fhsm *brfhsm)
+{
+	mutex_init(&brfhsm->bf_lock);
+	brfhsm->bf_jiffy = 0;
+	brfhsm->bf_readable = 0;
+}
+
+static inline void au_br_fhsm_fin(struct au_br_fhsm *brfhsm)
+{
+	mutex_destroy(&brfhsm->bf_lock);
+}
+#else
+AuStubVoid(au_br_fhsm_init, struct au_br_fhsm *brfhsm)
+AuStubVoid(au_br_fhsm_fin, struct au_br_fhsm *brfhsm)
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_BRANCH_H__ */
diff --git b/fs/aufs/cpup.c b/fs/aufs/cpup.c
new file mode 100644
index 0000000..598a4d6
--- /dev/null
+++ b/fs/aufs/cpup.c
@@ -0,0 +1,1378 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * copy-up functions, see wbr_policy.c for copy-down
+ */
+
+#include <linux/fs_stack.h>
+#include <linux/mm.h>
+#include <linux/task_work.h>
+#include "aufs.h"
+
+void au_cpup_attr_flags(struct inode *dst, unsigned int iflags)
+{
+	const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE
+		| S_NOATIME | S_NOCMTIME | S_AUTOMOUNT;
+
+	BUILD_BUG_ON(sizeof(iflags) != sizeof(dst->i_flags));
+
+	dst->i_flags |= iflags & ~mask;
+	if (au_test_fs_notime(dst->i_sb))
+		dst->i_flags |= S_NOATIME | S_NOCMTIME;
+}
+
+void au_cpup_attr_timesizes(struct inode *inode)
+{
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, au_ibtop(inode));
+	fsstack_copy_attr_times(inode, h_inode);
+	fsstack_copy_inode_size(inode, h_inode);
+}
+
+void au_cpup_attr_nlink(struct inode *inode, int force)
+{
+	struct inode *h_inode;
+	struct super_block *sb;
+	aufs_bindex_t bindex, bbot;
+
+	sb = inode->i_sb;
+	bindex = au_ibtop(inode);
+	h_inode = au_h_iptr(inode, bindex);
+	if (!force
+	    && !S_ISDIR(h_inode->i_mode)
+	    && au_opt_test(au_mntflags(sb), PLINK)
+	    && au_plink_test(inode))
+		return;
+
+	/*
+	 * 0 can happen in revalidating.
+	 * h_inode->i_mutex may not be held here, but it is harmless since once
+	 * i_nlink reaches 0, it will never become positive except O_TMPFILE
+	 * case.
+	 * todo: O_TMPFILE+linkat(AT_SYMLINK_FOLLOW) bypassing aufs may cause
+	 *	 the incorrect link count.
+	 */
+	set_nlink(inode, h_inode->i_nlink);
+
+	/*
+	 * fewer nlink makes find(1) noisy, but larger nlink doesn't.
+	 * it may includes whplink directory.
+	 */
+	if (S_ISDIR(h_inode->i_mode)) {
+		bbot = au_ibbot(inode);
+		for (bindex++; bindex <= bbot; bindex++) {
+			h_inode = au_h_iptr(inode, bindex);
+			if (h_inode)
+				au_add_nlink(inode, h_inode);
+		}
+	}
+}
+
+void au_cpup_attr_changeable(struct inode *inode)
+{
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, au_ibtop(inode));
+	inode->i_mode = h_inode->i_mode;
+	inode->i_uid = h_inode->i_uid;
+	inode->i_gid = h_inode->i_gid;
+	au_cpup_attr_timesizes(inode);
+	au_cpup_attr_flags(inode, h_inode->i_flags);
+}
+
+void au_cpup_igen(struct inode *inode, struct inode *h_inode)
+{
+	struct au_iinfo *iinfo = au_ii(inode);
+
+	IiMustWriteLock(inode);
+
+	iinfo->ii_higen = h_inode->i_generation;
+	iinfo->ii_hsb1 = h_inode->i_sb;
+}
+
+void au_cpup_attr_all(struct inode *inode, int force)
+{
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, au_ibtop(inode));
+	au_cpup_attr_changeable(inode);
+	if (inode->i_nlink > 0)
+		au_cpup_attr_nlink(inode, force);
+	inode->i_rdev = h_inode->i_rdev;
+	inode->i_blkbits = h_inode->i_blkbits;
+	au_cpup_igen(inode, h_inode);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */
+
+/* keep the timestamps of the parent dir when cpup */
+void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
+		    struct path *h_path)
+{
+	struct inode *h_inode;
+
+	dt->dt_dentry = dentry;
+	dt->dt_h_path = *h_path;
+	h_inode = d_inode(h_path->dentry);
+	dt->dt_atime = h_inode->i_atime;
+	dt->dt_mtime = h_inode->i_mtime;
+	/* smp_mb(); */
+}
+
+void au_dtime_revert(struct au_dtime *dt)
+{
+	struct iattr attr;
+	int err;
+
+	attr.ia_atime = dt->dt_atime;
+	attr.ia_mtime = dt->dt_mtime;
+	attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET
+		| ATTR_ATIME | ATTR_ATIME_SET;
+
+	/* no delegation since this is a directory */
+	err = vfsub_notify_change(&dt->dt_h_path, &attr, /*delegated*/NULL);
+	if (unlikely(err))
+		pr_warn("restoring timestamps failed(%d). ignored\n", err);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* internal use only */
+struct au_cpup_reg_attr {
+	int		valid;
+	struct kstat	st;
+	unsigned int	iflags; /* inode->i_flags */
+};
+
+static noinline_for_stack
+int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src,
+	       struct au_cpup_reg_attr *h_src_attr)
+{
+	int err, sbits, icex;
+	unsigned int mnt_flags;
+	unsigned char verbose;
+	struct iattr ia;
+	struct path h_path;
+	struct inode *h_isrc, *h_idst;
+	struct kstat *h_st;
+	struct au_branch *br;
+
+	h_path.dentry = au_h_dptr(dst, bindex);
+	h_idst = d_inode(h_path.dentry);
+	br = au_sbr(dst->d_sb, bindex);
+	h_path.mnt = au_br_mnt(br);
+	h_isrc = d_inode(h_src);
+	ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID
+		| ATTR_ATIME | ATTR_MTIME
+		| ATTR_ATIME_SET | ATTR_MTIME_SET;
+	if (h_src_attr && h_src_attr->valid) {
+		h_st = &h_src_attr->st;
+		ia.ia_uid = h_st->uid;
+		ia.ia_gid = h_st->gid;
+		ia.ia_atime = h_st->atime;
+		ia.ia_mtime = h_st->mtime;
+		if (h_idst->i_mode != h_st->mode
+		    && !S_ISLNK(h_idst->i_mode)) {
+			ia.ia_valid |= ATTR_MODE;
+			ia.ia_mode = h_st->mode;
+		}
+		sbits = !!(h_st->mode & (S_ISUID | S_ISGID));
+		au_cpup_attr_flags(h_idst, h_src_attr->iflags);
+	} else {
+		ia.ia_uid = h_isrc->i_uid;
+		ia.ia_gid = h_isrc->i_gid;
+		ia.ia_atime = h_isrc->i_atime;
+		ia.ia_mtime = h_isrc->i_mtime;
+		if (h_idst->i_mode != h_isrc->i_mode
+		    && !S_ISLNK(h_idst->i_mode)) {
+			ia.ia_valid |= ATTR_MODE;
+			ia.ia_mode = h_isrc->i_mode;
+		}
+		sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID));
+		au_cpup_attr_flags(h_idst, h_isrc->i_flags);
+	}
+	/* no delegation since it is just created */
+	err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
+
+	/* is this nfs only? */
+	if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) {
+		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
+		ia.ia_mode = h_isrc->i_mode;
+		err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
+	}
+
+	icex = br->br_perm & AuBrAttr_ICEX;
+	if (!err) {
+		mnt_flags = au_mntflags(dst->d_sb);
+		verbose = !!au_opt_test(mnt_flags, VERBOSE);
+		err = au_cpup_xattr(h_path.dentry, h_src, icex, verbose);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_copy_file(struct file *dst, struct file *src, loff_t len,
+			   char *buf, unsigned long blksize)
+{
+	int err;
+	size_t sz, rbytes, wbytes;
+	unsigned char all_zero;
+	char *p, *zp;
+	struct inode *h_inode;
+	/* reduce stack usage */
+	struct iattr *ia;
+
+	zp = page_address(ZERO_PAGE(0));
+	if (unlikely(!zp))
+		return -ENOMEM; /* possible? */
+
+	err = 0;
+	all_zero = 0;
+	while (len) {
+		AuDbg("len %lld\n", len);
+		sz = blksize;
+		if (len < blksize)
+			sz = len;
+
+		rbytes = 0;
+		/* todo: signal_pending? */
+		while (!rbytes || err == -EAGAIN || err == -EINTR) {
+			rbytes = vfsub_read_k(src, buf, sz, &src->f_pos);
+			err = rbytes;
+		}
+		if (unlikely(err < 0))
+			break;
+
+		all_zero = 0;
+		if (len >= rbytes && rbytes == blksize)
+			all_zero = !memcmp(buf, zp, rbytes);
+		if (!all_zero) {
+			wbytes = rbytes;
+			p = buf;
+			while (wbytes) {
+				size_t b;
+
+				b = vfsub_write_k(dst, p, wbytes, &dst->f_pos);
+				err = b;
+				/* todo: signal_pending? */
+				if (unlikely(err == -EAGAIN || err == -EINTR))
+					continue;
+				if (unlikely(err < 0))
+					break;
+				wbytes -= b;
+				p += b;
+			}
+			if (unlikely(err < 0))
+				break;
+		} else {
+			loff_t res;
+
+			AuLabel(hole);
+			res = vfsub_llseek(dst, rbytes, SEEK_CUR);
+			err = res;
+			if (unlikely(res < 0))
+				break;
+		}
+		len -= rbytes;
+		err = 0;
+	}
+
+	/* the last block may be a hole */
+	if (!err && all_zero) {
+		AuLabel(last hole);
+
+		err = 1;
+		if (au_test_nfs(dst->f_path.dentry->d_sb)) {
+			/* nfs requires this step to make last hole */
+			/* is this only nfs? */
+			do {
+				/* todo: signal_pending? */
+				err = vfsub_write_k(dst, "\0", 1, &dst->f_pos);
+			} while (err == -EAGAIN || err == -EINTR);
+			if (err == 1)
+				dst->f_pos--;
+		}
+
+		if (err == 1) {
+			ia = (void *)buf;
+			ia->ia_size = dst->f_pos;
+			ia->ia_valid = ATTR_SIZE | ATTR_FILE;
+			ia->ia_file = dst;
+			h_inode = file_inode(dst);
+			inode_lock_nested(h_inode, AuLsc_I_CHILD2);
+			/* no delegation since it is just created */
+			err = vfsub_notify_change(&dst->f_path, ia,
+						  /*delegated*/NULL);
+			inode_unlock(h_inode);
+		}
+	}
+
+	return err;
+}
+
+int au_copy_file(struct file *dst, struct file *src, loff_t len)
+{
+	int err;
+	unsigned long blksize;
+	unsigned char do_kfree;
+	char *buf;
+
+	err = -ENOMEM;
+	blksize = dst->f_path.dentry->d_sb->s_blocksize;
+	if (!blksize || PAGE_SIZE < blksize)
+		blksize = PAGE_SIZE;
+	AuDbg("blksize %lu\n", blksize);
+	do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *));
+	if (do_kfree)
+		buf = kmalloc(blksize, GFP_NOFS);
+	else
+		buf = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!buf))
+		goto out;
+
+	if (len > (1 << 22))
+		AuDbg("copying a large file %lld\n", (long long)len);
+
+	src->f_pos = 0;
+	dst->f_pos = 0;
+	err = au_do_copy_file(dst, src, len, buf, blksize);
+	if (do_kfree)
+		au_delayed_kfree(buf);
+	else
+		au_delayed_free_page((unsigned long)buf);
+
+out:
+	return err;
+}
+
+/*
+ * to support a sparse file which is opened with O_APPEND,
+ * we need to close the file.
+ */
+static int au_cp_regular(struct au_cp_generic *cpg)
+{
+	int err, i;
+	enum { SRC, DST };
+	struct {
+		aufs_bindex_t bindex;
+		unsigned int flags;
+		struct dentry *dentry;
+		int force_wr;
+		struct file *file;
+		void *label;
+	} *f, file[] = {
+		{
+			.bindex = cpg->bsrc,
+			.flags = O_RDONLY | O_NOATIME | O_LARGEFILE,
+			.label = &&out
+		},
+		{
+			.bindex = cpg->bdst,
+			.flags = O_WRONLY | O_NOATIME | O_LARGEFILE,
+			.force_wr = !!au_ftest_cpup(cpg->flags, RWDST),
+			.label = &&out_src
+		}
+	};
+	struct super_block *sb;
+	struct inode *h_src_inode;
+	struct task_struct *tsk = current;
+
+	/* bsrc branch can be ro/rw. */
+	sb = cpg->dentry->d_sb;
+	f = file;
+	for (i = 0; i < 2; i++, f++) {
+		f->dentry = au_h_dptr(cpg->dentry, f->bindex);
+		f->file = au_h_open(cpg->dentry, f->bindex, f->flags,
+				    /*file*/NULL, f->force_wr);
+		err = PTR_ERR(f->file);
+		if (IS_ERR(f->file))
+			goto *f->label;
+	}
+
+	/* try stopping to update while we copyup */
+	h_src_inode = d_inode(file[SRC].dentry);
+	if (!au_test_nfs(h_src_inode->i_sb))
+		IMustLock(h_src_inode);
+	err = au_copy_file(file[DST].file, file[SRC].file, cpg->len);
+
+	/* i wonder if we had O_NO_DELAY_FPUT flag */
+	if (tsk->flags & PF_KTHREAD)
+		__fput_sync(file[DST].file);
+	else {
+		WARN(1, "%pD\nPlease report this warning to aufs-users ML",
+		     file[DST].file);
+		fput(file[DST].file);
+		/*
+		 * too bad.
+		 * we have to call both since we don't know which place the file
+		 * was added to.
+		 */
+		task_work_run();
+		flush_delayed_fput();
+	}
+	au_sbr_put(sb, file[DST].bindex);
+
+out_src:
+	fput(file[SRC].file);
+	au_sbr_put(sb, file[SRC].bindex);
+out:
+	return err;
+}
+
+static int au_do_cpup_regular(struct au_cp_generic *cpg,
+			      struct au_cpup_reg_attr *h_src_attr)
+{
+	int err, rerr;
+	loff_t l;
+	struct path h_path;
+	struct inode *h_src_inode, *h_dst_inode;
+
+	err = 0;
+	h_src_inode = au_h_iptr(d_inode(cpg->dentry), cpg->bsrc);
+	l = i_size_read(h_src_inode);
+	if (cpg->len == -1 || l < cpg->len)
+		cpg->len = l;
+	if (cpg->len) {
+		/* try stopping to update while we are referencing */
+		inode_lock_nested(h_src_inode, AuLsc_I_CHILD);
+		au_pin_hdir_unlock(cpg->pin);
+
+		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
+		h_path.mnt = au_sbr_mnt(cpg->dentry->d_sb, cpg->bsrc);
+		h_src_attr->iflags = h_src_inode->i_flags;
+		if (!au_test_nfs(h_src_inode->i_sb))
+			err = vfs_getattr(&h_path, &h_src_attr->st);
+		else {
+			inode_unlock(h_src_inode);
+			err = vfs_getattr(&h_path, &h_src_attr->st);
+			inode_lock_nested(h_src_inode, AuLsc_I_CHILD);
+		}
+		if (unlikely(err)) {
+			inode_unlock(h_src_inode);
+			goto out;
+		}
+		h_src_attr->valid = 1;
+		if (!au_test_nfs(h_src_inode->i_sb)) {
+			err = au_cp_regular(cpg);
+			inode_unlock(h_src_inode);
+		} else {
+			inode_unlock(h_src_inode);
+			err = au_cp_regular(cpg);
+		}
+		rerr = au_pin_hdir_relock(cpg->pin);
+		if (!err && rerr)
+			err = rerr;
+	}
+	if (!err && (h_src_inode->i_state & I_LINKABLE)) {
+		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bdst);
+		h_dst_inode = d_inode(h_path.dentry);
+		spin_lock(&h_dst_inode->i_lock);
+		h_dst_inode->i_state |= I_LINKABLE;
+		spin_unlock(&h_dst_inode->i_lock);
+	}
+
+out:
+	return err;
+}
+
+static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src,
+			      struct inode *h_dir)
+{
+	int err, symlen;
+	mm_segment_t old_fs;
+	union {
+		char *k;
+		char __user *u;
+	} sym;
+	struct inode *h_inode = d_inode(h_src);
+	const struct inode_operations *h_iop = h_inode->i_op;
+
+	err = -ENOSYS;
+	if (unlikely(!h_iop->readlink))
+		goto out;
+
+	err = -ENOMEM;
+	sym.k = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!sym.k))
+		goto out;
+
+	/* unnecessary to support mmap_sem since symlink is not mmap-able */
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	symlen = h_iop->readlink(h_src, sym.u, PATH_MAX);
+	err = symlen;
+	set_fs(old_fs);
+
+	if (symlen > 0) {
+		sym.k[symlen] = 0;
+		err = vfsub_symlink(h_dir, h_path, sym.k);
+	}
+	au_delayed_free_page((unsigned long)sym.k);
+
+out:
+	return err;
+}
+
+/*
+ * regardless 'acl' option, reset all ACL.
+ * All ACL will be copied up later from the original entry on the lower branch.
+ */
+static int au_reset_acl(struct inode *h_dir, struct path *h_path, umode_t mode)
+{
+	int err;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	h_dentry = h_path->dentry;
+	h_inode = d_inode(h_dentry);
+	/* forget_all_cached_acls(h_inode)); */
+	err = vfsub_removexattr(h_dentry, XATTR_NAME_POSIX_ACL_ACCESS);
+	AuTraceErr(err);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+	if (!err)
+		err = vfsub_acl_chmod(h_inode, mode);
+
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_do_cpup_dir(struct au_cp_generic *cpg, struct dentry *dst_parent,
+			  struct inode *h_dir, struct path *h_path)
+{
+	int err;
+	struct inode *dir, *inode;
+
+	err = vfsub_removexattr(h_path->dentry, XATTR_NAME_POSIX_ACL_DEFAULT);
+	AuTraceErr(err);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+	if (unlikely(err))
+		goto out;
+
+	/*
+	 * strange behaviour from the users view,
+	 * particularry setattr case
+	 */
+	dir = d_inode(dst_parent);
+	if (au_ibtop(dir) == cpg->bdst)
+		au_cpup_attr_nlink(dir, /*force*/1);
+	inode = d_inode(cpg->dentry);
+	au_cpup_attr_nlink(inode, /*force*/1);
+
+out:
+	return err;
+}
+
+static noinline_for_stack
+int cpup_entry(struct au_cp_generic *cpg, struct dentry *dst_parent,
+	       struct au_cpup_reg_attr *h_src_attr)
+{
+	int err;
+	umode_t mode;
+	unsigned int mnt_flags;
+	unsigned char isdir, isreg, force;
+	const unsigned char do_dt = !!au_ftest_cpup(cpg->flags, DTIME);
+	struct au_dtime dt;
+	struct path h_path;
+	struct dentry *h_src, *h_dst, *h_parent;
+	struct inode *h_inode, *h_dir;
+	struct super_block *sb;
+
+	/* bsrc branch can be ro/rw. */
+	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
+	h_inode = d_inode(h_src);
+	AuDebugOn(h_inode != au_h_iptr(d_inode(cpg->dentry), cpg->bsrc));
+
+	/* try stopping to be referenced while we are creating */
+	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
+	if (au_ftest_cpup(cpg->flags, RENAME))
+		AuDebugOn(strncmp(h_dst->d_name.name, AUFS_WH_PFX,
+				  AUFS_WH_PFX_LEN));
+	h_parent = h_dst->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+	AuDebugOn(h_parent != h_dst->d_parent);
+
+	sb = cpg->dentry->d_sb;
+	h_path.mnt = au_sbr_mnt(sb, cpg->bdst);
+	if (do_dt) {
+		h_path.dentry = h_parent;
+		au_dtime_store(&dt, dst_parent, &h_path);
+	}
+	h_path.dentry = h_dst;
+
+	isreg = 0;
+	isdir = 0;
+	mode = h_inode->i_mode;
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		isreg = 1;
+		err = vfsub_create(h_dir, &h_path, S_IRUSR | S_IWUSR,
+				   /*want_excl*/true);
+		if (!err)
+			err = au_do_cpup_regular(cpg, h_src_attr);
+		break;
+	case S_IFDIR:
+		isdir = 1;
+		err = vfsub_mkdir(h_dir, &h_path, mode);
+		if (!err)
+			err = au_do_cpup_dir(cpg, dst_parent, h_dir, &h_path);
+		break;
+	case S_IFLNK:
+		err = au_do_cpup_symlink(&h_path, h_src, h_dir);
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+		AuDebugOn(!capable(CAP_MKNOD));
+		/*FALLTHROUGH*/
+	case S_IFIFO:
+	case S_IFSOCK:
+		err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev);
+		break;
+	default:
+		AuIOErr("Unknown inode type 0%o\n", mode);
+		err = -EIO;
+	}
+	if (!err)
+		err = au_reset_acl(h_dir, &h_path, mode);
+
+	mnt_flags = au_mntflags(sb);
+	if (!au_opt_test(mnt_flags, UDBA_NONE)
+	    && !isdir
+	    && au_opt_test(mnt_flags, XINO)
+	    && (h_inode->i_nlink == 1
+		|| (h_inode->i_state & I_LINKABLE))
+	    /* todo: unnecessary? */
+	    /* && d_inode(cpg->dentry)->i_nlink == 1 */
+	    && cpg->bdst < cpg->bsrc
+	    && !au_ftest_cpup(cpg->flags, KEEPLINO))
+		au_xino_write(sb, cpg->bsrc, h_inode->i_ino, /*ino*/0);
+		/* ignore this error */
+
+	if (!err) {
+		force = 0;
+		if (isreg) {
+			force = !!cpg->len;
+			if (cpg->len == -1)
+				force = !!i_size_read(h_inode);
+		}
+		au_fhsm_wrote(sb, cpg->bdst, force);
+	}
+
+	if (do_dt)
+		au_dtime_revert(&dt);
+	return err;
+}
+
+static int au_do_ren_after_cpup(struct au_cp_generic *cpg, struct path *h_path)
+{
+	int err;
+	struct dentry *dentry, *h_dentry, *h_parent, *parent;
+	struct inode *h_dir;
+	aufs_bindex_t bdst;
+
+	dentry = cpg->dentry;
+	bdst = cpg->bdst;
+	h_dentry = au_h_dptr(dentry, bdst);
+	if (!au_ftest_cpup(cpg->flags, OVERWRITE)) {
+		dget(h_dentry);
+		au_set_h_dptr(dentry, bdst, NULL);
+		err = au_lkup_neg(dentry, bdst, /*wh*/0);
+		if (!err)
+			h_path->dentry = dget(au_h_dptr(dentry, bdst));
+		au_set_h_dptr(dentry, bdst, h_dentry);
+	} else {
+		err = 0;
+		parent = dget_parent(dentry);
+		h_parent = au_h_dptr(parent, bdst);
+		dput(parent);
+		h_path->dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
+		if (IS_ERR(h_path->dentry))
+			err = PTR_ERR(h_path->dentry);
+	}
+	if (unlikely(err))
+		goto out;
+
+	h_parent = h_dentry->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+	AuDbg("%pd %pd\n", h_dentry, h_path->dentry);
+	/* no delegation since it is just created */
+	err = vfsub_rename(h_dir, h_dentry, h_dir, h_path, /*delegated*/NULL);
+	dput(h_path->dentry);
+
+out:
+	return err;
+}
+
+/*
+ * copyup the @dentry from @bsrc to @bdst.
+ * the caller must set the both of lower dentries.
+ * @len is for truncating when it is -1 copyup the entire file.
+ * in link/rename cases, @dst_parent may be different from the real one.
+ * basic->bsrc can be larger than basic->bdst.
+ */
+static int au_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
+{
+	int err, rerr;
+	aufs_bindex_t old_ibtop;
+	unsigned char isdir, plink;
+	struct dentry *h_src, *h_dst, *h_parent;
+	struct inode *dst_inode, *h_dir, *inode, *delegated, *src_inode;
+	struct super_block *sb;
+	struct au_branch *br;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct path h_path;
+		struct au_cpup_reg_attr h_src_attr;
+	} *a;
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+	a->h_src_attr.valid = 0;
+
+	sb = cpg->dentry->d_sb;
+	br = au_sbr(sb, cpg->bdst);
+	a->h_path.mnt = au_br_mnt(br);
+	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
+	h_parent = h_dst->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+
+	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
+	inode = d_inode(cpg->dentry);
+
+	if (!dst_parent)
+		dst_parent = dget_parent(cpg->dentry);
+	else
+		dget(dst_parent);
+
+	plink = !!au_opt_test(au_mntflags(sb), PLINK);
+	dst_inode = au_h_iptr(inode, cpg->bdst);
+	if (dst_inode) {
+		if (unlikely(!plink)) {
+			err = -EIO;
+			AuIOErr("hi%lu(i%lu) exists on b%d "
+				"but plink is disabled\n",
+				dst_inode->i_ino, inode->i_ino, cpg->bdst);
+			goto out_parent;
+		}
+
+		if (dst_inode->i_nlink) {
+			const int do_dt = au_ftest_cpup(cpg->flags, DTIME);
+
+			h_src = au_plink_lkup(inode, cpg->bdst);
+			err = PTR_ERR(h_src);
+			if (IS_ERR(h_src))
+				goto out_parent;
+			if (unlikely(d_is_negative(h_src))) {
+				err = -EIO;
+				AuIOErr("i%lu exists on b%d "
+					"but not pseudo-linked\n",
+					inode->i_ino, cpg->bdst);
+				dput(h_src);
+				goto out_parent;
+			}
+
+			if (do_dt) {
+				a->h_path.dentry = h_parent;
+				au_dtime_store(&a->dt, dst_parent, &a->h_path);
+			}
+
+			a->h_path.dentry = h_dst;
+			delegated = NULL;
+			err = vfsub_link(h_src, h_dir, &a->h_path, &delegated);
+			if (!err && au_ftest_cpup(cpg->flags, RENAME))
+				err = au_do_ren_after_cpup(cpg, &a->h_path);
+			if (do_dt)
+				au_dtime_revert(&a->dt);
+			if (unlikely(err == -EWOULDBLOCK)) {
+				pr_warn("cannot retry for NFSv4 delegation"
+					" for an internal link\n");
+				iput(delegated);
+			}
+			dput(h_src);
+			goto out_parent;
+		} else
+			/* todo: cpup_wh_file? */
+			/* udba work */
+			au_update_ibrange(inode, /*do_put_zero*/1);
+	}
+
+	isdir = S_ISDIR(inode->i_mode);
+	old_ibtop = au_ibtop(inode);
+	err = cpup_entry(cpg, dst_parent, &a->h_src_attr);
+	if (unlikely(err))
+		goto out_rev;
+	dst_inode = d_inode(h_dst);
+	inode_lock_nested(dst_inode, AuLsc_I_CHILD2);
+	/* todo: necessary? */
+	/* au_pin_hdir_unlock(cpg->pin); */
+
+	err = cpup_iattr(cpg->dentry, cpg->bdst, h_src, &a->h_src_attr);
+	if (unlikely(err)) {
+		/* todo: necessary? */
+		/* au_pin_hdir_relock(cpg->pin); */ /* ignore an error */
+		inode_unlock(dst_inode);
+		goto out_rev;
+	}
+
+	if (cpg->bdst < old_ibtop) {
+		if (S_ISREG(inode->i_mode)) {
+			err = au_dy_iaop(inode, cpg->bdst, dst_inode);
+			if (unlikely(err)) {
+				/* ignore an error */
+				/* au_pin_hdir_relock(cpg->pin); */
+				inode_unlock(dst_inode);
+				goto out_rev;
+			}
+		}
+		au_set_ibtop(inode, cpg->bdst);
+	} else
+		au_set_ibbot(inode, cpg->bdst);
+	au_set_h_iptr(inode, cpg->bdst, au_igrab(dst_inode),
+		      au_hi_flags(inode, isdir));
+
+	/* todo: necessary? */
+	/* err = au_pin_hdir_relock(cpg->pin); */
+	inode_unlock(dst_inode);
+	if (unlikely(err))
+		goto out_rev;
+
+	src_inode = d_inode(h_src);
+	if (!isdir
+	    && (src_inode->i_nlink > 1
+		|| src_inode->i_state & I_LINKABLE)
+	    && plink)
+		au_plink_append(inode, cpg->bdst, h_dst);
+
+	if (au_ftest_cpup(cpg->flags, RENAME)) {
+		a->h_path.dentry = h_dst;
+		err = au_do_ren_after_cpup(cpg, &a->h_path);
+	}
+	if (!err)
+		goto out_parent; /* success */
+
+	/* revert */
+out_rev:
+	a->h_path.dentry = h_parent;
+	au_dtime_store(&a->dt, dst_parent, &a->h_path);
+	a->h_path.dentry = h_dst;
+	rerr = 0;
+	if (d_is_positive(h_dst)) {
+		if (!isdir) {
+			/* no delegation since it is just created */
+			rerr = vfsub_unlink(h_dir, &a->h_path,
+					    /*delegated*/NULL, /*force*/0);
+		} else
+			rerr = vfsub_rmdir(h_dir, &a->h_path);
+	}
+	au_dtime_revert(&a->dt);
+	if (rerr) {
+		AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr);
+		err = -EIO;
+	}
+out_parent:
+	dput(dst_parent);
+	au_delayed_kfree(a);
+out:
+	return err;
+}
+
+#if 0 /* reserved */
+struct au_cpup_single_args {
+	int *errp;
+	struct au_cp_generic *cpg;
+	struct dentry *dst_parent;
+};
+
+static void au_call_cpup_single(void *args)
+{
+	struct au_cpup_single_args *a = args;
+
+	au_pin_hdir_acquire_nest(a->cpg->pin);
+	*a->errp = au_cpup_single(a->cpg, a->dst_parent);
+	au_pin_hdir_release(a->cpg->pin);
+}
+#endif
+
+/*
+ * prevent SIGXFSZ in copy-up.
+ * testing CAP_MKNOD is for generic fs,
+ * but CAP_FSETID is for xfs only, currently.
+ */
+static int au_cpup_sio_test(struct au_pin *pin, umode_t mode)
+{
+	int do_sio;
+	struct super_block *sb;
+	struct inode *h_dir;
+
+	do_sio = 0;
+	sb = au_pinned_parent(pin)->d_sb;
+	if (!au_wkq_test()
+	    && (!au_sbi(sb)->si_plink_maint_pid
+		|| au_plink_maint(sb, AuLock_NOPLM))) {
+		switch (mode & S_IFMT) {
+		case S_IFREG:
+			/* no condition about RLIMIT_FSIZE and the file size */
+			do_sio = 1;
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
+			do_sio = !capable(CAP_MKNOD);
+			break;
+		}
+		if (!do_sio)
+			do_sio = ((mode & (S_ISUID | S_ISGID))
+				  && !capable(CAP_FSETID));
+		/* this workaround may be removed in the future */
+		if (!do_sio) {
+			h_dir = au_pinned_h_dir(pin);
+			do_sio = h_dir->i_mode & S_ISVTX;
+		}
+	}
+
+	return do_sio;
+}
+
+#if 0 /* reserved */
+int au_sio_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
+{
+	int err, wkq_err;
+	struct dentry *h_dentry;
+
+	h_dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
+	if (!au_cpup_sio_test(pin, d_inode(h_dentry)->i_mode))
+		err = au_cpup_single(cpg, dst_parent);
+	else {
+		struct au_cpup_single_args args = {
+			.errp		= &err,
+			.cpg		= cpg,
+			.dst_parent	= dst_parent
+		};
+		wkq_err = au_wkq_wait(au_call_cpup_single, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
+#endif
+
+/*
+ * copyup the @dentry from the first active lower branch to @bdst,
+ * using au_cpup_single().
+ */
+static int au_cpup_simple(struct au_cp_generic *cpg)
+{
+	int err;
+	unsigned int flags_orig;
+	struct dentry *dentry;
+
+	AuDebugOn(cpg->bsrc < 0);
+
+	dentry = cpg->dentry;
+	DiMustWriteLock(dentry);
+
+	err = au_lkup_neg(dentry, cpg->bdst, /*wh*/1);
+	if (!err) {
+		flags_orig = cpg->flags;
+		au_fset_cpup(cpg->flags, RENAME);
+		err = au_cpup_single(cpg, NULL);
+		cpg->flags = flags_orig;
+		if (!err)
+			return 0; /* success */
+
+		/* revert */
+		au_set_h_dptr(dentry, cpg->bdst, NULL);
+		au_set_dbtop(dentry, cpg->bsrc);
+	}
+
+	return err;
+}
+
+struct au_cpup_simple_args {
+	int *errp;
+	struct au_cp_generic *cpg;
+};
+
+static void au_call_cpup_simple(void *args)
+{
+	struct au_cpup_simple_args *a = args;
+
+	au_pin_hdir_acquire_nest(a->cpg->pin);
+	*a->errp = au_cpup_simple(a->cpg);
+	au_pin_hdir_release(a->cpg->pin);
+}
+
+static int au_do_sio_cpup_simple(struct au_cp_generic *cpg)
+{
+	int err, wkq_err;
+	struct dentry *dentry, *parent;
+	struct file *h_file;
+	struct inode *h_dir;
+
+	dentry = cpg->dentry;
+	h_file = NULL;
+	if (au_ftest_cpup(cpg->flags, HOPEN)) {
+		AuDebugOn(cpg->bsrc < 0);
+		h_file = au_h_open_pre(dentry, cpg->bsrc, /*force_wr*/0);
+		err = PTR_ERR(h_file);
+		if (IS_ERR(h_file))
+			goto out;
+	}
+
+	parent = dget_parent(dentry);
+	h_dir = au_h_iptr(d_inode(parent), cpg->bdst);
+	if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE)
+	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
+		err = au_cpup_simple(cpg);
+	else {
+		struct au_cpup_simple_args args = {
+			.errp		= &err,
+			.cpg		= cpg
+		};
+		wkq_err = au_wkq_wait(au_call_cpup_simple, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	dput(parent);
+	if (h_file)
+		au_h_open_post(dentry, cpg->bsrc, h_file);
+
+out:
+	return err;
+}
+
+int au_sio_cpup_simple(struct au_cp_generic *cpg)
+{
+	aufs_bindex_t bsrc, bbot;
+	struct dentry *dentry, *h_dentry;
+
+	if (cpg->bsrc < 0) {
+		dentry = cpg->dentry;
+		bbot = au_dbbot(dentry);
+		for (bsrc = cpg->bdst + 1; bsrc <= bbot; bsrc++) {
+			h_dentry = au_h_dptr(dentry, bsrc);
+			if (h_dentry) {
+				AuDebugOn(d_is_negative(h_dentry));
+				break;
+			}
+		}
+		AuDebugOn(bsrc > bbot);
+		cpg->bsrc = bsrc;
+	}
+	AuDebugOn(cpg->bsrc <= cpg->bdst);
+	return au_do_sio_cpup_simple(cpg);
+}
+
+int au_sio_cpdown_simple(struct au_cp_generic *cpg)
+{
+	AuDebugOn(cpg->bdst <= cpg->bsrc);
+	return au_do_sio_cpup_simple(cpg);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * copyup the deleted file for writing.
+ */
+static int au_do_cpup_wh(struct au_cp_generic *cpg, struct dentry *wh_dentry,
+			 struct file *file)
+{
+	int err;
+	unsigned int flags_orig;
+	aufs_bindex_t bsrc_orig;
+	struct au_dinfo *dinfo;
+	struct {
+		struct au_hdentry *hd;
+		struct dentry *h_dentry;
+	} hdst, hsrc;
+
+	dinfo = au_di(cpg->dentry);
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	bsrc_orig = cpg->bsrc;
+	cpg->bsrc = dinfo->di_btop;
+	hdst.hd = au_hdentry(dinfo, cpg->bdst);
+	hdst.h_dentry = hdst.hd->hd_dentry;
+	hdst.hd->hd_dentry = wh_dentry;
+	dinfo->di_btop = cpg->bdst;
+
+	hsrc.h_dentry = NULL;
+	if (file) {
+		hsrc.hd = au_hdentry(dinfo, cpg->bsrc);
+		hsrc.h_dentry = hsrc.hd->hd_dentry;
+		hsrc.hd->hd_dentry = au_hf_top(file)->f_path.dentry;
+	}
+	flags_orig = cpg->flags;
+	cpg->flags = !AuCpup_DTIME;
+	err = au_cpup_single(cpg, /*h_parent*/NULL);
+	cpg->flags = flags_orig;
+	if (file) {
+		if (!err)
+			err = au_reopen_nondir(file);
+		hsrc.hd->hd_dentry = hsrc.h_dentry;
+	}
+	hdst.hd->hd_dentry = hdst.h_dentry;
+	dinfo->di_btop = cpg->bsrc;
+	cpg->bsrc = bsrc_orig;
+
+	return err;
+}
+
+static int au_cpup_wh(struct au_cp_generic *cpg, struct file *file)
+{
+	int err;
+	aufs_bindex_t bdst;
+	struct au_dtime dt;
+	struct dentry *dentry, *parent, *h_parent, *wh_dentry;
+	struct au_branch *br;
+	struct path h_path;
+
+	dentry = cpg->dentry;
+	bdst = cpg->bdst;
+	br = au_sbr(dentry->d_sb, bdst);
+	parent = dget_parent(dentry);
+	h_parent = au_h_dptr(parent, bdst);
+	wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out;
+
+	h_path.dentry = h_parent;
+	h_path.mnt = au_br_mnt(br);
+	au_dtime_store(&dt, parent, &h_path);
+	err = au_do_cpup_wh(cpg, wh_dentry, file);
+	if (unlikely(err))
+		goto out_wh;
+
+	dget(wh_dentry);
+	h_path.dentry = wh_dentry;
+	if (!d_is_dir(wh_dentry)) {
+		/* no delegation since it is just created */
+		err = vfsub_unlink(d_inode(h_parent), &h_path,
+				   /*delegated*/NULL, /*force*/0);
+	} else
+		err = vfsub_rmdir(d_inode(h_parent), &h_path);
+	if (unlikely(err)) {
+		AuIOErr("failed remove copied-up tmp file %pd(%d)\n",
+			wh_dentry, err);
+		err = -EIO;
+	}
+	au_dtime_revert(&dt);
+	au_set_hi_wh(d_inode(dentry), bdst, wh_dentry);
+
+out_wh:
+	dput(wh_dentry);
+out:
+	dput(parent);
+	return err;
+}
+
+struct au_cpup_wh_args {
+	int *errp;
+	struct au_cp_generic *cpg;
+	struct file *file;
+};
+
+static void au_call_cpup_wh(void *args)
+{
+	struct au_cpup_wh_args *a = args;
+
+	au_pin_hdir_acquire_nest(a->cpg->pin);
+	*a->errp = au_cpup_wh(a->cpg, a->file);
+	au_pin_hdir_release(a->cpg->pin);
+}
+
+int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file)
+{
+	int err, wkq_err;
+	aufs_bindex_t bdst;
+	struct dentry *dentry, *parent, *h_orph, *h_parent;
+	struct inode *dir, *h_dir, *h_tmpdir;
+	struct au_wbr *wbr;
+	struct au_pin wh_pin, *pin_orig;
+
+	dentry = cpg->dentry;
+	bdst = cpg->bdst;
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+	h_orph = NULL;
+	h_parent = NULL;
+	h_dir = au_igrab(au_h_iptr(dir, bdst));
+	h_tmpdir = h_dir;
+	pin_orig = NULL;
+	if (!h_dir->i_nlink) {
+		wbr = au_sbr(dentry->d_sb, bdst)->br_wbr;
+		h_orph = wbr->wbr_orph;
+
+		h_parent = dget(au_h_dptr(parent, bdst));
+		au_set_h_dptr(parent, bdst, dget(h_orph));
+		h_tmpdir = d_inode(h_orph);
+		au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0);
+
+		inode_lock_nested(h_tmpdir, AuLsc_I_PARENT3);
+		/* todo: au_h_open_pre()? */
+
+		pin_orig = cpg->pin;
+		au_pin_init(&wh_pin, dentry, bdst, AuLsc_DI_PARENT,
+			    AuLsc_I_PARENT3, cpg->pin->udba, AuPin_DI_LOCKED);
+		cpg->pin = &wh_pin;
+	}
+
+	if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE)
+	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
+		err = au_cpup_wh(cpg, file);
+	else {
+		struct au_cpup_wh_args args = {
+			.errp	= &err,
+			.cpg	= cpg,
+			.file	= file
+		};
+		wkq_err = au_wkq_wait(au_call_cpup_wh, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	if (h_orph) {
+		inode_unlock(h_tmpdir);
+		/* todo: au_h_open_post()? */
+		au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0);
+		au_set_h_dptr(parent, bdst, h_parent);
+		AuDebugOn(!pin_orig);
+		cpg->pin = pin_orig;
+	}
+	iput(h_dir);
+	dput(parent);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * generic routine for both of copy-up and copy-down.
+ */
+/* cf. revalidate function in file.c */
+int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
+	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
+			 struct au_pin *pin,
+			 struct dentry *h_parent, void *arg),
+	       void *arg)
+{
+	int err;
+	struct au_pin pin;
+	struct dentry *d, *parent, *h_parent, *real_parent, *h_dentry;
+
+	err = 0;
+	parent = dget_parent(dentry);
+	if (IS_ROOT(parent))
+		goto out;
+
+	au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2,
+		    au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE);
+
+	/* do not use au_dpage */
+	real_parent = parent;
+	while (1) {
+		dput(parent);
+		parent = dget_parent(dentry);
+		h_parent = au_h_dptr(parent, bdst);
+		if (h_parent)
+			goto out; /* success */
+
+		/* find top dir which is necessary to cpup */
+		do {
+			d = parent;
+			dput(parent);
+			parent = dget_parent(d);
+			di_read_lock_parent3(parent, !AuLock_IR);
+			h_parent = au_h_dptr(parent, bdst);
+			di_read_unlock(parent, !AuLock_IR);
+		} while (!h_parent);
+
+		if (d != real_parent)
+			di_write_lock_child3(d);
+
+		/* somebody else might create while we were sleeping */
+		h_dentry = au_h_dptr(d, bdst);
+		if (!h_dentry || d_is_negative(h_dentry)) {
+			if (h_dentry)
+				au_update_dbtop(d);
+
+			au_pin_set_dentry(&pin, d);
+			err = au_do_pin(&pin);
+			if (!err) {
+				err = cp(d, bdst, &pin, h_parent, arg);
+				au_unpin(&pin);
+			}
+		}
+
+		if (d != real_parent)
+			di_write_unlock(d);
+		if (unlikely(err))
+			break;
+	}
+
+out:
+	dput(parent);
+	return err;
+}
+
+static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst,
+		       struct au_pin *pin,
+		       struct dentry *h_parent __maybe_unused,
+		       void *arg __maybe_unused)
+{
+	struct au_cp_generic cpg = {
+		.dentry	= dentry,
+		.bdst	= bdst,
+		.bsrc	= -1,
+		.len	= 0,
+		.pin	= pin,
+		.flags	= AuCpup_DTIME
+	};
+	return au_sio_cpup_simple(&cpg);
+}
+
+int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
+{
+	return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL);
+}
+
+int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
+{
+	int err;
+	struct dentry *parent;
+	struct inode *dir;
+
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+	err = 0;
+	if (au_h_iptr(dir, bdst))
+		goto out;
+
+	di_read_unlock(parent, AuLock_IR);
+	di_write_lock_parent(parent);
+	/* someone else might change our inode while we were sleeping */
+	if (!au_h_iptr(dir, bdst))
+		err = au_cpup_dirs(dentry, bdst);
+	di_downgrade_lock(parent, AuLock_IR);
+
+out:
+	dput(parent);
+	return err;
+}
diff --git b/fs/aufs/cpup.h b/fs/aufs/cpup.h
new file mode 100644
index 0000000..ccba2c4
--- /dev/null
+++ b/fs/aufs/cpup.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * copy-up/down functions
+ */
+
+#ifndef __AUFS_CPUP_H__
+#define __AUFS_CPUP_H__
+
+#ifdef __KERNEL__
+
+#include <linux/path.h>
+
+struct inode;
+struct file;
+struct au_pin;
+
+void au_cpup_attr_flags(struct inode *dst, unsigned int iflags);
+void au_cpup_attr_timesizes(struct inode *inode);
+void au_cpup_attr_nlink(struct inode *inode, int force);
+void au_cpup_attr_changeable(struct inode *inode);
+void au_cpup_igen(struct inode *inode, struct inode *h_inode);
+void au_cpup_attr_all(struct inode *inode, int force);
+
+/* ---------------------------------------------------------------------- */
+
+struct au_cp_generic {
+	struct dentry	*dentry;
+	aufs_bindex_t	bdst, bsrc;
+	loff_t		len;
+	struct au_pin	*pin;
+	unsigned int	flags;
+};
+
+/* cpup flags */
+#define AuCpup_DTIME		1		/* do dtime_store/revert */
+#define AuCpup_KEEPLINO		(1 << 1)	/* do not clear the lower xino,
+						   for link(2) */
+#define AuCpup_RENAME		(1 << 2)	/* rename after cpup */
+#define AuCpup_HOPEN		(1 << 3)	/* call h_open_pre/post() in
+						   cpup */
+#define AuCpup_OVERWRITE	(1 << 4)	/* allow overwriting the
+						   existing entry */
+#define AuCpup_RWDST		(1 << 5)	/* force write target even if
+						   the branch is marked as RO */
+
+#define au_ftest_cpup(flags, name)	((flags) & AuCpup_##name)
+#define au_fset_cpup(flags, name) \
+	do { (flags) |= AuCpup_##name; } while (0)
+#define au_fclr_cpup(flags, name) \
+	do { (flags) &= ~AuCpup_##name; } while (0)
+
+int au_copy_file(struct file *dst, struct file *src, loff_t len);
+int au_sio_cpup_simple(struct au_cp_generic *cpg);
+int au_sio_cpdown_simple(struct au_cp_generic *cpg);
+int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file);
+
+int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
+	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
+			 struct au_pin *pin,
+			 struct dentry *h_parent, void *arg),
+	       void *arg);
+int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
+int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
+
+/* ---------------------------------------------------------------------- */
+
+/* keep timestamps when copyup */
+struct au_dtime {
+	struct dentry *dt_dentry;
+	struct path dt_h_path;
+	struct timespec dt_atime, dt_mtime;
+};
+void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
+		    struct path *h_path);
+void au_dtime_revert(struct au_dtime *dt);
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_CPUP_H__ */
diff --git b/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c
new file mode 100644
index 0000000..4db01e8
--- /dev/null
+++ b/fs/aufs/dbgaufs.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debugfs interface
+ */
+
+#include <linux/debugfs.h>
+#include "aufs.h"
+
+#ifndef CONFIG_SYSFS
+#error DEBUG_FS depends upon SYSFS
+#endif
+
+static struct dentry *dbgaufs;
+static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH;
+
+/* 20 is max digits length of ulong 64 */
+struct dbgaufs_arg {
+	int n;
+	char a[20 * 4];
+};
+
+/*
+ * common function for all XINO files
+ */
+static int dbgaufs_xi_release(struct inode *inode __maybe_unused,
+			      struct file *file)
+{
+	au_delayed_kfree(file->private_data);
+	return 0;
+}
+
+static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt)
+{
+	int err;
+	struct kstat st;
+	struct dbgaufs_arg *p;
+
+	err = -ENOMEM;
+	p = kmalloc(sizeof(*p), GFP_NOFS);
+	if (unlikely(!p))
+		goto out;
+
+	err = 0;
+	p->n = 0;
+	file->private_data = p;
+	if (!xf)
+		goto out;
+
+	err = vfs_getattr(&xf->f_path, &st);
+	if (!err) {
+		if (do_fcnt)
+			p->n = snprintf
+				(p->a, sizeof(p->a), "%ld, %llux%lu %lld\n",
+				 (long)file_count(xf), st.blocks, st.blksize,
+				 (long long)st.size);
+		else
+			p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n",
+					st.blocks, st.blksize,
+					(long long)st.size);
+		AuDebugOn(p->n >= sizeof(p->a));
+	} else {
+		p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err);
+		err = 0;
+	}
+
+out:
+	return err;
+
+}
+
+static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf,
+			       size_t count, loff_t *ppos)
+{
+	struct dbgaufs_arg *p;
+
+	p = file->private_data;
+	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct dbgaufs_plink_arg {
+	int n;
+	char a[];
+};
+
+static int dbgaufs_plink_release(struct inode *inode __maybe_unused,
+				 struct file *file)
+{
+	au_delayed_free_page((unsigned long)file->private_data);
+	return 0;
+}
+
+static int dbgaufs_plink_open(struct inode *inode, struct file *file)
+{
+	int err, i, limit;
+	unsigned long n, sum;
+	struct dbgaufs_plink_arg *p;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+	struct au_sphlhead *sphl;
+
+	err = -ENOMEM;
+	p = (void *)get_zeroed_page(GFP_NOFS);
+	if (unlikely(!p))
+		goto out;
+
+	err = -EFBIG;
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	if (au_opt_test(au_mntflags(sb), PLINK)) {
+		limit = PAGE_SIZE - sizeof(p->n);
+
+		/* the number of buckets */
+		n = snprintf(p->a + p->n, limit, "%d\n", AuPlink_NHASH);
+		p->n += n;
+		limit -= n;
+
+		sum = 0;
+		for (i = 0, sphl = sbinfo->si_plink;
+		     i < AuPlink_NHASH;
+		     i++, sphl++) {
+			n = au_sphl_count(sphl);
+			sum += n;
+
+			n = snprintf(p->a + p->n, limit, "%lu ", n);
+			p->n += n;
+			limit -= n;
+			if (unlikely(limit <= 0))
+				goto out_free;
+		}
+		p->a[p->n - 1] = '\n';
+
+		/* the sum of plinks */
+		n = snprintf(p->a + p->n, limit, "%lu\n", sum);
+		p->n += n;
+		limit -= n;
+		if (unlikely(limit <= 0))
+			goto out_free;
+	} else {
+#define str "1\n0\n0\n"
+		p->n = sizeof(str) - 1;
+		strcpy(p->a, str);
+#undef str
+	}
+	si_read_unlock(sb);
+
+	err = 0;
+	file->private_data = p;
+	goto out; /* success */
+
+out_free:
+	au_delayed_free_page((unsigned long)p);
+out:
+	return err;
+}
+
+static ssize_t dbgaufs_plink_read(struct file *file, char __user *buf,
+				  size_t count, loff_t *ppos)
+{
+	struct dbgaufs_plink_arg *p;
+
+	p = file->private_data;
+	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
+}
+
+static const struct file_operations dbgaufs_plink_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_plink_open,
+	.release	= dbgaufs_plink_release,
+	.read		= dbgaufs_plink_read
+};
+
+/* ---------------------------------------------------------------------- */
+
+static int dbgaufs_xib_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0);
+	si_read_unlock(sb);
+	return err;
+}
+
+static const struct file_operations dbgaufs_xib_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_xib_open,
+	.release	= dbgaufs_xi_release,
+	.read		= dbgaufs_xi_read
+};
+
+/* ---------------------------------------------------------------------- */
+
+#define DbgaufsXi_PREFIX "xi"
+
+static int dbgaufs_xino_open(struct inode *inode, struct file *file)
+{
+	int err;
+	long l;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+	struct file *xf;
+	struct qstr *name;
+
+	err = -ENOENT;
+	xf = NULL;
+	name = &file->f_path.dentry->d_name;
+	if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX)
+		     || memcmp(name->name, DbgaufsXi_PREFIX,
+			       sizeof(DbgaufsXi_PREFIX) - 1)))
+		goto out;
+	err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l);
+	if (unlikely(err))
+		goto out;
+
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	if (l <= au_sbbot(sb)) {
+		xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file;
+		err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1);
+	} else
+		err = -ENOENT;
+	si_read_unlock(sb);
+
+out:
+	return err;
+}
+
+static const struct file_operations dbgaufs_xino_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_xino_open,
+	.release	= dbgaufs_xi_release,
+	.read		= dbgaufs_xi_read
+};
+
+void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
+{
+	aufs_bindex_t bbot;
+	struct au_branch *br;
+	struct au_xino_file *xi;
+
+	if (!au_sbi(sb)->si_dbgaufs)
+		return;
+
+	bbot = au_sbbot(sb);
+	for (; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		xi = &br->br_xino;
+		/* debugfs acquires the parent i_mutex */
+		lockdep_off();
+		debugfs_remove(xi->xi_dbgaufs);
+		lockdep_on();
+		xi->xi_dbgaufs = NULL;
+	}
+}
+
+void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
+{
+	struct au_sbinfo *sbinfo;
+	struct dentry *parent;
+	struct au_branch *br;
+	struct au_xino_file *xi;
+	aufs_bindex_t bbot;
+	char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */
+
+	sbinfo = au_sbi(sb);
+	parent = sbinfo->si_dbgaufs;
+	if (!parent)
+		return;
+
+	bbot = au_sbbot(sb);
+	for (; bindex <= bbot; bindex++) {
+		snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex);
+		br = au_sbr(sb, bindex);
+		xi = &br->br_xino;
+		AuDebugOn(xi->xi_dbgaufs);
+		/* debugfs acquires the parent i_mutex */
+		lockdep_off();
+		xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent,
+						     sbinfo, &dbgaufs_xino_fop);
+		lockdep_on();
+		/* ignore an error */
+		if (unlikely(!xi->xi_dbgaufs))
+			AuWarn1("failed %s under debugfs\n", name);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_EXPORT
+static int dbgaufs_xigen_open(struct inode *inode, struct file *file)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+
+	sbinfo = inode->i_private;
+	sb = sbinfo->si_sb;
+	si_noflush_read_lock(sb);
+	err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0);
+	si_read_unlock(sb);
+	return err;
+}
+
+static const struct file_operations dbgaufs_xigen_fop = {
+	.owner		= THIS_MODULE,
+	.open		= dbgaufs_xigen_open,
+	.release	= dbgaufs_xi_release,
+	.read		= dbgaufs_xi_read
+};
+
+static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
+{
+	int err;
+
+	/*
+	 * This function is a dynamic '__init' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+
+	err = -EIO;
+	sbinfo->si_dbgaufs_xigen = debugfs_create_file
+		("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
+		 &dbgaufs_xigen_fop);
+	if (sbinfo->si_dbgaufs_xigen)
+		err = 0;
+
+	return err;
+}
+#else
+static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
+{
+	return 0;
+}
+#endif /* CONFIG_AUFS_EXPORT */
+
+/* ---------------------------------------------------------------------- */
+
+void dbgaufs_si_fin(struct au_sbinfo *sbinfo)
+{
+	/*
+	 * This function is a dynamic '__fin' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+
+	debugfs_remove_recursive(sbinfo->si_dbgaufs);
+	sbinfo->si_dbgaufs = NULL;
+	kobject_put(&sbinfo->si_kobj);
+}
+
+int dbgaufs_si_init(struct au_sbinfo *sbinfo)
+{
+	int err;
+	char name[SysaufsSiNameLen];
+
+	/*
+	 * This function is a dynamic '__init' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+
+	err = -ENOENT;
+	if (!dbgaufs) {
+		AuErr1("/debug/aufs is uninitialized\n");
+		goto out;
+	}
+
+	err = -EIO;
+	sysaufs_name(sbinfo, name);
+	sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs);
+	if (unlikely(!sbinfo->si_dbgaufs))
+		goto out;
+	kobject_get(&sbinfo->si_kobj);
+
+	sbinfo->si_dbgaufs_xib = debugfs_create_file
+		("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
+		 &dbgaufs_xib_fop);
+	if (unlikely(!sbinfo->si_dbgaufs_xib))
+		goto out_dir;
+
+	sbinfo->si_dbgaufs_plink = debugfs_create_file
+		("plink", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
+		 &dbgaufs_plink_fop);
+	if (unlikely(!sbinfo->si_dbgaufs_plink))
+		goto out_dir;
+
+	err = dbgaufs_xigen_init(sbinfo);
+	if (!err)
+		goto out; /* success */
+
+out_dir:
+	dbgaufs_si_fin(sbinfo);
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void dbgaufs_fin(void)
+{
+	debugfs_remove(dbgaufs);
+}
+
+int __init dbgaufs_init(void)
+{
+	int err;
+
+	err = -EIO;
+	dbgaufs = debugfs_create_dir(AUFS_NAME, NULL);
+	if (dbgaufs)
+		err = 0;
+	return err;
+}
diff --git b/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h
new file mode 100644
index 0000000..81f272e
--- /dev/null
+++ b/fs/aufs/dbgaufs.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debugfs interface
+ */
+
+#ifndef __DBGAUFS_H__
+#define __DBGAUFS_H__
+
+#ifdef __KERNEL__
+
+struct super_block;
+struct au_sbinfo;
+
+#ifdef CONFIG_DEBUG_FS
+/* dbgaufs.c */
+void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
+void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
+void dbgaufs_si_fin(struct au_sbinfo *sbinfo);
+int dbgaufs_si_init(struct au_sbinfo *sbinfo);
+void dbgaufs_fin(void);
+int __init dbgaufs_init(void);
+#else
+AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo)
+AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo)
+AuStubVoid(dbgaufs_fin, void)
+AuStubInt0(__init dbgaufs_init, void)
+#endif /* CONFIG_DEBUG_FS */
+
+#endif /* __KERNEL__ */
+#endif /* __DBGAUFS_H__ */
diff --git b/fs/aufs/dcsub.c b/fs/aufs/dcsub.c
new file mode 100644
index 0000000..5d46398
--- /dev/null
+++ b/fs/aufs/dcsub.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for dentry cache
+ */
+
+#include "aufs.h"
+
+static void au_dpage_free(struct au_dpage *dpage)
+{
+	int i;
+	struct dentry **p;
+
+	p = dpage->dentries;
+	for (i = 0; i < dpage->ndentry; i++)
+		dput(*p++);
+	au_delayed_free_page((unsigned long)dpage->dentries);
+}
+
+int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp)
+{
+	int err;
+	void *p;
+
+	err = -ENOMEM;
+	dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp);
+	if (unlikely(!dpages->dpages))
+		goto out;
+
+	p = (void *)__get_free_page(gfp);
+	if (unlikely(!p))
+		goto out_dpages;
+
+	dpages->dpages[0].ndentry = 0;
+	dpages->dpages[0].dentries = p;
+	dpages->ndpage = 1;
+	return 0; /* success */
+
+out_dpages:
+	au_delayed_kfree(dpages->dpages);
+out:
+	return err;
+}
+
+void au_dpages_free(struct au_dcsub_pages *dpages)
+{
+	int i;
+	struct au_dpage *p;
+
+	p = dpages->dpages;
+	for (i = 0; i < dpages->ndpage; i++)
+		au_dpage_free(p++);
+	au_delayed_kfree(dpages->dpages);
+}
+
+static int au_dpages_append(struct au_dcsub_pages *dpages,
+			    struct dentry *dentry, gfp_t gfp)
+{
+	int err, sz;
+	struct au_dpage *dpage;
+	void *p;
+
+	dpage = dpages->dpages + dpages->ndpage - 1;
+	sz = PAGE_SIZE / sizeof(dentry);
+	if (unlikely(dpage->ndentry >= sz)) {
+		AuLabel(new dpage);
+		err = -ENOMEM;
+		sz = dpages->ndpage * sizeof(*dpages->dpages);
+		p = au_kzrealloc(dpages->dpages, sz,
+				 sz + sizeof(*dpages->dpages), gfp,
+				 /*may_shrink*/0);
+		if (unlikely(!p))
+			goto out;
+
+		dpages->dpages = p;
+		dpage = dpages->dpages + dpages->ndpage;
+		p = (void *)__get_free_page(gfp);
+		if (unlikely(!p))
+			goto out;
+
+		dpage->ndentry = 0;
+		dpage->dentries = p;
+		dpages->ndpage++;
+	}
+
+	AuDebugOn(au_dcount(dentry) <= 0);
+	dpage->dentries[dpage->ndentry++] = dget_dlock(dentry);
+	return 0; /* success */
+
+out:
+	return err;
+}
+
+/* todo: BAD approach */
+/* copied from linux/fs/dcache.c */
+enum d_walk_ret {
+	D_WALK_CONTINUE,
+	D_WALK_QUIT,
+	D_WALK_NORETRY,
+	D_WALK_SKIP,
+};
+
+extern void d_walk(struct dentry *parent, void *data,
+		   enum d_walk_ret (*enter)(void *, struct dentry *),
+		   void (*finish)(void *));
+
+struct ac_dpages_arg {
+	int err;
+	struct au_dcsub_pages *dpages;
+	struct super_block *sb;
+	au_dpages_test test;
+	void *arg;
+};
+
+static enum d_walk_ret au_call_dpages_append(void *_arg, struct dentry *dentry)
+{
+	enum d_walk_ret ret;
+	struct ac_dpages_arg *arg = _arg;
+
+	ret = D_WALK_CONTINUE;
+	if (dentry->d_sb == arg->sb
+	    && !IS_ROOT(dentry)
+	    && au_dcount(dentry) > 0
+	    && au_di(dentry)
+	    && (!arg->test || arg->test(dentry, arg->arg))) {
+		arg->err = au_dpages_append(arg->dpages, dentry, GFP_ATOMIC);
+		if (unlikely(arg->err))
+			ret = D_WALK_QUIT;
+	}
+
+	return ret;
+}
+
+int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
+		   au_dpages_test test, void *arg)
+{
+	struct ac_dpages_arg args = {
+		.err	= 0,
+		.dpages	= dpages,
+		.sb	= root->d_sb,
+		.test	= test,
+		.arg	= arg
+	};
+
+	d_walk(root, &args, au_call_dpages_append, NULL);
+
+	return args.err;
+}
+
+int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
+		       int do_include, au_dpages_test test, void *arg)
+{
+	int err;
+
+	err = 0;
+	write_seqlock(&rename_lock);
+	spin_lock(&dentry->d_lock);
+	if (do_include
+	    && au_dcount(dentry) > 0
+	    && (!test || test(dentry, arg)))
+		err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
+	spin_unlock(&dentry->d_lock);
+	if (unlikely(err))
+		goto out;
+
+	/*
+	 * RCU for vfsmount is unnecessary since this is a traverse in a single
+	 * mount
+	 */
+	while (!IS_ROOT(dentry)) {
+		dentry = dentry->d_parent; /* rename_lock is locked */
+		spin_lock(&dentry->d_lock);
+		if (au_dcount(dentry) > 0
+		    && (!test || test(dentry, arg)))
+			err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
+		spin_unlock(&dentry->d_lock);
+		if (unlikely(err))
+			break;
+	}
+
+out:
+	write_sequnlock(&rename_lock);
+	return err;
+}
+
+static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg)
+{
+	return au_di(dentry) && dentry->d_sb == arg;
+}
+
+int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
+			    struct dentry *dentry, int do_include)
+{
+	return au_dcsub_pages_rev(dpages, dentry, do_include,
+				  au_dcsub_dpages_aufs, dentry->d_sb);
+}
+
+int au_test_subdir(struct dentry *d1, struct dentry *d2)
+{
+	struct path path[2] = {
+		{
+			.dentry = d1
+		},
+		{
+			.dentry = d2
+		}
+	};
+
+	return path_is_under(path + 0, path + 1);
+}
diff --git b/fs/aufs/dcsub.h b/fs/aufs/dcsub.h
new file mode 100644
index 0000000..5d2cf66
--- /dev/null
+++ b/fs/aufs/dcsub.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for dentry cache
+ */
+
+#ifndef __AUFS_DCSUB_H__
+#define __AUFS_DCSUB_H__
+
+#ifdef __KERNEL__
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+
+struct au_dpage {
+	int ndentry;
+	struct dentry **dentries;
+};
+
+struct au_dcsub_pages {
+	int ndpage;
+	struct au_dpage *dpages;
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* dcsub.c */
+int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp);
+void au_dpages_free(struct au_dcsub_pages *dpages);
+typedef int (*au_dpages_test)(struct dentry *dentry, void *arg);
+int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
+		   au_dpages_test test, void *arg);
+int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
+		       int do_include, au_dpages_test test, void *arg);
+int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
+			    struct dentry *dentry, int do_include);
+int au_test_subdir(struct dentry *d1, struct dentry *d2);
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * todo: in linux-3.13, several similar (but faster) helpers are added to
+ * include/linux/dcache.h. Try them (in the future).
+ */
+
+static inline int au_d_hashed_positive(struct dentry *d)
+{
+	int err;
+	struct inode *inode = d_inode(d);
+
+	err = 0;
+	if (unlikely(d_unhashed(d)
+		     || d_is_negative(d)
+		     || !inode->i_nlink))
+		err = -ENOENT;
+	return err;
+}
+
+static inline int au_d_linkable(struct dentry *d)
+{
+	int err;
+	struct inode *inode = d_inode(d);
+
+	err = au_d_hashed_positive(d);
+	if (err
+	    && d_is_positive(d)
+	    && (inode->i_state & I_LINKABLE))
+		err = 0;
+	return err;
+}
+
+static inline int au_d_alive(struct dentry *d)
+{
+	int err;
+	struct inode *inode;
+
+	err = 0;
+	if (!IS_ROOT(d))
+		err = au_d_hashed_positive(d);
+	else {
+		inode = d_inode(d);
+		if (unlikely(d_unlinked(d)
+			     || d_is_negative(d)
+			     || !inode->i_nlink))
+			err = -ENOENT;
+	}
+	return err;
+}
+
+static inline int au_alive_dir(struct dentry *d)
+{
+	int err;
+
+	err = au_d_alive(d);
+	if (unlikely(err || IS_DEADDIR(d_inode(d))))
+		err = -ENOENT;
+	return err;
+}
+
+static inline int au_qstreq(struct qstr *a, struct qstr *b)
+{
+	return a->len == b->len
+		&& !memcmp(a->name, b->name, a->len);
+}
+
+/*
+ * by the commit
+ * 360f547 2015-01-25 dcache: let the dentry count go down to zero without
+ *			taking d_lock
+ * the type of d_lockref.count became int, but the inlined function d_count()
+ * still returns unsigned int.
+ * I don't know why. Maybe it is for every d_count() users?
+ * Anyway au_dcount() lives on.
+ */
+static inline int au_dcount(struct dentry *d)
+{
+	return (int)d_count(d);
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DCSUB_H__ */
diff --git b/fs/aufs/debug.c b/fs/aufs/debug.c
new file mode 100644
index 0000000..443fee3
--- /dev/null
+++ b/fs/aufs/debug.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debug print functions
+ */
+
+#include "aufs.h"
+
+/* Returns 0, or -errno.  arg is in kp->arg. */
+static int param_atomic_t_set(const char *val, const struct kernel_param *kp)
+{
+	int err, n;
+
+	err = kstrtoint(val, 0, &n);
+	if (!err) {
+		if (n > 0)
+			au_debug_on();
+		else
+			au_debug_off();
+	}
+	return err;
+}
+
+/* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
+static int param_atomic_t_get(char *buffer, const struct kernel_param *kp)
+{
+	atomic_t *a;
+
+	a = kp->arg;
+	return sprintf(buffer, "%d", atomic_read(a));
+}
+
+static struct kernel_param_ops param_ops_atomic_t = {
+	.set = param_atomic_t_set,
+	.get = param_atomic_t_get
+	/* void (*free)(void *arg) */
+};
+
+atomic_t aufs_debug = ATOMIC_INIT(0);
+MODULE_PARM_DESC(debug, "debug print");
+module_param_named(debug, aufs_debug, atomic_t, S_IRUGO | S_IWUSR | S_IWGRP);
+
+DEFINE_MUTEX(au_dbg_mtx);	/* just to serialize the dbg msgs */
+char *au_plevel = KERN_DEBUG;
+#define dpri(fmt, ...) do {					\
+	if ((au_plevel						\
+	     && strcmp(au_plevel, KERN_DEBUG))			\
+	    || au_debug_test())					\
+		printk("%s" fmt, au_plevel, ##__VA_ARGS__);	\
+} while (0)
+
+/* ---------------------------------------------------------------------- */
+
+void au_dpri_whlist(struct au_nhash *whlist)
+{
+	unsigned long ul, n;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+
+	n = whlist->nh_num;
+	head = whlist->nh_head;
+	for (ul = 0; ul < n; ul++) {
+		hlist_for_each_entry(pos, head, wh_hash)
+			dpri("b%d, %.*s, %d\n",
+			     pos->wh_bindex,
+			     pos->wh_str.len, pos->wh_str.name,
+			     pos->wh_str.len);
+		head++;
+	}
+}
+
+void au_dpri_vdir(struct au_vdir *vdir)
+{
+	unsigned long ul;
+	union au_vdir_deblk_p p;
+	unsigned char *o;
+
+	if (!vdir || IS_ERR(vdir)) {
+		dpri("err %ld\n", PTR_ERR(vdir));
+		return;
+	}
+
+	dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n",
+	     vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk,
+	     vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version);
+	for (ul = 0; ul < vdir->vd_nblk; ul++) {
+		p.deblk = vdir->vd_deblk[ul];
+		o = p.deblk;
+		dpri("[%lu]: %p\n", ul, o);
+	}
+}
+
+static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn,
+			struct dentry *wh)
+{
+	char *n = NULL;
+	int l = 0;
+
+	if (!inode || IS_ERR(inode)) {
+		dpri("i%d: err %ld\n", bindex, PTR_ERR(inode));
+		return -1;
+	}
+
+	/* the type of i_blocks depends upon CONFIG_LBDAF */
+	BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long)
+		     && sizeof(inode->i_blocks) != sizeof(u64));
+	if (wh) {
+		n = (void *)wh->d_name.name;
+		l = wh->d_name.len;
+	}
+
+	dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu,"
+	     " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n",
+	     bindex, inode,
+	     inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??",
+	     atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode,
+	     i_size_read(inode), (unsigned long long)inode->i_blocks,
+	     hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff,
+	     inode->i_mapping ? inode->i_mapping->nrpages : 0,
+	     inode->i_state, inode->i_flags, inode->i_version,
+	     inode->i_generation,
+	     l ? ", wh " : "", l, n);
+	return 0;
+}
+
+void au_dpri_inode(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	struct au_hinode *hi;
+	aufs_bindex_t bindex;
+	int err, hn;
+
+	err = do_pri_inode(-1, inode, -1, NULL);
+	if (err || !au_test_aufs(inode->i_sb) || au_is_bad_inode(inode))
+		return;
+
+	iinfo = au_ii(inode);
+	dpri("i-1: btop %d, bbot %d, gen %d\n",
+	     iinfo->ii_btop, iinfo->ii_bbot, au_iigen(inode, NULL));
+	if (iinfo->ii_btop < 0)
+		return;
+	hn = 0;
+	for (bindex = iinfo->ii_btop; bindex <= iinfo->ii_bbot; bindex++) {
+		hi = au_hinode(iinfo, bindex);
+		hn = !!au_hn(hi);
+		do_pri_inode(bindex, hi->hi_inode, hn, hi->hi_whdentry);
+	}
+}
+
+void au_dpri_dalias(struct inode *inode)
+{
+	struct dentry *d;
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias)
+		au_dpri_dentry(d);
+	spin_unlock(&inode->i_lock);
+}
+
+static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry)
+{
+	struct dentry *wh = NULL;
+	int hn;
+	struct inode *inode;
+	struct au_iinfo *iinfo;
+	struct au_hinode *hi;
+
+	if (!dentry || IS_ERR(dentry)) {
+		dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry));
+		return -1;
+	}
+	/* do not call dget_parent() here */
+	/* note: access d_xxx without d_lock */
+	dpri("d%d: %p, %pd2?, %s, cnt %d, flags 0x%x, %shashed\n",
+	     bindex, dentry, dentry,
+	     dentry->d_sb ? au_sbtype(dentry->d_sb) : "??",
+	     au_dcount(dentry), dentry->d_flags,
+	     d_unhashed(dentry) ? "un" : "");
+	hn = -1;
+	inode = NULL;
+	if (d_is_positive(dentry))
+		inode = d_inode(dentry);
+	if (inode
+	    && au_test_aufs(dentry->d_sb)
+	    && bindex >= 0
+	    && !au_is_bad_inode(inode)) {
+		iinfo = au_ii(inode);
+		hi = au_hinode(iinfo, bindex);
+		hn = !!au_hn(hi);
+		wh = hi->hi_whdentry;
+	}
+	do_pri_inode(bindex, inode, hn, wh);
+	return 0;
+}
+
+void au_dpri_dentry(struct dentry *dentry)
+{
+	struct au_dinfo *dinfo;
+	aufs_bindex_t bindex;
+	int err;
+
+	err = do_pri_dentry(-1, dentry);
+	if (err || !au_test_aufs(dentry->d_sb))
+		return;
+
+	dinfo = au_di(dentry);
+	if (!dinfo)
+		return;
+	dpri("d-1: btop %d, bbot %d, bwh %d, bdiropq %d, gen %d, tmp %d\n",
+	     dinfo->di_btop, dinfo->di_bbot,
+	     dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry),
+	     dinfo->di_tmpfile);
+	if (dinfo->di_btop < 0)
+		return;
+	for (bindex = dinfo->di_btop; bindex <= dinfo->di_bbot; bindex++)
+		do_pri_dentry(bindex, au_hdentry(dinfo, bindex)->hd_dentry);
+}
+
+static int do_pri_file(aufs_bindex_t bindex, struct file *file)
+{
+	char a[32];
+
+	if (!file || IS_ERR(file)) {
+		dpri("f%d: err %ld\n", bindex, PTR_ERR(file));
+		return -1;
+	}
+	a[0] = 0;
+	if (bindex < 0
+	    && !IS_ERR_OR_NULL(file->f_path.dentry)
+	    && au_test_aufs(file->f_path.dentry->d_sb)
+	    && au_fi(file))
+		snprintf(a, sizeof(a), ", gen %d, mmapped %d",
+			 au_figen(file), atomic_read(&au_fi(file)->fi_mmapped));
+	dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n",
+	     bindex, file->f_mode, file->f_flags, (long)file_count(file),
+	     file->f_version, file->f_pos, a);
+	if (!IS_ERR_OR_NULL(file->f_path.dentry))
+		do_pri_dentry(bindex, file->f_path.dentry);
+	return 0;
+}
+
+void au_dpri_file(struct file *file)
+{
+	struct au_finfo *finfo;
+	struct au_fidir *fidir;
+	struct au_hfile *hfile;
+	aufs_bindex_t bindex;
+	int err;
+
+	err = do_pri_file(-1, file);
+	if (err
+	    || IS_ERR_OR_NULL(file->f_path.dentry)
+	    || !au_test_aufs(file->f_path.dentry->d_sb))
+		return;
+
+	finfo = au_fi(file);
+	if (!finfo)
+		return;
+	if (finfo->fi_btop < 0)
+		return;
+	fidir = finfo->fi_hdir;
+	if (!fidir)
+		do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file);
+	else
+		for (bindex = finfo->fi_btop;
+		     bindex >= 0 && bindex <= fidir->fd_bbot;
+		     bindex++) {
+			hfile = fidir->fd_hfile + bindex;
+			do_pri_file(bindex, hfile ? hfile->hf_file : NULL);
+		}
+}
+
+static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br)
+{
+	struct vfsmount *mnt;
+	struct super_block *sb;
+
+	if (!br || IS_ERR(br))
+		goto out;
+	mnt = au_br_mnt(br);
+	if (!mnt || IS_ERR(mnt))
+		goto out;
+	sb = mnt->mnt_sb;
+	if (!sb || IS_ERR(sb))
+		goto out;
+
+	dpri("s%d: {perm 0x%x, id %d, cnt %lld, wbr %p}, "
+	     "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, "
+	     "xino %d\n",
+	     bindex, br->br_perm, br->br_id, au_br_count(br),
+	     br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev),
+	     sb->s_flags, sb->s_count,
+	     atomic_read(&sb->s_active), !!br->br_xino.xi_file);
+	return 0;
+
+out:
+	dpri("s%d: err %ld\n", bindex, PTR_ERR(br));
+	return -1;
+}
+
+void au_dpri_sb(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+	aufs_bindex_t bindex;
+	int err;
+	/* to reuduce stack size */
+	struct {
+		struct vfsmount mnt;
+		struct au_branch fake;
+	} *a;
+
+	/* this function can be called from magic sysrq */
+	a = kzalloc(sizeof(*a), GFP_ATOMIC);
+	if (unlikely(!a)) {
+		dpri("no memory\n");
+		return;
+	}
+
+	a->mnt.mnt_sb = sb;
+	a->fake.br_path.mnt = &a->mnt;
+	au_br_count_init(&a->fake);
+	err = do_pri_br(-1, &a->fake);
+	au_br_count_fin(&a->fake);
+	au_delayed_kfree(a);
+	dpri("dev 0x%x\n", sb->s_dev);
+	if (err || !au_test_aufs(sb))
+		return;
+
+	sbinfo = au_sbi(sb);
+	if (!sbinfo)
+		return;
+	dpri("nw %d, gen %u, kobj %d\n",
+	     atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation,
+	     atomic_read(&sbinfo->si_kobj.kref.refcount));
+	for (bindex = 0; bindex <= sbinfo->si_bbot; bindex++)
+		do_pri_br(bindex, sbinfo->si_branch[0 + bindex]);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line)
+{
+	struct inode *h_inode, *inode = d_inode(dentry);
+	struct dentry *h_dentry;
+	aufs_bindex_t bindex, bbot, bi;
+
+	if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */)
+		return;
+
+	bbot = au_dbbot(dentry);
+	bi = au_ibbot(inode);
+	if (bi < bbot)
+		bbot = bi;
+	bindex = au_dbtop(dentry);
+	bi = au_ibtop(inode);
+	if (bi > bindex)
+		bindex = bi;
+
+	for (; bindex <= bbot; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		h_inode = au_h_iptr(inode, bindex);
+		if (unlikely(h_inode != d_inode(h_dentry))) {
+			au_debug_on();
+			AuDbg("b%d, %s:%d\n", bindex, func, line);
+			AuDbgDentry(dentry);
+			AuDbgInode(inode);
+			au_debug_off();
+			BUG();
+		}
+	}
+}
+
+void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen)
+{
+	int err, i, j;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	AuDebugOn(err);
+	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1);
+	AuDebugOn(err);
+	for (i = dpages.ndpage - 1; !err && i >= 0; i--) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		for (j = dpage->ndentry - 1; !err && j >= 0; j--)
+			AuDebugOn(au_digen_test(dentries[j], sigen));
+	}
+	au_dpages_free(&dpages);
+}
+
+void au_dbg_verify_kthread(void)
+{
+	if (au_wkq_test()) {
+		au_dbg_blocked();
+		/*
+		 * It may be recursive, but udba=notify between two aufs mounts,
+		 * where a single ro branch is shared, is not a problem.
+		 */
+		/* WARN_ON(1); */
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+int __init au_debug_init(void)
+{
+	aufs_bindex_t bindex;
+	struct au_vdir_destr destr;
+
+	bindex = -1;
+	AuDebugOn(bindex >= 0);
+
+	destr.len = -1;
+	AuDebugOn(destr.len < NAME_MAX);
+
+#ifdef CONFIG_4KSTACKS
+	pr_warn("CONFIG_4KSTACKS is defined.\n");
+#endif
+
+	return 0;
+}
diff --git b/fs/aufs/debug.h b/fs/aufs/debug.h
new file mode 100644
index 0000000..0567f31
--- /dev/null
+++ b/fs/aufs/debug.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * debug print functions
+ */
+
+#ifndef __AUFS_DEBUG_H__
+#define __AUFS_DEBUG_H__
+
+#ifdef __KERNEL__
+
+#include <linux/atomic.h>
+#include <linux/module.h>
+#include <linux/kallsyms.h>
+#include <linux/sysrq.h>
+
+#ifdef CONFIG_AUFS_DEBUG
+#define AuDebugOn(a)		BUG_ON(a)
+
+/* module parameter */
+extern atomic_t aufs_debug;
+static inline void au_debug_on(void)
+{
+	atomic_inc(&aufs_debug);
+}
+static inline void au_debug_off(void)
+{
+	atomic_dec_if_positive(&aufs_debug);
+}
+
+static inline int au_debug_test(void)
+{
+	return atomic_read(&aufs_debug) > 0;
+}
+#else
+#define AuDebugOn(a)		do {} while (0)
+AuStubVoid(au_debug_on, void)
+AuStubVoid(au_debug_off, void)
+AuStubInt0(au_debug_test, void)
+#endif /* CONFIG_AUFS_DEBUG */
+
+#define param_check_atomic_t(name, p) __param_check(name, p, atomic_t)
+
+/* ---------------------------------------------------------------------- */
+
+/* debug print */
+
+#define AuDbg(fmt, ...) do { \
+	if (au_debug_test()) \
+		pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \
+} while (0)
+#define AuLabel(l)		AuDbg(#l "\n")
+#define AuIOErr(fmt, ...)	pr_err("I/O Error, " fmt, ##__VA_ARGS__)
+#define AuWarn1(fmt, ...) do { \
+	static unsigned char _c; \
+	if (!_c++) \
+		pr_warn(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define AuErr1(fmt, ...) do { \
+	static unsigned char _c; \
+	if (!_c++) \
+		pr_err(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define AuIOErr1(fmt, ...) do { \
+	static unsigned char _c; \
+	if (!_c++) \
+		AuIOErr(fmt, ##__VA_ARGS__); \
+} while (0)
+
+#define AuUnsupportMsg	"This operation is not supported." \
+			" Please report this application to aufs-users ML."
+#define AuUnsupport(fmt, ...) do { \
+	pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \
+	dump_stack(); \
+} while (0)
+
+#define AuTraceErr(e) do { \
+	if (unlikely((e) < 0)) \
+		AuDbg("err %d\n", (int)(e)); \
+} while (0)
+
+#define AuTraceErrPtr(p) do { \
+	if (IS_ERR(p)) \
+		AuDbg("err %ld\n", PTR_ERR(p)); \
+} while (0)
+
+/* dirty macros for debug print, use with "%.*s" and caution */
+#define AuLNPair(qstr)		(qstr)->len, (qstr)->name
+
+/* ---------------------------------------------------------------------- */
+
+struct dentry;
+#ifdef CONFIG_AUFS_DEBUG
+extern struct mutex au_dbg_mtx;
+extern char *au_plevel;
+struct au_nhash;
+void au_dpri_whlist(struct au_nhash *whlist);
+struct au_vdir;
+void au_dpri_vdir(struct au_vdir *vdir);
+struct inode;
+void au_dpri_inode(struct inode *inode);
+void au_dpri_dalias(struct inode *inode);
+void au_dpri_dentry(struct dentry *dentry);
+struct file;
+void au_dpri_file(struct file *filp);
+struct super_block;
+void au_dpri_sb(struct super_block *sb);
+
+#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__)
+void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line);
+void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen);
+void au_dbg_verify_kthread(void);
+
+int __init au_debug_init(void);
+
+#define AuDbgWhlist(w) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#w "\n"); \
+	au_dpri_whlist(w); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgVdir(v) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#v "\n"); \
+	au_dpri_vdir(v); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgInode(i) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#i "\n"); \
+	au_dpri_inode(i); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgDAlias(i) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#i "\n"); \
+	au_dpri_dalias(i); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgDentry(d) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#d "\n"); \
+	au_dpri_dentry(d); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgFile(f) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#f "\n"); \
+	au_dpri_file(f); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgSb(sb) do { \
+	mutex_lock(&au_dbg_mtx); \
+	AuDbg(#sb "\n"); \
+	au_dpri_sb(sb); \
+	mutex_unlock(&au_dbg_mtx); \
+} while (0)
+
+#define AuDbgSym(addr) do {				\
+	char sym[KSYM_SYMBOL_LEN];			\
+	sprint_symbol(sym, (unsigned long)addr);	\
+	AuDbg("%s\n", sym);				\
+} while (0)
+#else
+AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry)
+AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen)
+AuStubVoid(au_dbg_verify_kthread, void)
+AuStubInt0(__init au_debug_init, void)
+
+#define AuDbgWhlist(w)		do {} while (0)
+#define AuDbgVdir(v)		do {} while (0)
+#define AuDbgInode(i)		do {} while (0)
+#define AuDbgDAlias(i)		do {} while (0)
+#define AuDbgDentry(d)		do {} while (0)
+#define AuDbgFile(f)		do {} while (0)
+#define AuDbgSb(sb)		do {} while (0)
+#define AuDbgSym(addr)		do {} while (0)
+#endif /* CONFIG_AUFS_DEBUG */
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_MAGIC_SYSRQ
+int __init au_sysrq_init(void);
+void au_sysrq_fin(void);
+
+#ifdef CONFIG_HW_CONSOLE
+#define au_dbg_blocked() do { \
+	WARN_ON(1); \
+	handle_sysrq('w'); \
+} while (0)
+#else
+AuStubVoid(au_dbg_blocked, void)
+#endif
+
+#else
+AuStubInt0(__init au_sysrq_init, void)
+AuStubVoid(au_sysrq_fin, void)
+AuStubVoid(au_dbg_blocked, void)
+#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DEBUG_H__ */
diff --git b/fs/aufs/dentry.c b/fs/aufs/dentry.c
new file mode 100644
index 0000000..5eb4f36
--- /dev/null
+++ b/fs/aufs/dentry.c
@@ -0,0 +1,1117 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * lookup and dentry operations
+ */
+
+#include <linux/namei.h>
+#include "aufs.h"
+
+struct au_do_lookup_args {
+	unsigned int		flags;
+	mode_t			type;
+};
+
+/*
+ * returns positive/negative dentry, NULL or an error.
+ * NULL means whiteout-ed or not-found.
+ */
+static struct dentry*
+au_do_lookup(struct dentry *h_parent, struct dentry *dentry,
+	     aufs_bindex_t bindex, struct qstr *wh_name,
+	     struct au_do_lookup_args *args)
+{
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	struct au_branch *br;
+	int wh_found, opq;
+	unsigned char wh_able;
+	const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG);
+	const unsigned char ignore_perm = !!au_ftest_lkup(args->flags,
+							  IGNORE_PERM);
+
+	wh_found = 0;
+	br = au_sbr(dentry->d_sb, bindex);
+	wh_able = !!au_br_whable(br->br_perm);
+	if (wh_able)
+		wh_found = au_wh_test(h_parent, wh_name, ignore_perm);
+	h_dentry = ERR_PTR(wh_found);
+	if (!wh_found)
+		goto real_lookup;
+	if (unlikely(wh_found < 0))
+		goto out;
+
+	/* We found a whiteout */
+	/* au_set_dbbot(dentry, bindex); */
+	au_set_dbwh(dentry, bindex);
+	if (!allow_neg)
+		return NULL; /* success */
+
+real_lookup:
+	if (!ignore_perm)
+		h_dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
+	else
+		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
+	if (IS_ERR(h_dentry)) {
+		if (PTR_ERR(h_dentry) == -ENAMETOOLONG
+		    && !allow_neg)
+			h_dentry = NULL;
+		goto out;
+	}
+
+	h_inode = d_inode(h_dentry);
+	if (d_is_negative(h_dentry)) {
+		if (!allow_neg)
+			goto out_neg;
+	} else if (wh_found
+		   || (args->type && args->type != (h_inode->i_mode & S_IFMT)))
+		goto out_neg;
+
+	if (au_dbbot(dentry) <= bindex)
+		au_set_dbbot(dentry, bindex);
+	if (au_dbtop(dentry) < 0 || bindex < au_dbtop(dentry))
+		au_set_dbtop(dentry, bindex);
+	au_set_h_dptr(dentry, bindex, h_dentry);
+
+	if (!d_is_dir(h_dentry)
+	    || !wh_able
+	    || (d_really_is_positive(dentry) && !d_is_dir(dentry)))
+		goto out; /* success */
+
+	inode_lock_nested(h_inode, AuLsc_I_CHILD);
+	opq = au_diropq_test(h_dentry);
+	inode_unlock(h_inode);
+	if (opq > 0)
+		au_set_dbdiropq(dentry, bindex);
+	else if (unlikely(opq < 0)) {
+		au_set_h_dptr(dentry, bindex, NULL);
+		h_dentry = ERR_PTR(opq);
+	}
+	goto out;
+
+out_neg:
+	dput(h_dentry);
+	h_dentry = NULL;
+out:
+	return h_dentry;
+}
+
+static int au_test_shwh(struct super_block *sb, const struct qstr *name)
+{
+	if (unlikely(!au_opt_test(au_mntflags(sb), SHWH)
+		     && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)))
+		return -EPERM;
+	return 0;
+}
+
+/*
+ * returns the number of lower positive dentries,
+ * otherwise an error.
+ * can be called at unlinking with @type is zero.
+ */
+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop,
+		   unsigned int flags)
+{
+	int npositive, err;
+	aufs_bindex_t bindex, btail, bdiropq;
+	unsigned char isdir, dirperm1;
+	struct qstr whname;
+	struct au_do_lookup_args args = {
+		.flags		= flags
+	};
+	const struct qstr *name = &dentry->d_name;
+	struct dentry *parent;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	err = au_test_shwh(sb, name);
+	if (unlikely(err))
+		goto out;
+
+	err = au_wh_name_alloc(&whname, name);
+	if (unlikely(err))
+		goto out;
+
+	isdir = !!d_is_dir(dentry);
+	dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1);
+
+	npositive = 0;
+	parent = dget_parent(dentry);
+	btail = au_dbtaildir(parent);
+	for (bindex = btop; bindex <= btail; bindex++) {
+		struct dentry *h_parent, *h_dentry;
+		struct inode *h_inode, *h_dir;
+
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry) {
+			if (d_is_positive(h_dentry))
+				npositive++;
+			break;
+		}
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || !d_is_dir(h_parent))
+			continue;
+
+		h_dir = d_inode(h_parent);
+		inode_lock_nested(h_dir, AuLsc_I_PARENT);
+		h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname,
+					&args);
+		inode_unlock(h_dir);
+		err = PTR_ERR(h_dentry);
+		if (IS_ERR(h_dentry))
+			goto out_parent;
+		if (h_dentry)
+			au_fclr_lkup(args.flags, ALLOW_NEG);
+		if (dirperm1)
+			au_fset_lkup(args.flags, IGNORE_PERM);
+
+		if (au_dbwh(dentry) == bindex)
+			break;
+		if (!h_dentry)
+			continue;
+		if (d_is_negative(h_dentry))
+			continue;
+		h_inode = d_inode(h_dentry);
+		npositive++;
+		if (!args.type)
+			args.type = h_inode->i_mode & S_IFMT;
+		if (args.type != S_IFDIR)
+			break;
+		else if (isdir) {
+			/* the type of lower may be different */
+			bdiropq = au_dbdiropq(dentry);
+			if (bdiropq >= 0 && bdiropq <= bindex)
+				break;
+		}
+	}
+
+	if (npositive) {
+		AuLabel(positive);
+		au_update_dbtop(dentry);
+	}
+	err = npositive;
+	if (unlikely(!au_opt_test(au_mntflags(sb), UDBA_NONE)
+		     && au_dbtop(dentry) < 0)) {
+		err = -EIO;
+		AuIOErr("both of real entry and whiteout found, %pd, err %d\n",
+			dentry, err);
+	}
+
+out_parent:
+	dput(parent);
+	au_delayed_kfree(whname.name);
+out:
+	return err;
+}
+
+struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent)
+{
+	struct dentry *dentry;
+	int wkq_err;
+
+	if (!au_test_h_perm_sio(d_inode(parent), MAY_EXEC))
+		dentry = vfsub_lkup_one(name, parent);
+	else {
+		struct vfsub_lkup_one_args args = {
+			.errp	= &dentry,
+			.name	= name,
+			.parent	= parent
+		};
+
+		wkq_err = au_wkq_wait(vfsub_call_lkup_one, &args);
+		if (unlikely(wkq_err))
+			dentry = ERR_PTR(wkq_err);
+	}
+
+	return dentry;
+}
+
+/*
+ * lookup @dentry on @bindex which should be negative.
+ */
+int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh)
+{
+	int err;
+	struct dentry *parent, *h_parent, *h_dentry;
+	struct au_branch *br;
+
+	parent = dget_parent(dentry);
+	h_parent = au_h_dptr(parent, bindex);
+	br = au_sbr(dentry->d_sb, bindex);
+	if (wh)
+		h_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
+	else
+		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
+	err = PTR_ERR(h_dentry);
+	if (IS_ERR(h_dentry))
+		goto out;
+	if (unlikely(d_is_positive(h_dentry))) {
+		err = -EIO;
+		AuIOErr("%pd should be negative on b%d.\n", h_dentry, bindex);
+		dput(h_dentry);
+		goto out;
+	}
+
+	err = 0;
+	if (bindex < au_dbtop(dentry))
+		au_set_dbtop(dentry, bindex);
+	if (au_dbbot(dentry) < bindex)
+		au_set_dbbot(dentry, bindex);
+	au_set_h_dptr(dentry, bindex, h_dentry);
+
+out:
+	dput(parent);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* subset of struct inode */
+struct au_iattr {
+	unsigned long		i_ino;
+	/* unsigned int		i_nlink; */
+	kuid_t			i_uid;
+	kgid_t			i_gid;
+	u64			i_version;
+/*
+	loff_t			i_size;
+	blkcnt_t		i_blocks;
+*/
+	umode_t			i_mode;
+};
+
+static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode)
+{
+	ia->i_ino = h_inode->i_ino;
+	/* ia->i_nlink = h_inode->i_nlink; */
+	ia->i_uid = h_inode->i_uid;
+	ia->i_gid = h_inode->i_gid;
+	ia->i_version = h_inode->i_version;
+/*
+	ia->i_size = h_inode->i_size;
+	ia->i_blocks = h_inode->i_blocks;
+*/
+	ia->i_mode = (h_inode->i_mode & S_IFMT);
+}
+
+static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode)
+{
+	return ia->i_ino != h_inode->i_ino
+		/* || ia->i_nlink != h_inode->i_nlink */
+		|| !uid_eq(ia->i_uid, h_inode->i_uid)
+		|| !gid_eq(ia->i_gid, h_inode->i_gid)
+		|| ia->i_version != h_inode->i_version
+/*
+		|| ia->i_size != h_inode->i_size
+		|| ia->i_blocks != h_inode->i_blocks
+*/
+		|| ia->i_mode != (h_inode->i_mode & S_IFMT);
+}
+
+static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent,
+			      struct au_branch *br)
+{
+	int err;
+	struct au_iattr ia;
+	struct inode *h_inode;
+	struct dentry *h_d;
+	struct super_block *h_sb;
+
+	err = 0;
+	memset(&ia, -1, sizeof(ia));
+	h_sb = h_dentry->d_sb;
+	h_inode = NULL;
+	if (d_is_positive(h_dentry)) {
+		h_inode = d_inode(h_dentry);
+		au_iattr_save(&ia, h_inode);
+	} else if (au_test_nfs(h_sb) || au_test_fuse(h_sb))
+		/* nfs d_revalidate may return 0 for negative dentry */
+		/* fuse d_revalidate always return 0 for negative dentry */
+		goto out;
+
+	/* main purpose is namei.c:cached_lookup() and d_revalidate */
+	h_d = vfsub_lkup_one(&h_dentry->d_name, h_parent);
+	err = PTR_ERR(h_d);
+	if (IS_ERR(h_d))
+		goto out;
+
+	err = 0;
+	if (unlikely(h_d != h_dentry
+		     || d_inode(h_d) != h_inode
+		     || (h_inode && au_iattr_test(&ia, h_inode))))
+		err = au_busy_or_stale();
+	dput(h_d);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
+		struct dentry *h_parent, struct au_branch *br)
+{
+	int err;
+
+	err = 0;
+	if (udba == AuOpt_UDBA_REVAL
+	    && !au_test_fs_remote(h_dentry->d_sb)) {
+		IMustLock(h_dir);
+		err = (d_inode(h_dentry->d_parent) != h_dir);
+	} else if (udba != AuOpt_UDBA_NONE)
+		err = au_h_verify_dentry(h_dentry, h_parent, br);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent)
+{
+	int err;
+	aufs_bindex_t new_bindex, bindex, bbot, bwh, bdiropq;
+	struct au_hdentry tmp, *p, *q;
+	struct au_dinfo *dinfo;
+	struct super_block *sb;
+
+	DiMustWriteLock(dentry);
+
+	sb = dentry->d_sb;
+	dinfo = au_di(dentry);
+	bbot = dinfo->di_bbot;
+	bwh = dinfo->di_bwh;
+	bdiropq = dinfo->di_bdiropq;
+	bindex = dinfo->di_btop;
+	p = au_hdentry(dinfo, bindex);
+	for (; bindex <= bbot; bindex++, p++) {
+		if (!p->hd_dentry)
+			continue;
+
+		new_bindex = au_br_index(sb, p->hd_id);
+		if (new_bindex == bindex)
+			continue;
+
+		if (dinfo->di_bwh == bindex)
+			bwh = new_bindex;
+		if (dinfo->di_bdiropq == bindex)
+			bdiropq = new_bindex;
+		if (new_bindex < 0) {
+			au_hdput(p);
+			p->hd_dentry = NULL;
+			continue;
+		}
+
+		/* swap two lower dentries, and loop again */
+		q = au_hdentry(dinfo, new_bindex);
+		tmp = *q;
+		*q = *p;
+		*p = tmp;
+		if (tmp.hd_dentry) {
+			bindex--;
+			p--;
+		}
+	}
+
+	dinfo->di_bwh = -1;
+	if (bwh >= 0 && bwh <= au_sbbot(sb) && au_sbr_whable(sb, bwh))
+		dinfo->di_bwh = bwh;
+
+	dinfo->di_bdiropq = -1;
+	if (bdiropq >= 0
+	    && bdiropq <= au_sbbot(sb)
+	    && au_sbr_whable(sb, bdiropq))
+		dinfo->di_bdiropq = bdiropq;
+
+	err = -EIO;
+	dinfo->di_btop = -1;
+	dinfo->di_bbot = -1;
+	bbot = au_dbbot(parent);
+	bindex = 0;
+	p = au_hdentry(dinfo, bindex);
+	for (; bindex <= bbot; bindex++, p++)
+		if (p->hd_dentry) {
+			dinfo->di_btop = bindex;
+			break;
+		}
+
+	if (dinfo->di_btop >= 0) {
+		bindex = bbot;
+		p = au_hdentry(dinfo, bindex);
+		for (; bindex >= 0; bindex--, p--)
+			if (p->hd_dentry) {
+				dinfo->di_bbot = bindex;
+				err = 0;
+				break;
+			}
+	}
+
+	return err;
+}
+
+static void au_do_hide(struct dentry *dentry)
+{
+	struct inode *inode;
+
+	if (d_really_is_positive(dentry)) {
+		inode = d_inode(dentry);
+		if (!d_is_dir(dentry)) {
+			if (inode->i_nlink && !d_unhashed(dentry))
+				drop_nlink(inode);
+		} else {
+			clear_nlink(inode);
+			/* stop next lookup */
+			inode->i_flags |= S_DEAD;
+		}
+		smp_mb(); /* necessary? */
+	}
+	d_drop(dentry);
+}
+
+static int au_hide_children(struct dentry *parent)
+{
+	int err, i, j, ndentry;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry *dentry;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, parent, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	/* in reverse order */
+	for (i = dpages.ndpage - 1; i >= 0; i--) {
+		dpage = dpages.dpages + i;
+		ndentry = dpage->ndentry;
+		for (j = ndentry - 1; j >= 0; j--) {
+			dentry = dpage->dentries[j];
+			if (dentry != parent)
+				au_do_hide(dentry);
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static void au_hide(struct dentry *dentry)
+{
+	int err;
+
+	AuDbgDentry(dentry);
+	if (d_is_dir(dentry)) {
+		/* shrink_dcache_parent(dentry); */
+		err = au_hide_children(dentry);
+		if (unlikely(err))
+			AuIOErr("%pd, failed hiding children, ignored %d\n",
+				dentry, err);
+	}
+	au_do_hide(dentry);
+}
+
+/*
+ * By adding a dirty branch, a cached dentry may be affected in various ways.
+ *
+ * a dirty branch is added
+ * - on the top of layers
+ * - in the middle of layers
+ * - to the bottom of layers
+ *
+ * on the added branch there exists
+ * - a whiteout
+ * - a diropq
+ * - a same named entry
+ *   + exist
+ *     * negative --> positive
+ *     * positive --> positive
+ *	 - type is unchanged
+ *	 - type is changed
+ *   + doesn't exist
+ *     * negative --> negative
+ *     * positive --> negative (rejected by au_br_del() for non-dir case)
+ * - none
+ */
+static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo,
+			       struct au_dinfo *tmp)
+{
+	int err;
+	aufs_bindex_t bindex, bbot;
+	struct {
+		struct dentry *dentry;
+		struct inode *inode;
+		mode_t mode;
+	} orig_h, tmp_h = {
+		.dentry = NULL
+	};
+	struct au_hdentry *hd;
+	struct inode *inode, *h_inode;
+	struct dentry *h_dentry;
+
+	err = 0;
+	AuDebugOn(dinfo->di_btop < 0);
+	orig_h.mode = 0;
+	orig_h.dentry = au_hdentry(dinfo, dinfo->di_btop)->hd_dentry;
+	orig_h.inode = NULL;
+	if (d_is_positive(orig_h.dentry)) {
+		orig_h.inode = d_inode(orig_h.dentry);
+		orig_h.mode = orig_h.inode->i_mode & S_IFMT;
+	}
+	if (tmp->di_btop >= 0) {
+		tmp_h.dentry = au_hdentry(tmp, tmp->di_btop)->hd_dentry;
+		if (d_is_positive(tmp_h.dentry)) {
+			tmp_h.inode = d_inode(tmp_h.dentry);
+			tmp_h.mode = tmp_h.inode->i_mode & S_IFMT;
+		}
+	}
+
+	inode = NULL;
+	if (d_really_is_positive(dentry))
+		inode = d_inode(dentry);
+	if (!orig_h.inode) {
+		AuDbg("nagative originally\n");
+		if (inode) {
+			au_hide(dentry);
+			goto out;
+		}
+		AuDebugOn(inode);
+		AuDebugOn(dinfo->di_btop != dinfo->di_bbot);
+		AuDebugOn(dinfo->di_bdiropq != -1);
+
+		if (!tmp_h.inode) {
+			AuDbg("negative --> negative\n");
+			/* should have only one negative lower */
+			if (tmp->di_btop >= 0
+			    && tmp->di_btop < dinfo->di_btop) {
+				AuDebugOn(tmp->di_btop != tmp->di_bbot);
+				AuDebugOn(dinfo->di_btop != dinfo->di_bbot);
+				au_set_h_dptr(dentry, dinfo->di_btop, NULL);
+				au_di_cp(dinfo, tmp);
+				hd = au_hdentry(tmp, tmp->di_btop);
+				au_set_h_dptr(dentry, tmp->di_btop,
+					      dget(hd->hd_dentry));
+			}
+			au_dbg_verify_dinode(dentry);
+		} else {
+			AuDbg("negative --> positive\n");
+			/*
+			 * similar to the behaviour of creating with bypassing
+			 * aufs.
+			 * unhash it in order to force an error in the
+			 * succeeding create operation.
+			 * we should not set S_DEAD here.
+			 */
+			d_drop(dentry);
+			/* au_di_swap(tmp, dinfo); */
+			au_dbg_verify_dinode(dentry);
+		}
+	} else {
+		AuDbg("positive originally\n");
+		/* inode may be NULL */
+		AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode);
+		if (!tmp_h.inode) {
+			AuDbg("positive --> negative\n");
+			/* or bypassing aufs */
+			au_hide(dentry);
+			if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_btop)
+				dinfo->di_bwh = tmp->di_bwh;
+			if (inode)
+				err = au_refresh_hinode_self(inode);
+			au_dbg_verify_dinode(dentry);
+		} else if (orig_h.mode == tmp_h.mode) {
+			AuDbg("positive --> positive, same type\n");
+			if (!S_ISDIR(orig_h.mode)
+			    && dinfo->di_btop > tmp->di_btop) {
+				/*
+				 * similar to the behaviour of removing and
+				 * creating.
+				 */
+				au_hide(dentry);
+				if (inode)
+					err = au_refresh_hinode_self(inode);
+				au_dbg_verify_dinode(dentry);
+			} else {
+				/* fill empty slots */
+				if (dinfo->di_btop > tmp->di_btop)
+					dinfo->di_btop = tmp->di_btop;
+				if (dinfo->di_bbot < tmp->di_bbot)
+					dinfo->di_bbot = tmp->di_bbot;
+				dinfo->di_bwh = tmp->di_bwh;
+				dinfo->di_bdiropq = tmp->di_bdiropq;
+				bbot = dinfo->di_bbot;
+				bindex = tmp->di_btop;
+				hd = au_hdentry(tmp, bindex);
+				for (; bindex <= bbot; bindex++, hd++) {
+					if (au_h_dptr(dentry, bindex))
+						continue;
+					h_dentry = hd->hd_dentry;
+					if (!h_dentry)
+						continue;
+					AuDebugOn(d_is_negative(h_dentry));
+					h_inode = d_inode(h_dentry);
+					AuDebugOn(orig_h.mode
+						  != (h_inode->i_mode
+						      & S_IFMT));
+					au_set_h_dptr(dentry, bindex,
+						      dget(h_dentry));
+				}
+				if (inode)
+					err = au_refresh_hinode(inode, dentry);
+				au_dbg_verify_dinode(dentry);
+			}
+		} else {
+			AuDbg("positive --> positive, different type\n");
+			/* similar to the behaviour of removing and creating */
+			au_hide(dentry);
+			if (inode)
+				err = au_refresh_hinode_self(inode);
+			au_dbg_verify_dinode(dentry);
+		}
+	}
+
+out:
+	return err;
+}
+
+void au_refresh_dop(struct dentry *dentry, int force_reval)
+{
+	const struct dentry_operations *dop
+		= force_reval ? &aufs_dop : dentry->d_sb->s_d_op;
+	static const unsigned int mask
+		= DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE;
+
+	BUILD_BUG_ON(sizeof(mask) != sizeof(dentry->d_flags));
+
+	if (dentry->d_op == dop)
+		return;
+
+	AuDbg("%pd\n", dentry);
+	spin_lock(&dentry->d_lock);
+	if (dop == &aufs_dop)
+		dentry->d_flags |= mask;
+	else
+		dentry->d_flags &= ~mask;
+	dentry->d_op = dop;
+	spin_unlock(&dentry->d_lock);
+}
+
+int au_refresh_dentry(struct dentry *dentry, struct dentry *parent)
+{
+	int err, ebrange, nbr;
+	unsigned int sigen;
+	struct au_dinfo *dinfo, *tmp;
+	struct super_block *sb;
+	struct inode *inode;
+
+	DiMustWriteLock(dentry);
+	AuDebugOn(IS_ROOT(dentry));
+	AuDebugOn(d_really_is_negative(parent));
+
+	sb = dentry->d_sb;
+	sigen = au_sigen(sb);
+	err = au_digen_test(parent, sigen);
+	if (unlikely(err))
+		goto out;
+
+	nbr = au_sbbot(sb) + 1;
+	dinfo = au_di(dentry);
+	err = au_di_realloc(dinfo, nbr, /*may_shrink*/0);
+	if (unlikely(err))
+		goto out;
+	ebrange = au_dbrange_test(dentry);
+	if (!ebrange)
+		ebrange = au_do_refresh_hdentry(dentry, parent);
+
+	if (d_unhashed(dentry) || ebrange /* || dinfo->di_tmpfile */) {
+		AuDebugOn(au_dbtop(dentry) < 0 && au_dbbot(dentry) >= 0);
+		if (d_really_is_positive(dentry)) {
+			inode = d_inode(dentry);
+			err = au_refresh_hinode_self(inode);
+		}
+		au_dbg_verify_dinode(dentry);
+		if (!err)
+			goto out_dgen; /* success */
+		goto out;
+	}
+
+	/* temporary dinfo */
+	AuDbgDentry(dentry);
+	err = -ENOMEM;
+	tmp = au_di_alloc(sb, AuLsc_DI_TMP);
+	if (unlikely(!tmp))
+		goto out;
+	au_di_swap(tmp, dinfo);
+	/* returns the number of positive dentries */
+	/*
+	 * if current working dir is removed, it returns an error.
+	 * but the dentry is legal.
+	 */
+	err = au_lkup_dentry(dentry, /*btop*/0, AuLkup_ALLOW_NEG);
+	AuDbgDentry(dentry);
+	au_di_swap(tmp, dinfo);
+	if (err == -ENOENT)
+		err = 0;
+	if (err >= 0) {
+		/* compare/refresh by dinfo */
+		AuDbgDentry(dentry);
+		err = au_refresh_by_dinfo(dentry, dinfo, tmp);
+		au_dbg_verify_dinode(dentry);
+		AuTraceErr(err);
+	}
+	au_di_realloc(dinfo, nbr, /*may_shrink*/1); /* harmless if err */
+	au_rw_write_unlock(&tmp->di_rwsem);
+	au_di_free(tmp);
+	if (unlikely(err))
+		goto out;
+
+out_dgen:
+	au_update_digen(dentry);
+out:
+	if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) {
+		AuIOErr("failed refreshing %pd, %d\n", dentry, err);
+		AuDbgDentry(dentry);
+	}
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_do_h_d_reval(struct dentry *h_dentry, unsigned int flags,
+			   struct dentry *dentry, aufs_bindex_t bindex)
+{
+	int err, valid;
+
+	err = 0;
+	if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE))
+		goto out;
+
+	AuDbg("b%d\n", bindex);
+	/*
+	 * gave up supporting LOOKUP_CREATE/OPEN for lower fs,
+	 * due to whiteout and branch permission.
+	 */
+	flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE
+		   | LOOKUP_FOLLOW | LOOKUP_EXCL);
+	/* it may return tri-state */
+	valid = h_dentry->d_op->d_revalidate(h_dentry, flags);
+
+	if (unlikely(valid < 0))
+		err = valid;
+	else if (!valid)
+		err = -EINVAL;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* todo: remove this */
+static int h_d_revalidate(struct dentry *dentry, struct inode *inode,
+			  unsigned int flags, int do_udba)
+{
+	int err;
+	umode_t mode, h_mode;
+	aufs_bindex_t bindex, btail, btop, ibs, ibe;
+	unsigned char plus, unhashed, is_root, h_plus, h_nfs, tmpfile;
+	struct inode *h_inode, *h_cached_inode;
+	struct dentry *h_dentry;
+	struct qstr *name, *h_name;
+
+	err = 0;
+	plus = 0;
+	mode = 0;
+	ibs = -1;
+	ibe = -1;
+	unhashed = !!d_unhashed(dentry);
+	is_root = !!IS_ROOT(dentry);
+	name = &dentry->d_name;
+	tmpfile = au_di(dentry)->di_tmpfile;
+
+	/*
+	 * Theoretically, REVAL test should be unnecessary in case of
+	 * {FS,I}NOTIFY.
+	 * But {fs,i}notify doesn't fire some necessary events,
+	 *	IN_ATTRIB for atime/nlink/pageio
+	 * Let's do REVAL test too.
+	 */
+	if (do_udba && inode) {
+		mode = (inode->i_mode & S_IFMT);
+		plus = (inode->i_nlink > 0);
+		ibs = au_ibtop(inode);
+		ibe = au_ibbot(inode);
+	}
+
+	btop = au_dbtop(dentry);
+	btail = btop;
+	if (inode && S_ISDIR(inode->i_mode))
+		btail = au_dbtaildir(dentry);
+	for (bindex = btop; bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+
+		AuDbg("b%d, %pd\n", bindex, h_dentry);
+		h_nfs = !!au_test_nfs(h_dentry->d_sb);
+		spin_lock(&h_dentry->d_lock);
+		h_name = &h_dentry->d_name;
+		if (unlikely(do_udba
+			     && !is_root
+			     && ((!h_nfs
+				  && (unhashed != !!d_unhashed(h_dentry)
+				      || (!tmpfile
+					  && !au_qstreq(name, h_name))
+					  ))
+				 || (h_nfs
+				     && !(flags & LOOKUP_OPEN)
+				     && (h_dentry->d_flags
+					 & DCACHE_NFSFS_RENAMED)))
+			    )) {
+			int h_unhashed;
+
+			h_unhashed = d_unhashed(h_dentry);
+			spin_unlock(&h_dentry->d_lock);
+			AuDbg("unhash 0x%x 0x%x, %pd %pd\n",
+			      unhashed, h_unhashed, dentry, h_dentry);
+			goto err;
+		}
+		spin_unlock(&h_dentry->d_lock);
+
+		err = au_do_h_d_reval(h_dentry, flags, dentry, bindex);
+		if (unlikely(err))
+			/* do not goto err, to keep the errno */
+			break;
+
+		/* todo: plink too? */
+		if (!do_udba)
+			continue;
+
+		/* UDBA tests */
+		if (unlikely(!!inode != d_is_positive(h_dentry)))
+			goto err;
+
+		h_inode = NULL;
+		if (d_is_positive(h_dentry))
+			h_inode = d_inode(h_dentry);
+		h_plus = plus;
+		h_mode = mode;
+		h_cached_inode = h_inode;
+		if (h_inode) {
+			h_mode = (h_inode->i_mode & S_IFMT);
+			h_plus = (h_inode->i_nlink > 0);
+		}
+		if (inode && ibs <= bindex && bindex <= ibe)
+			h_cached_inode = au_h_iptr(inode, bindex);
+
+		if (!h_nfs) {
+			if (unlikely(plus != h_plus && !tmpfile))
+				goto err;
+		} else {
+			if (unlikely(!(h_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+				     && !is_root
+				     && !IS_ROOT(h_dentry)
+				     && unhashed != d_unhashed(h_dentry)))
+				goto err;
+		}
+		if (unlikely(mode != h_mode
+			     || h_cached_inode != h_inode))
+			goto err;
+		continue;
+
+err:
+		err = -EINVAL;
+		break;
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+/* todo: consolidate with do_refresh() and au_reval_for_attr() */
+static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+	struct dentry *parent;
+
+	if (!au_digen_test(dentry, sigen))
+		return 0;
+
+	parent = dget_parent(dentry);
+	di_read_lock_parent(parent, AuLock_IR);
+	AuDebugOn(au_digen_test(parent, sigen));
+	au_dbg_verify_gen(parent, sigen);
+	err = au_refresh_dentry(dentry, parent);
+	di_read_unlock(parent, AuLock_IR);
+	dput(parent);
+	AuTraceErr(err);
+	return err;
+}
+
+int au_reval_dpath(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+	struct dentry *d, *parent;
+
+	if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR))
+		return simple_reval_dpath(dentry, sigen);
+
+	/* slow loop, keep it simple and stupid */
+	/* cf: au_cpup_dirs() */
+	err = 0;
+	parent = NULL;
+	while (au_digen_test(dentry, sigen)) {
+		d = dentry;
+		while (1) {
+			dput(parent);
+			parent = dget_parent(d);
+			if (!au_digen_test(parent, sigen))
+				break;
+			d = parent;
+		}
+
+		if (d != dentry)
+			di_write_lock_child2(d);
+
+		/* someone might update our dentry while we were sleeping */
+		if (au_digen_test(d, sigen)) {
+			/*
+			 * todo: consolidate with simple_reval_dpath(),
+			 * do_refresh() and au_reval_for_attr().
+			 */
+			di_read_lock_parent(parent, AuLock_IR);
+			err = au_refresh_dentry(d, parent);
+			di_read_unlock(parent, AuLock_IR);
+		}
+
+		if (d != dentry)
+			di_write_unlock(d);
+		dput(parent);
+		if (unlikely(err))
+			break;
+	}
+
+	return err;
+}
+
+/*
+ * if valid returns 1, otherwise 0.
+ */
+static int aufs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	int valid, err;
+	unsigned int sigen;
+	unsigned char do_udba;
+	struct super_block *sb;
+	struct inode *inode;
+
+	/* todo: support rcu-walk? */
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	valid = 0;
+	if (unlikely(!au_di(dentry)))
+		goto out;
+
+	valid = 1;
+	sb = dentry->d_sb;
+	/*
+	 * todo: very ugly
+	 * i_mutex of parent dir may be held,
+	 * but we should not return 'invalid' due to busy.
+	 */
+	err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM);
+	if (unlikely(err)) {
+		valid = err;
+		AuTraceErr(err);
+		goto out;
+	}
+	inode = NULL;
+	if (d_really_is_positive(dentry))
+		inode = d_inode(dentry);
+	if (unlikely(inode && au_is_bad_inode(inode))) {
+		err = -EINVAL;
+		AuTraceErr(err);
+		goto out_dgrade;
+	}
+	if (unlikely(au_dbrange_test(dentry))) {
+		err = -EINVAL;
+		AuTraceErr(err);
+		goto out_dgrade;
+	}
+
+	sigen = au_sigen(sb);
+	if (au_digen_test(dentry, sigen)) {
+		AuDebugOn(IS_ROOT(dentry));
+		err = au_reval_dpath(dentry, sigen);
+		if (unlikely(err)) {
+			AuTraceErr(err);
+			goto out_dgrade;
+		}
+	}
+	di_downgrade_lock(dentry, AuLock_IR);
+
+	err = -EINVAL;
+	if (!(flags & (LOOKUP_OPEN | LOOKUP_EMPTY))
+	    && inode
+	    && !(inode->i_state && I_LINKABLE)
+	    && (IS_DEADDIR(inode) || !inode->i_nlink)) {
+		AuTraceErr(err);
+		goto out_inval;
+	}
+
+	do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE);
+	if (do_udba && inode) {
+		aufs_bindex_t btop = au_ibtop(inode);
+		struct inode *h_inode;
+
+		if (btop >= 0) {
+			h_inode = au_h_iptr(inode, btop);
+			if (h_inode && au_test_higen(inode, h_inode)) {
+				AuTraceErr(err);
+				goto out_inval;
+			}
+		}
+	}
+
+	err = h_d_revalidate(dentry, inode, flags, do_udba);
+	if (unlikely(!err && do_udba && au_dbtop(dentry) < 0)) {
+		err = -EIO;
+		AuDbg("both of real entry and whiteout found, %p, err %d\n",
+		      dentry, err);
+	}
+	goto out_inval;
+
+out_dgrade:
+	di_downgrade_lock(dentry, AuLock_IR);
+out_inval:
+	aufs_read_unlock(dentry, AuLock_IR);
+	AuTraceErr(err);
+	valid = !err;
+out:
+	if (!valid) {
+		AuDbg("%pd invalid, %d\n", dentry, valid);
+		d_drop(dentry);
+	}
+	return valid;
+}
+
+static void aufs_d_release(struct dentry *dentry)
+{
+	if (au_di(dentry)) {
+		au_di_fin(dentry);
+		au_hn_di_reinit(dentry);
+	}
+}
+
+const struct dentry_operations aufs_dop = {
+	.d_revalidate		= aufs_d_revalidate,
+	.d_weak_revalidate	= aufs_d_revalidate,
+	.d_release		= aufs_d_release
+};
+
+/* aufs_dop without d_revalidate */
+const struct dentry_operations aufs_dop_noreval = {
+	.d_release		= aufs_d_release
+};
diff --git b/fs/aufs/dentry.h b/fs/aufs/dentry.h
new file mode 100644
index 0000000..01211d2
--- /dev/null
+++ b/fs/aufs/dentry.h
@@ -0,0 +1,242 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * lookup and dentry operations
+ */
+
+#ifndef __AUFS_DENTRY_H__
+#define __AUFS_DENTRY_H__
+
+#ifdef __KERNEL__
+
+#include <linux/dcache.h>
+#include "rwsem.h"
+
+struct au_hdentry {
+	struct dentry		*hd_dentry;
+	aufs_bindex_t		hd_id;
+};
+
+struct au_dinfo {
+	atomic_t		di_generation;
+
+	struct au_rwsem		di_rwsem;
+	aufs_bindex_t		di_btop, di_bbot, di_bwh, di_bdiropq;
+	unsigned char		di_tmpfile; /* to allow the different name */
+	union {
+		struct au_hdentry	*di_hdentry;
+		struct llist_node	di_lnode;	/* delayed free */
+	};
+} ____cacheline_aligned_in_smp;
+
+/* ---------------------------------------------------------------------- */
+
+/* flags for au_lkup_dentry() */
+#define AuLkup_ALLOW_NEG	1
+#define AuLkup_IGNORE_PERM	(1 << 1)
+#define au_ftest_lkup(flags, name)	((flags) & AuLkup_##name)
+#define au_fset_lkup(flags, name) \
+	do { (flags) |= AuLkup_##name; } while (0)
+#define au_fclr_lkup(flags, name) \
+	do { (flags) &= ~AuLkup_##name; } while (0)
+
+/* ---------------------------------------------------------------------- */
+
+/* dentry.c */
+extern const struct dentry_operations aufs_dop, aufs_dop_noreval;
+struct au_branch;
+struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent);
+int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
+		struct dentry *h_parent, struct au_branch *br);
+
+int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t btop,
+		   unsigned int flags);
+int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh);
+int au_refresh_dentry(struct dentry *dentry, struct dentry *parent);
+int au_reval_dpath(struct dentry *dentry, unsigned int sigen);
+void au_refresh_dop(struct dentry *dentry, int force_reval);
+
+/* dinfo.c */
+void au_di_init_once(void *_di);
+struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc);
+void au_di_free(struct au_dinfo *dinfo);
+void au_di_swap(struct au_dinfo *a, struct au_dinfo *b);
+void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src);
+int au_di_init(struct dentry *dentry);
+void au_di_fin(struct dentry *dentry);
+int au_di_realloc(struct au_dinfo *dinfo, int nbr, int may_shrink);
+
+void di_read_lock(struct dentry *d, int flags, unsigned int lsc);
+void di_read_unlock(struct dentry *d, int flags);
+void di_downgrade_lock(struct dentry *d, int flags);
+void di_write_lock(struct dentry *d, unsigned int lsc);
+void di_write_unlock(struct dentry *d);
+void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir);
+void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir);
+void di_write_unlock2(struct dentry *d1, struct dentry *d2);
+
+struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex);
+struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex);
+aufs_bindex_t au_dbtail(struct dentry *dentry);
+aufs_bindex_t au_dbtaildir(struct dentry *dentry);
+
+void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
+		   struct dentry *h_dentry);
+int au_digen_test(struct dentry *dentry, unsigned int sigen);
+int au_dbrange_test(struct dentry *dentry);
+void au_update_digen(struct dentry *dentry);
+void au_update_dbrange(struct dentry *dentry, int do_put_zero);
+void au_update_dbtop(struct dentry *dentry);
+void au_update_dbbot(struct dentry *dentry);
+int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry);
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_dinfo *au_di(struct dentry *dentry)
+{
+	return dentry->d_fsdata;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* lock subclass for dinfo */
+enum {
+	AuLsc_DI_CHILD,		/* child first */
+	AuLsc_DI_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
+	AuLsc_DI_CHILD3,	/* copyup dirs */
+	AuLsc_DI_PARENT,
+	AuLsc_DI_PARENT2,
+	AuLsc_DI_PARENT3,
+	AuLsc_DI_TMP		/* temp for replacing dinfo */
+};
+
+/*
+ * di_read_lock_child, di_write_lock_child,
+ * di_read_lock_child2, di_write_lock_child2,
+ * di_read_lock_child3, di_write_lock_child3,
+ * di_read_lock_parent, di_write_lock_parent,
+ * di_read_lock_parent2, di_write_lock_parent2,
+ * di_read_lock_parent3, di_write_lock_parent3,
+ */
+#define AuReadLockFunc(name, lsc) \
+static inline void di_read_lock_##name(struct dentry *d, int flags) \
+{ di_read_lock(d, flags, AuLsc_DI_##lsc); }
+
+#define AuWriteLockFunc(name, lsc) \
+static inline void di_write_lock_##name(struct dentry *d) \
+{ di_write_lock(d, AuLsc_DI_##lsc); }
+
+#define AuRWLockFuncs(name, lsc) \
+	AuReadLockFunc(name, lsc) \
+	AuWriteLockFunc(name, lsc)
+
+AuRWLockFuncs(child, CHILD);
+AuRWLockFuncs(child2, CHILD2);
+AuRWLockFuncs(child3, CHILD3);
+AuRWLockFuncs(parent, PARENT);
+AuRWLockFuncs(parent2, PARENT2);
+AuRWLockFuncs(parent3, PARENT3);
+
+#undef AuReadLockFunc
+#undef AuWriteLockFunc
+#undef AuRWLockFuncs
+
+#define DiMustNoWaiters(d)	AuRwMustNoWaiters(&au_di(d)->di_rwsem)
+#define DiMustAnyLock(d)	AuRwMustAnyLock(&au_di(d)->di_rwsem)
+#define DiMustWriteLock(d)	AuRwMustWriteLock(&au_di(d)->di_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: memory barrier? */
+static inline unsigned int au_digen(struct dentry *d)
+{
+	return atomic_read(&au_di(d)->di_generation);
+}
+
+static inline void au_h_dentry_init(struct au_hdentry *hdentry)
+{
+	hdentry->hd_dentry = NULL;
+}
+
+static inline struct au_hdentry *au_hdentry(struct au_dinfo *di,
+					    aufs_bindex_t bindex)
+{
+	return di->di_hdentry + bindex;
+}
+
+static inline void au_hdput(struct au_hdentry *hd)
+{
+	if (hd)
+		dput(hd->hd_dentry);
+}
+
+static inline aufs_bindex_t au_dbtop(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_btop;
+}
+
+static inline aufs_bindex_t au_dbbot(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_bbot;
+}
+
+static inline aufs_bindex_t au_dbwh(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_bwh;
+}
+
+static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry)
+{
+	DiMustAnyLock(dentry);
+	return au_di(dentry)->di_bdiropq;
+}
+
+/* todo: hard/soft set? */
+static inline void au_set_dbtop(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	au_di(dentry)->di_btop = bindex;
+}
+
+static inline void au_set_dbbot(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	au_di(dentry)->di_bbot = bindex;
+}
+
+static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	/* dbwh can be outside of btop - bbot range */
+	au_di(dentry)->di_bwh = bindex;
+}
+
+static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	DiMustWriteLock(dentry);
+	au_di(dentry)->di_bdiropq = bindex;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_HNOTIFY
+static inline void au_digen_dec(struct dentry *d)
+{
+	atomic_dec(&au_di(d)->di_generation);
+}
+
+static inline void au_hn_di_reinit(struct dentry *dentry)
+{
+	dentry->d_fsdata = NULL;
+}
+#else
+AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused)
+#endif /* CONFIG_AUFS_HNOTIFY */
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DENTRY_H__ */
diff --git b/fs/aufs/dinfo.c b/fs/aufs/dinfo.c
new file mode 100644
index 0000000..2626c0c
--- /dev/null
+++ b/fs/aufs/dinfo.c
@@ -0,0 +1,540 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * dentry private data
+ */
+
+#include "aufs.h"
+
+void au_di_init_once(void *_dinfo)
+{
+	struct au_dinfo *dinfo = _dinfo;
+
+	au_rw_init(&dinfo->di_rwsem);
+}
+
+struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc)
+{
+	struct au_dinfo *dinfo;
+	int nbr, i;
+
+	dinfo = au_cache_alloc_dinfo();
+	if (unlikely(!dinfo))
+		goto out;
+
+	nbr = au_sbbot(sb) + 1;
+	if (nbr <= 0)
+		nbr = 1;
+	dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS);
+	if (dinfo->di_hdentry) {
+		au_rw_write_lock_nested(&dinfo->di_rwsem, lsc);
+		dinfo->di_btop = -1;
+		dinfo->di_bbot = -1;
+		dinfo->di_bwh = -1;
+		dinfo->di_bdiropq = -1;
+		dinfo->di_tmpfile = 0;
+		for (i = 0; i < nbr; i++)
+			dinfo->di_hdentry[i].hd_id = -1;
+		goto out;
+	}
+
+	au_cache_dfree_dinfo(dinfo);
+	dinfo = NULL;
+
+out:
+	return dinfo;
+}
+
+void au_di_free(struct au_dinfo *dinfo)
+{
+	struct au_hdentry *p;
+	aufs_bindex_t bbot, bindex;
+
+	/* dentry may not be revalidated */
+	bindex = dinfo->di_btop;
+	if (bindex >= 0) {
+		bbot = dinfo->di_bbot;
+		p = au_hdentry(dinfo, bindex);
+		while (bindex++ <= bbot)
+			au_hdput(p++);
+	}
+	au_delayed_kfree(dinfo->di_hdentry);
+	au_cache_dfree_dinfo(dinfo);
+}
+
+void au_di_swap(struct au_dinfo *a, struct au_dinfo *b)
+{
+	struct au_hdentry *p;
+	aufs_bindex_t bi;
+
+	AuRwMustWriteLock(&a->di_rwsem);
+	AuRwMustWriteLock(&b->di_rwsem);
+
+#define DiSwap(v, name)				\
+	do {					\
+		v = a->di_##name;		\
+		a->di_##name = b->di_##name;	\
+		b->di_##name = v;		\
+	} while (0)
+
+	DiSwap(p, hdentry);
+	DiSwap(bi, btop);
+	DiSwap(bi, bbot);
+	DiSwap(bi, bwh);
+	DiSwap(bi, bdiropq);
+	/* smp_mb(); */
+
+#undef DiSwap
+}
+
+void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src)
+{
+	AuRwMustWriteLock(&dst->di_rwsem);
+	AuRwMustWriteLock(&src->di_rwsem);
+
+	dst->di_btop = src->di_btop;
+	dst->di_bbot = src->di_bbot;
+	dst->di_bwh = src->di_bwh;
+	dst->di_bdiropq = src->di_bdiropq;
+	/* smp_mb(); */
+}
+
+int au_di_init(struct dentry *dentry)
+{
+	int err;
+	struct super_block *sb;
+	struct au_dinfo *dinfo;
+
+	err = 0;
+	sb = dentry->d_sb;
+	dinfo = au_di_alloc(sb, AuLsc_DI_CHILD);
+	if (dinfo) {
+		atomic_set(&dinfo->di_generation, au_sigen(sb));
+		/* smp_mb(); */ /* atomic_set */
+		dentry->d_fsdata = dinfo;
+	} else
+		err = -ENOMEM;
+
+	return err;
+}
+
+void au_di_fin(struct dentry *dentry)
+{
+	struct au_dinfo *dinfo;
+
+	dinfo = au_di(dentry);
+	AuRwDestroy(&dinfo->di_rwsem);
+	au_di_free(dinfo);
+}
+
+int au_di_realloc(struct au_dinfo *dinfo, int nbr, int may_shrink)
+{
+	int err, sz;
+	struct au_hdentry *hdp;
+
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	err = -ENOMEM;
+	sz = sizeof(*hdp) * (dinfo->di_bbot + 1);
+	if (!sz)
+		sz = sizeof(*hdp);
+	hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS,
+			   may_shrink);
+	if (hdp) {
+		dinfo->di_hdentry = hdp;
+		err = 0;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void do_ii_write_lock(struct inode *inode, unsigned int lsc)
+{
+	switch (lsc) {
+	case AuLsc_DI_CHILD:
+		ii_write_lock_child(inode);
+		break;
+	case AuLsc_DI_CHILD2:
+		ii_write_lock_child2(inode);
+		break;
+	case AuLsc_DI_CHILD3:
+		ii_write_lock_child3(inode);
+		break;
+	case AuLsc_DI_PARENT:
+		ii_write_lock_parent(inode);
+		break;
+	case AuLsc_DI_PARENT2:
+		ii_write_lock_parent2(inode);
+		break;
+	case AuLsc_DI_PARENT3:
+		ii_write_lock_parent3(inode);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void do_ii_read_lock(struct inode *inode, unsigned int lsc)
+{
+	switch (lsc) {
+	case AuLsc_DI_CHILD:
+		ii_read_lock_child(inode);
+		break;
+	case AuLsc_DI_CHILD2:
+		ii_read_lock_child2(inode);
+		break;
+	case AuLsc_DI_CHILD3:
+		ii_read_lock_child3(inode);
+		break;
+	case AuLsc_DI_PARENT:
+		ii_read_lock_parent(inode);
+		break;
+	case AuLsc_DI_PARENT2:
+		ii_read_lock_parent2(inode);
+		break;
+	case AuLsc_DI_PARENT3:
+		ii_read_lock_parent3(inode);
+		break;
+	default:
+		BUG();
+	}
+}
+
+void di_read_lock(struct dentry *d, int flags, unsigned int lsc)
+{
+	struct inode *inode;
+
+	au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc);
+	if (d_really_is_positive(d)) {
+		inode = d_inode(d);
+		if (au_ftest_lock(flags, IW))
+			do_ii_write_lock(inode, lsc);
+		else if (au_ftest_lock(flags, IR))
+			do_ii_read_lock(inode, lsc);
+	}
+}
+
+void di_read_unlock(struct dentry *d, int flags)
+{
+	struct inode *inode;
+
+	if (d_really_is_positive(d)) {
+		inode = d_inode(d);
+		if (au_ftest_lock(flags, IW)) {
+			au_dbg_verify_dinode(d);
+			ii_write_unlock(inode);
+		} else if (au_ftest_lock(flags, IR)) {
+			au_dbg_verify_dinode(d);
+			ii_read_unlock(inode);
+		}
+	}
+	au_rw_read_unlock(&au_di(d)->di_rwsem);
+}
+
+void di_downgrade_lock(struct dentry *d, int flags)
+{
+	if (d_really_is_positive(d) && au_ftest_lock(flags, IR))
+		ii_downgrade_lock(d_inode(d));
+	au_rw_dgrade_lock(&au_di(d)->di_rwsem);
+}
+
+void di_write_lock(struct dentry *d, unsigned int lsc)
+{
+	au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc);
+	if (d_really_is_positive(d))
+		do_ii_write_lock(d_inode(d), lsc);
+}
+
+void di_write_unlock(struct dentry *d)
+{
+	au_dbg_verify_dinode(d);
+	if (d_really_is_positive(d))
+		ii_write_unlock(d_inode(d));
+	au_rw_write_unlock(&au_di(d)->di_rwsem);
+}
+
+void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir)
+{
+	AuDebugOn(d1 == d2
+		  || d_inode(d1) == d_inode(d2)
+		  || d1->d_sb != d2->d_sb);
+
+	if (isdir && au_test_subdir(d1, d2)) {
+		di_write_lock_child(d1);
+		di_write_lock_child2(d2);
+	} else {
+		/* there should be no races */
+		di_write_lock_child(d2);
+		di_write_lock_child2(d1);
+	}
+}
+
+void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir)
+{
+	AuDebugOn(d1 == d2
+		  || d_inode(d1) == d_inode(d2)
+		  || d1->d_sb != d2->d_sb);
+
+	if (isdir && au_test_subdir(d1, d2)) {
+		di_write_lock_parent(d1);
+		di_write_lock_parent2(d2);
+	} else {
+		/* there should be no races */
+		di_write_lock_parent(d2);
+		di_write_lock_parent2(d1);
+	}
+}
+
+void di_write_unlock2(struct dentry *d1, struct dentry *d2)
+{
+	di_write_unlock(d1);
+	if (d_inode(d1) == d_inode(d2))
+		au_rw_write_unlock(&au_di(d2)->di_rwsem);
+	else
+		di_write_unlock(d2);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	struct dentry *d;
+
+	DiMustAnyLock(dentry);
+
+	if (au_dbtop(dentry) < 0 || bindex < au_dbtop(dentry))
+		return NULL;
+	AuDebugOn(bindex < 0);
+	d = au_hdentry(au_di(dentry), bindex)->hd_dentry;
+	AuDebugOn(d && au_dcount(d) <= 0);
+	return d;
+}
+
+/*
+ * extended version of au_h_dptr().
+ * returns a hashed and positive (or linkable) h_dentry in bindex, NULL, or
+ * error.
+ */
+struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	struct dentry *h_dentry;
+	struct inode *inode, *h_inode;
+
+	AuDebugOn(d_really_is_negative(dentry));
+
+	h_dentry = NULL;
+	if (au_dbtop(dentry) <= bindex
+	    && bindex <= au_dbbot(dentry))
+		h_dentry = au_h_dptr(dentry, bindex);
+	if (h_dentry && !au_d_linkable(h_dentry)) {
+		dget(h_dentry);
+		goto out; /* success */
+	}
+
+	inode = d_inode(dentry);
+	AuDebugOn(bindex < au_ibtop(inode));
+	AuDebugOn(au_ibbot(inode) < bindex);
+	h_inode = au_h_iptr(inode, bindex);
+	h_dentry = d_find_alias(h_inode);
+	if (h_dentry) {
+		if (!IS_ERR(h_dentry)) {
+			if (!au_d_linkable(h_dentry))
+				goto out; /* success */
+			dput(h_dentry);
+		} else
+			goto out;
+	}
+
+	if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) {
+		h_dentry = au_plink_lkup(inode, bindex);
+		AuDebugOn(!h_dentry);
+		if (!IS_ERR(h_dentry)) {
+			if (!au_d_hashed_positive(h_dentry))
+				goto out; /* success */
+			dput(h_dentry);
+			h_dentry = NULL;
+		}
+	}
+
+out:
+	AuDbgDentry(h_dentry);
+	return h_dentry;
+}
+
+aufs_bindex_t au_dbtail(struct dentry *dentry)
+{
+	aufs_bindex_t bbot, bwh;
+
+	bbot = au_dbbot(dentry);
+	if (0 <= bbot) {
+		bwh = au_dbwh(dentry);
+		if (!bwh)
+			return bwh;
+		if (0 < bwh && bwh < bbot)
+			return bwh - 1;
+	}
+	return bbot;
+}
+
+aufs_bindex_t au_dbtaildir(struct dentry *dentry)
+{
+	aufs_bindex_t bbot, bopq;
+
+	bbot = au_dbtail(dentry);
+	if (0 <= bbot) {
+		bopq = au_dbdiropq(dentry);
+		if (0 <= bopq && bopq < bbot)
+			bbot = bopq;
+	}
+	return bbot;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
+		   struct dentry *h_dentry)
+{
+	struct au_dinfo *dinfo;
+	struct au_hdentry *hd;
+	struct au_branch *br;
+
+	DiMustWriteLock(dentry);
+
+	dinfo = au_di(dentry);
+	hd = au_hdentry(dinfo, bindex);
+	au_hdput(hd);
+	hd->hd_dentry = h_dentry;
+	if (h_dentry) {
+		br = au_sbr(dentry->d_sb, bindex);
+		hd->hd_id = br->br_id;
+	}
+}
+
+int au_dbrange_test(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t btop, bbot;
+
+	err = 0;
+	btop = au_dbtop(dentry);
+	bbot = au_dbbot(dentry);
+	if (btop >= 0)
+		AuDebugOn(bbot < 0 && btop > bbot);
+	else {
+		err = -EIO;
+		AuDebugOn(bbot >= 0);
+	}
+
+	return err;
+}
+
+int au_digen_test(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+
+	err = 0;
+	if (unlikely(au_digen(dentry) != sigen
+		     || au_iigen_test(d_inode(dentry), sigen)))
+		err = -EIO;
+
+	return err;
+}
+
+void au_update_digen(struct dentry *dentry)
+{
+	atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb));
+	/* smp_mb(); */ /* atomic_set */
+}
+
+void au_update_dbrange(struct dentry *dentry, int do_put_zero)
+{
+	struct au_dinfo *dinfo;
+	struct dentry *h_d;
+	struct au_hdentry *hdp;
+	aufs_bindex_t bindex, bbot;
+
+	DiMustWriteLock(dentry);
+
+	dinfo = au_di(dentry);
+	if (!dinfo || dinfo->di_btop < 0)
+		return;
+
+	if (do_put_zero) {
+		bbot = dinfo->di_bbot;
+		bindex = dinfo->di_btop;
+		hdp = au_hdentry(dinfo, bindex);
+		for (; bindex <= bbot; bindex++, hdp++) {
+			h_d = hdp->hd_dentry;
+			if (h_d && d_is_negative(h_d))
+				au_set_h_dptr(dentry, bindex, NULL);
+		}
+	}
+
+	dinfo->di_btop = 0;
+	hdp = au_hdentry(dinfo, dinfo->di_btop);
+	for (; dinfo->di_btop <= dinfo->di_bbot; dinfo->di_btop++, hdp++)
+		if (hdp->hd_dentry)
+			break;
+	if (dinfo->di_btop > dinfo->di_bbot) {
+		dinfo->di_btop = -1;
+		dinfo->di_bbot = -1;
+		return;
+	}
+
+	hdp = au_hdentry(dinfo, dinfo->di_bbot);
+	for (; dinfo->di_bbot >= 0; dinfo->di_bbot--, hdp--)
+		if (hdp->hd_dentry)
+			break;
+	AuDebugOn(dinfo->di_btop > dinfo->di_bbot || dinfo->di_bbot < 0);
+}
+
+void au_update_dbtop(struct dentry *dentry)
+{
+	aufs_bindex_t bindex, bbot;
+	struct dentry *h_dentry;
+
+	bbot = au_dbbot(dentry);
+	for (bindex = au_dbtop(dentry); bindex <= bbot; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		if (d_is_positive(h_dentry)) {
+			au_set_dbtop(dentry, bindex);
+			return;
+		}
+		au_set_h_dptr(dentry, bindex, NULL);
+	}
+}
+
+void au_update_dbbot(struct dentry *dentry)
+{
+	aufs_bindex_t bindex, btop;
+	struct dentry *h_dentry;
+
+	btop = au_dbtop(dentry);
+	for (bindex = au_dbbot(dentry); bindex >= btop; bindex--) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		if (d_is_positive(h_dentry)) {
+			au_set_dbbot(dentry, bindex);
+			return;
+		}
+		au_set_h_dptr(dentry, bindex, NULL);
+	}
+}
+
+int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry)
+{
+	aufs_bindex_t bindex, bbot;
+
+	bbot = au_dbbot(dentry);
+	for (bindex = au_dbtop(dentry); bindex <= bbot; bindex++)
+		if (au_h_dptr(dentry, bindex) == h_dentry)
+			return bindex;
+	return -1;
+}
diff --git b/fs/aufs/dir.c b/fs/aufs/dir.c
new file mode 100644
index 0000000..395563e
--- /dev/null
+++ b/fs/aufs/dir.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * directory operations
+ */
+
+#include <linux/fs_stack.h>
+#include "aufs.h"
+
+void au_add_nlink(struct inode *dir, struct inode *h_dir)
+{
+	unsigned int nlink;
+
+	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
+
+	nlink = dir->i_nlink;
+	nlink += h_dir->i_nlink - 2;
+	if (h_dir->i_nlink < 2)
+		nlink += 2;
+	smp_mb(); /* for i_nlink */
+	/* 0 can happen in revaliding */
+	set_nlink(dir, nlink);
+}
+
+void au_sub_nlink(struct inode *dir, struct inode *h_dir)
+{
+	unsigned int nlink;
+
+	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
+
+	nlink = dir->i_nlink;
+	nlink -= h_dir->i_nlink - 2;
+	if (h_dir->i_nlink < 2)
+		nlink -= 2;
+	smp_mb(); /* for i_nlink */
+	/* nlink == 0 means the branch-fs is broken */
+	set_nlink(dir, nlink);
+}
+
+loff_t au_dir_size(struct file *file, struct dentry *dentry)
+{
+	loff_t sz;
+	aufs_bindex_t bindex, bbot;
+	struct file *h_file;
+	struct dentry *h_dentry;
+
+	sz = 0;
+	if (file) {
+		AuDebugOn(!d_is_dir(file->f_path.dentry));
+
+		bbot = au_fbbot_dir(file);
+		for (bindex = au_fbtop(file);
+		     bindex <= bbot && sz < KMALLOC_MAX_SIZE;
+		     bindex++) {
+			h_file = au_hf_dir(file, bindex);
+			if (h_file && file_inode(h_file))
+				sz += vfsub_f_size_read(h_file);
+		}
+	} else {
+		AuDebugOn(!dentry);
+		AuDebugOn(!d_is_dir(dentry));
+
+		bbot = au_dbtaildir(dentry);
+		for (bindex = au_dbtop(dentry);
+		     bindex <= bbot && sz < KMALLOC_MAX_SIZE;
+		     bindex++) {
+			h_dentry = au_h_dptr(dentry, bindex);
+			if (h_dentry && d_is_positive(h_dentry))
+				sz += i_size_read(d_inode(h_dentry));
+		}
+	}
+	if (sz < KMALLOC_MAX_SIZE)
+		sz = roundup_pow_of_two(sz);
+	if (sz > KMALLOC_MAX_SIZE)
+		sz = KMALLOC_MAX_SIZE;
+	else if (sz < NAME_MAX) {
+		BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX);
+		sz = AUFS_RDBLK_DEF;
+	}
+	return sz;
+}
+
+struct au_dir_ts_arg {
+	struct dentry *dentry;
+	aufs_bindex_t brid;
+};
+
+static void au_do_dir_ts(void *arg)
+{
+	struct au_dir_ts_arg *a = arg;
+	struct au_dtime dt;
+	struct path h_path;
+	struct inode *dir, *h_dir;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_hinode *hdir;
+	int err;
+	aufs_bindex_t btop, bindex;
+
+	sb = a->dentry->d_sb;
+	if (d_really_is_negative(a->dentry))
+		goto out;
+	/* no dir->i_mutex lock */
+	aufs_read_lock(a->dentry, AuLock_DW); /* noflush */
+
+	dir = d_inode(a->dentry);
+	btop = au_ibtop(dir);
+	bindex = au_br_index(sb, a->brid);
+	if (bindex < btop)
+		goto out_unlock;
+
+	br = au_sbr(sb, bindex);
+	h_path.dentry = au_h_dptr(a->dentry, bindex);
+	if (!h_path.dentry)
+		goto out_unlock;
+	h_path.mnt = au_br_mnt(br);
+	au_dtime_store(&dt, a->dentry, &h_path);
+
+	br = au_sbr(sb, btop);
+	if (!au_br_writable(br->br_perm))
+		goto out_unlock;
+	h_path.dentry = au_h_dptr(a->dentry, btop);
+	h_path.mnt = au_br_mnt(br);
+	err = vfsub_mnt_want_write(h_path.mnt);
+	if (err)
+		goto out_unlock;
+	hdir = au_hi(dir, btop);
+	au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT);
+	h_dir = au_h_iptr(dir, btop);
+	if (h_dir->i_nlink
+	    && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) {
+		dt.dt_h_path = h_path;
+		au_dtime_revert(&dt);
+	}
+	au_hn_inode_unlock(hdir);
+	vfsub_mnt_drop_write(h_path.mnt);
+	au_cpup_attr_timesizes(dir);
+
+out_unlock:
+	aufs_read_unlock(a->dentry, AuLock_DW);
+out:
+	dput(a->dentry);
+	au_nwt_done(&au_sbi(sb)->si_nowait);
+	au_delayed_kfree(arg);
+}
+
+void au_dir_ts(struct inode *dir, aufs_bindex_t bindex)
+{
+	int perm, wkq_err;
+	aufs_bindex_t btop;
+	struct au_dir_ts_arg *arg;
+	struct dentry *dentry;
+	struct super_block *sb;
+
+	IMustLock(dir);
+
+	dentry = d_find_any_alias(dir);
+	AuDebugOn(!dentry);
+	sb = dentry->d_sb;
+	btop = au_ibtop(dir);
+	if (btop == bindex) {
+		au_cpup_attr_timesizes(dir);
+		goto out;
+	}
+
+	perm = au_sbr_perm(sb, btop);
+	if (!au_br_writable(perm))
+		goto out;
+
+	arg = kmalloc(sizeof(*arg), GFP_NOFS);
+	if (!arg)
+		goto out;
+
+	arg->dentry = dget(dentry); /* will be dput-ted by au_do_dir_ts() */
+	arg->brid = au_sbr_id(sb, bindex);
+	wkq_err = au_wkq_nowait(au_do_dir_ts, arg, sb, /*flags*/0);
+	if (unlikely(wkq_err)) {
+		pr_err("wkq %d\n", wkq_err);
+		dput(dentry);
+		au_delayed_kfree(arg);
+	}
+
+out:
+	dput(dentry);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int reopen_dir(struct file *file)
+{
+	int err;
+	unsigned int flags;
+	aufs_bindex_t bindex, btail, btop;
+	struct dentry *dentry, *h_dentry;
+	struct file *h_file;
+
+	/* open all lower dirs */
+	dentry = file->f_path.dentry;
+	btop = au_dbtop(dentry);
+	for (bindex = au_fbtop(file); bindex < btop; bindex++)
+		au_set_h_fptr(file, bindex, NULL);
+	au_set_fbtop(file, btop);
+
+	btail = au_dbtaildir(dentry);
+	for (bindex = au_fbbot_dir(file); btail < bindex; bindex--)
+		au_set_h_fptr(file, bindex, NULL);
+	au_set_fbbot_dir(file, btail);
+
+	flags = vfsub_file_flags(file);
+	for (bindex = btop; bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+		h_file = au_hf_dir(file, bindex);
+		if (h_file)
+			continue;
+
+		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
+		err = PTR_ERR(h_file);
+		if (IS_ERR(h_file))
+			goto out; /* close all? */
+		au_set_h_fptr(file, bindex, h_file);
+	}
+	au_update_figen(file);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	err = 0;
+
+out:
+	return err;
+}
+
+static int do_open_dir(struct file *file, int flags, struct file *h_file)
+{
+	int err;
+	aufs_bindex_t bindex, btail;
+	struct dentry *dentry, *h_dentry;
+	struct vfsmount *mnt;
+
+	FiMustWriteLock(file);
+	AuDebugOn(h_file);
+
+	err = 0;
+	mnt = file->f_path.mnt;
+	dentry = file->f_path.dentry;
+	file->f_version = d_inode(dentry)->i_version;
+	bindex = au_dbtop(dentry);
+	au_set_fbtop(file, bindex);
+	btail = au_dbtaildir(dentry);
+	au_set_fbbot_dir(file, btail);
+	for (; !err && bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (!h_dentry)
+			continue;
+
+		err = vfsub_test_mntns(mnt, h_dentry->d_sb);
+		if (unlikely(err))
+			break;
+		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
+		if (IS_ERR(h_file)) {
+			err = PTR_ERR(h_file);
+			break;
+		}
+		au_set_h_fptr(file, bindex, h_file);
+	}
+	au_update_figen(file);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	if (!err)
+		return 0; /* success */
+
+	/* close all */
+	for (bindex = au_fbtop(file); bindex <= btail; bindex++)
+		au_set_h_fptr(file, bindex, NULL);
+	au_set_fbtop(file, -1);
+	au_set_fbbot_dir(file, -1);
+
+	return err;
+}
+
+static int aufs_open_dir(struct inode *inode __maybe_unused,
+			 struct file *file)
+{
+	int err;
+	struct super_block *sb;
+	struct au_fidir *fidir;
+
+	err = -ENOMEM;
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	fidir = au_fidir_alloc(sb);
+	if (fidir) {
+		struct au_do_open_args args = {
+			.open	= do_open_dir,
+			.fidir	= fidir
+		};
+		err = au_do_open(file, &args);
+		if (unlikely(err))
+			au_delayed_kfree(fidir);
+	}
+	si_read_unlock(sb);
+	return err;
+}
+
+static int aufs_release_dir(struct inode *inode __maybe_unused,
+			    struct file *file)
+{
+	struct au_vdir *vdir_cache;
+	struct au_finfo *finfo;
+	struct au_fidir *fidir;
+	struct au_hfile *hf;
+	aufs_bindex_t bindex, bbot;
+	int execed, delayed;
+
+	delayed = (current->flags & PF_KTHREAD) || in_interrupt();
+	finfo = au_fi(file);
+	fidir = finfo->fi_hdir;
+	if (fidir) {
+		au_sphl_del(&finfo->fi_hlist,
+			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
+		vdir_cache = fidir->fd_vdir_cache; /* lock-free */
+		if (vdir_cache)
+			au_vdir_free(vdir_cache, delayed);
+
+		bindex = finfo->fi_btop;
+		if (bindex >= 0) {
+			execed = vfsub_file_execed(file);
+			hf = fidir->fd_hfile + bindex;
+			/*
+			 * calls fput() instead of filp_close(),
+			 * since no dnotify or lock for the lower file.
+			 */
+			bbot = fidir->fd_bbot;
+			for (; bindex <= bbot; bindex++, hf++)
+				if (hf->hf_file)
+					au_hfput(hf, execed);
+		}
+		au_delayed_kfree(fidir);
+		finfo->fi_hdir = NULL;
+	}
+	au_finfo_fin(file, delayed);
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_flush_dir(struct file *file, fl_owner_t id)
+{
+	int err;
+	aufs_bindex_t bindex, bbot;
+	struct file *h_file;
+
+	err = 0;
+	bbot = au_fbbot_dir(file);
+	for (bindex = au_fbtop(file); !err && bindex <= bbot; bindex++) {
+		h_file = au_hf_dir(file, bindex);
+		if (h_file)
+			err = vfsub_flush(h_file, id);
+	}
+	return err;
+}
+
+static int aufs_flush_dir(struct file *file, fl_owner_t id)
+{
+	return au_do_flush(file, id, au_do_flush_dir);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync)
+{
+	int err;
+	aufs_bindex_t bbot, bindex;
+	struct inode *inode;
+	struct super_block *sb;
+
+	err = 0;
+	sb = dentry->d_sb;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+	bbot = au_dbbot(dentry);
+	for (bindex = au_dbtop(dentry); !err && bindex <= bbot; bindex++) {
+		struct path h_path;
+
+		if (au_test_ro(sb, bindex, inode))
+			continue;
+		h_path.dentry = au_h_dptr(dentry, bindex);
+		if (!h_path.dentry)
+			continue;
+
+		h_path.mnt = au_sbr_mnt(sb, bindex);
+		err = vfsub_fsync(NULL, &h_path, datasync);
+	}
+
+	return err;
+}
+
+static int au_do_fsync_dir(struct file *file, int datasync)
+{
+	int err;
+	aufs_bindex_t bbot, bindex;
+	struct file *h_file;
+	struct super_block *sb;
+	struct inode *inode;
+
+	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
+	if (unlikely(err))
+		goto out;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	bbot = au_fbbot_dir(file);
+	for (bindex = au_fbtop(file); !err && bindex <= bbot; bindex++) {
+		h_file = au_hf_dir(file, bindex);
+		if (!h_file || au_test_ro(sb, bindex, inode))
+			continue;
+
+		err = vfsub_fsync(h_file, &h_file->f_path, datasync);
+	}
+
+out:
+	return err;
+}
+
+/*
+ * @file may be NULL
+ */
+static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end,
+			  int datasync)
+{
+	int err;
+	struct dentry *dentry;
+	struct inode *inode;
+	struct super_block *sb;
+
+	err = 0;
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+	inode_lock(inode);
+	sb = dentry->d_sb;
+	si_noflush_read_lock(sb);
+	if (file)
+		err = au_do_fsync_dir(file, datasync);
+	else {
+		di_write_lock_child(dentry);
+		err = au_do_fsync_dir_no_file(dentry, datasync);
+	}
+	au_cpup_attr_timesizes(inode);
+	di_write_unlock(dentry);
+	if (file)
+		fi_write_unlock(file);
+
+	si_read_unlock(sb);
+	inode_unlock(inode);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_iterate_shared(struct file *file, struct dir_context *ctx)
+{
+	int err;
+	struct dentry *dentry;
+	struct inode *inode, *h_inode;
+	struct super_block *sb;
+
+	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
+
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+
+	sb = dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
+	if (unlikely(err))
+		goto out;
+	err = au_alive_dir(dentry);
+	if (!err)
+		err = au_vdir_init(file);
+	di_downgrade_lock(dentry, AuLock_IR);
+	if (unlikely(err))
+		goto out_unlock;
+
+	h_inode = au_h_iptr(inode, au_ibtop(inode));
+	if (!au_test_nfsd()) {
+		err = au_vdir_fill_de(file, ctx);
+		fsstack_copy_attr_atime(inode, h_inode);
+	} else {
+		/*
+		 * nfsd filldir may call lookup_one_len(), vfs_getattr(),
+		 * encode_fh() and others.
+		 */
+		atomic_inc(&h_inode->i_count);
+		di_read_unlock(dentry, AuLock_IR);
+		si_read_unlock(sb);
+		err = au_vdir_fill_de(file, ctx);
+		fsstack_copy_attr_atime(inode, h_inode);
+		fi_write_unlock(file);
+		iput(h_inode);
+
+		AuTraceErr(err);
+		return err;
+	}
+
+out_unlock:
+	di_read_unlock(dentry, AuLock_IR);
+	fi_write_unlock(file);
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define AuTestEmpty_WHONLY	1
+#define AuTestEmpty_CALLED	(1 << 1)
+#define AuTestEmpty_SHWH	(1 << 2)
+#define au_ftest_testempty(flags, name)	((flags) & AuTestEmpty_##name)
+#define au_fset_testempty(flags, name) \
+	do { (flags) |= AuTestEmpty_##name; } while (0)
+#define au_fclr_testempty(flags, name) \
+	do { (flags) &= ~AuTestEmpty_##name; } while (0)
+
+#ifndef CONFIG_AUFS_SHWH
+#undef AuTestEmpty_SHWH
+#define AuTestEmpty_SHWH	0
+#endif
+
+struct test_empty_arg {
+	struct dir_context ctx;
+	struct au_nhash *whlist;
+	unsigned int flags;
+	int err;
+	aufs_bindex_t bindex;
+};
+
+static int test_empty_cb(struct dir_context *ctx, const char *__name,
+			 int namelen, loff_t offset __maybe_unused, u64 ino,
+			 unsigned int d_type)
+{
+	struct test_empty_arg *arg = container_of(ctx, struct test_empty_arg,
+						  ctx);
+	char *name = (void *)__name;
+
+	arg->err = 0;
+	au_fset_testempty(arg->flags, CALLED);
+	/* smp_mb(); */
+	if (name[0] == '.'
+	    && (namelen == 1 || (name[1] == '.' && namelen == 2)))
+		goto out; /* success */
+
+	if (namelen <= AUFS_WH_PFX_LEN
+	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
+		if (au_ftest_testempty(arg->flags, WHONLY)
+		    && !au_nhash_test_known_wh(arg->whlist, name, namelen))
+			arg->err = -ENOTEMPTY;
+		goto out;
+	}
+
+	name += AUFS_WH_PFX_LEN;
+	namelen -= AUFS_WH_PFX_LEN;
+	if (!au_nhash_test_known_wh(arg->whlist, name, namelen))
+		arg->err = au_nhash_append_wh
+			(arg->whlist, name, namelen, ino, d_type, arg->bindex,
+			 au_ftest_testempty(arg->flags, SHWH));
+
+out:
+	/* smp_mb(); */
+	AuTraceErr(arg->err);
+	return arg->err;
+}
+
+static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
+{
+	int err;
+	struct file *h_file;
+
+	h_file = au_h_open(dentry, arg->bindex,
+			   O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE,
+			   /*file*/NULL, /*force_wr*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = 0;
+	if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE)
+	    && !file_inode(h_file)->i_nlink)
+		goto out_put;
+
+	do {
+		arg->err = 0;
+		au_fclr_testempty(arg->flags, CALLED);
+		/* smp_mb(); */
+		err = vfsub_iterate_dir(h_file, &arg->ctx);
+		if (err >= 0)
+			err = arg->err;
+	} while (!err && au_ftest_testempty(arg->flags, CALLED));
+
+out_put:
+	fput(h_file);
+	au_sbr_put(dentry->d_sb, arg->bindex);
+out:
+	return err;
+}
+
+struct do_test_empty_args {
+	int *errp;
+	struct dentry *dentry;
+	struct test_empty_arg *arg;
+};
+
+static void call_do_test_empty(void *args)
+{
+	struct do_test_empty_args *a = args;
+	*a->errp = do_test_empty(a->dentry, a->arg);
+}
+
+static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
+{
+	int err, wkq_err;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	h_dentry = au_h_dptr(dentry, arg->bindex);
+	h_inode = d_inode(h_dentry);
+	/* todo: i_mode changes anytime? */
+	inode_lock_nested(h_inode, AuLsc_I_CHILD);
+	err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ);
+	inode_unlock(h_inode);
+	if (!err)
+		err = do_test_empty(dentry, arg);
+	else {
+		struct do_test_empty_args args = {
+			.errp	= &err,
+			.dentry	= dentry,
+			.arg	= arg
+		};
+		unsigned int flags = arg->flags;
+
+		wkq_err = au_wkq_wait(call_do_test_empty, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+		arg->flags = flags;
+	}
+
+	return err;
+}
+
+int au_test_empty_lower(struct dentry *dentry)
+{
+	int err;
+	unsigned int rdhash;
+	aufs_bindex_t bindex, btop, btail;
+	struct au_nhash whlist;
+	struct test_empty_arg arg = {
+		.ctx = {
+			.actor = test_empty_cb
+		}
+	};
+	int (*test_empty)(struct dentry *dentry, struct test_empty_arg *arg);
+
+	SiMustAnyLock(dentry->d_sb);
+
+	rdhash = au_sbi(dentry->d_sb)->si_rdhash;
+	if (!rdhash)
+		rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry));
+	err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+
+	arg.flags = 0;
+	arg.whlist = &whlist;
+	btop = au_dbtop(dentry);
+	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
+		au_fset_testempty(arg.flags, SHWH);
+	test_empty = do_test_empty;
+	if (au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1))
+		test_empty = sio_test_empty;
+	arg.bindex = btop;
+	err = test_empty(dentry, &arg);
+	if (unlikely(err))
+		goto out_whlist;
+
+	au_fset_testempty(arg.flags, WHONLY);
+	btail = au_dbtaildir(dentry);
+	for (bindex = btop + 1; !err && bindex <= btail; bindex++) {
+		struct dentry *h_dentry;
+
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry && d_is_positive(h_dentry)) {
+			arg.bindex = bindex;
+			err = test_empty(dentry, &arg);
+		}
+	}
+
+out_whlist:
+	au_nhash_wh_free(&whlist);
+out:
+	return err;
+}
+
+int au_test_empty(struct dentry *dentry, struct au_nhash *whlist)
+{
+	int err;
+	struct test_empty_arg arg = {
+		.ctx = {
+			.actor = test_empty_cb
+		}
+	};
+	aufs_bindex_t bindex, btail;
+
+	err = 0;
+	arg.whlist = whlist;
+	arg.flags = AuTestEmpty_WHONLY;
+	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
+		au_fset_testempty(arg.flags, SHWH);
+	btail = au_dbtaildir(dentry);
+	for (bindex = au_dbtop(dentry); !err && bindex <= btail; bindex++) {
+		struct dentry *h_dentry;
+
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry && d_is_positive(h_dentry)) {
+			arg.bindex = bindex;
+			err = sio_test_empty(dentry, &arg);
+		}
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+const struct file_operations aufs_dir_fop = {
+	.owner		= THIS_MODULE,
+	.llseek		= default_llseek,
+	.read		= generic_read_dir,
+	.iterate_shared	= aufs_iterate_shared,
+	.unlocked_ioctl	= aufs_ioctl_dir,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= aufs_compat_ioctl_dir,
+#endif
+	.open		= aufs_open_dir,
+	.release	= aufs_release_dir,
+	.flush		= aufs_flush_dir,
+	.fsync		= aufs_fsync_dir
+};
diff --git b/fs/aufs/dir.h b/fs/aufs/dir.h
new file mode 100644
index 0000000..4f3945a
--- /dev/null
+++ b/fs/aufs/dir.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * directory operations
+ */
+
+#ifndef __AUFS_DIR_H__
+#define __AUFS_DIR_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+
+/* ---------------------------------------------------------------------- */
+
+/* need to be faster and smaller */
+
+struct au_nhash {
+	unsigned int		nh_num;
+	struct hlist_head	*nh_head;
+};
+
+struct au_vdir_destr {
+	unsigned char	len;
+	unsigned char	name[0];
+} __packed;
+
+struct au_vdir_dehstr {
+	struct hlist_node	hash;
+	union {
+		struct au_vdir_destr	*str;
+		struct llist_node	lnode;	/* delayed free */
+	};
+} ____cacheline_aligned_in_smp;
+
+struct au_vdir_de {
+	ino_t			de_ino;
+	unsigned char		de_type;
+	/* caution: packed */
+	struct au_vdir_destr	de_str;
+} __packed;
+
+struct au_vdir_wh {
+	struct hlist_node	wh_hash;
+#ifdef CONFIG_AUFS_SHWH
+	ino_t			wh_ino;
+	aufs_bindex_t		wh_bindex;
+	unsigned char		wh_type;
+#else
+	aufs_bindex_t		wh_bindex;
+#endif
+	/* caution: packed */
+	struct au_vdir_destr	wh_str;
+} __packed;
+
+union au_vdir_deblk_p {
+	unsigned char		*deblk;
+	struct au_vdir_de	*de;
+};
+
+struct au_vdir {
+	unsigned char	**vd_deblk;
+	unsigned long	vd_nblk;
+	struct {
+		unsigned long		ul;
+		union au_vdir_deblk_p	p;
+	} vd_last;
+
+	unsigned long	vd_version;
+	unsigned int	vd_deblk_sz;
+	union {
+		unsigned long		vd_jiffy;
+		struct llist_node	vd_lnode;	/* delayed free */
+	};
+} ____cacheline_aligned_in_smp;
+
+/* ---------------------------------------------------------------------- */
+
+/* dir.c */
+extern const struct file_operations aufs_dir_fop;
+void au_add_nlink(struct inode *dir, struct inode *h_dir);
+void au_sub_nlink(struct inode *dir, struct inode *h_dir);
+loff_t au_dir_size(struct file *file, struct dentry *dentry);
+void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc);
+int au_test_empty_lower(struct dentry *dentry);
+int au_test_empty(struct dentry *dentry, struct au_nhash *whlist);
+
+/* vdir.c */
+unsigned int au_rdhash_est(loff_t sz);
+int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp);
+void au_nhash_wh_free(struct au_nhash *whlist);
+int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
+			    int limit);
+int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen);
+int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
+		       unsigned int d_type, aufs_bindex_t bindex,
+		       unsigned char shwh);
+void au_vdir_free(struct au_vdir *vdir, int atonce);
+int au_vdir_init(struct file *file);
+int au_vdir_fill_de(struct file *file, struct dir_context *ctx);
+
+/* ioctl.c */
+long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg);
+
+#ifdef CONFIG_AUFS_RDU
+/* rdu.c */
+long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long au_rdu_compat_ioctl(struct file *file, unsigned int cmd,
+			 unsigned long arg);
+#endif
+#else
+AuStub(long, au_rdu_ioctl, return -EINVAL, struct file *file,
+       unsigned int cmd, unsigned long arg)
+#ifdef CONFIG_COMPAT
+AuStub(long, au_rdu_compat_ioctl, return -EINVAL, struct file *file,
+       unsigned int cmd, unsigned long arg)
+#endif
+#endif
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DIR_H__ */
diff --git b/fs/aufs/dynop.c b/fs/aufs/dynop.c
new file mode 100644
index 0000000..eb0e1a7
--- /dev/null
+++ b/fs/aufs/dynop.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * dynamically customizable operations for regular files
+ */
+
+#include "aufs.h"
+
+#define DyPrSym(key)	AuDbgSym(key->dk_op.dy_hop)
+
+/*
+ * How large will these lists be?
+ * Usually just a few elements, 20-30 at most for each, I guess.
+ */
+static struct au_sphlhead dynop[AuDyLast];
+
+static struct au_dykey *dy_gfind_get(struct au_sphlhead *sphl, const void *h_op)
+{
+	struct au_dykey *key, *tmp;
+	struct hlist_head *head;
+
+	key = NULL;
+	head = &sphl->head;
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(tmp, head, dk_hnode)
+		if (tmp->dk_op.dy_hop == h_op) {
+			key = tmp;
+			kref_get(&key->dk_kref);
+			break;
+		}
+	rcu_read_unlock();
+
+	return key;
+}
+
+static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key)
+{
+	struct au_dykey **k, *found;
+	const void *h_op = key->dk_op.dy_hop;
+	int i;
+
+	found = NULL;
+	k = br->br_dykey;
+	for (i = 0; i < AuBrDynOp; i++)
+		if (k[i]) {
+			if (k[i]->dk_op.dy_hop == h_op) {
+				found = k[i];
+				break;
+			}
+		} else
+			break;
+	if (!found) {
+		spin_lock(&br->br_dykey_lock);
+		for (; i < AuBrDynOp; i++)
+			if (k[i]) {
+				if (k[i]->dk_op.dy_hop == h_op) {
+					found = k[i];
+					break;
+				}
+			} else {
+				k[i] = key;
+				break;
+			}
+		spin_unlock(&br->br_dykey_lock);
+		BUG_ON(i == AuBrDynOp); /* expand the array */
+	}
+
+	return found;
+}
+
+/* kref_get() if @key is already added */
+static struct au_dykey *dy_gadd(struct au_sphlhead *sphl, struct au_dykey *key)
+{
+	struct au_dykey *tmp, *found;
+	struct hlist_head *head;
+	const void *h_op = key->dk_op.dy_hop;
+
+	found = NULL;
+	head = &sphl->head;
+	spin_lock(&sphl->spin);
+	hlist_for_each_entry(tmp, head, dk_hnode)
+		if (tmp->dk_op.dy_hop == h_op) {
+			kref_get(&tmp->dk_kref);
+			found = tmp;
+			break;
+		}
+	if (!found)
+		hlist_add_head_rcu(&key->dk_hnode, head);
+	spin_unlock(&sphl->spin);
+
+	if (!found)
+		DyPrSym(key);
+	return found;
+}
+
+static void dy_free_rcu(struct rcu_head *rcu)
+{
+	struct au_dykey *key;
+
+	key = container_of(rcu, struct au_dykey, dk_rcu);
+	DyPrSym(key);
+	kfree(key);	/* not delayed */
+}
+
+static void dy_free(struct kref *kref)
+{
+	struct au_dykey *key;
+	struct au_sphlhead *sphl;
+
+	key = container_of(kref, struct au_dykey, dk_kref);
+	sphl = dynop + key->dk_op.dy_type;
+	au_sphl_del_rcu(&key->dk_hnode, sphl);
+	call_rcu(&key->dk_rcu, dy_free_rcu);
+}
+
+void au_dy_put(struct au_dykey *key)
+{
+	kref_put(&key->dk_kref, dy_free);
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define DyDbgSize(cnt, op)	AuDebugOn(cnt != sizeof(op)/sizeof(void *))
+
+#ifdef CONFIG_AUFS_DEBUG
+#define DyDbgDeclare(cnt)	unsigned int cnt = 0
+#define DyDbgInc(cnt)		do { cnt++; } while (0)
+#else
+#define DyDbgDeclare(cnt)	do {} while (0)
+#define DyDbgInc(cnt)		do {} while (0)
+#endif
+
+#define DySet(func, dst, src, h_op, h_sb) do {				\
+	DyDbgInc(cnt);							\
+	if (h_op->func) {						\
+		if (src.func)						\
+			dst.func = src.func;				\
+		else							\
+			AuDbg("%s %s\n", au_sbtype(h_sb), #func);	\
+	}								\
+} while (0)
+
+#define DySetForce(func, dst, src) do {		\
+	AuDebugOn(!src.func);			\
+	DyDbgInc(cnt);				\
+	dst.func = src.func;			\
+} while (0)
+
+#define DySetAop(func) \
+	DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb)
+#define DySetAopForce(func) \
+	DySetForce(func, dyaop->da_op, aufs_aop)
+
+static void dy_aop(struct au_dykey *key, const void *h_op,
+		   struct super_block *h_sb __maybe_unused)
+{
+	struct au_dyaop *dyaop = (void *)key;
+	const struct address_space_operations *h_aop = h_op;
+	DyDbgDeclare(cnt);
+
+	AuDbg("%s\n", au_sbtype(h_sb));
+
+	DySetAop(writepage);
+	DySetAopForce(readpage);	/* force */
+	DySetAop(writepages);
+	DySetAop(set_page_dirty);
+	DySetAop(readpages);
+	DySetAop(write_begin);
+	DySetAop(write_end);
+	DySetAop(bmap);
+	DySetAop(invalidatepage);
+	DySetAop(releasepage);
+	DySetAop(freepage);
+	/* this one will be changed according to an aufs mount option */
+	DySetAop(direct_IO);
+	DySetAop(migratepage);
+	DySetAop(isolate_page);
+	DySetAop(putback_page);
+	DySetAop(launder_page);
+	DySetAop(is_partially_uptodate);
+	DySetAop(is_dirty_writeback);
+	DySetAop(error_remove_page);
+	DySetAop(swap_activate);
+	DySetAop(swap_deactivate);
+
+	DyDbgSize(cnt, *h_aop);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void dy_bug(struct kref *kref)
+{
+	BUG();
+}
+
+static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br)
+{
+	struct au_dykey *key, *old;
+	struct au_sphlhead *sphl;
+	struct op {
+		unsigned int sz;
+		void (*set)(struct au_dykey *key, const void *h_op,
+			    struct super_block *h_sb __maybe_unused);
+	};
+	static const struct op a[] = {
+		[AuDy_AOP] = {
+			.sz	= sizeof(struct au_dyaop),
+			.set	= dy_aop
+		}
+	};
+	const struct op *p;
+
+	sphl = dynop + op->dy_type;
+	key = dy_gfind_get(sphl, op->dy_hop);
+	if (key)
+		goto out_add; /* success */
+
+	p = a + op->dy_type;
+	key = kzalloc(p->sz, GFP_NOFS);
+	if (unlikely(!key)) {
+		key = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	key->dk_op.dy_hop = op->dy_hop;
+	kref_init(&key->dk_kref);
+	p->set(key, op->dy_hop, au_br_sb(br));
+	old = dy_gadd(sphl, key);
+	if (old) {
+		au_delayed_kfree(key);
+		key = old;
+	}
+
+out_add:
+	old = dy_bradd(br, key);
+	if (old)
+		/* its ref-count should never be zero here */
+		kref_put(&key->dk_kref, dy_bug);
+out:
+	return key;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * Aufs prohibits O_DIRECT by defaut even if the branch supports it.
+ * This behaviour is necessary to return an error from open(O_DIRECT) instead
+ * of the succeeding I/O. The dio mount option enables O_DIRECT and makes
+ * open(O_DIRECT) always succeed, but the succeeding I/O may return an error.
+ * See the aufs manual in detail.
+ */
+static void dy_adx(struct au_dyaop *dyaop, int do_dx)
+{
+	if (!do_dx)
+		dyaop->da_op.direct_IO = NULL;
+	else
+		dyaop->da_op.direct_IO = aufs_aop.direct_IO;
+}
+
+static struct au_dyaop *dy_aget(struct au_branch *br,
+				const struct address_space_operations *h_aop,
+				int do_dx)
+{
+	struct au_dyaop *dyaop;
+	struct au_dynop op;
+
+	op.dy_type = AuDy_AOP;
+	op.dy_haop = h_aop;
+	dyaop = (void *)dy_get(&op, br);
+	if (IS_ERR(dyaop))
+		goto out;
+	dy_adx(dyaop, do_dx);
+
+out:
+	return dyaop;
+}
+
+int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
+		struct inode *h_inode)
+{
+	int err, do_dx;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_dyaop *dyaop;
+
+	AuDebugOn(!S_ISREG(h_inode->i_mode));
+	IiMustWriteLock(inode);
+
+	sb = inode->i_sb;
+	br = au_sbr(sb, bindex);
+	do_dx = !!au_opt_test(au_mntflags(sb), DIO);
+	dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx);
+	err = PTR_ERR(dyaop);
+	if (IS_ERR(dyaop))
+		/* unnecessary to call dy_fput() */
+		goto out;
+
+	err = 0;
+	inode->i_mapping->a_ops = &dyaop->da_op;
+
+out:
+	return err;
+}
+
+/*
+ * Is it safe to replace a_ops during the inode/file is in operation?
+ * Yes, I hope so.
+ */
+int au_dy_irefresh(struct inode *inode)
+{
+	int err;
+	aufs_bindex_t btop;
+	struct inode *h_inode;
+
+	err = 0;
+	if (S_ISREG(inode->i_mode)) {
+		btop = au_ibtop(inode);
+		h_inode = au_h_iptr(inode, btop);
+		err = au_dy_iaop(inode, btop, h_inode);
+	}
+	return err;
+}
+
+void au_dy_arefresh(int do_dx)
+{
+	struct au_sphlhead *sphl;
+	struct hlist_head *head;
+	struct au_dykey *key;
+
+	sphl = dynop + AuDy_AOP;
+	head = &sphl->head;
+	spin_lock(&sphl->spin);
+	hlist_for_each_entry(key, head, dk_hnode)
+		dy_adx((void *)key, do_dx);
+	spin_unlock(&sphl->spin);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void __init au_dy_init(void)
+{
+	int i;
+
+	/* make sure that 'struct au_dykey *' can be any type */
+	BUILD_BUG_ON(offsetof(struct au_dyaop, da_key));
+
+	for (i = 0; i < AuDyLast; i++)
+		au_sphl_init(dynop + i);
+}
+
+void au_dy_fin(void)
+{
+	int i;
+
+	for (i = 0; i < AuDyLast; i++)
+		WARN_ON(!hlist_empty(&dynop[i].head));
+}
diff --git b/fs/aufs/dynop.h b/fs/aufs/dynop.h
new file mode 100644
index 0000000..5db2da2
--- /dev/null
+++ b/fs/aufs/dynop.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * dynamically customizable operations (for regular files only)
+ */
+
+#ifndef __AUFS_DYNOP_H__
+#define __AUFS_DYNOP_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/kref.h>
+
+enum {AuDy_AOP, AuDyLast};
+
+struct au_dynop {
+	int						dy_type;
+	union {
+		const void				*dy_hop;
+		const struct address_space_operations	*dy_haop;
+	};
+};
+
+struct au_dykey {
+	union {
+		struct hlist_node	dk_hnode;
+		struct rcu_head		dk_rcu;
+	};
+	struct au_dynop		dk_op;
+
+	/*
+	 * during I am in the branch local array, kref is gotten. when the
+	 * branch is removed, kref is put.
+	 */
+	struct kref		dk_kref;
+};
+
+/* stop unioning since their sizes are very different from each other */
+struct au_dyaop {
+	struct au_dykey			da_key;
+	struct address_space_operations	da_op; /* not const */
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* dynop.c */
+struct au_branch;
+void au_dy_put(struct au_dykey *key);
+int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
+		struct inode *h_inode);
+int au_dy_irefresh(struct inode *inode);
+void au_dy_arefresh(int do_dio);
+
+void __init au_dy_init(void);
+void au_dy_fin(void);
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_DYNOP_H__ */
diff --git b/fs/aufs/export.c b/fs/aufs/export.c
new file mode 100644
index 0000000..6b5a7be
--- /dev/null
+++ b/fs/aufs/export.c
@@ -0,0 +1,824 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * export via nfs
+ */
+
+#include <linux/exportfs.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/nsproxy.h>
+#include <linux/random.h>
+#include <linux/writeback.h>
+#include "../fs/mount.h"
+#include "aufs.h"
+
+union conv {
+#ifdef CONFIG_AUFS_INO_T_64
+	__u32 a[2];
+#else
+	__u32 a[1];
+#endif
+	ino_t ino;
+};
+
+static ino_t decode_ino(__u32 *a)
+{
+	union conv u;
+
+	BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a));
+	u.a[0] = a[0];
+#ifdef CONFIG_AUFS_INO_T_64
+	u.a[1] = a[1];
+#endif
+	return u.ino;
+}
+
+static void encode_ino(__u32 *a, ino_t ino)
+{
+	union conv u;
+
+	u.ino = ino;
+	a[0] = u.a[0];
+#ifdef CONFIG_AUFS_INO_T_64
+	a[1] = u.a[1];
+#endif
+}
+
+/* NFS file handle */
+enum {
+	Fh_br_id,
+	Fh_sigen,
+#ifdef CONFIG_AUFS_INO_T_64
+	/* support 64bit inode number */
+	Fh_ino1,
+	Fh_ino2,
+	Fh_dir_ino1,
+	Fh_dir_ino2,
+#else
+	Fh_ino1,
+	Fh_dir_ino1,
+#endif
+	Fh_igen,
+	Fh_h_type,
+	Fh_tail,
+
+	Fh_ino = Fh_ino1,
+	Fh_dir_ino = Fh_dir_ino1
+};
+
+static int au_test_anon(struct dentry *dentry)
+{
+	/* note: read d_flags without d_lock */
+	return !!(dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
+int au_test_nfsd(void)
+{
+	int ret;
+	struct task_struct *tsk = current;
+	char comm[sizeof(tsk->comm)];
+
+	ret = 0;
+	if (tsk->flags & PF_KTHREAD) {
+		get_task_comm(comm, tsk);
+		ret = !strcmp(comm, "nfsd");
+	}
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+/* inode generation external table */
+
+void au_xigen_inc(struct inode *inode)
+{
+	loff_t pos;
+	ssize_t sz;
+	__u32 igen;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+	sb = inode->i_sb;
+	AuDebugOn(!au_opt_test(au_mntflags(sb), XINO));
+
+	sbinfo = au_sbi(sb);
+	pos = inode->i_ino;
+	pos *= sizeof(igen);
+	igen = inode->i_generation + 1;
+	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen,
+			 sizeof(igen), &pos);
+	if (sz == sizeof(igen))
+		return; /* success */
+
+	if (unlikely(sz >= 0))
+		AuIOErr("xigen error (%zd)\n", sz);
+}
+
+int au_xigen_new(struct inode *inode)
+{
+	int err;
+	loff_t pos;
+	ssize_t sz;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+
+	err = 0;
+	/* todo: dirty, at mount time */
+	if (inode->i_ino == AUFS_ROOT_INO)
+		goto out;
+	sb = inode->i_sb;
+	SiMustAnyLock(sb);
+	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
+		goto out;
+
+	err = -EFBIG;
+	pos = inode->i_ino;
+	if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) {
+		AuIOErr1("too large i%lld\n", pos);
+		goto out;
+	}
+	pos *= sizeof(inode->i_generation);
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	file = sbinfo->si_xigen;
+	BUG_ON(!file);
+
+	if (vfsub_f_size_read(file)
+	    < pos + sizeof(inode->i_generation)) {
+		inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next);
+		sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation,
+				 sizeof(inode->i_generation), &pos);
+	} else
+		sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation,
+				sizeof(inode->i_generation), &pos);
+	if (sz == sizeof(inode->i_generation))
+		goto out; /* success */
+
+	err = sz;
+	if (unlikely(sz >= 0)) {
+		err = -EIO;
+		AuIOErr("xigen error (%zd)\n", sz);
+	}
+
+out:
+	return err;
+}
+
+int au_xigen_set(struct super_block *sb, struct file *base)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	file = au_xino_create2(base, sbinfo->si_xigen);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+	err = 0;
+	if (sbinfo->si_xigen)
+		fput(sbinfo->si_xigen);
+	sbinfo->si_xigen = file;
+
+out:
+	return err;
+}
+
+void au_xigen_clr(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	if (sbinfo->si_xigen) {
+		fput(sbinfo->si_xigen);
+		sbinfo->si_xigen = NULL;
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino,
+				    ino_t dir_ino)
+{
+	struct dentry *dentry, *d;
+	struct inode *inode;
+	unsigned int sigen;
+
+	dentry = NULL;
+	inode = ilookup(sb, ino);
+	if (!inode)
+		goto out;
+
+	dentry = ERR_PTR(-ESTALE);
+	sigen = au_sigen(sb);
+	if (unlikely(au_is_bad_inode(inode)
+		     || IS_DEADDIR(inode)
+		     || sigen != au_iigen(inode, NULL)))
+		goto out_iput;
+
+	dentry = NULL;
+	if (!dir_ino || S_ISDIR(inode->i_mode))
+		dentry = d_find_alias(inode);
+	else {
+		spin_lock(&inode->i_lock);
+		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
+			spin_lock(&d->d_lock);
+			if (!au_test_anon(d)
+			    && d_inode(d->d_parent)->i_ino == dir_ino) {
+				dentry = dget_dlock(d);
+				spin_unlock(&d->d_lock);
+				break;
+			}
+			spin_unlock(&d->d_lock);
+		}
+		spin_unlock(&inode->i_lock);
+	}
+	if (unlikely(dentry && au_digen_test(dentry, sigen))) {
+		/* need to refresh */
+		dput(dentry);
+		dentry = NULL;
+	}
+
+out_iput:
+	iput(inode);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: dirty? */
+/* if exportfs_decode_fh() passed vfsmount*, we could be happy */
+
+struct au_compare_mnt_args {
+	/* input */
+	struct super_block *sb;
+
+	/* output */
+	struct vfsmount *mnt;
+};
+
+static int au_compare_mnt(struct vfsmount *mnt, void *arg)
+{
+	struct au_compare_mnt_args *a = arg;
+
+	if (mnt->mnt_sb != a->sb)
+		return 0;
+	a->mnt = mntget(mnt);
+	return 1;
+}
+
+static struct vfsmount *au_mnt_get(struct super_block *sb)
+{
+	int err;
+	struct path root;
+	struct au_compare_mnt_args args = {
+		.sb = sb
+	};
+
+	get_fs_root(current->fs, &root);
+	rcu_read_lock();
+	err = iterate_mounts(au_compare_mnt, &args, root.mnt);
+	rcu_read_unlock();
+	path_put(&root);
+	AuDebugOn(!err);
+	AuDebugOn(!args.mnt);
+	return args.mnt;
+}
+
+struct au_nfsd_si_lock {
+	unsigned int sigen;
+	aufs_bindex_t bindex, br_id;
+	unsigned char force_lock;
+};
+
+static int si_nfsd_read_lock(struct super_block *sb,
+			     struct au_nfsd_si_lock *nsi_lock)
+{
+	int err;
+	aufs_bindex_t bindex;
+
+	si_read_lock(sb, AuLock_FLUSH);
+
+	/* branch id may be wrapped around */
+	err = 0;
+	bindex = au_br_index(sb, nsi_lock->br_id);
+	if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb))
+		goto out; /* success */
+
+	err = -ESTALE;
+	bindex = -1;
+	if (!nsi_lock->force_lock)
+		si_read_unlock(sb);
+
+out:
+	nsi_lock->bindex = bindex;
+	return err;
+}
+
+struct find_name_by_ino {
+	struct dir_context ctx;
+	int called, found;
+	ino_t ino;
+	char *name;
+	int namelen;
+};
+
+static int
+find_name_by_ino(struct dir_context *ctx, const char *name, int namelen,
+		 loff_t offset, u64 ino, unsigned int d_type)
+{
+	struct find_name_by_ino *a = container_of(ctx, struct find_name_by_ino,
+						  ctx);
+
+	a->called++;
+	if (a->ino != ino)
+		return 0;
+
+	memcpy(a->name, name, namelen);
+	a->namelen = namelen;
+	a->found = 1;
+	return 1;
+}
+
+static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino,
+				     struct au_nfsd_si_lock *nsi_lock)
+{
+	struct dentry *dentry, *parent;
+	struct file *file;
+	struct inode *dir;
+	struct find_name_by_ino arg = {
+		.ctx = {
+			.actor = find_name_by_ino
+		}
+	};
+	int err;
+
+	parent = path->dentry;
+	if (nsi_lock)
+		si_read_unlock(parent->d_sb);
+	file = vfsub_dentry_open(path, au_dir_roflags);
+	dentry = (void *)file;
+	if (IS_ERR(file))
+		goto out;
+
+	dentry = ERR_PTR(-ENOMEM);
+	arg.name = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!arg.name))
+		goto out_file;
+	arg.ino = ino;
+	arg.found = 0;
+	do {
+		arg.called = 0;
+		/* smp_mb(); */
+		err = vfsub_iterate_dir(file, &arg.ctx);
+	} while (!err && !arg.found && arg.called);
+	dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_name;
+	/* instead of ENOENT */
+	dentry = ERR_PTR(-ESTALE);
+	if (!arg.found)
+		goto out_name;
+
+	/* do not call vfsub_lkup_one() */
+	dir = d_inode(parent);
+	dentry = vfsub_lookup_one_len_unlocked(arg.name, parent, arg.namelen);
+	AuTraceErrPtr(dentry);
+	if (IS_ERR(dentry))
+		goto out_name;
+	AuDebugOn(au_test_anon(dentry));
+	if (unlikely(d_really_is_negative(dentry))) {
+		dput(dentry);
+		dentry = ERR_PTR(-ENOENT);
+	}
+
+out_name:
+	au_delayed_free_page((unsigned long)arg.name);
+out_file:
+	fput(file);
+out:
+	if (unlikely(nsi_lock
+		     && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0))
+		if (!IS_ERR(dentry)) {
+			dput(dentry);
+			dentry = ERR_PTR(-ESTALE);
+		}
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino,
+					ino_t dir_ino,
+					struct au_nfsd_si_lock *nsi_lock)
+{
+	struct dentry *dentry;
+	struct path path;
+
+	if (dir_ino != AUFS_ROOT_INO) {
+		path.dentry = decode_by_ino(sb, dir_ino, 0);
+		dentry = path.dentry;
+		if (!path.dentry || IS_ERR(path.dentry))
+			goto out;
+		AuDebugOn(au_test_anon(path.dentry));
+	} else
+		path.dentry = dget(sb->s_root);
+
+	path.mnt = au_mnt_get(sb);
+	dentry = au_lkup_by_ino(&path, ino, nsi_lock);
+	path_put(&path);
+
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int h_acceptable(void *expv, struct dentry *dentry)
+{
+	return 1;
+}
+
+static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath,
+			   char *buf, int len, struct super_block *sb)
+{
+	char *p;
+	int n;
+	struct path path;
+
+	p = d_path(h_rootpath, buf, len);
+	if (IS_ERR(p))
+		goto out;
+	n = strlen(p);
+
+	path.mnt = h_rootpath->mnt;
+	path.dentry = h_parent;
+	p = d_path(&path, buf, len);
+	if (IS_ERR(p))
+		goto out;
+	if (n != 1)
+		p += n;
+
+	path.mnt = au_mnt_get(sb);
+	path.dentry = sb->s_root;
+	p = d_path(&path, buf, len - strlen(p));
+	mntput(path.mnt);
+	if (IS_ERR(p))
+		goto out;
+	if (n != 1)
+		p[strlen(p)] = '/';
+
+out:
+	AuTraceErrPtr(p);
+	return p;
+}
+
+static
+struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh,
+			      int fh_len, struct au_nfsd_si_lock *nsi_lock)
+{
+	struct dentry *dentry, *h_parent, *root;
+	struct super_block *h_sb;
+	char *pathname, *p;
+	struct vfsmount *h_mnt;
+	struct au_branch *br;
+	int err;
+	struct path path;
+
+	br = au_sbr(sb, nsi_lock->bindex);
+	h_mnt = au_br_mnt(br);
+	h_sb = h_mnt->mnt_sb;
+	/* todo: call lower fh_to_dentry()? fh_to_parent()? */
+	lockdep_off();
+	h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail),
+				      fh_len - Fh_tail, fh[Fh_h_type],
+				      h_acceptable, /*context*/NULL);
+	lockdep_on();
+	dentry = h_parent;
+	if (unlikely(!h_parent || IS_ERR(h_parent))) {
+		AuWarn1("%s decode_fh failed, %ld\n",
+			au_sbtype(h_sb), PTR_ERR(h_parent));
+		goto out;
+	}
+	dentry = NULL;
+	if (unlikely(au_test_anon(h_parent))) {
+		AuWarn1("%s decode_fh returned a disconnected dentry\n",
+			au_sbtype(h_sb));
+		goto out_h_parent;
+	}
+
+	dentry = ERR_PTR(-ENOMEM);
+	pathname = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!pathname))
+		goto out_h_parent;
+
+	root = sb->s_root;
+	path.mnt = h_mnt;
+	di_read_lock_parent(root, !AuLock_IR);
+	path.dentry = au_h_dptr(root, nsi_lock->bindex);
+	di_read_unlock(root, !AuLock_IR);
+	p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb);
+	dentry = (void *)p;
+	if (IS_ERR(p))
+		goto out_pathname;
+
+	si_read_unlock(sb);
+	err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
+	dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_relock;
+
+	dentry = ERR_PTR(-ENOENT);
+	AuDebugOn(au_test_anon(path.dentry));
+	if (unlikely(d_really_is_negative(path.dentry)))
+		goto out_path;
+
+	if (ino != d_inode(path.dentry)->i_ino)
+		dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL);
+	else
+		dentry = dget(path.dentry);
+
+out_path:
+	path_put(&path);
+out_relock:
+	if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0))
+		if (!IS_ERR(dentry)) {
+			dput(dentry);
+			dentry = ERR_PTR(-ESTALE);
+		}
+out_pathname:
+	au_delayed_free_page((unsigned long)pathname);
+out_h_parent:
+	dput(h_parent);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *
+aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
+		  int fh_type)
+{
+	struct dentry *dentry;
+	__u32 *fh = fid->raw;
+	struct au_branch *br;
+	ino_t ino, dir_ino;
+	struct au_nfsd_si_lock nsi_lock = {
+		.force_lock	= 0
+	};
+
+	dentry = ERR_PTR(-ESTALE);
+	/* it should never happen, but the file handle is unreliable */
+	if (unlikely(fh_len < Fh_tail))
+		goto out;
+	nsi_lock.sigen = fh[Fh_sigen];
+	nsi_lock.br_id = fh[Fh_br_id];
+
+	/* branch id may be wrapped around */
+	br = NULL;
+	if (unlikely(si_nfsd_read_lock(sb, &nsi_lock)))
+		goto out;
+	nsi_lock.force_lock = 1;
+
+	/* is this inode still cached? */
+	ino = decode_ino(fh + Fh_ino);
+	/* it should never happen */
+	if (unlikely(ino == AUFS_ROOT_INO))
+		goto out_unlock;
+
+	dir_ino = decode_ino(fh + Fh_dir_ino);
+	dentry = decode_by_ino(sb, ino, dir_ino);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+	if (dentry)
+		goto accept;
+
+	/* is the parent dir cached? */
+	br = au_sbr(sb, nsi_lock.bindex);
+	au_br_get(br);
+	dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+	if (dentry)
+		goto accept;
+
+	/* lookup path */
+	dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock);
+	if (IS_ERR(dentry))
+		goto out_unlock;
+	if (unlikely(!dentry))
+		/* todo?: make it ESTALE */
+		goto out_unlock;
+
+accept:
+	if (!au_digen_test(dentry, au_sigen(sb))
+	    && d_inode(dentry)->i_generation == fh[Fh_igen])
+		goto out_unlock; /* success */
+
+	dput(dentry);
+	dentry = ERR_PTR(-ESTALE);
+out_unlock:
+	if (br)
+		au_br_put(br);
+	si_read_unlock(sb);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+#if 0 /* reserved for future use */
+/* support subtreecheck option */
+static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid,
+					int fh_len, int fh_type)
+{
+	struct dentry *parent;
+	__u32 *fh = fid->raw;
+	ino_t dir_ino;
+
+	dir_ino = decode_ino(fh + Fh_dir_ino);
+	parent = decode_by_ino(sb, dir_ino, 0);
+	if (IS_ERR(parent))
+		goto out;
+	if (!parent)
+		parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]),
+					dir_ino, fh, fh_len);
+
+out:
+	AuTraceErrPtr(parent);
+	return parent;
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
+			  struct inode *dir)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct super_block *sb, *h_sb;
+	struct dentry *dentry, *parent, *h_parent;
+	struct inode *h_dir;
+	struct au_branch *br;
+
+	err = -ENOSPC;
+	if (unlikely(*max_len <= Fh_tail)) {
+		AuWarn1("NFSv2 client (max_len %d)?\n", *max_len);
+		goto out;
+	}
+
+	err = FILEID_ROOT;
+	if (inode->i_ino == AUFS_ROOT_INO) {
+		AuDebugOn(inode->i_ino != AUFS_ROOT_INO);
+		goto out;
+	}
+
+	h_parent = NULL;
+	sb = inode->i_sb;
+	err = si_read_lock(sb, AuLock_FLUSH);
+	if (unlikely(err))
+		goto out;
+
+#ifdef CONFIG_AUFS_DEBUG
+	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
+		AuWarn1("NFS-exporting requires xino\n");
+#endif
+	err = -EIO;
+	parent = NULL;
+	ii_read_lock_child(inode);
+	bindex = au_ibtop(inode);
+	if (!dir) {
+		dentry = d_find_any_alias(inode);
+		if (unlikely(!dentry))
+			goto out_unlock;
+		AuDebugOn(au_test_anon(dentry));
+		parent = dget_parent(dentry);
+		dput(dentry);
+		if (unlikely(!parent))
+			goto out_unlock;
+		if (d_really_is_positive(parent))
+			dir = d_inode(parent);
+	}
+
+	ii_read_lock_parent(dir);
+	h_dir = au_h_iptr(dir, bindex);
+	ii_read_unlock(dir);
+	if (unlikely(!h_dir))
+		goto out_parent;
+	h_parent = d_find_any_alias(h_dir);
+	if (unlikely(!h_parent))
+		goto out_hparent;
+
+	err = -EPERM;
+	br = au_sbr(sb, bindex);
+	h_sb = au_br_sb(br);
+	if (unlikely(!h_sb->s_export_op)) {
+		AuErr1("%s branch is not exportable\n", au_sbtype(h_sb));
+		goto out_hparent;
+	}
+
+	fh[Fh_br_id] = br->br_id;
+	fh[Fh_sigen] = au_sigen(sb);
+	encode_ino(fh + Fh_ino, inode->i_ino);
+	encode_ino(fh + Fh_dir_ino, dir->i_ino);
+	fh[Fh_igen] = inode->i_generation;
+
+	*max_len -= Fh_tail;
+	fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail),
+					   max_len,
+					   /*connectable or subtreecheck*/0);
+	err = fh[Fh_h_type];
+	*max_len += Fh_tail;
+	/* todo: macros? */
+	if (err != FILEID_INVALID)
+		err = 99;
+	else
+		AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb));
+
+out_hparent:
+	dput(h_parent);
+out_parent:
+	dput(parent);
+out_unlock:
+	ii_read_unlock(inode);
+	si_read_unlock(sb);
+out:
+	if (unlikely(err < 0))
+		err = FILEID_INVALID;
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_commit_metadata(struct inode *inode)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct super_block *sb;
+	struct inode *h_inode;
+	int (*f)(struct inode *inode);
+
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+	ii_write_lock_child(inode);
+	bindex = au_ibtop(inode);
+	AuDebugOn(bindex < 0);
+	h_inode = au_h_iptr(inode, bindex);
+
+	f = h_inode->i_sb->s_export_op->commit_metadata;
+	if (f)
+		err = f(h_inode);
+	else {
+		struct writeback_control wbc = {
+			.sync_mode	= WB_SYNC_ALL,
+			.nr_to_write	= 0 /* metadata only */
+		};
+
+		err = sync_inode(h_inode, &wbc);
+	}
+
+	au_cpup_attr_timesizes(inode);
+	ii_write_unlock(inode);
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct export_operations aufs_export_op = {
+	.fh_to_dentry		= aufs_fh_to_dentry,
+	/* .fh_to_parent	= aufs_fh_to_parent, */
+	.encode_fh		= aufs_encode_fh,
+	.commit_metadata	= aufs_commit_metadata
+};
+
+void au_export_init(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+	__u32 u;
+
+	BUILD_BUG_ON_MSG(IS_BUILTIN(CONFIG_AUFS_FS)
+			 && IS_MODULE(CONFIG_EXPORTFS),
+			 AUFS_NAME ": unsupported configuration "
+			 "CONFIG_EXPORTFS=m and CONFIG_AUFS_FS=y");
+
+	sb->s_export_op = &aufs_export_op;
+	sbinfo = au_sbi(sb);
+	sbinfo->si_xigen = NULL;
+	get_random_bytes(&u, sizeof(u));
+	BUILD_BUG_ON(sizeof(u) != sizeof(int));
+	atomic_set(&sbinfo->si_xigen_next, u);
+}
diff --git b/fs/aufs/f_op.c b/fs/aufs/f_op.c
new file mode 100644
index 0000000..00475fb
--- /dev/null
+++ b/fs/aufs/f_op.c
@@ -0,0 +1,759 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * file and vm operations
+ */
+
+#include <linux/aio.h>
+#include <linux/fs_stack.h>
+#include <linux/mman.h>
+#include <linux/security.h>
+#include "aufs.h"
+
+int au_do_open_nondir(struct file *file, int flags, struct file *h_file)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct dentry *dentry, *h_dentry;
+	struct au_finfo *finfo;
+	struct inode *h_inode;
+
+	FiMustWriteLock(file);
+
+	err = 0;
+	dentry = file->f_path.dentry;
+	AuDebugOn(IS_ERR_OR_NULL(dentry));
+	finfo = au_fi(file);
+	memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
+	atomic_set(&finfo->fi_mmapped, 0);
+	bindex = au_dbtop(dentry);
+	if (!h_file) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
+		if (unlikely(err))
+			goto out;
+		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
+	} else {
+		h_dentry = h_file->f_path.dentry;
+		err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
+		if (unlikely(err))
+			goto out;
+		get_file(h_file);
+	}
+	if (IS_ERR(h_file))
+		err = PTR_ERR(h_file);
+	else {
+		if ((flags & __O_TMPFILE)
+		    && !(flags & O_EXCL)) {
+			h_inode = file_inode(h_file);
+			spin_lock(&h_inode->i_lock);
+			h_inode->i_state |= I_LINKABLE;
+			spin_unlock(&h_inode->i_lock);
+		}
+		au_set_fbtop(file, bindex);
+		au_set_h_fptr(file, bindex, h_file);
+		au_update_figen(file);
+		/* todo: necessary? */
+		/* file->f_ra = h_file->f_ra; */
+	}
+
+out:
+	return err;
+}
+
+static int aufs_open_nondir(struct inode *inode __maybe_unused,
+			    struct file *file)
+{
+	int err;
+	struct super_block *sb;
+	struct au_do_open_args args = {
+		.open	= au_do_open_nondir
+	};
+
+	AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n",
+	      file, vfsub_file_flags(file), file->f_mode);
+
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	err = au_do_open(file, &args);
+	si_read_unlock(sb);
+	return err;
+}
+
+int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)
+{
+	struct au_finfo *finfo;
+	aufs_bindex_t bindex;
+	int delayed;
+
+	finfo = au_fi(file);
+	au_sphl_del(&finfo->fi_hlist,
+		    &au_sbi(file->f_path.dentry->d_sb)->si_files);
+	bindex = finfo->fi_btop;
+	if (bindex >= 0)
+		au_set_h_fptr(file, bindex, NULL);
+
+	delayed = (current->flags & PF_KTHREAD) || in_interrupt();
+	au_finfo_fin(file, delayed);
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_do_flush_nondir(struct file *file, fl_owner_t id)
+{
+	int err;
+	struct file *h_file;
+
+	err = 0;
+	h_file = au_hf_top(file);
+	if (h_file)
+		err = vfsub_flush(h_file, id);
+	return err;
+}
+
+static int aufs_flush_nondir(struct file *file, fl_owner_t id)
+{
+	return au_do_flush(file, id, au_do_flush_nondir);
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * read and write functions acquire [fdi]_rwsem once, but release before
+ * mmap_sem. This is because to stop a race condition between mmap(2).
+ * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping
+ * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in
+ * read functions after [fdi]_rwsem are released, but it should be harmless.
+ */
+
+/* Callers should call au_read_post() or fput() in the end */
+struct file *au_read_pre(struct file *file, int keep_fi)
+{
+	struct file *h_file;
+	int err;
+
+	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
+	if (!err) {
+		di_read_unlock(file->f_path.dentry, AuLock_IR);
+		h_file = au_hf_top(file);
+		get_file(h_file);
+		if (!keep_fi)
+			fi_read_unlock(file);
+	} else
+		h_file = ERR_PTR(err);
+
+	return h_file;
+}
+
+static void au_read_post(struct inode *inode, struct file *h_file)
+{
+	/* update without lock, I don't think it a problem */
+	fsstack_copy_attr_atime(inode, file_inode(h_file));
+	fput(h_file);
+}
+
+struct au_write_pre {
+	blkcnt_t blks;
+	aufs_bindex_t btop;
+};
+
+/*
+ * return with iinfo is write-locked
+ * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
+ * end
+ */
+static struct file *au_write_pre(struct file *file, int do_ready,
+				 struct au_write_pre *wpre)
+{
+	struct file *h_file;
+	struct dentry *dentry;
+	int err;
+	struct au_pin pin;
+
+	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
+	h_file = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	dentry = file->f_path.dentry;
+	if (do_ready) {
+		err = au_ready_to_write(file, -1, &pin);
+		if (unlikely(err)) {
+			h_file = ERR_PTR(err);
+			di_write_unlock(dentry);
+			goto out_fi;
+		}
+	}
+
+	di_downgrade_lock(dentry, /*flags*/0);
+	if (wpre)
+		wpre->btop = au_fbtop(file);
+	h_file = au_hf_top(file);
+	get_file(h_file);
+	if (wpre)
+		wpre->blks = file_inode(h_file)->i_blocks;
+	if (do_ready)
+		au_unpin(&pin);
+	di_read_unlock(dentry, /*flags*/0);
+
+out_fi:
+	fi_write_unlock(file);
+out:
+	return h_file;
+}
+
+static void au_write_post(struct inode *inode, struct file *h_file,
+			  struct au_write_pre *wpre, ssize_t written)
+{
+	struct inode *h_inode;
+
+	au_cpup_attr_timesizes(inode);
+	AuDebugOn(au_ibtop(inode) != wpre->btop);
+	h_inode = file_inode(h_file);
+	inode->i_mode = h_inode->i_mode;
+	ii_write_unlock(inode);
+	fput(h_file);
+
+	/* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */
+	if (written > 0)
+		au_fhsm_wrote(inode->i_sb, wpre->btop,
+			      /*force*/h_inode->i_blocks > wpre->blks);
+}
+
+static ssize_t aufs_read(struct file *file, char __user *buf, size_t count,
+			 loff_t *ppos)
+{
+	ssize_t err;
+	struct inode *inode;
+	struct file *h_file;
+	struct super_block *sb;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	/* filedata may be obsoleted by concurrent copyup, but no problem */
+	err = vfsub_read_u(h_file, buf, count, ppos);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	au_read_post(inode, h_file);
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+/*
+ * todo: very ugly
+ * it locks both of i_mutex and si_rwsem for read in safe.
+ * if the plink maintenance mode continues forever (that is the problem),
+ * may loop forever.
+ */
+static void au_mtx_and_read_lock(struct inode *inode)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+
+	while (1) {
+		inode_lock(inode);
+		err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+		if (!err)
+			break;
+		inode_unlock(inode);
+		si_read_lock(sb, AuLock_NOPLMW);
+		si_read_unlock(sb);
+	}
+}
+
+static ssize_t aufs_write(struct file *file, const char __user *ubuf,
+			  size_t count, loff_t *ppos)
+{
+	ssize_t err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+	char __user *buf = (char __user *)ubuf;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = vfsub_write_u(h_file, buf, count, ppos);
+	au_write_post(inode, h_file, &wpre, err);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio,
+			  struct iov_iter *iov_iter)
+{
+	ssize_t err;
+	struct file *file;
+	ssize_t (*iter)(struct kiocb *, struct iov_iter *);
+
+	err = security_file_permission(h_file, rw);
+	if (unlikely(err))
+		goto out;
+
+	err = -ENOSYS;
+	iter = NULL;
+	if (rw == MAY_READ)
+		iter = h_file->f_op->read_iter;
+	else if (rw == MAY_WRITE)
+		iter = h_file->f_op->write_iter;
+
+	file = kio->ki_filp;
+	kio->ki_filp = h_file;
+	if (iter) {
+		lockdep_off();
+		err = iter(kio, iov_iter);
+		lockdep_on();
+	} else
+		/* currently there is no such fs */
+		WARN_ON_ONCE(1);
+	kio->ki_filp = file;
+
+out:
+	return err;
+}
+
+static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter)
+{
+	ssize_t err;
+	struct file *file, *h_file;
+	struct inode *inode;
+	struct super_block *sb;
+
+	file = kio->ki_filp;
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/1);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	if (au_test_loopback_kthread()) {
+		au_warn_loopback(h_file->f_path.dentry->d_sb);
+		if (file->f_mapping != h_file->f_mapping) {
+			file->f_mapping = h_file->f_mapping;
+			smp_mb(); /* unnecessary? */
+		}
+	}
+	fi_read_unlock(file);
+
+	err = au_do_iter(h_file, MAY_READ, kio, iov_iter);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+	au_read_post(inode, h_file);
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter)
+{
+	ssize_t err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *file, *h_file;
+
+	file = kio->ki_filp;
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter);
+	au_write_post(inode, h_file, &wpre, err);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+static ssize_t aufs_splice_read(struct file *file, loff_t *ppos,
+				struct pipe_inode_info *pipe, size_t len,
+				unsigned int flags)
+{
+	ssize_t err;
+	struct file *h_file;
+	struct inode *inode;
+	struct super_block *sb;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = vfsub_splice_to(h_file, ppos, pipe, len, flags);
+	/* todo: necessasry? */
+	/* file->f_ra = h_file->f_ra; */
+	au_read_post(inode, h_file);
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+static ssize_t
+aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos,
+		  size_t len, unsigned int flags)
+{
+	ssize_t err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = vfsub_splice_from(pipe, h_file, ppos, len, flags);
+	au_write_post(inode, h_file, &wpre, err);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+static long aufs_fallocate(struct file *file, int mode, loff_t offset,
+			   loff_t len)
+{
+	long err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	lockdep_off();
+	err = vfs_fallocate(h_file, mode, offset, len);
+	lockdep_on();
+	au_write_post(inode, h_file, &wpre, /*written*/1);
+
+out:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * The locking order around current->mmap_sem.
+ * - in most and regular cases
+ *   file I/O syscall -- aufs_read() or something
+ *	-- si_rwsem for read -- mmap_sem
+ *	(Note that [fdi]i_rwsem are released before mmap_sem).
+ * - in mmap case
+ *   mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
+ * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for
+ * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in
+ * file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
+ * It means that when aufs acquires si_rwsem for write, the process should never
+ * acquire mmap_sem.
+ *
+ * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
+ * problem either since any directory is not able to be mmap-ed.
+ * The similar scenario is applied to aufs_readlink() too.
+ */
+
+#if 0 /* stop calling security_file_mmap() */
+/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
+#define AuConv_VM_PROT(f, b)	_calc_vm_trans(f, VM_##b, PROT_##b)
+
+static unsigned long au_arch_prot_conv(unsigned long flags)
+{
+	/* currently ppc64 only */
+#ifdef CONFIG_PPC64
+	/* cf. linux/arch/powerpc/include/asm/mman.h */
+	AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
+	return AuConv_VM_PROT(flags, SAO);
+#else
+	AuDebugOn(arch_calc_vm_prot_bits(-1));
+	return 0;
+#endif
+}
+
+static unsigned long au_prot_conv(unsigned long flags)
+{
+	return AuConv_VM_PROT(flags, READ)
+		| AuConv_VM_PROT(flags, WRITE)
+		| AuConv_VM_PROT(flags, EXEC)
+		| au_arch_prot_conv(flags);
+}
+
+/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
+#define AuConv_VM_MAP(f, b)	_calc_vm_trans(f, VM_##b, MAP_##b)
+
+static unsigned long au_flag_conv(unsigned long flags)
+{
+	return AuConv_VM_MAP(flags, GROWSDOWN)
+		| AuConv_VM_MAP(flags, DENYWRITE)
+		| AuConv_VM_MAP(flags, LOCKED);
+}
+#endif
+
+static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	int err;
+	const unsigned char wlock
+		= (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
+	struct super_block *sb;
+	struct file *h_file;
+	struct inode *inode;
+
+	AuDbgVmRegion(file, vma);
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	lockdep_off();
+	si_read_lock(sb, AuLock_NOPLMW);
+
+	h_file = au_write_pre(file, wlock, /*wpre*/NULL);
+	lockdep_on();
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	err = 0;
+	au_set_mmapped(file);
+	au_vm_file_reset(vma, h_file);
+	/*
+	 * we cannot call security_mmap_file() here since it may acquire
+	 * mmap_sem or i_mutex.
+	 *
+	 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
+	 *			 au_flag_conv(vma->vm_flags));
+	 */
+	if (!err)
+		err = h_file->f_op->mmap(h_file, vma);
+	if (!err) {
+		au_vm_prfile_set(vma, file);
+		fsstack_copy_attr_atime(inode, file_inode(h_file));
+		goto out_fput; /* success */
+	}
+	au_unset_mmapped(file);
+	au_vm_file_reset(vma, file);
+
+out_fput:
+	lockdep_off();
+	ii_write_unlock(inode);
+	lockdep_on();
+	fput(h_file);
+out:
+	lockdep_off();
+	si_read_unlock(sb);
+	lockdep_on();
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end,
+			     int datasync)
+{
+	int err;
+	struct au_write_pre wpre;
+	struct inode *inode;
+	struct file *h_file;
+
+	err = 0; /* -EBADF; */ /* posix? */
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		goto out;
+
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out_unlock;
+
+	err = vfsub_fsync(h_file, &h_file->f_path, datasync);
+	au_write_post(inode, h_file, &wpre, /*written*/0);
+
+out_unlock:
+	si_read_unlock(inode->i_sb);
+	inode_unlock(inode);
+out:
+	return err;
+}
+
+/* no one supports this operation, currently */
+#if 0
+static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync)
+{
+	int err;
+	struct au_write_pre wpre;
+	struct inode *inode, *h_inode;
+	struct file *file, *h_file;
+
+	err = 0; /* -EBADF; */ /* posix? */
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		goto out;
+
+	file = kio->ki_filp;
+	inode = file_inode(file);
+	au_mtx_and_read_lock(inode);
+
+	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out_unlock;
+
+	err = -ENOSYS;
+	h_file = au_hf_top(file);
+	if (h_file->f_op->aio_fsync) {
+		h_inode = file_inode(h_file);
+		if (!is_sync_kiocb(kio)) {
+			get_file(h_file);
+			fput(file);
+		}
+		kio->ki_filp = h_file;
+		err = h_file->f_op->aio_fsync(kio, datasync);
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		if (!err)
+			vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL);
+		/*ignore*/
+		inode_unlock(h_inode);
+	}
+	au_write_post(inode, h_file, &wpre, /*written*/0);
+
+out_unlock:
+	si_read_unlock(inode->sb);
+	inode_unlock(inode);
+out:
+	return err;
+}
+#endif
+
+static int aufs_fasync(int fd, struct file *file, int flag)
+{
+	int err;
+	struct file *h_file;
+	struct super_block *sb;
+
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	if (h_file->f_op->fasync)
+		err = h_file->f_op->fasync(fd, h_file, flag);
+	fput(h_file); /* instead of au_read_post() */
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+static int aufs_setfl(struct file *file, unsigned long arg)
+{
+	int err;
+	struct file *h_file;
+	struct super_block *sb;
+
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	arg |= vfsub_file_flags(file) & FASYNC; /* stop calling h_file->fasync */
+	err = setfl(/*unused fd*/-1, h_file, arg);
+	fput(h_file); /* instead of au_read_post() */
+
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* no one supports this operation, currently */
+#if 0
+static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset,
+			     size_t len, loff_t *pos, int more)
+{
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+const struct file_operations aufs_file_fop = {
+	.owner		= THIS_MODULE,
+
+	.llseek		= default_llseek,
+
+	.read		= aufs_read,
+	.write		= aufs_write,
+	.read_iter	= aufs_read_iter,
+	.write_iter	= aufs_write_iter,
+
+#ifdef CONFIG_AUFS_POLL
+	.poll		= aufs_poll,
+#endif
+	.unlocked_ioctl	= aufs_ioctl_nondir,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= aufs_compat_ioctl_nondir,
+#endif
+	.mmap		= aufs_mmap,
+	.open		= aufs_open_nondir,
+	.flush		= aufs_flush_nondir,
+	.release	= aufs_release_nondir,
+	.fsync		= aufs_fsync_nondir,
+	/* .aio_fsync	= aufs_aio_fsync_nondir, */
+	.fasync		= aufs_fasync,
+	/* .sendpage	= aufs_sendpage, */
+	.setfl		= aufs_setfl,
+	.splice_write	= aufs_splice_write,
+	.splice_read	= aufs_splice_read,
+#if 0
+	.aio_splice_write = aufs_aio_splice_write,
+	.aio_splice_read  = aufs_aio_splice_read,
+#endif
+	.fallocate	= aufs_fallocate
+};
diff --git b/fs/aufs/fhsm.c b/fs/aufs/fhsm.c
new file mode 100644
index 0000000..e3cb6ed
--- /dev/null
+++ b/fs/aufs/fhsm.c
@@ -0,0 +1,412 @@
+/*
+ * Copyright (C) 2011-2016 Junjiro R. Okajima
+ */
+
+/*
+ * File-based Hierarchy Storage Management
+ */
+
+#include <linux/anon_inodes.h>
+#include <linux/poll.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include "aufs.h"
+
+static aufs_bindex_t au_fhsm_bottom(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	AuDebugOn(!fhsm);
+	return fhsm->fhsm_bottom;
+}
+
+void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	AuDebugOn(!fhsm);
+	fhsm->fhsm_bottom = bindex;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_fhsm_test_jiffy(struct au_sbinfo *sbinfo, struct au_branch *br)
+{
+	struct au_br_fhsm *bf;
+
+	bf = br->br_fhsm;
+	MtxMustLock(&bf->bf_lock);
+
+	return !bf->bf_readable
+		|| time_after(jiffies,
+			      bf->bf_jiffy + sbinfo->si_fhsm.fhsm_expire);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_fhsm_notify(struct super_block *sb, int val)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	if (au_fhsm_pid(fhsm)
+	    && atomic_read(&fhsm->fhsm_readable) != -1) {
+		atomic_set(&fhsm->fhsm_readable, val);
+		if (val)
+			wake_up(&fhsm->fhsm_wqh);
+	}
+}
+
+static int au_fhsm_stfs(struct super_block *sb, aufs_bindex_t bindex,
+			struct aufs_stfs *rstfs, int do_lock, int do_notify)
+{
+	int err;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	br = au_sbr(sb, bindex);
+	AuDebugOn(au_br_rdonly(br));
+	bf = br->br_fhsm;
+	AuDebugOn(!bf);
+
+	if (do_lock)
+		mutex_lock(&bf->bf_lock);
+	else
+		MtxMustLock(&bf->bf_lock);
+
+	/* sb->s_root for NFS is unreliable */
+	err = au_br_stfs(br, &bf->bf_stfs);
+	if (unlikely(err)) {
+		AuErr1("FHSM failed (%d), b%d, ignored.\n", bindex, err);
+		goto out;
+	}
+
+	bf->bf_jiffy = jiffies;
+	bf->bf_readable = 1;
+	if (do_notify)
+		au_fhsm_notify(sb, /*val*/1);
+	if (rstfs)
+		*rstfs = bf->bf_stfs;
+
+out:
+	if (do_lock)
+		mutex_unlock(&bf->bf_lock);
+	au_fhsm_notify(sb, /*val*/1);
+
+	return err;
+}
+
+void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	AuDbg("b%d, force %d\n", bindex, force);
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	if (!au_ftest_si(sbinfo, FHSM)
+	    || fhsm->fhsm_bottom == bindex)
+		return;
+
+	br = au_sbr(sb, bindex);
+	bf = br->br_fhsm;
+	AuDebugOn(!bf);
+	mutex_lock(&bf->bf_lock);
+	if (force
+	    || au_fhsm_pid(fhsm)
+	    || au_fhsm_test_jiffy(sbinfo, br))
+		err = au_fhsm_stfs(sb, bindex, /*rstfs*/NULL, /*do_lock*/0,
+				  /*do_notify*/1);
+	mutex_unlock(&bf->bf_lock);
+}
+
+void au_fhsm_wrote_all(struct super_block *sb, int force)
+{
+	aufs_bindex_t bindex, bbot;
+	struct au_branch *br;
+
+	/* exclude the bottom */
+	bbot = au_fhsm_bottom(sb);
+	for (bindex = 0; bindex < bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (au_br_fhsm(br->br_perm))
+			au_fhsm_wrote(sb, bindex, force);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+static unsigned int au_fhsm_poll(struct file *file,
+				 struct poll_table_struct *wait)
+{
+	unsigned int mask;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	mask = 0;
+	sbinfo = file->private_data;
+	fhsm = &sbinfo->si_fhsm;
+	poll_wait(file, &fhsm->fhsm_wqh, wait);
+	if (atomic_read(&fhsm->fhsm_readable))
+		mask = POLLIN /* | POLLRDNORM */;
+
+	AuTraceErr((int)mask);
+	return mask;
+}
+
+static int au_fhsm_do_read_one(struct aufs_stbr __user *stbr,
+			      struct aufs_stfs *stfs, __s16 brid)
+{
+	int err;
+
+	err = copy_to_user(&stbr->stfs, stfs, sizeof(*stfs));
+	if (!err)
+		err = __put_user(brid, &stbr->brid);
+	if (unlikely(err))
+		err = -EFAULT;
+
+	return err;
+}
+
+static ssize_t au_fhsm_do_read(struct super_block *sb,
+			       struct aufs_stbr __user *stbr, size_t count)
+{
+	ssize_t err;
+	int nstbr;
+	aufs_bindex_t bindex, bbot;
+	struct au_branch *br;
+	struct au_br_fhsm *bf;
+
+	/* except the bottom branch */
+	err = 0;
+	nstbr = 0;
+	bbot = au_fhsm_bottom(sb);
+	for (bindex = 0; !err && bindex < bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (!au_br_fhsm(br->br_perm))
+			continue;
+
+		bf = br->br_fhsm;
+		mutex_lock(&bf->bf_lock);
+		if (bf->bf_readable) {
+			err = -EFAULT;
+			if (count >= sizeof(*stbr))
+				err = au_fhsm_do_read_one(stbr++, &bf->bf_stfs,
+							  br->br_id);
+			if (!err) {
+				bf->bf_readable = 0;
+				count -= sizeof(*stbr);
+				nstbr++;
+			}
+		}
+		mutex_unlock(&bf->bf_lock);
+	}
+	if (!err)
+		err = sizeof(*stbr) * nstbr;
+
+	return err;
+}
+
+static ssize_t au_fhsm_read(struct file *file, char __user *buf, size_t count,
+			   loff_t *pos)
+{
+	ssize_t err;
+	int readable;
+	aufs_bindex_t nfhsm, bindex, bbot;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+	struct au_branch *br;
+	struct super_block *sb;
+
+	err = 0;
+	sbinfo = file->private_data;
+	fhsm = &sbinfo->si_fhsm;
+need_data:
+	spin_lock_irq(&fhsm->fhsm_wqh.lock);
+	if (!atomic_read(&fhsm->fhsm_readable)) {
+		if (vfsub_file_flags(file) & O_NONBLOCK)
+			err = -EAGAIN;
+		else
+			err = wait_event_interruptible_locked_irq
+				(fhsm->fhsm_wqh,
+				 atomic_read(&fhsm->fhsm_readable));
+	}
+	spin_unlock_irq(&fhsm->fhsm_wqh.lock);
+	if (unlikely(err))
+		goto out;
+
+	/* sb may already be dead */
+	au_rw_read_lock(&sbinfo->si_rwsem);
+	readable = atomic_read(&fhsm->fhsm_readable);
+	if (readable > 0) {
+		sb = sbinfo->si_sb;
+		AuDebugOn(!sb);
+		/* exclude the bottom branch */
+		nfhsm = 0;
+		bbot = au_fhsm_bottom(sb);
+		for (bindex = 0; bindex < bbot; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (au_br_fhsm(br->br_perm))
+				nfhsm++;
+		}
+		err = -EMSGSIZE;
+		if (nfhsm * sizeof(struct aufs_stbr) <= count) {
+			atomic_set(&fhsm->fhsm_readable, 0);
+			err = au_fhsm_do_read(sbinfo->si_sb, (void __user *)buf,
+					     count);
+		}
+	}
+	au_rw_read_unlock(&sbinfo->si_rwsem);
+	if (!readable)
+		goto need_data;
+
+out:
+	return err;
+}
+
+static int au_fhsm_release(struct inode *inode, struct file *file)
+{
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	/* sb may already be dead */
+	sbinfo = file->private_data;
+	fhsm = &sbinfo->si_fhsm;
+	spin_lock(&fhsm->fhsm_spin);
+	fhsm->fhsm_pid = 0;
+	spin_unlock(&fhsm->fhsm_spin);
+	kobject_put(&sbinfo->si_kobj);
+
+	return 0;
+}
+
+static const struct file_operations au_fhsm_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+	.read		= au_fhsm_read,
+	.poll		= au_fhsm_poll,
+	.release	= au_fhsm_release
+};
+
+int au_fhsm_fd(struct super_block *sb, int oflags)
+{
+	int err, fd;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+
+	err = -EPERM;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = -EINVAL;
+	if (unlikely(oflags & ~(O_CLOEXEC | O_NONBLOCK)))
+		goto out;
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	spin_lock(&fhsm->fhsm_spin);
+	if (!fhsm->fhsm_pid)
+		fhsm->fhsm_pid = current->pid;
+	else
+		err = -EBUSY;
+	spin_unlock(&fhsm->fhsm_spin);
+	if (unlikely(err))
+		goto out;
+
+	oflags |= O_RDONLY;
+	/* oflags |= FMODE_NONOTIFY; */
+	fd = anon_inode_getfd("[aufs_fhsm]", &au_fhsm_fops, sbinfo, oflags);
+	err = fd;
+	if (unlikely(fd < 0))
+		goto out_pid;
+
+	/* succeed reglardless 'fhsm' status */
+	kobject_get(&sbinfo->si_kobj);
+	si_noflush_read_lock(sb);
+	if (au_ftest_si(sbinfo, FHSM))
+		au_fhsm_wrote_all(sb, /*force*/0);
+	si_read_unlock(sb);
+	goto out; /* success */
+
+out_pid:
+	spin_lock(&fhsm->fhsm_spin);
+	fhsm->fhsm_pid = 0;
+	spin_unlock(&fhsm->fhsm_spin);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_fhsm_br_alloc(struct au_branch *br)
+{
+	int err;
+
+	err = 0;
+	br->br_fhsm = kmalloc(sizeof(*br->br_fhsm), GFP_NOFS);
+	if (br->br_fhsm)
+		au_br_fhsm_init(br->br_fhsm);
+	else
+		err = -ENOMEM;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_fhsm_fin(struct super_block *sb)
+{
+	au_fhsm_notify(sb, /*val*/-1);
+}
+
+void au_fhsm_init(struct au_sbinfo *sbinfo)
+{
+	struct au_fhsm *fhsm;
+
+	fhsm = &sbinfo->si_fhsm;
+	spin_lock_init(&fhsm->fhsm_spin);
+	init_waitqueue_head(&fhsm->fhsm_wqh);
+	atomic_set(&fhsm->fhsm_readable, 0);
+	fhsm->fhsm_expire
+		= msecs_to_jiffies(AUFS_FHSM_CACHE_DEF_SEC * MSEC_PER_SEC);
+	fhsm->fhsm_bottom = -1;
+}
+
+void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec)
+{
+	sbinfo->si_fhsm.fhsm_expire
+		= msecs_to_jiffies(sec * MSEC_PER_SEC);
+}
+
+void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo)
+{
+	unsigned int u;
+
+	if (!au_ftest_si(sbinfo, FHSM))
+		return;
+
+	u = jiffies_to_msecs(sbinfo->si_fhsm.fhsm_expire) / MSEC_PER_SEC;
+	if (u != AUFS_FHSM_CACHE_DEF_SEC)
+		seq_printf(seq, ",fhsm_sec=%u", u);
+}
diff --git b/fs/aufs/file.c b/fs/aufs/file.c
new file mode 100644
index 0000000..bfc1d1b
--- /dev/null
+++ b/fs/aufs/file.c
@@ -0,0 +1,844 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * handling file/dir, and address_space operation
+ */
+
+#ifdef CONFIG_AUFS_DEBUG
+#include <linux/migrate.h>
+#endif
+#include <linux/pagemap.h>
+#include "aufs.h"
+
+/* drop flags for writing */
+unsigned int au_file_roflags(unsigned int flags)
+{
+	flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC);
+	flags |= O_RDONLY | O_NOATIME;
+	return flags;
+}
+
+/* common functions to regular file and dir */
+struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
+		       struct file *file, int force_wr)
+{
+	struct file *h_file;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct path h_path;
+	int err;
+
+	/* a race condition can happen between open and unlink/rmdir */
+	h_file = ERR_PTR(-ENOENT);
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (au_test_nfsd() && (!h_dentry || d_is_negative(h_dentry)))
+		goto out;
+	h_inode = d_inode(h_dentry);
+	spin_lock(&h_dentry->d_lock);
+	err = (!d_unhashed(dentry) && d_unlinked(h_dentry))
+		/* || !d_inode(dentry)->i_nlink */
+		;
+	spin_unlock(&h_dentry->d_lock);
+	if (unlikely(err))
+		goto out;
+
+	sb = dentry->d_sb;
+	br = au_sbr(sb, bindex);
+	err = au_br_test_oflag(flags, br);
+	h_file = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	/* drop flags for writing */
+	if (au_test_ro(sb, bindex, d_inode(dentry))) {
+		if (force_wr && !(flags & O_WRONLY))
+			force_wr = 0;
+		flags = au_file_roflags(flags);
+		if (force_wr) {
+			h_file = ERR_PTR(-EROFS);
+			flags = au_file_roflags(flags);
+			if (unlikely(vfsub_native_ro(h_inode)
+				     || IS_APPEND(h_inode)))
+				goto out;
+			flags &= ~O_ACCMODE;
+			flags |= O_WRONLY;
+		}
+	}
+	flags &= ~O_CREAT;
+	au_br_get(br);
+	h_path.dentry = h_dentry;
+	h_path.mnt = au_br_mnt(br);
+	h_file = vfsub_dentry_open(&h_path, flags);
+	if (IS_ERR(h_file))
+		goto out_br;
+
+	if (flags & __FMODE_EXEC) {
+		err = deny_write_access(h_file);
+		if (unlikely(err)) {
+			fput(h_file);
+			h_file = ERR_PTR(err);
+			goto out_br;
+		}
+	}
+	fsnotify_open(h_file);
+	goto out; /* success */
+
+out_br:
+	au_br_put(br);
+out:
+	return h_file;
+}
+
+static int au_cmoo(struct dentry *dentry)
+{
+	int err, cmoo;
+	unsigned int udba;
+	struct path h_path;
+	struct au_pin pin;
+	struct au_cp_generic cpg = {
+		.dentry	= dentry,
+		.bdst	= -1,
+		.bsrc	= -1,
+		.len	= -1,
+		.pin	= &pin,
+		.flags	= AuCpup_DTIME | AuCpup_HOPEN
+	};
+	struct inode *delegated;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+	struct au_fhsm *fhsm;
+	pid_t pid;
+	struct au_branch *br;
+	struct dentry *parent;
+	struct au_hinode *hdir;
+
+	DiMustWriteLock(dentry);
+	IiMustWriteLock(d_inode(dentry));
+
+	err = 0;
+	if (IS_ROOT(dentry))
+		goto out;
+	cpg.bsrc = au_dbtop(dentry);
+	if (!cpg.bsrc)
+		goto out;
+
+	sb = dentry->d_sb;
+	sbinfo = au_sbi(sb);
+	fhsm = &sbinfo->si_fhsm;
+	pid = au_fhsm_pid(fhsm);
+	if (pid
+	    && (current->pid == pid
+		|| current->real_parent->pid == pid))
+		goto out;
+
+	br = au_sbr(sb, cpg.bsrc);
+	cmoo = au_br_cmoo(br->br_perm);
+	if (!cmoo)
+		goto out;
+	if (!d_is_reg(dentry))
+		cmoo &= AuBrAttr_COO_ALL;
+	if (!cmoo)
+		goto out;
+
+	parent = dget_parent(dentry);
+	di_write_lock_parent(parent);
+	err = au_wbr_do_copyup_bu(dentry, cpg.bsrc - 1);
+	cpg.bdst = err;
+	if (unlikely(err < 0)) {
+		err = 0;	/* there is no upper writable branch */
+		goto out_dgrade;
+	}
+	AuDbg("bsrc %d, bdst %d\n", cpg.bsrc, cpg.bdst);
+
+	/* do not respect the coo attrib for the target branch */
+	err = au_cpup_dirs(dentry, cpg.bdst);
+	if (unlikely(err))
+		goto out_dgrade;
+
+	di_downgrade_lock(parent, AuLock_IR);
+	udba = au_opt_udba(sb);
+	err = au_pin(&pin, dentry, cpg.bdst, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out_parent;
+
+	err = au_sio_cpup_simple(&cpg);
+	au_unpin(&pin);
+	if (unlikely(err))
+		goto out_parent;
+	if (!(cmoo & AuBrWAttr_MOO))
+		goto out_parent; /* success */
+
+	err = au_pin(&pin, dentry, cpg.bsrc, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out_parent;
+
+	h_path.mnt = au_br_mnt(br);
+	h_path.dentry = au_h_dptr(dentry, cpg.bsrc);
+	hdir = au_hi(d_inode(parent), cpg.bsrc);
+	delegated = NULL;
+	err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, /*force*/1);
+	au_unpin(&pin);
+	/* todo: keep h_dentry or not? */
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	if (unlikely(err)) {
+		pr_err("unlink %pd after coo failed (%d), ignored\n",
+		       dentry, err);
+		err = 0;
+	}
+	goto out_parent; /* success */
+
+out_dgrade:
+	di_downgrade_lock(parent, AuLock_IR);
+out_parent:
+	di_read_unlock(parent, AuLock_IR);
+	dput(parent);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_do_open(struct file *file, struct au_do_open_args *args)
+{
+	int err, no_lock = args->no_lock;
+	struct dentry *dentry;
+	struct au_finfo *finfo;
+
+	if (!no_lock)
+		err = au_finfo_init(file, args->fidir);
+	else {
+		lockdep_off();
+		err = au_finfo_init(file, args->fidir);
+		lockdep_on();
+	}
+	if (unlikely(err))
+		goto out;
+
+	dentry = file->f_path.dentry;
+	AuDebugOn(IS_ERR_OR_NULL(dentry));
+	if (!no_lock) {
+		di_write_lock_child(dentry);
+		err = au_cmoo(dentry);
+		di_downgrade_lock(dentry, AuLock_IR);
+		if (!err)
+			err = args->open(file, vfsub_file_flags(file), NULL);
+		di_read_unlock(dentry, AuLock_IR);
+	} else {
+		err = au_cmoo(dentry);
+		if (!err)
+			err = args->open(file, vfsub_file_flags(file),
+					 args->h_file);
+		if (!err && au_fbtop(file) != au_dbtop(dentry))
+			/*
+			 * cmoo happens after h_file was opened.
+			 * need to refresh file later.
+			 */
+			atomic_dec(&au_fi(file)->fi_generation);
+	}
+
+	finfo = au_fi(file);
+	if (!err) {
+		finfo->fi_file = file;
+		au_sphl_add(&finfo->fi_hlist,
+			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
+	}
+	if (!no_lock)
+		fi_write_unlock(file);
+	else {
+		lockdep_off();
+		fi_write_unlock(file);
+		lockdep_on();
+	}
+	if (unlikely(err)) {
+		finfo->fi_hdir = NULL;
+		au_finfo_fin(file, /*atonce*/0);
+	}
+
+out:
+	return err;
+}
+
+int au_reopen_nondir(struct file *file)
+{
+	int err;
+	aufs_bindex_t btop;
+	struct dentry *dentry;
+	struct file *h_file, *h_file_tmp;
+
+	dentry = file->f_path.dentry;
+	btop = au_dbtop(dentry);
+	h_file_tmp = NULL;
+	if (au_fbtop(file) == btop) {
+		h_file = au_hf_top(file);
+		if (file->f_mode == h_file->f_mode)
+			return 0; /* success */
+		h_file_tmp = h_file;
+		get_file(h_file_tmp);
+		au_set_h_fptr(file, btop, NULL);
+	}
+	AuDebugOn(au_fi(file)->fi_hdir);
+	/*
+	 * it can happen
+	 * file exists on both of rw and ro
+	 * open --> dbtop and fbtop are both 0
+	 * prepend a branch as rw, "rw" become ro
+	 * remove rw/file
+	 * delete the top branch, "rw" becomes rw again
+	 *	--> dbtop is 1, fbtop is still 0
+	 * write --> fbtop is 0 but dbtop is 1
+	 */
+	/* AuDebugOn(au_fbtop(file) < btop); */
+
+	h_file = au_h_open(dentry, btop, vfsub_file_flags(file) & ~O_TRUNC,
+			   file, /*force_wr*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file)) {
+		if (h_file_tmp) {
+			au_sbr_get(dentry->d_sb, btop);
+			au_set_h_fptr(file, btop, h_file_tmp);
+			h_file_tmp = NULL;
+		}
+		goto out; /* todo: close all? */
+	}
+
+	err = 0;
+	au_set_fbtop(file, btop);
+	au_set_h_fptr(file, btop, h_file);
+	au_update_figen(file);
+	/* todo: necessary? */
+	/* file->f_ra = h_file->f_ra; */
+
+out:
+	if (h_file_tmp)
+		fput(h_file_tmp);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_reopen_wh(struct file *file, aufs_bindex_t btgt,
+			struct dentry *hi_wh)
+{
+	int err;
+	aufs_bindex_t btop;
+	struct au_dinfo *dinfo;
+	struct dentry *h_dentry;
+	struct au_hdentry *hdp;
+
+	dinfo = au_di(file->f_path.dentry);
+	AuRwMustWriteLock(&dinfo->di_rwsem);
+
+	btop = dinfo->di_btop;
+	dinfo->di_btop = btgt;
+	hdp = au_hdentry(dinfo, btgt);
+	h_dentry = hdp->hd_dentry;
+	hdp->hd_dentry = hi_wh;
+	err = au_reopen_nondir(file);
+	hdp->hd_dentry = h_dentry;
+	dinfo->di_btop = btop;
+
+	return err;
+}
+
+static int au_ready_to_write_wh(struct file *file, loff_t len,
+				aufs_bindex_t bcpup, struct au_pin *pin)
+{
+	int err;
+	struct inode *inode, *h_inode;
+	struct dentry *h_dentry, *hi_wh;
+	struct au_cp_generic cpg = {
+		.dentry	= file->f_path.dentry,
+		.bdst	= bcpup,
+		.bsrc	= -1,
+		.len	= len,
+		.pin	= pin
+	};
+
+	au_update_dbtop(cpg.dentry);
+	inode = d_inode(cpg.dentry);
+	h_inode = NULL;
+	if (au_dbtop(cpg.dentry) <= bcpup
+	    && au_dbbot(cpg.dentry) >= bcpup) {
+		h_dentry = au_h_dptr(cpg.dentry, bcpup);
+		if (h_dentry && d_is_positive(h_dentry))
+			h_inode = d_inode(h_dentry);
+	}
+	hi_wh = au_hi_wh(inode, bcpup);
+	if (!hi_wh && !h_inode)
+		err = au_sio_cpup_wh(&cpg, file);
+	else
+		/* already copied-up after unlink */
+		err = au_reopen_wh(file, bcpup, hi_wh);
+
+	if (!err
+	    && (inode->i_nlink > 1
+		|| (inode->i_state & I_LINKABLE))
+	    && au_opt_test(au_mntflags(cpg.dentry->d_sb), PLINK))
+		au_plink_append(inode, bcpup, au_h_dptr(cpg.dentry, bcpup));
+
+	return err;
+}
+
+/*
+ * prepare the @file for writing.
+ */
+int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin)
+{
+	int err;
+	aufs_bindex_t dbtop;
+	struct dentry *parent;
+	struct inode *inode;
+	struct super_block *sb;
+	struct file *h_file;
+	struct au_cp_generic cpg = {
+		.dentry	= file->f_path.dentry,
+		.bdst	= -1,
+		.bsrc	= -1,
+		.len	= len,
+		.pin	= pin,
+		.flags	= AuCpup_DTIME
+	};
+
+	sb = cpg.dentry->d_sb;
+	inode = d_inode(cpg.dentry);
+	cpg.bsrc = au_fbtop(file);
+	err = au_test_ro(sb, cpg.bsrc, inode);
+	if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) {
+		err = au_pin(pin, cpg.dentry, cpg.bsrc, AuOpt_UDBA_NONE,
+			     /*flags*/0);
+		goto out;
+	}
+
+	/* need to cpup or reopen */
+	parent = dget_parent(cpg.dentry);
+	di_write_lock_parent(parent);
+	err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
+	cpg.bdst = err;
+	if (unlikely(err < 0))
+		goto out_dgrade;
+	err = 0;
+
+	if (!d_unhashed(cpg.dentry) && !au_h_dptr(parent, cpg.bdst)) {
+		err = au_cpup_dirs(cpg.dentry, cpg.bdst);
+		if (unlikely(err))
+			goto out_dgrade;
+	}
+
+	err = au_pin(pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out_dgrade;
+
+	dbtop = au_dbtop(cpg.dentry);
+	if (dbtop <= cpg.bdst)
+		cpg.bsrc = cpg.bdst;
+
+	if (dbtop <= cpg.bdst		/* just reopen */
+	    || !d_unhashed(cpg.dentry)	/* copyup and reopen */
+		) {
+		h_file = au_h_open_pre(cpg.dentry, cpg.bsrc, /*force_wr*/0);
+		if (IS_ERR(h_file))
+			err = PTR_ERR(h_file);
+		else {
+			di_downgrade_lock(parent, AuLock_IR);
+			if (dbtop > cpg.bdst)
+				err = au_sio_cpup_simple(&cpg);
+			if (!err)
+				err = au_reopen_nondir(file);
+			au_h_open_post(cpg.dentry, cpg.bsrc, h_file);
+		}
+	} else {			/* copyup as wh and reopen */
+		/*
+		 * since writable hfsplus branch is not supported,
+		 * h_open_pre/post() are unnecessary.
+		 */
+		err = au_ready_to_write_wh(file, len, cpg.bdst, pin);
+		di_downgrade_lock(parent, AuLock_IR);
+	}
+
+	if (!err) {
+		au_pin_set_parent_lflag(pin, /*lflag*/0);
+		goto out_dput; /* success */
+	}
+	au_unpin(pin);
+	goto out_unlock;
+
+out_dgrade:
+	di_downgrade_lock(parent, AuLock_IR);
+out_unlock:
+	di_read_unlock(parent, AuLock_IR);
+out_dput:
+	dput(parent);
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_do_flush(struct file *file, fl_owner_t id,
+		int (*flush)(struct file *file, fl_owner_t id))
+{
+	int err;
+	struct super_block *sb;
+	struct inode *inode;
+
+	inode = file_inode(file);
+	sb = inode->i_sb;
+	si_noflush_read_lock(sb);
+	fi_read_lock(file);
+	ii_read_lock_child(inode);
+
+	err = flush(file, id);
+	au_cpup_attr_timesizes(inode);
+
+	ii_read_unlock(inode);
+	fi_read_unlock(file);
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_file_refresh_by_inode(struct file *file, int *need_reopen)
+{
+	int err;
+	struct au_pin pin;
+	struct au_finfo *finfo;
+	struct dentry *parent, *hi_wh;
+	struct inode *inode;
+	struct super_block *sb;
+	struct au_cp_generic cpg = {
+		.dentry	= file->f_path.dentry,
+		.bdst	= -1,
+		.bsrc	= -1,
+		.len	= -1,
+		.pin	= &pin,
+		.flags	= AuCpup_DTIME
+	};
+
+	FiMustWriteLock(file);
+
+	err = 0;
+	finfo = au_fi(file);
+	sb = cpg.dentry->d_sb;
+	inode = d_inode(cpg.dentry);
+	cpg.bdst = au_ibtop(inode);
+	if (cpg.bdst == finfo->fi_btop || IS_ROOT(cpg.dentry))
+		goto out;
+
+	parent = dget_parent(cpg.dentry);
+	if (au_test_ro(sb, cpg.bdst, inode)) {
+		di_read_lock_parent(parent, !AuLock_IR);
+		err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
+		cpg.bdst = err;
+		di_read_unlock(parent, !AuLock_IR);
+		if (unlikely(err < 0))
+			goto out_parent;
+		err = 0;
+	}
+
+	di_read_lock_parent(parent, AuLock_IR);
+	hi_wh = au_hi_wh(inode, cpg.bdst);
+	if (!S_ISDIR(inode->i_mode)
+	    && au_opt_test(au_mntflags(sb), PLINK)
+	    && au_plink_test(inode)
+	    && !d_unhashed(cpg.dentry)
+	    && cpg.bdst < au_dbtop(cpg.dentry)) {
+		err = au_test_and_cpup_dirs(cpg.dentry, cpg.bdst);
+		if (unlikely(err))
+			goto out_unlock;
+
+		/* always superio. */
+		err = au_pin(&pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
+			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+		if (!err) {
+			err = au_sio_cpup_simple(&cpg);
+			au_unpin(&pin);
+		}
+	} else if (hi_wh) {
+		/* already copied-up after unlink */
+		err = au_reopen_wh(file, cpg.bdst, hi_wh);
+		*need_reopen = 0;
+	}
+
+out_unlock:
+	di_read_unlock(parent, AuLock_IR);
+out_parent:
+	dput(parent);
+out:
+	return err;
+}
+
+static void au_do_refresh_dir(struct file *file)
+{
+	int execed;
+	aufs_bindex_t bindex, bbot, new_bindex, brid;
+	struct au_hfile *p, tmp, *q;
+	struct au_finfo *finfo;
+	struct super_block *sb;
+	struct au_fidir *fidir;
+
+	FiMustWriteLock(file);
+
+	sb = file->f_path.dentry->d_sb;
+	finfo = au_fi(file);
+	fidir = finfo->fi_hdir;
+	AuDebugOn(!fidir);
+	p = fidir->fd_hfile + finfo->fi_btop;
+	brid = p->hf_br->br_id;
+	bbot = fidir->fd_bbot;
+	for (bindex = finfo->fi_btop; bindex <= bbot; bindex++, p++) {
+		if (!p->hf_file)
+			continue;
+
+		new_bindex = au_br_index(sb, p->hf_br->br_id);
+		if (new_bindex == bindex)
+			continue;
+		if (new_bindex < 0) {
+			au_set_h_fptr(file, bindex, NULL);
+			continue;
+		}
+
+		/* swap two lower inode, and loop again */
+		q = fidir->fd_hfile + new_bindex;
+		tmp = *q;
+		*q = *p;
+		*p = tmp;
+		if (tmp.hf_file) {
+			bindex--;
+			p--;
+		}
+	}
+
+	execed = vfsub_file_execed(file);
+	p = fidir->fd_hfile;
+	if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) {
+		bbot = au_sbbot(sb);
+		for (finfo->fi_btop = 0; finfo->fi_btop <= bbot;
+		     finfo->fi_btop++, p++)
+			if (p->hf_file) {
+				if (file_inode(p->hf_file))
+					break;
+				au_hfput(p, execed);
+			}
+	} else {
+		bbot = au_br_index(sb, brid);
+		for (finfo->fi_btop = 0; finfo->fi_btop < bbot;
+		     finfo->fi_btop++, p++)
+			if (p->hf_file)
+				au_hfput(p, execed);
+		bbot = au_sbbot(sb);
+	}
+
+	p = fidir->fd_hfile + bbot;
+	for (fidir->fd_bbot = bbot; fidir->fd_bbot >= finfo->fi_btop;
+	     fidir->fd_bbot--, p--)
+		if (p->hf_file) {
+			if (file_inode(p->hf_file))
+				break;
+			au_hfput(p, execed);
+		}
+	AuDebugOn(fidir->fd_bbot < finfo->fi_btop);
+}
+
+/*
+ * after branch manipulating, refresh the file.
+ */
+static int refresh_file(struct file *file, int (*reopen)(struct file *file))
+{
+	int err, need_reopen, nbr;
+	aufs_bindex_t bbot, bindex;
+	struct dentry *dentry;
+	struct super_block *sb;
+	struct au_finfo *finfo;
+	struct au_hfile *hfile;
+
+	dentry = file->f_path.dentry;
+	sb = dentry->d_sb;
+	nbr = au_sbbot(sb) + 1;
+	finfo = au_fi(file);
+	if (!finfo->fi_hdir) {
+		hfile = &finfo->fi_htop;
+		AuDebugOn(!hfile->hf_file);
+		bindex = au_br_index(sb, hfile->hf_br->br_id);
+		AuDebugOn(bindex < 0);
+		if (bindex != finfo->fi_btop)
+			au_set_fbtop(file, bindex);
+	} else {
+		err = au_fidir_realloc(finfo, nbr, /*may_shrink*/0);
+		if (unlikely(err))
+			goto out;
+		au_do_refresh_dir(file);
+	}
+
+	err = 0;
+	need_reopen = 1;
+	if (!au_test_mmapped(file))
+		err = au_file_refresh_by_inode(file, &need_reopen);
+	if (finfo->fi_hdir)
+		/* harmless if err */
+		au_fidir_realloc(finfo, nbr, /*may_shrink*/1);
+	if (!err && need_reopen && !d_unlinked(dentry))
+		err = reopen(file);
+	if (!err) {
+		au_update_figen(file);
+		goto out; /* success */
+	}
+
+	/* error, close all lower files */
+	if (finfo->fi_hdir) {
+		bbot = au_fbbot_dir(file);
+		for (bindex = au_fbtop(file); bindex <= bbot; bindex++)
+			au_set_h_fptr(file, bindex, NULL);
+	}
+
+out:
+	return err;
+}
+
+/* common function to regular file and dir */
+int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
+			  int wlock)
+{
+	int err;
+	unsigned int sigen, figen;
+	aufs_bindex_t btop;
+	unsigned char pseudo_link;
+	struct dentry *dentry;
+	struct inode *inode;
+
+	err = 0;
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+	sigen = au_sigen(dentry->d_sb);
+	fi_write_lock(file);
+	figen = au_figen(file);
+	di_write_lock_child(dentry);
+	btop = au_dbtop(dentry);
+	pseudo_link = (btop != au_ibtop(inode));
+	if (sigen == figen && !pseudo_link && au_fbtop(file) == btop) {
+		if (!wlock) {
+			di_downgrade_lock(dentry, AuLock_IR);
+			fi_downgrade_lock(file);
+		}
+		goto out; /* success */
+	}
+
+	AuDbg("sigen %d, figen %d\n", sigen, figen);
+	if (au_digen_test(dentry, sigen)) {
+		err = au_reval_dpath(dentry, sigen);
+		AuDebugOn(!err && au_digen_test(dentry, sigen));
+	}
+
+	if (!err)
+		err = refresh_file(file, reopen);
+	if (!err) {
+		if (!wlock) {
+			di_downgrade_lock(dentry, AuLock_IR);
+			fi_downgrade_lock(file);
+		}
+	} else {
+		di_write_unlock(dentry);
+		fi_write_unlock(file);
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* cf. aufs_nopage() */
+/* for madvise(2) */
+static int aufs_readpage(struct file *file __maybe_unused, struct page *page)
+{
+	unlock_page(page);
+	return 0;
+}
+
+/* it will never be called, but necessary to support O_DIRECT */
+static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{ BUG(); return 0; }
+
+/* they will never be called. */
+#ifdef CONFIG_AUFS_DEBUG
+static int aufs_write_begin(struct file *file, struct address_space *mapping,
+			    loff_t pos, unsigned len, unsigned flags,
+			    struct page **pagep, void **fsdata)
+{ AuUnsupport(); return 0; }
+static int aufs_write_end(struct file *file, struct address_space *mapping,
+			  loff_t pos, unsigned len, unsigned copied,
+			  struct page *page, void *fsdata)
+{ AuUnsupport(); return 0; }
+static int aufs_writepage(struct page *page, struct writeback_control *wbc)
+{ AuUnsupport(); return 0; }
+
+static int aufs_set_page_dirty(struct page *page)
+{ AuUnsupport(); return 0; }
+static void aufs_invalidatepage(struct page *page, unsigned int offset,
+				unsigned int length)
+{ AuUnsupport(); }
+static int aufs_releasepage(struct page *page, gfp_t gfp)
+{ AuUnsupport(); return 0; }
+#if 0 /* called by memory compaction regardless file */
+static int aufs_migratepage(struct address_space *mapping, struct page *newpage,
+			    struct page *page, enum migrate_mode mode)
+{ AuUnsupport(); return 0; }
+#endif
+static bool aufs_isolate_page(struct page *page, isolate_mode_t mode)
+{ AuUnsupport(); return true; }
+static void aufs_putback_page(struct page *page)
+{ AuUnsupport(); }
+static int aufs_launder_page(struct page *page)
+{ AuUnsupport(); return 0; }
+static int aufs_is_partially_uptodate(struct page *page,
+				      unsigned long from,
+				      unsigned long count)
+{ AuUnsupport(); return 0; }
+static void aufs_is_dirty_writeback(struct page *page, bool *dirty,
+				    bool *writeback)
+{ AuUnsupport(); }
+static int aufs_error_remove_page(struct address_space *mapping,
+				  struct page *page)
+{ AuUnsupport(); return 0; }
+static int aufs_swap_activate(struct swap_info_struct *sis, struct file *file,
+			      sector_t *span)
+{ AuUnsupport(); return 0; }
+static void aufs_swap_deactivate(struct file *file)
+{ AuUnsupport(); }
+#endif /* CONFIG_AUFS_DEBUG */
+
+const struct address_space_operations aufs_aop = {
+	.readpage		= aufs_readpage,
+	.direct_IO		= aufs_direct_IO,
+#ifdef CONFIG_AUFS_DEBUG
+	.writepage		= aufs_writepage,
+	/* no writepages, because of writepage */
+	.set_page_dirty		= aufs_set_page_dirty,
+	/* no readpages, because of readpage */
+	.write_begin		= aufs_write_begin,
+	.write_end		= aufs_write_end,
+	/* no bmap, no block device */
+	.invalidatepage		= aufs_invalidatepage,
+	.releasepage		= aufs_releasepage,
+	/* is fallback_migrate_page ok? */
+	/* .migratepage		= aufs_migratepage, */
+	.isolate_page		= aufs_isolate_page,
+	.putback_page		= aufs_putback_page,
+	.launder_page		= aufs_launder_page,
+	.is_partially_uptodate	= aufs_is_partially_uptodate,
+	.is_dirty_writeback	= aufs_is_dirty_writeback,
+	.error_remove_page	= aufs_error_remove_page,
+	.swap_activate		= aufs_swap_activate,
+	.swap_deactivate	= aufs_swap_deactivate
+#endif /* CONFIG_AUFS_DEBUG */
+};
diff --git b/fs/aufs/file.h b/fs/aufs/file.h
new file mode 100644
index 0000000..f53b381
--- /dev/null
+++ b/fs/aufs/file.h
@@ -0,0 +1,281 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * file operations
+ */
+
+#ifndef __AUFS_FILE_H__
+#define __AUFS_FILE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include "rwsem.h"
+
+struct au_branch;
+struct au_hfile {
+	struct file		*hf_file;
+	struct au_branch	*hf_br;
+};
+
+struct au_vdir;
+struct au_fidir {
+	aufs_bindex_t		fd_bbot;
+	aufs_bindex_t		fd_nent;
+	struct au_vdir		*fd_vdir_cache;
+	struct au_hfile		fd_hfile[];
+};
+
+static inline int au_fidir_sz(int nent)
+{
+	AuDebugOn(nent < 0);
+	return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent;
+}
+
+struct au_finfo {
+	atomic_t		fi_generation;
+
+	struct au_rwsem		fi_rwsem;
+	aufs_bindex_t		fi_btop;
+
+	/* do not union them */
+	struct {				/* for non-dir */
+		struct au_hfile			fi_htop;
+		atomic_t			fi_mmapped;
+	};
+	struct au_fidir		*fi_hdir;	/* for dir only */
+
+	struct hlist_node	fi_hlist;
+	union {
+		struct file		*fi_file;	/* very ugly */
+		struct llist_node	fi_lnode;	/* delayed free */
+	};
+} ____cacheline_aligned_in_smp;
+
+/* ---------------------------------------------------------------------- */
+
+/* file.c */
+extern const struct address_space_operations aufs_aop;
+unsigned int au_file_roflags(unsigned int flags);
+struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
+		       struct file *file, int force_wr);
+struct au_do_open_args {
+	int		no_lock;
+	int		(*open)(struct file *file, int flags,
+				struct file *h_file);
+	struct au_fidir	*fidir;
+	struct file	*h_file;
+};
+int au_do_open(struct file *file, struct au_do_open_args *args);
+int au_reopen_nondir(struct file *file);
+struct au_pin;
+int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin);
+int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
+			  int wlock);
+int au_do_flush(struct file *file, fl_owner_t id,
+		int (*flush)(struct file *file, fl_owner_t id));
+
+/* poll.c */
+#ifdef CONFIG_AUFS_POLL
+unsigned int aufs_poll(struct file *file, poll_table *wait);
+#endif
+
+#ifdef CONFIG_AUFS_BR_HFSPLUS
+/* hfsplus.c */
+struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
+			   int force_wr);
+void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
+		    struct file *h_file);
+#else
+AuStub(struct file *, au_h_open_pre, return NULL, struct dentry *dentry,
+       aufs_bindex_t bindex, int force_wr)
+AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex,
+	   struct file *h_file);
+#endif
+
+/* f_op.c */
+extern const struct file_operations aufs_file_fop;
+int au_do_open_nondir(struct file *file, int flags, struct file *h_file);
+int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file);
+struct file *au_read_pre(struct file *file, int keep_fi);
+
+/* finfo.c */
+void au_hfput(struct au_hfile *hf, int execed);
+void au_set_h_fptr(struct file *file, aufs_bindex_t bindex,
+		   struct file *h_file);
+
+void au_update_figen(struct file *file);
+struct au_fidir *au_fidir_alloc(struct super_block *sb);
+int au_fidir_realloc(struct au_finfo *finfo, int nbr, int may_shrink);
+
+void au_fi_init_once(void *_fi);
+void au_finfo_fin(struct file *file, int atonce);
+int au_finfo_init(struct file *file, struct au_fidir *fidir);
+
+/* ioctl.c */
+long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
+			   unsigned long arg);
+long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
+			      unsigned long arg);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_finfo *au_fi(struct file *file)
+{
+	return file->private_data;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * fi_read_lock, fi_write_lock,
+ * fi_read_unlock, fi_write_unlock, fi_downgrade_lock
+ */
+AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem);
+
+#define FiMustNoWaiters(f)	AuRwMustNoWaiters(&au_fi(f)->fi_rwsem)
+#define FiMustAnyLock(f)	AuRwMustAnyLock(&au_fi(f)->fi_rwsem)
+#define FiMustWriteLock(f)	AuRwMustWriteLock(&au_fi(f)->fi_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: hard/soft set? */
+static inline aufs_bindex_t au_fbtop(struct file *file)
+{
+	FiMustAnyLock(file);
+	return au_fi(file)->fi_btop;
+}
+
+static inline aufs_bindex_t au_fbbot_dir(struct file *file)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_hdir->fd_bbot;
+}
+
+static inline struct au_vdir *au_fvdir_cache(struct file *file)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_hdir->fd_vdir_cache;
+}
+
+static inline void au_set_fbtop(struct file *file, aufs_bindex_t bindex)
+{
+	FiMustWriteLock(file);
+	au_fi(file)->fi_btop = bindex;
+}
+
+static inline void au_set_fbbot_dir(struct file *file, aufs_bindex_t bindex)
+{
+	FiMustWriteLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	au_fi(file)->fi_hdir->fd_bbot = bindex;
+}
+
+static inline void au_set_fvdir_cache(struct file *file,
+				      struct au_vdir *vdir_cache)
+{
+	FiMustWriteLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache;
+}
+
+static inline struct file *au_hf_top(struct file *file)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_htop.hf_file;
+}
+
+static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex)
+{
+	FiMustAnyLock(file);
+	AuDebugOn(!au_fi(file)->fi_hdir);
+	return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file;
+}
+
+/* todo: memory barrier? */
+static inline unsigned int au_figen(struct file *f)
+{
+	return atomic_read(&au_fi(f)->fi_generation);
+}
+
+static inline void au_set_mmapped(struct file *f)
+{
+	if (atomic_inc_return(&au_fi(f)->fi_mmapped))
+		return;
+	pr_warn("fi_mmapped wrapped around\n");
+	while (!atomic_inc_return(&au_fi(f)->fi_mmapped))
+		;
+}
+
+static inline void au_unset_mmapped(struct file *f)
+{
+	atomic_dec(&au_fi(f)->fi_mmapped);
+}
+
+static inline int au_test_mmapped(struct file *f)
+{
+	return atomic_read(&au_fi(f)->fi_mmapped);
+}
+
+/* customize vma->vm_file */
+
+static inline void au_do_vm_file_reset(struct vm_area_struct *vma,
+				       struct file *file)
+{
+	struct file *f;
+
+	f = vma->vm_file;
+	get_file(file);
+	vma->vm_file = file;
+	fput(f);
+}
+
+#ifdef CONFIG_MMU
+#define AuDbgVmRegion(file, vma) do {} while (0)
+
+static inline void au_vm_file_reset(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	au_do_vm_file_reset(vma, file);
+}
+#else
+#define AuDbgVmRegion(file, vma) \
+	AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file))
+
+static inline void au_vm_file_reset(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	struct file *f;
+
+	au_do_vm_file_reset(vma, file);
+	f = vma->vm_region->vm_file;
+	get_file(file);
+	vma->vm_region->vm_file = file;
+	fput(f);
+}
+#endif /* CONFIG_MMU */
+
+/* handle vma->vm_prfile */
+static inline void au_vm_prfile_set(struct vm_area_struct *vma,
+				    struct file *file)
+{
+	get_file(file);
+	vma->vm_prfile = file;
+#ifndef CONFIG_MMU
+	get_file(file);
+	vma->vm_region->vm_prfile = file;
+#endif
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_FILE_H__ */
diff --git b/fs/aufs/finfo.c b/fs/aufs/finfo.c
new file mode 100644
index 0000000..0f016ed
--- /dev/null
+++ b/fs/aufs/finfo.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * file private data
+ */
+
+#include "aufs.h"
+
+void au_hfput(struct au_hfile *hf, int execed)
+{
+	if (execed)
+		allow_write_access(hf->hf_file);
+	fput(hf->hf_file);
+	hf->hf_file = NULL;
+	au_br_put(hf->hf_br);
+	hf->hf_br = NULL;
+}
+
+void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val)
+{
+	struct au_finfo *finfo = au_fi(file);
+	struct au_hfile *hf;
+	struct au_fidir *fidir;
+
+	fidir = finfo->fi_hdir;
+	if (!fidir) {
+		AuDebugOn(finfo->fi_btop != bindex);
+		hf = &finfo->fi_htop;
+	} else
+		hf = fidir->fd_hfile + bindex;
+
+	if (hf && hf->hf_file)
+		au_hfput(hf, vfsub_file_execed(file));
+	if (val) {
+		FiMustWriteLock(file);
+		AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry));
+		hf->hf_file = val;
+		hf->hf_br = au_sbr(file->f_path.dentry->d_sb, bindex);
+	}
+}
+
+void au_update_figen(struct file *file)
+{
+	atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_path.dentry));
+	/* smp_mb(); */ /* atomic_set */
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_fidir *au_fidir_alloc(struct super_block *sb)
+{
+	struct au_fidir *fidir;
+	int nbr;
+
+	nbr = au_sbbot(sb) + 1;
+	if (nbr < 2)
+		nbr = 2; /* initial allocate for 2 branches */
+	fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS);
+	if (fidir) {
+		fidir->fd_bbot = -1;
+		fidir->fd_nent = nbr;
+	}
+
+	return fidir;
+}
+
+int au_fidir_realloc(struct au_finfo *finfo, int nbr, int may_shrink)
+{
+	int err;
+	struct au_fidir *fidir, *p;
+
+	AuRwMustWriteLock(&finfo->fi_rwsem);
+	fidir = finfo->fi_hdir;
+	AuDebugOn(!fidir);
+
+	err = -ENOMEM;
+	p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr),
+			 GFP_NOFS, may_shrink);
+	if (p) {
+		p->fd_nent = nbr;
+		finfo->fi_hdir = p;
+		err = 0;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_finfo_fin(struct file *file, int atonce)
+{
+	struct au_finfo *finfo;
+
+	au_nfiles_dec(file->f_path.dentry->d_sb);
+
+	finfo = au_fi(file);
+	AuDebugOn(finfo->fi_hdir);
+	AuRwDestroy(&finfo->fi_rwsem);
+	if (!atonce)
+		au_cache_dfree_finfo(finfo);
+	else
+		au_cache_free_finfo(finfo);
+}
+
+void au_fi_init_once(void *_finfo)
+{
+	struct au_finfo *finfo = _finfo;
+
+	au_rw_init(&finfo->fi_rwsem);
+}
+
+int au_finfo_init(struct file *file, struct au_fidir *fidir)
+{
+	int err;
+	struct au_finfo *finfo;
+	struct dentry *dentry;
+
+	err = -ENOMEM;
+	dentry = file->f_path.dentry;
+	finfo = au_cache_alloc_finfo();
+	if (unlikely(!finfo))
+		goto out;
+
+	err = 0;
+	au_nfiles_inc(dentry->d_sb);
+	au_rw_write_lock(&finfo->fi_rwsem);
+	finfo->fi_btop = -1;
+	finfo->fi_hdir = fidir;
+	atomic_set(&finfo->fi_generation, au_digen(dentry));
+	/* smp_mb(); */ /* atomic_set */
+
+	file->private_data = finfo;
+
+out:
+	return err;
+}
diff --git b/fs/aufs/fstype.h b/fs/aufs/fstype.h
new file mode 100644
index 0000000..95b78b9
--- /dev/null
+++ b/fs/aufs/fstype.h
@@ -0,0 +1,387 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * judging filesystem type
+ */
+
+#ifndef __AUFS_FSTYPE_H__
+#define __AUFS_FSTYPE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <linux/nfs_fs.h>
+#include <linux/romfs_fs.h>
+
+static inline int au_test_aufs(struct super_block *sb)
+{
+	return sb->s_magic == AUFS_SUPER_MAGIC;
+}
+
+static inline const char *au_sbtype(struct super_block *sb)
+{
+	return sb->s_type->name;
+}
+
+static inline int au_test_iso9660(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_ISO9660_FS)
+	return sb->s_magic == ISOFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_romfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_ROMFS_FS)
+	return sb->s_magic == ROMFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_cramfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_CRAMFS)
+	return sb->s_magic == CRAMFS_MAGIC;
+#endif
+	return 0;
+}
+
+static inline int au_test_nfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_NFS_FS)
+	return sb->s_magic == NFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_fuse(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_FUSE_FS)
+	return sb->s_magic == FUSE_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_xfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_XFS_FS)
+	return sb->s_magic == XFS_SB_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_tmpfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_TMPFS
+	return sb->s_magic == TMPFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_ECRYPT_FS)
+	return !strcmp(au_sbtype(sb), "ecryptfs");
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_ramfs(struct super_block *sb)
+{
+	return sb->s_magic == RAMFS_MAGIC;
+}
+
+static inline int au_test_ubifs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_UBIFS_FS)
+	return sb->s_magic == UBIFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_procfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_PROC_FS
+	return sb->s_magic == PROC_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_sysfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_SYSFS
+	return sb->s_magic == SYSFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_configfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_CONFIGFS_FS)
+	return sb->s_magic == CONFIGFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_minix(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_MINIX_FS)
+	return sb->s_magic == MINIX3_SUPER_MAGIC
+		|| sb->s_magic == MINIX2_SUPER_MAGIC
+		|| sb->s_magic == MINIX2_SUPER_MAGIC2
+		|| sb->s_magic == MINIX_SUPER_MAGIC
+		|| sb->s_magic == MINIX_SUPER_MAGIC2;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_fat(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_FAT_FS)
+	return sb->s_magic == MSDOS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_msdos(struct super_block *sb)
+{
+	return au_test_fat(sb);
+}
+
+static inline int au_test_vfat(struct super_block *sb)
+{
+	return au_test_fat(sb);
+}
+
+static inline int au_test_securityfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_SECURITYFS
+	return sb->s_magic == SECURITYFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_squashfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_SQUASHFS)
+	return sb->s_magic == SQUASHFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_btrfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_BTRFS_FS)
+	return sb->s_magic == BTRFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_xenfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_XENFS)
+	return sb->s_magic == XENFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_debugfs(struct super_block *sb __maybe_unused)
+{
+#ifdef CONFIG_DEBUG_FS
+	return sb->s_magic == DEBUGFS_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_nilfs(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_NILFS)
+	return sb->s_magic == NILFS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+static inline int au_test_hfsplus(struct super_block *sb __maybe_unused)
+{
+#if IS_ENABLED(CONFIG_HFSPLUS_FS)
+	return sb->s_magic == HFSPLUS_SUPER_MAGIC;
+#else
+	return 0;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * they can't be an aufs branch.
+ */
+static inline int au_test_fs_unsuppoted(struct super_block *sb)
+{
+	return
+#ifndef CONFIG_AUFS_BR_RAMFS
+		au_test_ramfs(sb) ||
+#endif
+		au_test_procfs(sb)
+		|| au_test_sysfs(sb)
+		|| au_test_configfs(sb)
+		|| au_test_debugfs(sb)
+		|| au_test_securityfs(sb)
+		|| au_test_xenfs(sb)
+		|| au_test_ecryptfs(sb)
+		/* || !strcmp(au_sbtype(sb), "unionfs") */
+		|| au_test_aufs(sb); /* will be supported in next version */
+}
+
+static inline int au_test_fs_remote(struct super_block *sb)
+{
+	return !au_test_tmpfs(sb)
+#ifdef CONFIG_AUFS_BR_RAMFS
+		&& !au_test_ramfs(sb)
+#endif
+		&& !(sb->s_type->fs_flags & FS_REQUIRES_DEV);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * Note: these functions (below) are created after reading ->getattr() in all
+ * filesystems under linux/fs. it means we have to do so in every update...
+ */
+
+/*
+ * some filesystems require getattr to refresh the inode attributes before
+ * referencing.
+ * in most cases, we can rely on the inode attribute in NFS (or every remote fs)
+ * and leave the work for d_revalidate()
+ */
+static inline int au_test_fs_refresh_iattr(struct super_block *sb)
+{
+	return au_test_nfs(sb)
+		|| au_test_fuse(sb)
+		/* || au_test_btrfs(sb) */	/* untested */
+		;
+}
+
+/*
+ * filesystems which don't maintain i_size or i_blocks.
+ */
+static inline int au_test_fs_bad_iattr_size(struct super_block *sb)
+{
+	return au_test_xfs(sb)
+		|| au_test_btrfs(sb)
+		|| au_test_ubifs(sb)
+		|| au_test_hfsplus(sb)	/* maintained, but incorrect */
+		/* || au_test_minix(sb) */	/* untested */
+		;
+}
+
+/*
+ * filesystems which don't store the correct value in some of their inode
+ * attributes.
+ */
+static inline int au_test_fs_bad_iattr(struct super_block *sb)
+{
+	return au_test_fs_bad_iattr_size(sb)
+		|| au_test_fat(sb)
+		|| au_test_msdos(sb)
+		|| au_test_vfat(sb);
+}
+
+/* they don't check i_nlink in link(2) */
+static inline int au_test_fs_no_limit_nlink(struct super_block *sb)
+{
+	return au_test_tmpfs(sb)
+#ifdef CONFIG_AUFS_BR_RAMFS
+		|| au_test_ramfs(sb)
+#endif
+		|| au_test_ubifs(sb)
+		|| au_test_hfsplus(sb);
+}
+
+/*
+ * filesystems which sets S_NOATIME and S_NOCMTIME.
+ */
+static inline int au_test_fs_notime(struct super_block *sb)
+{
+	return au_test_nfs(sb)
+		|| au_test_fuse(sb)
+		|| au_test_ubifs(sb)
+		;
+}
+
+/* temporary support for i#1 in cramfs */
+static inline int au_test_fs_unique_ino(struct inode *inode)
+{
+	if (au_test_cramfs(inode->i_sb))
+		return inode->i_ino != 1;
+	return 1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * the filesystem where the xino files placed must support i/o after unlink and
+ * maintain i_size and i_blocks.
+ */
+static inline int au_test_fs_bad_xino(struct super_block *sb)
+{
+	return au_test_fs_remote(sb)
+		|| au_test_fs_bad_iattr_size(sb)
+		/* don't want unnecessary work for xino */
+		|| au_test_aufs(sb)
+		|| au_test_ecryptfs(sb)
+		|| au_test_nilfs(sb);
+}
+
+static inline int au_test_fs_trunc_xino(struct super_block *sb)
+{
+	return au_test_tmpfs(sb)
+		|| au_test_ramfs(sb);
+}
+
+/*
+ * test if the @sb is real-readonly.
+ */
+static inline int au_test_fs_rr(struct super_block *sb)
+{
+	return au_test_squashfs(sb)
+		|| au_test_iso9660(sb)
+		|| au_test_cramfs(sb)
+		|| au_test_romfs(sb);
+}
+
+/*
+ * test if the @inode is nfs with 'noacl' option
+ * NFS always sets MS_POSIXACL regardless its mount option 'noacl.'
+ */
+static inline int au_test_nfs_noacl(struct inode *inode)
+{
+	return au_test_nfs(inode->i_sb)
+		/* && IS_POSIXACL(inode) */
+		&& !nfs_server_capable(inode, NFS_CAP_ACLS);
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_FSTYPE_H__ */
diff --git b/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c
new file mode 100644
index 0000000..110f8f5
--- /dev/null
+++ b/fs/aufs/hfsnotify.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * fsnotify for the lower directories
+ */
+
+#include "aufs.h"
+
+/* FS_IN_IGNORED is unnecessary */
+static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE
+				 | FS_CREATE | FS_EVENT_ON_CHILD);
+static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq);
+static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0);
+
+static void au_hfsn_free_mark(struct fsnotify_mark *mark)
+{
+	struct au_hnotify *hn = container_of(mark, struct au_hnotify,
+					     hn_mark);
+	/* AuDbg("here\n"); */
+	au_cache_dfree_hnotify(hn);
+	smp_mb__before_atomic();
+	if (atomic64_dec_and_test(&au_hfsn_ifree))
+		wake_up(&au_hfsn_wq);
+}
+
+static int au_hfsn_alloc(struct au_hinode *hinode)
+{
+	int err;
+	struct au_hnotify *hn;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct fsnotify_mark *mark;
+	aufs_bindex_t bindex;
+
+	hn = hinode->hi_notify;
+	sb = hn->hn_aufs_inode->i_sb;
+	bindex = au_br_index(sb, hinode->hi_id);
+	br = au_sbr(sb, bindex);
+	AuDebugOn(!br->br_hfsn);
+
+	mark = &hn->hn_mark;
+	fsnotify_init_mark(mark, au_hfsn_free_mark);
+	mark->mask = AuHfsnMask;
+	/*
+	 * by udba rename or rmdir, aufs assign a new inode to the known
+	 * h_inode, so specify 1 to allow dups.
+	 */
+	lockdep_off();
+	err = fsnotify_add_mark(mark, br->br_hfsn->hfsn_group, hinode->hi_inode,
+				 /*mnt*/NULL, /*allow_dups*/1);
+	lockdep_on();
+
+	return err;
+}
+
+static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn)
+{
+	struct fsnotify_mark *mark;
+	unsigned long long ull;
+	struct fsnotify_group *group;
+
+	ull = atomic64_inc_return(&au_hfsn_ifree);
+	BUG_ON(!ull);
+
+	mark = &hn->hn_mark;
+	spin_lock(&mark->lock);
+	group = mark->group;
+	fsnotify_get_group(group);
+	spin_unlock(&mark->lock);
+	lockdep_off();
+	fsnotify_destroy_mark(mark, group);
+	fsnotify_put_mark(mark);
+	fsnotify_put_group(group);
+	lockdep_on();
+
+	/* free hn by myself */
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_ctl(struct au_hinode *hinode, int do_set)
+{
+	struct fsnotify_mark *mark;
+
+	mark = &hinode->hi_notify->hn_mark;
+	spin_lock(&mark->lock);
+	if (do_set) {
+		AuDebugOn(mark->mask & AuHfsnMask);
+		mark->mask |= AuHfsnMask;
+	} else {
+		AuDebugOn(!(mark->mask & AuHfsnMask));
+		mark->mask &= ~AuHfsnMask;
+	}
+	spin_unlock(&mark->lock);
+	/* fsnotify_recalc_inode_mask(hinode->hi_inode); */
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* #define AuDbgHnotify */
+#ifdef AuDbgHnotify
+static char *au_hfsn_name(u32 mask)
+{
+#ifdef CONFIG_AUFS_DEBUG
+#define test_ret(flag)				\
+	do {					\
+		if (mask & flag)		\
+			return #flag;		\
+	} while (0)
+	test_ret(FS_ACCESS);
+	test_ret(FS_MODIFY);
+	test_ret(FS_ATTRIB);
+	test_ret(FS_CLOSE_WRITE);
+	test_ret(FS_CLOSE_NOWRITE);
+	test_ret(FS_OPEN);
+	test_ret(FS_MOVED_FROM);
+	test_ret(FS_MOVED_TO);
+	test_ret(FS_CREATE);
+	test_ret(FS_DELETE);
+	test_ret(FS_DELETE_SELF);
+	test_ret(FS_MOVE_SELF);
+	test_ret(FS_UNMOUNT);
+	test_ret(FS_Q_OVERFLOW);
+	test_ret(FS_IN_IGNORED);
+	test_ret(FS_ISDIR);
+	test_ret(FS_IN_ONESHOT);
+	test_ret(FS_EVENT_ON_CHILD);
+	return "";
+#undef test_ret
+#else
+	return "??";
+#endif
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_free_group(struct fsnotify_group *group)
+{
+	struct au_br_hfsnotify *hfsn = group->private;
+
+	/* AuDbg("here\n"); */
+	au_delayed_kfree(hfsn);
+}
+
+static int au_hfsn_handle_event(struct fsnotify_group *group,
+				struct inode *inode,
+				struct fsnotify_mark *inode_mark,
+				struct fsnotify_mark *vfsmount_mark,
+				u32 mask, void *data, int data_type,
+				const unsigned char *file_name, u32 cookie)
+{
+	int err;
+	struct au_hnotify *hnotify;
+	struct inode *h_dir, *h_inode;
+	struct qstr h_child_qstr = QSTR_INIT(file_name, strlen(file_name));
+
+	AuDebugOn(data_type != FSNOTIFY_EVENT_INODE);
+
+	err = 0;
+	/* if FS_UNMOUNT happens, there must be another bug */
+	AuDebugOn(mask & FS_UNMOUNT);
+	if (mask & (FS_IN_IGNORED | FS_UNMOUNT))
+		goto out;
+
+	h_dir = inode;
+	h_inode = NULL;
+#ifdef AuDbgHnotify
+	au_debug_on();
+	if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1
+	    || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) {
+		AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n",
+		      h_dir->i_ino, mask, au_hfsn_name(mask),
+		      AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0);
+		/* WARN_ON(1); */
+	}
+	au_debug_off();
+#endif
+
+	AuDebugOn(!inode_mark);
+	hnotify = container_of(inode_mark, struct au_hnotify, hn_mark);
+	err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode);
+
+out:
+	return err;
+}
+
+static struct fsnotify_ops au_hfsn_ops = {
+	.handle_event		= au_hfsn_handle_event,
+	.free_group_priv	= au_hfsn_free_group
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_fin_br(struct au_branch *br)
+{
+	struct au_br_hfsnotify *hfsn;
+
+	hfsn = br->br_hfsn;
+	if (hfsn) {
+		lockdep_off();
+		fsnotify_put_group(hfsn->hfsn_group);
+		lockdep_on();
+	}
+}
+
+static int au_hfsn_init_br(struct au_branch *br, int perm)
+{
+	int err;
+	struct fsnotify_group *group;
+	struct au_br_hfsnotify *hfsn;
+
+	err = 0;
+	br->br_hfsn = NULL;
+	if (!au_br_hnotifyable(perm))
+		goto out;
+
+	err = -ENOMEM;
+	hfsn = kmalloc(sizeof(*hfsn), GFP_NOFS);
+	if (unlikely(!hfsn))
+		goto out;
+
+	err = 0;
+	group = fsnotify_alloc_group(&au_hfsn_ops);
+	if (IS_ERR(group)) {
+		err = PTR_ERR(group);
+		pr_err("fsnotify_alloc_group() failed, %d\n", err);
+		goto out_hfsn;
+	}
+
+	group->private = hfsn;
+	hfsn->hfsn_group = group;
+	br->br_hfsn = hfsn;
+	goto out; /* success */
+
+out_hfsn:
+	au_delayed_kfree(hfsn);
+out:
+	return err;
+}
+
+static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm)
+{
+	int err;
+
+	err = 0;
+	if (!br->br_hfsn)
+		err = au_hfsn_init_br(br, perm);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_hfsn_fin(void)
+{
+	AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree));
+	wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree));
+}
+
+const struct au_hnotify_op au_hnotify_op = {
+	.ctl		= au_hfsn_ctl,
+	.alloc		= au_hfsn_alloc,
+	.free		= au_hfsn_free,
+
+	.fin		= au_hfsn_fin,
+
+	.reset_br	= au_hfsn_reset_br,
+	.fin_br		= au_hfsn_fin_br,
+	.init_br	= au_hfsn_init_br
+};
diff --git b/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c
new file mode 100644
index 0000000..145c6ac
--- /dev/null
+++ b/fs/aufs/hfsplus.c
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * special support for filesystems which aqucires an inode mutex
+ * at final closing a file, eg, hfsplus.
+ *
+ * This trick is very simple and stupid, just to open the file before really
+ * neceeary open to tell hfsplus that this is not the final closing.
+ * The caller should call au_h_open_pre() after acquiring the inode mutex,
+ * and au_h_open_post() after releasing it.
+ */
+
+#include "aufs.h"
+
+struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
+			   int force_wr)
+{
+	struct file *h_file;
+	struct dentry *h_dentry;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	AuDebugOn(!h_dentry);
+	AuDebugOn(d_is_negative(h_dentry));
+
+	h_file = NULL;
+	if (au_test_hfsplus(h_dentry->d_sb)
+	    && d_is_reg(h_dentry))
+		h_file = au_h_open(dentry, bindex,
+				   O_RDONLY | O_NOATIME | O_LARGEFILE,
+				   /*file*/NULL, force_wr);
+	return h_file;
+}
+
+void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
+		    struct file *h_file)
+{
+	if (h_file) {
+		fput(h_file);
+		au_sbr_put(dentry->d_sb, bindex);
+	}
+}
diff --git b/fs/aufs/hnotify.c b/fs/aufs/hnotify.c
new file mode 100644
index 0000000..a05aed8
--- /dev/null
+++ b/fs/aufs/hnotify.c
@@ -0,0 +1,710 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * abstraction to notify the direct changes on lower directories
+ */
+
+#include "aufs.h"
+
+int au_hn_alloc(struct au_hinode *hinode, struct inode *inode)
+{
+	int err;
+	struct au_hnotify *hn;
+
+	err = -ENOMEM;
+	hn = au_cache_alloc_hnotify();
+	if (hn) {
+		hn->hn_aufs_inode = inode;
+		hinode->hi_notify = hn;
+		err = au_hnotify_op.alloc(hinode);
+		AuTraceErr(err);
+		if (unlikely(err)) {
+			hinode->hi_notify = NULL;
+			au_cache_dfree_hnotify(hn);
+			/*
+			 * The upper dir was removed by udba, but the same named
+			 * dir left. In this case, aufs assignes a new inode
+			 * number and set the monitor again.
+			 * For the lower dir, the old monitnor is still left.
+			 */
+			if (err == -EEXIST)
+				err = 0;
+		}
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+void au_hn_free(struct au_hinode *hinode)
+{
+	struct au_hnotify *hn;
+
+	hn = hinode->hi_notify;
+	if (hn) {
+		hinode->hi_notify = NULL;
+		if (au_hnotify_op.free(hinode, hn))
+			au_cache_dfree_hnotify(hn);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_hn_ctl(struct au_hinode *hinode, int do_set)
+{
+	if (hinode->hi_notify)
+		au_hnotify_op.ctl(hinode, do_set);
+}
+
+void au_hn_reset(struct inode *inode, unsigned int flags)
+{
+	aufs_bindex_t bindex, bbot;
+	struct inode *hi;
+	struct dentry *iwhdentry;
+
+	bbot = au_ibbot(inode);
+	for (bindex = au_ibtop(inode); bindex <= bbot; bindex++) {
+		hi = au_h_iptr(inode, bindex);
+		if (!hi)
+			continue;
+
+		/* inode_lock_nested(hi, AuLsc_I_CHILD); */
+		iwhdentry = au_hi_wh(inode, bindex);
+		if (iwhdentry)
+			dget(iwhdentry);
+		au_igrab(hi);
+		au_set_h_iptr(inode, bindex, NULL, 0);
+		au_set_h_iptr(inode, bindex, au_igrab(hi),
+			      flags & ~AuHi_XINO);
+		iput(hi);
+		dput(iwhdentry);
+		/* inode_unlock(hi); */
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int hn_xino(struct inode *inode, struct inode *h_inode)
+{
+	int err;
+	aufs_bindex_t bindex, bbot, bfound, btop;
+	struct inode *h_i;
+
+	err = 0;
+	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
+		pr_warn("branch root dir was changed\n");
+		goto out;
+	}
+
+	bfound = -1;
+	bbot = au_ibbot(inode);
+	btop = au_ibtop(inode);
+#if 0 /* reserved for future use */
+	if (bindex == bbot) {
+		/* keep this ino in rename case */
+		goto out;
+	}
+#endif
+	for (bindex = btop; bindex <= bbot; bindex++)
+		if (au_h_iptr(inode, bindex) == h_inode) {
+			bfound = bindex;
+			break;
+		}
+	if (bfound < 0)
+		goto out;
+
+	for (bindex = btop; bindex <= bbot; bindex++) {
+		h_i = au_h_iptr(inode, bindex);
+		if (!h_i)
+			continue;
+
+		err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0);
+		/* ignore this error */
+		/* bad action? */
+	}
+
+	/* children inode number will be broken */
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int hn_gen_tree(struct dentry *dentry)
+{
+	int err, i, j, ndentry;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, dentry, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	for (i = 0; i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		ndentry = dpage->ndentry;
+		for (j = 0; j < ndentry; j++) {
+			struct dentry *d;
+
+			d = dentries[j];
+			if (IS_ROOT(d))
+				continue;
+
+			au_digen_dec(d);
+			if (d_really_is_positive(d))
+				/* todo: reset children xino?
+				   cached children only? */
+				au_iigen_dec(d_inode(d));
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+
+#if 0
+	/* discard children */
+	dentry_unhash(dentry);
+	dput(dentry);
+#endif
+out:
+	return err;
+}
+
+/*
+ * return 0 if processed.
+ */
+static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode,
+			   const unsigned int isdir)
+{
+	int err;
+	struct dentry *d;
+	struct qstr *dname;
+
+	err = 1;
+	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
+		pr_warn("branch root dir was changed\n");
+		err = 0;
+		goto out;
+	}
+
+	if (!isdir) {
+		AuDebugOn(!name);
+		au_iigen_dec(inode);
+		spin_lock(&inode->i_lock);
+		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
+			spin_lock(&d->d_lock);
+			dname = &d->d_name;
+			if (dname->len != nlen
+			    && memcmp(dname->name, name, nlen)) {
+				spin_unlock(&d->d_lock);
+				continue;
+			}
+			err = 0;
+			au_digen_dec(d);
+			spin_unlock(&d->d_lock);
+			break;
+		}
+		spin_unlock(&inode->i_lock);
+	} else {
+		au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR);
+		d = d_find_any_alias(inode);
+		if (!d) {
+			au_iigen_dec(inode);
+			goto out;
+		}
+
+		spin_lock(&d->d_lock);
+		dname = &d->d_name;
+		if (dname->len == nlen && !memcmp(dname->name, name, nlen)) {
+			spin_unlock(&d->d_lock);
+			err = hn_gen_tree(d);
+			spin_lock(&d->d_lock);
+		}
+		spin_unlock(&d->d_lock);
+		dput(d);
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir)
+{
+	int err;
+
+	if (IS_ROOT(dentry)) {
+		pr_warn("branch root dir was changed\n");
+		return 0;
+	}
+
+	err = 0;
+	if (!isdir) {
+		au_digen_dec(dentry);
+		if (d_really_is_positive(dentry))
+			au_iigen_dec(d_inode(dentry));
+	} else {
+		au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR);
+		if (d_really_is_positive(dentry))
+			err = hn_gen_tree(dentry);
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* hnotify job flags */
+#define AuHnJob_XINO0		1
+#define AuHnJob_GEN		(1 << 1)
+#define AuHnJob_DIRENT		(1 << 2)
+#define AuHnJob_ISDIR		(1 << 3)
+#define AuHnJob_TRYXINO0	(1 << 4)
+#define AuHnJob_MNTPNT		(1 << 5)
+#define au_ftest_hnjob(flags, name)	((flags) & AuHnJob_##name)
+#define au_fset_hnjob(flags, name) \
+	do { (flags) |= AuHnJob_##name; } while (0)
+#define au_fclr_hnjob(flags, name) \
+	do { (flags) &= ~AuHnJob_##name; } while (0)
+
+enum {
+	AuHn_CHILD,
+	AuHn_PARENT,
+	AuHnLast
+};
+
+struct au_hnotify_args {
+	struct inode *h_dir, *dir, *h_child_inode;
+	u32 mask;
+	unsigned int flags[AuHnLast];
+	unsigned int h_child_nlen;
+	char h_child_name[];
+};
+
+struct hn_job_args {
+	unsigned int flags;
+	struct inode *inode, *h_inode, *dir, *h_dir;
+	struct dentry *dentry;
+	char *h_name;
+	int h_nlen;
+};
+
+static int hn_job(struct hn_job_args *a)
+{
+	const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR);
+	int e;
+
+	/* reset xino */
+	if (au_ftest_hnjob(a->flags, XINO0) && a->inode)
+		hn_xino(a->inode, a->h_inode); /* ignore this error */
+
+	if (au_ftest_hnjob(a->flags, TRYXINO0)
+	    && a->inode
+	    && a->h_inode) {
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+		if (!a->h_inode->i_nlink
+		    && !(a->h_inode->i_state & I_LINKABLE))
+			hn_xino(a->inode, a->h_inode); /* ignore this error */
+		inode_unlock(a->h_inode);
+	}
+
+	/* make the generation obsolete */
+	if (au_ftest_hnjob(a->flags, GEN)) {
+		e = -1;
+		if (a->inode)
+			e = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode,
+					      isdir);
+		if (e && a->dentry)
+			hn_gen_by_name(a->dentry, isdir);
+		/* ignore this error */
+	}
+
+	/* make dir entries obsolete */
+	if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) {
+		struct au_vdir *vdir;
+
+		vdir = au_ivdir(a->inode);
+		if (vdir)
+			vdir->vd_jiffy = 0;
+		/* IMustLock(a->inode); */
+		/* a->inode->i_version++; */
+	}
+
+	/* can do nothing but warn */
+	if (au_ftest_hnjob(a->flags, MNTPNT)
+	    && a->dentry
+	    && d_mountpoint(a->dentry))
+		pr_warn("mount-point %pd is removed or renamed\n", a->dentry);
+
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen,
+					   struct inode *dir)
+{
+	struct dentry *dentry, *d, *parent;
+	struct qstr *dname;
+
+	parent = d_find_any_alias(dir);
+	if (!parent)
+		return NULL;
+
+	dentry = NULL;
+	spin_lock(&parent->d_lock);
+	list_for_each_entry(d, &parent->d_subdirs, d_child) {
+		/* AuDbg("%pd\n", d); */
+		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
+		dname = &d->d_name;
+		if (dname->len != nlen || memcmp(dname->name, name, nlen))
+			goto cont_unlock;
+		if (au_di(d))
+			au_digen_dec(d);
+		else
+			goto cont_unlock;
+		if (au_dcount(d) > 0) {
+			dentry = dget_dlock(d);
+			spin_unlock(&d->d_lock);
+			break;
+		}
+
+cont_unlock:
+		spin_unlock(&d->d_lock);
+	}
+	spin_unlock(&parent->d_lock);
+	dput(parent);
+
+	if (dentry)
+		di_write_lock_child(dentry);
+
+	return dentry;
+}
+
+static struct inode *lookup_wlock_by_ino(struct super_block *sb,
+					 aufs_bindex_t bindex, ino_t h_ino)
+{
+	struct inode *inode;
+	ino_t ino;
+	int err;
+
+	inode = NULL;
+	err = au_xino_read(sb, bindex, h_ino, &ino);
+	if (!err && ino)
+		inode = ilookup(sb, ino);
+	if (!inode)
+		goto out;
+
+	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
+		pr_warn("wrong root branch\n");
+		iput(inode);
+		inode = NULL;
+		goto out;
+	}
+
+	ii_write_lock_child(inode);
+
+out:
+	return inode;
+}
+
+static void au_hn_bh(void *_args)
+{
+	struct au_hnotify_args *a = _args;
+	struct super_block *sb;
+	aufs_bindex_t bindex, bbot, bfound;
+	unsigned char xino, try_iput;
+	int err;
+	struct inode *inode;
+	ino_t h_ino;
+	struct hn_job_args args;
+	struct dentry *dentry;
+	struct au_sbinfo *sbinfo;
+
+	AuDebugOn(!_args);
+	AuDebugOn(!a->h_dir);
+	AuDebugOn(!a->dir);
+	AuDebugOn(!a->mask);
+	AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n",
+	      a->mask, a->dir->i_ino, a->h_dir->i_ino,
+	      a->h_child_inode ? a->h_child_inode->i_ino : 0);
+
+	inode = NULL;
+	dentry = NULL;
+	/*
+	 * do not lock a->dir->i_mutex here
+	 * because of d_revalidate() may cause a deadlock.
+	 */
+	sb = a->dir->i_sb;
+	AuDebugOn(!sb);
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!sbinfo);
+	si_write_lock(sb, AuLock_NOPLMW);
+
+	ii_read_lock_parent(a->dir);
+	bfound = -1;
+	bbot = au_ibbot(a->dir);
+	for (bindex = au_ibtop(a->dir); bindex <= bbot; bindex++)
+		if (au_h_iptr(a->dir, bindex) == a->h_dir) {
+			bfound = bindex;
+			break;
+		}
+	ii_read_unlock(a->dir);
+	if (unlikely(bfound < 0))
+		goto out;
+
+	xino = !!au_opt_test(au_mntflags(sb), XINO);
+	h_ino = 0;
+	if (a->h_child_inode)
+		h_ino = a->h_child_inode->i_ino;
+
+	if (a->h_child_nlen
+	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN)
+		|| au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT)))
+		dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen,
+					      a->dir);
+	try_iput = 0;
+	if (dentry && d_really_is_positive(dentry))
+		inode = d_inode(dentry);
+	if (xino && !inode && h_ino
+	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0)
+		|| au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0)
+		|| au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) {
+		inode = lookup_wlock_by_ino(sb, bfound, h_ino);
+		try_iput = 1;
+	}
+
+	args.flags = a->flags[AuHn_CHILD];
+	args.dentry = dentry;
+	args.inode = inode;
+	args.h_inode = a->h_child_inode;
+	args.dir = a->dir;
+	args.h_dir = a->h_dir;
+	args.h_name = a->h_child_name;
+	args.h_nlen = a->h_child_nlen;
+	err = hn_job(&args);
+	if (dentry) {
+		if (au_di(dentry))
+			di_write_unlock(dentry);
+		dput(dentry);
+	}
+	if (inode && try_iput) {
+		ii_write_unlock(inode);
+		iput(inode);
+	}
+
+	ii_write_lock_parent(a->dir);
+	args.flags = a->flags[AuHn_PARENT];
+	args.dentry = NULL;
+	args.inode = a->dir;
+	args.h_inode = a->h_dir;
+	args.dir = NULL;
+	args.h_dir = NULL;
+	args.h_name = NULL;
+	args.h_nlen = 0;
+	err = hn_job(&args);
+	ii_write_unlock(a->dir);
+
+out:
+	iput(a->h_child_inode);
+	iput(a->h_dir);
+	iput(a->dir);
+	si_write_unlock(sb);
+	au_nwt_done(&sbinfo->si_nowait);
+	au_delayed_kfree(a);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
+	       struct qstr *h_child_qstr, struct inode *h_child_inode)
+{
+	int err, len;
+	unsigned int flags[AuHnLast], f;
+	unsigned char isdir, isroot, wh;
+	struct inode *dir;
+	struct au_hnotify_args *args;
+	char *p, *h_child_name;
+
+	err = 0;
+	AuDebugOn(!hnotify || !hnotify->hn_aufs_inode);
+	dir = igrab(hnotify->hn_aufs_inode);
+	if (!dir)
+		goto out;
+
+	isroot = (dir->i_ino == AUFS_ROOT_INO);
+	wh = 0;
+	h_child_name = (void *)h_child_qstr->name;
+	len = h_child_qstr->len;
+	if (h_child_name) {
+		if (len > AUFS_WH_PFX_LEN
+		    && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
+			h_child_name += AUFS_WH_PFX_LEN;
+			len -= AUFS_WH_PFX_LEN;
+			wh = 1;
+		}
+	}
+
+	isdir = 0;
+	if (h_child_inode)
+		isdir = !!S_ISDIR(h_child_inode->i_mode);
+	flags[AuHn_PARENT] = AuHnJob_ISDIR;
+	flags[AuHn_CHILD] = 0;
+	if (isdir)
+		flags[AuHn_CHILD] = AuHnJob_ISDIR;
+	au_fset_hnjob(flags[AuHn_PARENT], DIRENT);
+	au_fset_hnjob(flags[AuHn_CHILD], GEN);
+	switch (mask & FS_EVENTS_POSS_ON_CHILD) {
+	case FS_MOVED_FROM:
+	case FS_MOVED_TO:
+		au_fset_hnjob(flags[AuHn_CHILD], XINO0);
+		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
+		/*FALLTHROUGH*/
+	case FS_CREATE:
+		AuDebugOn(!h_child_name);
+		break;
+
+	case FS_DELETE:
+		/*
+		 * aufs never be able to get this child inode.
+		 * revalidation should be in d_revalidate()
+		 * by checking i_nlink, i_generation or d_unhashed().
+		 */
+		AuDebugOn(!h_child_name);
+		au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0);
+		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
+		break;
+
+	default:
+		AuDebugOn(1);
+	}
+
+	if (wh)
+		h_child_inode = NULL;
+
+	err = -ENOMEM;
+	/* iput() and kfree() will be called in au_hnotify() */
+	args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS);
+	if (unlikely(!args)) {
+		AuErr1("no memory\n");
+		iput(dir);
+		goto out;
+	}
+	args->flags[AuHn_PARENT] = flags[AuHn_PARENT];
+	args->flags[AuHn_CHILD] = flags[AuHn_CHILD];
+	args->mask = mask;
+	args->dir = dir;
+	args->h_dir = igrab(h_dir);
+	if (h_child_inode)
+		h_child_inode = igrab(h_child_inode); /* can be NULL */
+	args->h_child_inode = h_child_inode;
+	args->h_child_nlen = len;
+	if (len) {
+		p = (void *)args;
+		p += sizeof(*args);
+		memcpy(p, h_child_name, len);
+		p[len] = 0;
+	}
+
+	/* NFS fires the event for silly-renamed one from kworker */
+	f = 0;
+	if (!dir->i_nlink
+	    || (au_test_nfs(h_dir->i_sb) && (mask & FS_DELETE)))
+		f = AuWkq_NEST;
+	err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f);
+	if (unlikely(err)) {
+		pr_err("wkq %d\n", err);
+		iput(args->h_child_inode);
+		iput(args->h_dir);
+		iput(args->dir);
+		au_delayed_kfree(args);
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm)
+{
+	int err;
+
+	AuDebugOn(!(udba & AuOptMask_UDBA));
+
+	err = 0;
+	if (au_hnotify_op.reset_br)
+		err = au_hnotify_op.reset_br(udba, br, perm);
+
+	return err;
+}
+
+int au_hnotify_init_br(struct au_branch *br, int perm)
+{
+	int err;
+
+	err = 0;
+	if (au_hnotify_op.init_br)
+		err = au_hnotify_op.init_br(br, perm);
+
+	return err;
+}
+
+void au_hnotify_fin_br(struct au_branch *br)
+{
+	if (au_hnotify_op.fin_br)
+		au_hnotify_op.fin_br(br);
+}
+
+static void au_hn_destroy_cache(void)
+{
+	struct au_cache *cp;
+
+	flush_delayed_work(&au_dfree.dwork);
+	cp = au_dfree.cache + AuCache_HNOTIFY;
+	AuDebugOn(!llist_empty(&cp->llist));
+	kmem_cache_destroy(cp->cache);
+	cp->cache = NULL;
+}
+
+AU_CACHE_DFREE_FUNC(hnotify, HNOTIFY, hn_lnode);
+
+int __init au_hnotify_init(void)
+{
+	int err;
+	struct au_cache *cp;
+
+	err = -ENOMEM;
+	cp = au_dfree.cache + AuCache_HNOTIFY;
+	cp->cache = AuCache(au_hnotify);
+	if (cp->cache) {
+		err = 0;
+		if (au_hnotify_op.init)
+			err = au_hnotify_op.init();
+		if (unlikely(err))
+			au_hn_destroy_cache();
+	}
+	AuTraceErr(err);
+	return err;
+}
+
+void au_hnotify_fin(void)
+{
+	struct au_cache *cp;
+
+	if (au_hnotify_op.fin)
+		au_hnotify_op.fin();
+
+	/* cf. au_cache_fin() */
+	cp = au_dfree.cache + AuCache_HNOTIFY;
+	if (cp->cache)
+		au_hn_destroy_cache();
+}
diff --git b/fs/aufs/i_op.c b/fs/aufs/i_op.c
new file mode 100644
index 0000000..c730ec7
--- /dev/null
+++ b/fs/aufs/i_op.c
@@ -0,0 +1,1438 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations (except add/del/rename)
+ */
+
+#include <linux/device_cgroup.h>
+#include <linux/fs_stack.h>
+#include <linux/namei.h>
+#include <linux/security.h>
+#include "aufs.h"
+
+static int h_permission(struct inode *h_inode, int mask,
+			struct path *h_path, int brperm)
+{
+	int err;
+	const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
+
+	err = -EPERM;
+	if (write_mask && IS_IMMUTABLE(h_inode))
+		goto out;
+
+	err = -EACCES;
+	if (((mask & MAY_EXEC)
+	     && S_ISREG(h_inode->i_mode)
+	     && (path_noexec(h_path)
+		 || !(h_inode->i_mode & S_IXUGO))))
+		goto out;
+
+	/*
+	 * - skip the lower fs test in the case of write to ro branch.
+	 * - nfs dir permission write check is optimized, but a policy for
+	 *   link/rename requires a real check.
+	 * - nfs always sets MS_POSIXACL regardless its mount option 'noacl.'
+	 *   in this case, generic_permission() returns -EOPNOTSUPP.
+	 */
+	if ((write_mask && !au_br_writable(brperm))
+	    || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode)
+		&& write_mask && !(mask & MAY_READ))
+	    || !h_inode->i_op->permission) {
+		/* AuLabel(generic_permission); */
+		/* AuDbg("get_acl %pf\n", h_inode->i_op->get_acl); */
+		err = generic_permission(h_inode, mask);
+		if (err == -EOPNOTSUPP && au_test_nfs_noacl(h_inode))
+			err = h_inode->i_op->permission(h_inode, mask);
+		AuTraceErr(err);
+	} else {
+		/* AuLabel(h_inode->permission); */
+		err = h_inode->i_op->permission(h_inode, mask);
+		AuTraceErr(err);
+	}
+
+	if (!err)
+		err = devcgroup_inode_permission(h_inode, mask);
+	if (!err)
+		err = security_inode_permission(h_inode, mask);
+
+#if 0
+	if (!err) {
+		/* todo: do we need to call ima_path_check()? */
+		struct path h_path = {
+			.dentry	=
+			.mnt	= h_mnt
+		};
+		err = ima_path_check(&h_path,
+				     mask & (MAY_READ | MAY_WRITE | MAY_EXEC),
+				     IMA_COUNT_LEAVE);
+	}
+#endif
+
+out:
+	return err;
+}
+
+static int aufs_permission(struct inode *inode, int mask)
+{
+	int err;
+	aufs_bindex_t bindex, bbot;
+	const unsigned char isdir = !!S_ISDIR(inode->i_mode),
+		write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
+	struct inode *h_inode;
+	struct super_block *sb;
+	struct au_branch *br;
+
+	/* todo: support rcu-walk? */
+	if (mask & MAY_NOT_BLOCK)
+		return -ECHILD;
+
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	ii_read_lock_child(inode);
+#if 0
+	err = au_iigen_test(inode, au_sigen(sb));
+	if (unlikely(err))
+		goto out;
+#endif
+
+	if (!isdir
+	    || write_mask
+	    || au_opt_test(au_mntflags(sb), DIRPERM1)) {
+		err = au_busy_or_stale();
+		h_inode = au_h_iptr(inode, au_ibtop(inode));
+		if (unlikely(!h_inode
+			     || (h_inode->i_mode & S_IFMT)
+			     != (inode->i_mode & S_IFMT)))
+			goto out;
+
+		err = 0;
+		bindex = au_ibtop(inode);
+		br = au_sbr(sb, bindex);
+		err = h_permission(h_inode, mask, &br->br_path, br->br_perm);
+		if (write_mask
+		    && !err
+		    && !special_file(h_inode->i_mode)) {
+			/* test whether the upper writable branch exists */
+			err = -EROFS;
+			for (; bindex >= 0; bindex--)
+				if (!au_br_rdonly(au_sbr(sb, bindex))) {
+					err = 0;
+					break;
+				}
+		}
+		goto out;
+	}
+
+	/* non-write to dir */
+	err = 0;
+	bbot = au_ibbot(inode);
+	for (bindex = au_ibtop(inode); !err && bindex <= bbot; bindex++) {
+		h_inode = au_h_iptr(inode, bindex);
+		if (h_inode) {
+			err = au_busy_or_stale();
+			if (unlikely(!S_ISDIR(h_inode->i_mode)))
+				break;
+
+			br = au_sbr(sb, bindex);
+			err = h_permission(h_inode, mask, &br->br_path,
+					   br->br_perm);
+		}
+	}
+
+out:
+	ii_read_unlock(inode);
+	si_read_unlock(sb);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry,
+				  unsigned int flags)
+{
+	struct dentry *ret, *parent;
+	struct inode *inode;
+	struct super_block *sb;
+	int err, npositive;
+
+	IMustLock(dir);
+
+	/* todo: support rcu-walk? */
+	ret = ERR_PTR(-ECHILD);
+	if (flags & LOOKUP_RCU)
+		goto out;
+
+	ret = ERR_PTR(-ENAMETOOLONG);
+	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
+		goto out;
+
+	sb = dir->i_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	ret = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	err = au_di_init(dentry);
+	ret = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_si;
+
+	inode = NULL;
+	npositive = 0; /* suppress a warning */
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_read_lock_parent(parent, AuLock_IR);
+	err = au_alive_dir(parent);
+	if (!err)
+		err = au_digen_test(parent, au_sigen(sb));
+	if (!err) {
+		/* regardless LOOKUP_CREATE, always ALLOW_NEG */
+		npositive = au_lkup_dentry(dentry, au_dbtop(parent),
+					   AuLkup_ALLOW_NEG);
+		err = npositive;
+	}
+	di_read_unlock(parent, AuLock_IR);
+	ret = ERR_PTR(err);
+	if (unlikely(err < 0))
+		goto out_unlock;
+
+	if (npositive) {
+		inode = au_new_inode(dentry, /*must_new*/0);
+		if (IS_ERR(inode)) {
+			ret = (void *)inode;
+			inode = NULL;
+			goto out_unlock;
+		}
+	}
+
+	if (inode)
+		atomic_inc(&inode->i_count);
+	ret = d_splice_alias(inode, dentry);
+#if 0
+	if (unlikely(d_need_lookup(dentry))) {
+		spin_lock(&dentry->d_lock);
+		dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
+		spin_unlock(&dentry->d_lock);
+	} else
+#endif
+	if (inode) {
+		if (!IS_ERR(ret)) {
+			iput(inode);
+			if (ret && ret != dentry)
+				ii_write_unlock(inode);
+		} else {
+			ii_write_unlock(inode);
+			iput(inode);
+			inode = NULL;
+		}
+	}
+
+out_unlock:
+	di_write_unlock(dentry);
+out_si:
+	si_read_unlock(sb);
+out:
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct aopen_node {
+	struct hlist_node hlist;
+	struct file *file, *h_file;
+};
+
+static int au_do_aopen(struct inode *inode, struct file *file)
+{
+	struct au_sphlhead *aopen;
+	struct aopen_node *node;
+	struct au_do_open_args args = {
+		.no_lock	= 1,
+		.open		= au_do_open_nondir
+	};
+
+	aopen = &au_sbi(inode->i_sb)->si_aopen;
+	spin_lock(&aopen->spin);
+	hlist_for_each_entry(node, &aopen->head, hlist)
+		if (node->file == file) {
+			args.h_file = node->h_file;
+			break;
+		}
+	spin_unlock(&aopen->spin);
+	/* AuDebugOn(!args.h_file); */
+
+	return au_do_open(file, &args);
+}
+
+static int aufs_atomic_open(struct inode *dir, struct dentry *dentry,
+			    struct file *file, unsigned int open_flag,
+			    umode_t create_mode, int *opened)
+{
+	int err, h_opened = *opened;
+	unsigned int lkup_flags;
+	struct dentry *parent, *d;
+	struct au_sphlhead *aopen;
+	struct vfsub_aopen_args args = {
+		.open_flag	= open_flag,
+		.create_mode	= create_mode,
+		.opened		= &h_opened
+	};
+	struct aopen_node aopen_node = {
+		.file	= file
+	};
+
+	IMustLock(dir);
+	AuDbg("open_flag 0%o\n", open_flag);
+	AuDbgDentry(dentry);
+
+	err = 0;
+	if (!au_di(dentry)) {
+		lkup_flags = LOOKUP_OPEN;
+		if (open_flag & O_CREAT)
+			lkup_flags |= LOOKUP_CREATE;
+		d = aufs_lookup(dir, dentry, lkup_flags);
+		if (IS_ERR(d)) {
+			err = PTR_ERR(d);
+			AuTraceErr(err);
+			goto out;
+		} else if (d) {
+			/*
+			 * obsoleted dentry found.
+			 * another error will be returned later.
+			 */
+			d_drop(d);
+			AuDbgDentry(d);
+			dput(d);
+		}
+		AuDbgDentry(dentry);
+	}
+
+	if (d_is_positive(dentry)
+	    || d_unhashed(dentry)
+	    || d_unlinked(dentry)
+	    || !(open_flag & O_CREAT))
+		goto out_no_open;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
+	if (unlikely(err))
+		goto out;
+
+	parent = dentry->d_parent;	/* dir is locked */
+	di_write_lock_parent(parent);
+	err = au_lkup_dentry(dentry, /*btop*/0, AuLkup_ALLOW_NEG);
+	if (unlikely(err))
+		goto out_unlock;
+
+	AuDbgDentry(dentry);
+	if (d_is_positive(dentry))
+		goto out_unlock;
+
+	args.file = get_empty_filp();
+	err = PTR_ERR(args.file);
+	if (IS_ERR(args.file))
+		goto out_unlock;
+
+	args.file->f_flags = file->f_flags;
+	err = au_aopen_or_create(dir, dentry, &args);
+	AuTraceErr(err);
+	AuDbgFile(args.file);
+	if (unlikely(err < 0)) {
+		if (h_opened & FILE_OPENED)
+			fput(args.file);
+		else
+			put_filp(args.file);
+		goto out_unlock;
+	}
+
+	/* some filesystems don't set FILE_CREATED while succeeded? */
+	*opened |= FILE_CREATED;
+	if (h_opened & FILE_OPENED)
+		aopen_node.h_file = args.file;
+	else {
+		put_filp(args.file);
+		args.file = NULL;
+	}
+	aopen = &au_sbi(dir->i_sb)->si_aopen;
+	au_sphl_add(&aopen_node.hlist, aopen);
+	err = finish_open(file, dentry, au_do_aopen, opened);
+	au_sphl_del(&aopen_node.hlist, aopen);
+	AuTraceErr(err);
+	AuDbgFile(file);
+	if (aopen_node.h_file)
+		fput(aopen_node.h_file);
+
+out_unlock:
+	di_write_unlock(parent);
+	aufs_read_unlock(dentry, AuLock_DW);
+	AuDbgDentry(dentry);
+	if (unlikely(err < 0))
+		goto out;
+out_no_open:
+	if (err >= 0 && !(*opened & FILE_CREATED)) {
+		AuLabel(out_no_open);
+		dget(dentry);
+		err = finish_no_open(file, dentry);
+	}
+out:
+	AuDbg("%pd%s%s\n", dentry,
+	      (*opened & FILE_CREATED) ? " created" : "",
+	      (*opened & FILE_OPENED) ? " opened" : "");
+	AuTraceErr(err);
+	return err;
+}
+
+
+/* ---------------------------------------------------------------------- */
+
+static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent,
+			  const unsigned char add_entry, aufs_bindex_t bcpup,
+			  aufs_bindex_t btop)
+{
+	int err;
+	struct dentry *h_parent;
+	struct inode *h_dir;
+
+	if (add_entry)
+		IMustLock(d_inode(parent));
+	else
+		di_write_lock_parent(parent);
+
+	err = 0;
+	if (!au_h_dptr(parent, bcpup)) {
+		if (btop > bcpup)
+			err = au_cpup_dirs(dentry, bcpup);
+		else if (btop < bcpup)
+			err = au_cpdown_dirs(dentry, bcpup);
+		else
+			BUG();
+	}
+	if (!err && add_entry && !au_ftest_wrdir(add_entry, TMPFILE)) {
+		h_parent = au_h_dptr(parent, bcpup);
+		h_dir = d_inode(h_parent);
+		inode_lock_nested(h_dir, AuLsc_I_PARENT);
+		err = au_lkup_neg(dentry, bcpup, /*wh*/0);
+		/* todo: no unlock here */
+		inode_unlock(h_dir);
+
+		AuDbg("bcpup %d\n", bcpup);
+		if (!err) {
+			if (d_really_is_negative(dentry))
+				au_set_h_dptr(dentry, btop, NULL);
+			au_update_dbrange(dentry, /*do_put_zero*/0);
+		}
+	}
+
+	if (!add_entry)
+		di_write_unlock(parent);
+	if (!err)
+		err = bcpup; /* success */
+
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * decide the branch and the parent dir where we will create a new entry.
+ * returns new bindex or an error.
+ * copyup the parent dir if needed.
+ */
+int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
+	      struct au_wr_dir_args *args)
+{
+	int err;
+	unsigned int flags;
+	aufs_bindex_t bcpup, btop, src_btop;
+	const unsigned char add_entry
+		= au_ftest_wrdir(args->flags, ADD_ENTRY)
+		| au_ftest_wrdir(args->flags, TMPFILE);
+	struct super_block *sb;
+	struct dentry *parent;
+	struct au_sbinfo *sbinfo;
+
+	sb = dentry->d_sb;
+	sbinfo = au_sbi(sb);
+	parent = dget_parent(dentry);
+	btop = au_dbtop(dentry);
+	bcpup = btop;
+	if (args->force_btgt < 0) {
+		if (src_dentry) {
+			src_btop = au_dbtop(src_dentry);
+			if (src_btop < btop)
+				bcpup = src_btop;
+		} else if (add_entry) {
+			flags = 0;
+			if (au_ftest_wrdir(args->flags, ISDIR))
+				au_fset_wbr(flags, DIR);
+			err = AuWbrCreate(sbinfo, dentry, flags);
+			bcpup = err;
+		}
+
+		if (bcpup < 0 || au_test_ro(sb, bcpup, d_inode(dentry))) {
+			if (add_entry)
+				err = AuWbrCopyup(sbinfo, dentry);
+			else {
+				if (!IS_ROOT(dentry)) {
+					di_read_lock_parent(parent, !AuLock_IR);
+					err = AuWbrCopyup(sbinfo, dentry);
+					di_read_unlock(parent, !AuLock_IR);
+				} else
+					err = AuWbrCopyup(sbinfo, dentry);
+			}
+			bcpup = err;
+			if (unlikely(err < 0))
+				goto out;
+		}
+	} else {
+		bcpup = args->force_btgt;
+		AuDebugOn(au_test_ro(sb, bcpup, d_inode(dentry)));
+	}
+
+	AuDbg("btop %d, bcpup %d\n", btop, bcpup);
+	err = bcpup;
+	if (bcpup == btop)
+		goto out; /* success */
+
+	/* copyup the new parent into the branch we process */
+	err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, btop);
+	if (err >= 0) {
+		if (d_really_is_negative(dentry)) {
+			au_set_h_dptr(dentry, btop, NULL);
+			au_set_dbtop(dentry, bcpup);
+			au_set_dbbot(dentry, bcpup);
+		}
+		AuDebugOn(add_entry
+			  && !au_ftest_wrdir(args->flags, TMPFILE)
+			  && !au_h_dptr(dentry, bcpup));
+	}
+
+out:
+	dput(parent);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_pin_hdir_unlock(struct au_pin *p)
+{
+	if (p->hdir)
+		au_hn_inode_unlock(p->hdir);
+}
+
+int au_pin_hdir_lock(struct au_pin *p)
+{
+	int err;
+
+	err = 0;
+	if (!p->hdir)
+		goto out;
+
+	/* even if an error happens later, keep this lock */
+	au_hn_inode_lock_nested(p->hdir, p->lsc_hi);
+
+	err = -EBUSY;
+	if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent)))
+		goto out;
+
+	err = 0;
+	if (p->h_dentry)
+		err = au_h_verify(p->h_dentry, p->udba, p->hdir->hi_inode,
+				  p->h_parent, p->br);
+
+out:
+	return err;
+}
+
+int au_pin_hdir_relock(struct au_pin *p)
+{
+	int err, i;
+	struct inode *h_i;
+	struct dentry *h_d[] = {
+		p->h_dentry,
+		p->h_parent
+	};
+
+	err = au_pin_hdir_lock(p);
+	if (unlikely(err))
+		goto out;
+
+	for (i = 0; !err && i < sizeof(h_d)/sizeof(*h_d); i++) {
+		if (!h_d[i])
+			continue;
+		if (d_is_positive(h_d[i])) {
+			h_i = d_inode(h_d[i]);
+			err = !h_i->i_nlink;
+		}
+	}
+
+out:
+	return err;
+}
+
+static void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task)
+{
+#if !defined(CONFIG_RWSEM_GENERIC_SPINLOCK) && defined(CONFIG_RWSEM_SPIN_ON_OWNER)
+	p->hdir->hi_inode->i_rwsem.owner = task;
+#endif
+}
+
+void au_pin_hdir_acquire_nest(struct au_pin *p)
+{
+	if (p->hdir) {
+		rwsem_acquire_nest(&p->hdir->hi_inode->i_rwsem.dep_map,
+				   p->lsc_hi, 0, NULL, _RET_IP_);
+		au_pin_hdir_set_owner(p, current);
+	}
+}
+
+void au_pin_hdir_release(struct au_pin *p)
+{
+	if (p->hdir) {
+		au_pin_hdir_set_owner(p, p->task);
+		rwsem_release(&p->hdir->hi_inode->i_rwsem.dep_map, 1, _RET_IP_);
+	}
+}
+
+struct dentry *au_pinned_h_parent(struct au_pin *pin)
+{
+	if (pin && pin->parent)
+		return au_h_dptr(pin->parent, pin->bindex);
+	return NULL;
+}
+
+void au_unpin(struct au_pin *p)
+{
+	if (p->hdir)
+		au_pin_hdir_unlock(p);
+	if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE))
+		vfsub_mnt_drop_write(p->h_mnt);
+	if (!p->hdir)
+		return;
+
+	if (!au_ftest_pin(p->flags, DI_LOCKED))
+		di_read_unlock(p->parent, AuLock_IR);
+	iput(p->hdir->hi_inode);
+	dput(p->parent);
+	p->parent = NULL;
+	p->hdir = NULL;
+	p->h_mnt = NULL;
+	/* do not clear p->task */
+}
+
+int au_do_pin(struct au_pin *p)
+{
+	int err;
+	struct super_block *sb;
+	struct inode *h_dir;
+
+	err = 0;
+	sb = p->dentry->d_sb;
+	p->br = au_sbr(sb, p->bindex);
+	if (IS_ROOT(p->dentry)) {
+		if (au_ftest_pin(p->flags, MNT_WRITE)) {
+			p->h_mnt = au_br_mnt(p->br);
+			err = vfsub_mnt_want_write(p->h_mnt);
+			if (unlikely(err)) {
+				au_fclr_pin(p->flags, MNT_WRITE);
+				goto out_err;
+			}
+		}
+		goto out;
+	}
+
+	p->h_dentry = NULL;
+	if (p->bindex <= au_dbbot(p->dentry))
+		p->h_dentry = au_h_dptr(p->dentry, p->bindex);
+
+	p->parent = dget_parent(p->dentry);
+	if (!au_ftest_pin(p->flags, DI_LOCKED))
+		di_read_lock(p->parent, AuLock_IR, p->lsc_di);
+
+	h_dir = NULL;
+	p->h_parent = au_h_dptr(p->parent, p->bindex);
+	p->hdir = au_hi(d_inode(p->parent), p->bindex);
+	if (p->hdir)
+		h_dir = p->hdir->hi_inode;
+
+	/*
+	 * udba case, or
+	 * if DI_LOCKED is not set, then p->parent may be different
+	 * and h_parent can be NULL.
+	 */
+	if (unlikely(!p->hdir || !h_dir || !p->h_parent)) {
+		err = -EBUSY;
+		if (!au_ftest_pin(p->flags, DI_LOCKED))
+			di_read_unlock(p->parent, AuLock_IR);
+		dput(p->parent);
+		p->parent = NULL;
+		goto out_err;
+	}
+
+	if (au_ftest_pin(p->flags, MNT_WRITE)) {
+		p->h_mnt = au_br_mnt(p->br);
+		err = vfsub_mnt_want_write(p->h_mnt);
+		if (unlikely(err)) {
+			au_fclr_pin(p->flags, MNT_WRITE);
+			if (!au_ftest_pin(p->flags, DI_LOCKED))
+				di_read_unlock(p->parent, AuLock_IR);
+			dput(p->parent);
+			p->parent = NULL;
+			goto out_err;
+		}
+	}
+
+	au_igrab(h_dir);
+	err = au_pin_hdir_lock(p);
+	if (!err)
+		goto out; /* success */
+
+	au_unpin(p);
+
+out_err:
+	pr_err("err %d\n", err);
+	err = au_busy_or_stale();
+out:
+	return err;
+}
+
+void au_pin_init(struct au_pin *p, struct dentry *dentry,
+		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
+		 unsigned int udba, unsigned char flags)
+{
+	p->dentry = dentry;
+	p->udba = udba;
+	p->lsc_di = lsc_di;
+	p->lsc_hi = lsc_hi;
+	p->flags = flags;
+	p->bindex = bindex;
+
+	p->parent = NULL;
+	p->hdir = NULL;
+	p->h_mnt = NULL;
+
+	p->h_dentry = NULL;
+	p->h_parent = NULL;
+	p->br = NULL;
+	p->task = current;
+}
+
+int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
+	   unsigned int udba, unsigned char flags)
+{
+	au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2,
+		    udba, flags);
+	return au_do_pin(pin);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * ->setattr() and ->getattr() are called in various cases.
+ * chmod, stat: dentry is revalidated.
+ * fchmod, fstat: file and dentry are not revalidated, additionally they may be
+ *		  unhashed.
+ * for ->setattr(), ia->ia_file is passed from ftruncate only.
+ */
+/* todo: consolidate with do_refresh() and simple_reval_dpath() */
+int au_reval_for_attr(struct dentry *dentry, unsigned int sigen)
+{
+	int err;
+	struct dentry *parent;
+
+	err = 0;
+	if (au_digen_test(dentry, sigen)) {
+		parent = dget_parent(dentry);
+		di_read_lock_parent(parent, AuLock_IR);
+		err = au_refresh_dentry(dentry, parent);
+		di_read_unlock(parent, AuLock_IR);
+		dput(parent);
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
+		     struct au_icpup_args *a)
+{
+	int err;
+	loff_t sz;
+	aufs_bindex_t btop, ibtop;
+	struct dentry *hi_wh, *parent;
+	struct inode *inode;
+	struct au_wr_dir_args wr_dir_args = {
+		.force_btgt	= -1,
+		.flags		= 0
+	};
+
+	if (d_is_dir(dentry))
+		au_fset_wrdir(wr_dir_args.flags, ISDIR);
+	/* plink or hi_wh() case */
+	btop = au_dbtop(dentry);
+	inode = d_inode(dentry);
+	ibtop = au_ibtop(inode);
+	if (btop != ibtop && !au_test_ro(inode->i_sb, ibtop, inode))
+		wr_dir_args.force_btgt = ibtop;
+	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
+	if (unlikely(err < 0))
+		goto out;
+	a->btgt = err;
+	if (err != btop)
+		au_fset_icpup(a->flags, DID_CPUP);
+
+	err = 0;
+	a->pin_flags = AuPin_MNT_WRITE;
+	parent = NULL;
+	if (!IS_ROOT(dentry)) {
+		au_fset_pin(a->pin_flags, DI_LOCKED);
+		parent = dget_parent(dentry);
+		di_write_lock_parent(parent);
+	}
+
+	err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags);
+	if (unlikely(err))
+		goto out_parent;
+
+	sz = -1;
+	a->h_path.dentry = au_h_dptr(dentry, btop);
+	a->h_inode = d_inode(a->h_path.dentry);
+	if (ia && (ia->ia_valid & ATTR_SIZE)) {
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+		if (ia->ia_size < i_size_read(a->h_inode))
+			sz = ia->ia_size;
+		inode_unlock(a->h_inode);
+	}
+
+	hi_wh = NULL;
+	if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) {
+		hi_wh = au_hi_wh(inode, a->btgt);
+		if (!hi_wh) {
+			struct au_cp_generic cpg = {
+				.dentry	= dentry,
+				.bdst	= a->btgt,
+				.bsrc	= -1,
+				.len	= sz,
+				.pin	= &a->pin
+			};
+			err = au_sio_cpup_wh(&cpg, /*file*/NULL);
+			if (unlikely(err))
+				goto out_unlock;
+			hi_wh = au_hi_wh(inode, a->btgt);
+			/* todo: revalidate hi_wh? */
+		}
+	}
+
+	if (parent) {
+		au_pin_set_parent_lflag(&a->pin, /*lflag*/0);
+		di_downgrade_lock(parent, AuLock_IR);
+		dput(parent);
+		parent = NULL;
+	}
+	if (!au_ftest_icpup(a->flags, DID_CPUP))
+		goto out; /* success */
+
+	if (!d_unhashed(dentry)) {
+		struct au_cp_generic cpg = {
+			.dentry	= dentry,
+			.bdst	= a->btgt,
+			.bsrc	= btop,
+			.len	= sz,
+			.pin	= &a->pin,
+			.flags	= AuCpup_DTIME | AuCpup_HOPEN
+		};
+		err = au_sio_cpup_simple(&cpg);
+		if (!err)
+			a->h_path.dentry = au_h_dptr(dentry, a->btgt);
+	} else if (!hi_wh)
+		a->h_path.dentry = au_h_dptr(dentry, a->btgt);
+	else
+		a->h_path.dentry = hi_wh; /* do not dget here */
+
+out_unlock:
+	a->h_inode = d_inode(a->h_path.dentry);
+	if (!err)
+		goto out; /* success */
+	au_unpin(&a->pin);
+out_parent:
+	if (parent) {
+		di_write_unlock(parent);
+		dput(parent);
+	}
+out:
+	if (!err)
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+	return err;
+}
+
+static int aufs_setattr(struct dentry *dentry, struct iattr *ia)
+{
+	int err;
+	struct inode *inode, *delegated;
+	struct super_block *sb;
+	struct file *file;
+	struct au_icpup_args *a;
+
+	inode = d_inode(dentry);
+	IMustLock(inode);
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+		ia->ia_valid &= ~ATTR_MODE;
+
+	file = NULL;
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_kfree;
+
+	if (ia->ia_valid & ATTR_FILE) {
+		/* currently ftruncate(2) only */
+		AuDebugOn(!d_is_reg(dentry));
+		file = ia->ia_file;
+		err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
+		if (unlikely(err))
+			goto out_si;
+		ia->ia_file = au_hf_top(file);
+		a->udba = AuOpt_UDBA_NONE;
+	} else {
+		/* fchmod() doesn't pass ia_file */
+		a->udba = au_opt_udba(sb);
+		di_write_lock_child(dentry);
+		/* no d_unlinked(), to set UDBA_NONE for root */
+		if (d_unhashed(dentry))
+			a->udba = AuOpt_UDBA_NONE;
+		if (a->udba != AuOpt_UDBA_NONE) {
+			AuDebugOn(IS_ROOT(dentry));
+			err = au_reval_for_attr(dentry, au_sigen(sb));
+			if (unlikely(err))
+				goto out_dentry;
+		}
+	}
+
+	err = au_pin_and_icpup(dentry, ia, a);
+	if (unlikely(err < 0))
+		goto out_dentry;
+	if (au_ftest_icpup(a->flags, DID_CPUP)) {
+		ia->ia_file = NULL;
+		ia->ia_valid &= ~ATTR_FILE;
+	}
+
+	a->h_path.mnt = au_sbr_mnt(sb, a->btgt);
+	if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME))
+	    == (ATTR_MODE | ATTR_CTIME)) {
+		err = security_path_chmod(&a->h_path, ia->ia_mode);
+		if (unlikely(err))
+			goto out_unlock;
+	} else if ((ia->ia_valid & (ATTR_UID | ATTR_GID))
+		   && (ia->ia_valid & ATTR_CTIME)) {
+		err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid);
+		if (unlikely(err))
+			goto out_unlock;
+	}
+
+	if (ia->ia_valid & ATTR_SIZE) {
+		struct file *f;
+
+		if (ia->ia_size < i_size_read(inode))
+			/* unmap only */
+			truncate_setsize(inode, ia->ia_size);
+
+		f = NULL;
+		if (ia->ia_valid & ATTR_FILE)
+			f = ia->ia_file;
+		inode_unlock(a->h_inode);
+		err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f);
+		inode_lock_nested(a->h_inode, AuLsc_I_CHILD);
+	} else {
+		delegated = NULL;
+		while (1) {
+			err = vfsub_notify_change(&a->h_path, ia, &delegated);
+			if (delegated) {
+				err = break_deleg_wait(&delegated);
+				if (!err)
+					continue;
+			}
+			break;
+		}
+	}
+	/*
+	 * regardless aufs 'acl' option setting.
+	 * why don't all acl-aware fs call this func from their ->setattr()?
+	 */
+	if (!err && (ia->ia_valid & ATTR_MODE))
+		err = vfsub_acl_chmod(a->h_inode, ia->ia_mode);
+	if (!err)
+		au_cpup_attr_changeable(inode);
+
+out_unlock:
+	inode_unlock(a->h_inode);
+	au_unpin(&a->pin);
+	if (unlikely(err))
+		au_update_dbtop(dentry);
+out_dentry:
+	di_write_unlock(dentry);
+	if (file) {
+		fi_write_unlock(file);
+		ia->ia_file = file;
+		ia->ia_valid |= ATTR_FILE;
+	}
+out_si:
+	si_read_unlock(sb);
+out_kfree:
+	au_delayed_kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
+static int au_h_path_to_set_attr(struct dentry *dentry,
+				 struct au_icpup_args *a, struct path *h_path)
+{
+	int err;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	a->udba = au_opt_udba(sb);
+	/* no d_unlinked(), to set UDBA_NONE for root */
+	if (d_unhashed(dentry))
+		a->udba = AuOpt_UDBA_NONE;
+	if (a->udba != AuOpt_UDBA_NONE) {
+		AuDebugOn(IS_ROOT(dentry));
+		err = au_reval_for_attr(dentry, au_sigen(sb));
+		if (unlikely(err))
+			goto out;
+	}
+	err = au_pin_and_icpup(dentry, /*ia*/NULL, a);
+	if (unlikely(err < 0))
+		goto out;
+
+	h_path->dentry = a->h_path.dentry;
+	h_path->mnt = au_sbr_mnt(sb, a->btgt);
+
+out:
+	return err;
+}
+
+ssize_t au_srxattr(struct dentry *dentry, struct inode *inode,
+		   struct au_srxattr *arg)
+{
+	int err;
+	struct path h_path;
+	struct super_block *sb;
+	struct au_icpup_args *a;
+	struct inode *h_inode;
+
+	IMustLock(inode);
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_kfree;
+
+	h_path.dentry = NULL;	/* silence gcc */
+	di_write_lock_child(dentry);
+	err = au_h_path_to_set_attr(dentry, a, &h_path);
+	if (unlikely(err))
+		goto out_di;
+
+	inode_unlock(a->h_inode);
+	switch (arg->type) {
+	case AU_XATTR_SET:
+		AuDebugOn(d_is_negative(h_path.dentry));
+		err = vfsub_setxattr(h_path.dentry,
+				     arg->u.set.name, arg->u.set.value,
+				     arg->u.set.size, arg->u.set.flags);
+		break;
+	case AU_XATTR_REMOVE:
+		err = vfsub_removexattr(h_path.dentry, arg->u.remove.name);
+		break;
+	case AU_ACL_SET:
+		err = -EOPNOTSUPP;
+		h_inode = d_inode(h_path.dentry);
+		if (h_inode->i_op->set_acl)
+			err = h_inode->i_op->set_acl(h_inode,
+						     arg->u.acl_set.acl,
+						     arg->u.acl_set.type);
+		break;
+	}
+	if (!err)
+		au_cpup_attr_timesizes(inode);
+
+	au_unpin(&a->pin);
+	if (unlikely(err))
+		au_update_dbtop(dentry);
+
+out_di:
+	di_write_unlock(dentry);
+	si_read_unlock(sb);
+out_kfree:
+	au_delayed_kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+#endif
+
+static void au_refresh_iattr(struct inode *inode, struct kstat *st,
+			     unsigned int nlink)
+{
+	unsigned int n;
+
+	inode->i_mode = st->mode;
+	/* don't i_[ug]id_write() here */
+	inode->i_uid = st->uid;
+	inode->i_gid = st->gid;
+	inode->i_atime = st->atime;
+	inode->i_mtime = st->mtime;
+	inode->i_ctime = st->ctime;
+
+	au_cpup_attr_nlink(inode, /*force*/0);
+	if (S_ISDIR(inode->i_mode)) {
+		n = inode->i_nlink;
+		n -= nlink;
+		n += st->nlink;
+		smp_mb(); /* for i_nlink */
+		/* 0 can happen */
+		set_nlink(inode, n);
+	}
+
+	spin_lock(&inode->i_lock);
+	inode->i_blocks = st->blocks;
+	i_size_write(inode, st->size);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * common routine for aufs_getattr() and aufs_getxattr().
+ * returns zero or negative (an error).
+ * @dentry will be read-locked in success.
+ */
+int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path)
+{
+	int err;
+	unsigned int mnt_flags, sigen;
+	unsigned char udba_none;
+	aufs_bindex_t bindex;
+	struct super_block *sb, *h_sb;
+	struct inode *inode;
+
+	h_path->mnt = NULL;
+	h_path->dentry = NULL;
+
+	err = 0;
+	sb = dentry->d_sb;
+	mnt_flags = au_mntflags(sb);
+	udba_none = !!au_opt_test(mnt_flags, UDBA_NONE);
+
+	/* support fstat(2) */
+	if (!d_unlinked(dentry) && !udba_none) {
+		sigen = au_sigen(sb);
+		err = au_digen_test(dentry, sigen);
+		if (!err) {
+			di_read_lock_child(dentry, AuLock_IR);
+			err = au_dbrange_test(dentry);
+			if (unlikely(err)) {
+				di_read_unlock(dentry, AuLock_IR);
+				goto out;
+			}
+		} else {
+			AuDebugOn(IS_ROOT(dentry));
+			di_write_lock_child(dentry);
+			err = au_dbrange_test(dentry);
+			if (!err)
+				err = au_reval_for_attr(dentry, sigen);
+			if (!err)
+				di_downgrade_lock(dentry, AuLock_IR);
+			else {
+				di_write_unlock(dentry);
+				goto out;
+			}
+		}
+	} else
+		di_read_lock_child(dentry, AuLock_IR);
+
+	inode = d_inode(dentry);
+	bindex = au_ibtop(inode);
+	h_path->mnt = au_sbr_mnt(sb, bindex);
+	h_sb = h_path->mnt->mnt_sb;
+	if (!force
+	    && !au_test_fs_bad_iattr(h_sb)
+	    && udba_none)
+		goto out; /* success */
+
+	if (au_dbtop(dentry) == bindex)
+		h_path->dentry = au_h_dptr(dentry, bindex);
+	else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) {
+		h_path->dentry = au_plink_lkup(inode, bindex);
+		if (IS_ERR(h_path->dentry))
+			/* pretending success */
+			h_path->dentry = NULL;
+		else
+			dput(h_path->dentry);
+	}
+
+out:
+	return err;
+}
+
+static int aufs_getattr(struct vfsmount *mnt __maybe_unused,
+			struct dentry *dentry, struct kstat *st)
+{
+	int err;
+	unsigned char positive;
+	struct path h_path;
+	struct inode *inode;
+	struct super_block *sb;
+
+	inode = d_inode(dentry);
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out;
+	err = au_h_path_getattr(dentry, /*force*/0, &h_path);
+	if (unlikely(err))
+		goto out_si;
+	if (unlikely(!h_path.dentry))
+		/* illegally overlapped or something */
+		goto out_fill; /* pretending success */
+
+	positive = d_is_positive(h_path.dentry);
+	if (positive)
+		err = vfs_getattr(&h_path, st);
+	if (!err) {
+		if (positive)
+			au_refresh_iattr(inode, st,
+					 d_inode(h_path.dentry)->i_nlink);
+		goto out_fill; /* success */
+	}
+	AuTraceErr(err);
+	goto out_di;
+
+out_fill:
+	generic_fillattr(inode, st);
+out_di:
+	di_read_unlock(dentry, AuLock_IR);
+out_si:
+	si_read_unlock(sb);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static const char *aufs_get_link(struct dentry *dentry, struct inode *inode,
+				 struct delayed_call *done)
+{
+	const char *ret;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	int err;
+	aufs_bindex_t bindex;
+
+	ret = NULL; /* suppress a warning */
+	err = -ECHILD;
+	if (!dentry)
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN);
+	if (unlikely(err))
+		goto out;
+
+	err = au_d_hashed_positive(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+
+	err = -EINVAL;
+	inode = d_inode(dentry);
+	bindex = au_ibtop(inode);
+	h_inode = au_h_iptr(inode, bindex);
+	if (unlikely(!h_inode->i_op->get_link))
+		goto out_unlock;
+
+	err = -EBUSY;
+	h_dentry = NULL;
+	if (au_dbtop(dentry) <= bindex) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry)
+			dget(h_dentry);
+	}
+	if (!h_dentry) {
+		h_dentry = d_find_any_alias(h_inode);
+		if (IS_ERR(h_dentry)) {
+			err = PTR_ERR(h_dentry);
+			goto out_unlock;
+		}
+	}
+	if (unlikely(!h_dentry))
+		goto out_unlock;
+
+	err = 0;
+	AuDbg("%pf\n", h_inode->i_op->get_link);
+	AuDbgDentry(h_dentry);
+	ret = h_inode->i_op->get_link(h_dentry, h_inode, done);
+	dput(h_dentry);
+	if (IS_ERR(ret))
+		err = PTR_ERR(ret);
+
+out_unlock:
+	aufs_read_unlock(dentry, AuLock_IR);
+out:
+	if (unlikely(err))
+		ret = ERR_PTR(err);
+	AuTraceErrPtr(ret);
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_is_special(struct inode *inode)
+{
+	return (inode->i_mode & (S_IFBLK | S_IFCHR | S_IFIFO | S_IFSOCK));
+}
+
+static int aufs_update_time(struct inode *inode, struct timespec *ts, int flags)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct super_block *sb;
+	struct inode *h_inode;
+	struct vfsmount *h_mnt;
+
+	sb = inode->i_sb;
+	WARN_ONCE((flags & S_ATIME) && !IS_NOATIME(inode),
+		  "unexpected s_flags 0x%lx", sb->s_flags);
+
+	/* mmap_sem might be acquired already, cf. aufs_mmap() */
+	lockdep_off();
+	si_read_lock(sb, AuLock_FLUSH);
+	ii_write_lock_child(inode);
+	lockdep_on();
+
+	err = 0;
+	bindex = au_ibtop(inode);
+	h_inode = au_h_iptr(inode, bindex);
+	if (!au_test_ro(sb, bindex, inode)) {
+		h_mnt = au_sbr_mnt(sb, bindex);
+		err = vfsub_mnt_want_write(h_mnt);
+		if (!err) {
+			err = vfsub_update_time(h_inode, ts, flags);
+			vfsub_mnt_drop_write(h_mnt);
+		}
+	} else if (au_is_special(h_inode)) {
+		/*
+		 * Never copy-up here.
+		 * These special files may already be opened and used for
+		 * communicating. If we copied it up, then the communication
+		 * would be corrupted.
+		 */
+		AuWarn1("timestamps for i%lu are ignored "
+			"since it is on readonly branch (hi%lu).\n",
+			inode->i_ino, h_inode->i_ino);
+	} else if (flags & ~S_ATIME) {
+		err = -EIO;
+		AuIOErr1("unexpected flags 0x%x\n", flags);
+		AuDebugOn(1);
+	}
+
+	lockdep_off();
+	if (!err)
+		au_cpup_attr_timesizes(inode);
+	ii_write_unlock(inode);
+	si_read_unlock(sb);
+	lockdep_on();
+
+	if (!err && (flags & S_VERSION))
+		inode_inc_iversion(inode);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* no getattr version will be set by module.c:aufs_init() */
+struct inode_operations aufs_iop_nogetattr[AuIop_Last],
+	aufs_iop[] = {
+	[AuIop_SYMLINK] = {
+		.permission	= aufs_permission,
+#ifdef CONFIG_FS_POSIX_ACL
+		.get_acl	= aufs_get_acl,
+		.set_acl	= aufs_set_acl, /* unsupport for symlink? */
+#endif
+
+		.setattr	= aufs_setattr,
+		.getattr	= aufs_getattr,
+
+#ifdef CONFIG_AUFS_XATTR
+		.setxattr	= aufs_setxattr,
+		.getxattr	= aufs_getxattr,
+		.listxattr	= aufs_listxattr,
+		.removexattr	= aufs_removexattr,
+#endif
+
+		.readlink	= generic_readlink,
+		.get_link	= aufs_get_link,
+
+		/* .update_time	= aufs_update_time */
+	},
+	[AuIop_DIR] = {
+		.create		= aufs_create,
+		.lookup		= aufs_lookup,
+		.link		= aufs_link,
+		.unlink		= aufs_unlink,
+		.symlink	= aufs_symlink,
+		.mkdir		= aufs_mkdir,
+		.rmdir		= aufs_rmdir,
+		.mknod		= aufs_mknod,
+		.rename		= aufs_rename,
+
+		.permission	= aufs_permission,
+#ifdef CONFIG_FS_POSIX_ACL
+		.get_acl	= aufs_get_acl,
+		.set_acl	= aufs_set_acl,
+#endif
+
+		.setattr	= aufs_setattr,
+		.getattr	= aufs_getattr,
+
+#ifdef CONFIG_AUFS_XATTR
+		.setxattr	= aufs_setxattr,
+		.getxattr	= aufs_getxattr,
+		.listxattr	= aufs_listxattr,
+		.removexattr	= aufs_removexattr,
+#endif
+
+		.update_time	= aufs_update_time,
+		.atomic_open	= aufs_atomic_open,
+		.tmpfile	= aufs_tmpfile
+	},
+	[AuIop_OTHER] = {
+		.permission	= aufs_permission,
+#ifdef CONFIG_FS_POSIX_ACL
+		.get_acl	= aufs_get_acl,
+		.set_acl	= aufs_set_acl,
+#endif
+
+		.setattr	= aufs_setattr,
+		.getattr	= aufs_getattr,
+
+#ifdef CONFIG_AUFS_XATTR
+		.setxattr	= aufs_setxattr,
+		.getxattr	= aufs_getxattr,
+		.listxattr	= aufs_listxattr,
+		.removexattr	= aufs_removexattr,
+#endif
+
+		.update_time	= aufs_update_time
+	}
+};
diff --git b/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c
new file mode 100644
index 0000000..2a9aa09
--- /dev/null
+++ b/fs/aufs/i_op_add.c
@@ -0,0 +1,911 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations (add entry)
+ */
+
+#include "aufs.h"
+
+/*
+ * final procedure of adding a new entry, except link(2).
+ * remove whiteout, instantiate, copyup the parent dir's times and size
+ * and update version.
+ * if it failed, re-create the removed whiteout.
+ */
+static int epilog(struct inode *dir, aufs_bindex_t bindex,
+		  struct dentry *wh_dentry, struct dentry *dentry)
+{
+	int err, rerr;
+	aufs_bindex_t bwh;
+	struct path h_path;
+	struct super_block *sb;
+	struct inode *inode, *h_dir;
+	struct dentry *wh;
+
+	bwh = -1;
+	sb = dir->i_sb;
+	if (wh_dentry) {
+		h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
+		IMustLock(h_dir);
+		AuDebugOn(au_h_iptr(dir, bindex) != h_dir);
+		bwh = au_dbwh(dentry);
+		h_path.dentry = wh_dentry;
+		h_path.mnt = au_sbr_mnt(sb, bindex);
+		err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path,
+					  dentry);
+		if (unlikely(err))
+			goto out;
+	}
+
+	inode = au_new_inode(dentry, /*must_new*/1);
+	if (!IS_ERR(inode)) {
+		d_instantiate(dentry, inode);
+		dir = d_inode(dentry->d_parent); /* dir inode is locked */
+		IMustLock(dir);
+		au_dir_ts(dir, bindex);
+		dir->i_version++;
+		au_fhsm_wrote(sb, bindex, /*force*/0);
+		return 0; /* success */
+	}
+
+	err = PTR_ERR(inode);
+	if (!wh_dentry)
+		goto out;
+
+	/* revert */
+	/* dir inode is locked */
+	wh = au_wh_create(dentry, bwh, wh_dentry->d_parent);
+	rerr = PTR_ERR(wh);
+	if (IS_ERR(wh)) {
+		AuIOErr("%pd reverting whiteout failed(%d, %d)\n",
+			dentry, err, rerr);
+		err = -EIO;
+	} else
+		dput(wh);
+
+out:
+	return err;
+}
+
+static int au_d_may_add(struct dentry *dentry)
+{
+	int err;
+
+	err = 0;
+	if (unlikely(d_unhashed(dentry)))
+		err = -ENOENT;
+	if (unlikely(d_really_is_positive(dentry)))
+		err = -EEXIST;
+	return err;
+}
+
+/*
+ * simple tests for the adding inode operations.
+ * following the checks in vfs, plus the parent-child relationship.
+ */
+int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir)
+{
+	int err;
+	umode_t h_mode;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	err = -ENAMETOOLONG;
+	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
+		goto out;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (d_really_is_negative(dentry)) {
+		err = -EEXIST;
+		if (unlikely(d_is_positive(h_dentry)))
+			goto out;
+	} else {
+		/* rename(2) case */
+		err = -EIO;
+		if (unlikely(d_is_negative(h_dentry)))
+			goto out;
+		h_inode = d_inode(h_dentry);
+		if (unlikely(!h_inode->i_nlink))
+			goto out;
+
+		h_mode = h_inode->i_mode;
+		if (!isdir) {
+			err = -EISDIR;
+			if (unlikely(S_ISDIR(h_mode)))
+				goto out;
+		} else if (unlikely(!S_ISDIR(h_mode))) {
+			err = -ENOTDIR;
+			goto out;
+		}
+	}
+
+	err = 0;
+	/* expected parent dir is locked */
+	if (unlikely(h_parent != h_dentry->d_parent))
+		err = -EIO;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * initial procedure of adding a new entry.
+ * prepare writable branch and the parent dir, lock it,
+ * and lookup whiteout for the new entry.
+ */
+static struct dentry*
+lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt,
+		  struct dentry *src_dentry, struct au_pin *pin,
+		  struct au_wr_dir_args *wr_dir_args)
+{
+	struct dentry *wh_dentry, *h_parent;
+	struct super_block *sb;
+	struct au_branch *br;
+	int err;
+	unsigned int udba;
+	aufs_bindex_t bcpup;
+
+	AuDbg("%pd\n", dentry);
+
+	err = au_wr_dir(dentry, src_dentry, wr_dir_args);
+	bcpup = err;
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err < 0))
+		goto out;
+
+	sb = dentry->d_sb;
+	udba = au_opt_udba(sb);
+	err = au_pin(pin, dentry, bcpup, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	h_parent = au_pinned_h_parent(pin);
+	if (udba != AuOpt_UDBA_NONE
+	    && au_dbtop(dentry) == bcpup)
+		err = au_may_add(dentry, bcpup, h_parent,
+				 au_ftest_wrdir(wr_dir_args->flags, ISDIR));
+	else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
+		err = -ENAMETOOLONG;
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out_unpin;
+
+	br = au_sbr(sb, bcpup);
+	if (dt) {
+		struct path tmp = {
+			.dentry	= h_parent,
+			.mnt	= au_br_mnt(br)
+		};
+		au_dtime_store(dt, au_pinned_parent(pin), &tmp);
+	}
+
+	wh_dentry = NULL;
+	if (bcpup != au_dbwh(dentry))
+		goto out; /* success */
+
+	/*
+	 * ENAMETOOLONG here means that if we allowed create such name, then it
+	 * would not be able to removed in the future. So we don't allow such
+	 * name here and we don't handle ENAMETOOLONG differently here.
+	 */
+	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
+
+out_unpin:
+	if (IS_ERR(wh_dentry))
+		au_unpin(pin);
+out:
+	return wh_dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+enum { Mknod, Symlink, Creat };
+struct simple_arg {
+	int type;
+	union {
+		struct {
+			umode_t			mode;
+			bool			want_excl;
+			bool			try_aopen;
+			struct vfsub_aopen_args	*aopen;
+		} c;
+		struct {
+			const char *symname;
+		} s;
+		struct {
+			umode_t mode;
+			dev_t dev;
+		} m;
+	} u;
+};
+
+static int add_simple(struct inode *dir, struct dentry *dentry,
+		      struct simple_arg *arg)
+{
+	int err, rerr;
+	aufs_bindex_t btop;
+	unsigned char created;
+	const unsigned char try_aopen
+		= (arg->type == Creat && arg->u.c.try_aopen);
+	struct dentry *wh_dentry, *parent;
+	struct inode *h_dir;
+	struct super_block *sb;
+	struct au_branch *br;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct au_pin pin;
+		struct path h_path;
+		struct au_wr_dir_args wr_dir_args;
+	} *a;
+
+	AuDbg("%pd\n", dentry);
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+	a->wr_dir_args.force_btgt = -1;
+	a->wr_dir_args.flags = AuWrDir_ADD_ENTRY;
+
+	parent = dentry->d_parent; /* dir inode is locked */
+	if (!try_aopen) {
+		err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
+		if (unlikely(err))
+			goto out_free;
+	}
+	err = au_d_may_add(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	if (!try_aopen)
+		di_write_lock_parent(parent);
+	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
+				      &a->pin, &a->wr_dir_args);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	btop = au_dbtop(dentry);
+	sb = dentry->d_sb;
+	br = au_sbr(sb, btop);
+	a->h_path.dentry = au_h_dptr(dentry, btop);
+	a->h_path.mnt = au_br_mnt(br);
+	h_dir = au_pinned_h_dir(&a->pin);
+	switch (arg->type) {
+	case Creat:
+		err = 0;
+		if (!try_aopen || !h_dir->i_op->atomic_open)
+			err = vfsub_create(h_dir, &a->h_path, arg->u.c.mode,
+					   arg->u.c.want_excl);
+		else
+			err = vfsub_atomic_open(h_dir, a->h_path.dentry,
+						arg->u.c.aopen, br);
+		break;
+	case Symlink:
+		err = vfsub_symlink(h_dir, &a->h_path, arg->u.s.symname);
+		break;
+	case Mknod:
+		err = vfsub_mknod(h_dir, &a->h_path, arg->u.m.mode,
+				  arg->u.m.dev);
+		break;
+	default:
+		BUG();
+	}
+	created = !err;
+	if (!err)
+		err = epilog(dir, btop, wh_dentry, dentry);
+
+	/* revert */
+	if (unlikely(created && err && d_is_positive(a->h_path.dentry))) {
+		/* no delegation since it is just created */
+		rerr = vfsub_unlink(h_dir, &a->h_path, /*delegated*/NULL,
+				    /*force*/0);
+		if (rerr) {
+			AuIOErr("%pd revert failure(%d, %d)\n",
+				dentry, err, rerr);
+			err = -EIO;
+		}
+		au_dtime_revert(&a->dt);
+	}
+
+	if (!err && try_aopen && !h_dir->i_op->atomic_open)
+		*arg->u.c.aopen->opened |= FILE_CREATED;
+
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+
+out_parent:
+	if (!try_aopen)
+		di_write_unlock(parent);
+out_unlock:
+	if (unlikely(err)) {
+		au_update_dbtop(dentry);
+		d_drop(dentry);
+	}
+	if (!try_aopen)
+		aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	au_delayed_kfree(a);
+out:
+	return err;
+}
+
+int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+	       dev_t dev)
+{
+	struct simple_arg arg = {
+		.type = Mknod,
+		.u.m = {
+			.mode	= mode,
+			.dev	= dev
+		}
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+{
+	struct simple_arg arg = {
+		.type = Symlink,
+		.u.s.symname = symname
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool want_excl)
+{
+	struct simple_arg arg = {
+		.type = Creat,
+		.u.c = {
+			.mode		= mode,
+			.want_excl	= want_excl
+		}
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
+		       struct vfsub_aopen_args *aopen_args)
+{
+	struct simple_arg arg = {
+		.type = Creat,
+		.u.c = {
+			.mode		= aopen_args->create_mode,
+			.want_excl	= aopen_args->open_flag & O_EXCL,
+			.try_aopen	= true,
+			.aopen		= aopen_args
+		}
+	};
+	return add_simple(dir, dentry, &arg);
+}
+
+int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct super_block *sb;
+	struct dentry *parent, *h_parent, *h_dentry;
+	struct inode *h_dir, *inode;
+	struct vfsmount *h_mnt;
+	struct au_wr_dir_args wr_dir_args = {
+		.force_btgt	= -1,
+		.flags		= AuWrDir_TMPFILE
+	};
+
+	/* copy-up may happen */
+	inode_lock(dir);
+
+	sb = dir->i_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out;
+
+	err = au_di_init(dentry);
+	if (unlikely(err))
+		goto out_si;
+
+	err = -EBUSY;
+	parent = d_find_any_alias(dir);
+	AuDebugOn(!parent);
+	di_write_lock_parent(parent);
+	if (unlikely(d_inode(parent) != dir))
+		goto out_parent;
+
+	err = au_digen_test(parent, au_sigen(sb));
+	if (unlikely(err))
+		goto out_parent;
+
+	bindex = au_dbtop(parent);
+	au_set_dbtop(dentry, bindex);
+	au_set_dbbot(dentry, bindex);
+	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
+	bindex = err;
+	if (unlikely(err < 0))
+		goto out_parent;
+
+	err = -EOPNOTSUPP;
+	h_dir = au_h_iptr(dir, bindex);
+	if (unlikely(!h_dir->i_op->tmpfile))
+		goto out_parent;
+
+	h_mnt = au_sbr_mnt(sb, bindex);
+	err = vfsub_mnt_want_write(h_mnt);
+	if (unlikely(err))
+		goto out_parent;
+
+	h_parent = au_h_dptr(parent, bindex);
+	err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC);
+	if (unlikely(err))
+		goto out_mnt;
+
+	err = -ENOMEM;
+	h_dentry = d_alloc(h_parent, &dentry->d_name);
+	if (unlikely(!h_dentry))
+		goto out_mnt;
+
+	err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode);
+	if (unlikely(err))
+		goto out_dentry;
+
+	au_set_dbtop(dentry, bindex);
+	au_set_dbbot(dentry, bindex);
+	au_set_h_dptr(dentry, bindex, dget(h_dentry));
+	inode = au_new_inode(dentry, /*must_new*/1);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		au_set_h_dptr(dentry, bindex, NULL);
+		au_set_dbtop(dentry, -1);
+		au_set_dbbot(dentry, -1);
+	} else {
+		if (!inode->i_nlink)
+			set_nlink(inode, 1);
+		d_tmpfile(dentry, inode);
+		au_di(dentry)->di_tmpfile = 1;
+
+		/* update without i_mutex */
+		if (au_ibtop(dir) == au_dbtop(dentry))
+			au_cpup_attr_timesizes(dir);
+	}
+
+out_dentry:
+	dput(h_dentry);
+out_mnt:
+	vfsub_mnt_drop_write(h_mnt);
+out_parent:
+	di_write_unlock(parent);
+	dput(parent);
+	di_write_unlock(dentry);
+	if (unlikely(err)) {
+		au_di_fin(dentry);
+		dentry->d_fsdata = NULL;
+	}
+out_si:
+	si_read_unlock(sb);
+out:
+	inode_unlock(dir);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_link_args {
+	aufs_bindex_t bdst, bsrc;
+	struct au_pin pin;
+	struct path h_path;
+	struct dentry *src_parent, *parent;
+};
+
+static int au_cpup_before_link(struct dentry *src_dentry,
+			       struct au_link_args *a)
+{
+	int err;
+	struct dentry *h_src_dentry;
+	struct au_cp_generic cpg = {
+		.dentry	= src_dentry,
+		.bdst	= a->bdst,
+		.bsrc	= a->bsrc,
+		.len	= -1,
+		.pin	= &a->pin,
+		.flags	= AuCpup_DTIME | AuCpup_HOPEN /* | AuCpup_KEEPLINO */
+	};
+
+	di_read_lock_parent(a->src_parent, AuLock_IR);
+	err = au_test_and_cpup_dirs(src_dentry, a->bdst);
+	if (unlikely(err))
+		goto out;
+
+	h_src_dentry = au_h_dptr(src_dentry, a->bsrc);
+	err = au_pin(&a->pin, src_dentry, a->bdst,
+		     au_opt_udba(src_dentry->d_sb),
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	if (unlikely(err))
+		goto out;
+
+	err = au_sio_cpup_simple(&cpg);
+	au_unpin(&a->pin);
+
+out:
+	di_read_unlock(a->src_parent, AuLock_IR);
+	return err;
+}
+
+static int au_cpup_or_link(struct dentry *src_dentry, struct dentry *dentry,
+			   struct au_link_args *a)
+{
+	int err;
+	unsigned char plink;
+	aufs_bindex_t bbot;
+	struct dentry *h_src_dentry;
+	struct inode *h_inode, *inode, *delegated;
+	struct super_block *sb;
+	struct file *h_file;
+
+	plink = 0;
+	h_inode = NULL;
+	sb = src_dentry->d_sb;
+	inode = d_inode(src_dentry);
+	if (au_ibtop(inode) <= a->bdst)
+		h_inode = au_h_iptr(inode, a->bdst);
+	if (!h_inode || !h_inode->i_nlink) {
+		/* copyup src_dentry as the name of dentry. */
+		bbot = au_dbbot(dentry);
+		if (bbot < a->bsrc)
+			au_set_dbbot(dentry, a->bsrc);
+		au_set_h_dptr(dentry, a->bsrc,
+			      dget(au_h_dptr(src_dentry, a->bsrc)));
+		dget(a->h_path.dentry);
+		au_set_h_dptr(dentry, a->bdst, NULL);
+		AuDbg("temporary d_inode...\n");
+		spin_lock(&dentry->d_lock);
+		dentry->d_inode = d_inode(src_dentry); /* tmp */
+		spin_unlock(&dentry->d_lock);
+		h_file = au_h_open_pre(dentry, a->bsrc, /*force_wr*/0);
+		if (IS_ERR(h_file))
+			err = PTR_ERR(h_file);
+		else {
+			struct au_cp_generic cpg = {
+				.dentry	= dentry,
+				.bdst	= a->bdst,
+				.bsrc	= -1,
+				.len	= -1,
+				.pin	= &a->pin,
+				.flags	= AuCpup_KEEPLINO
+			};
+			err = au_sio_cpup_simple(&cpg);
+			au_h_open_post(dentry, a->bsrc, h_file);
+			if (!err) {
+				dput(a->h_path.dentry);
+				a->h_path.dentry = au_h_dptr(dentry, a->bdst);
+			} else
+				au_set_h_dptr(dentry, a->bdst,
+					      a->h_path.dentry);
+		}
+		spin_lock(&dentry->d_lock);
+		dentry->d_inode = NULL; /* restore */
+		spin_unlock(&dentry->d_lock);
+		AuDbg("temporary d_inode...done\n");
+		au_set_h_dptr(dentry, a->bsrc, NULL);
+		au_set_dbbot(dentry, bbot);
+	} else {
+		/* the inode of src_dentry already exists on a.bdst branch */
+		h_src_dentry = d_find_alias(h_inode);
+		if (!h_src_dentry && au_plink_test(inode)) {
+			plink = 1;
+			h_src_dentry = au_plink_lkup(inode, a->bdst);
+			err = PTR_ERR(h_src_dentry);
+			if (IS_ERR(h_src_dentry))
+				goto out;
+
+			if (unlikely(d_is_negative(h_src_dentry))) {
+				dput(h_src_dentry);
+				h_src_dentry = NULL;
+			}
+
+		}
+		if (h_src_dentry) {
+			delegated = NULL;
+			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
+					 &a->h_path, &delegated);
+			if (unlikely(err == -EWOULDBLOCK)) {
+				pr_warn("cannot retry for NFSv4 delegation"
+					" for an internal link\n");
+				iput(delegated);
+			}
+			dput(h_src_dentry);
+		} else {
+			AuIOErr("no dentry found for hi%lu on b%d\n",
+				h_inode->i_ino, a->bdst);
+			err = -EIO;
+		}
+	}
+
+	if (!err && !plink)
+		au_plink_append(inode, a->bdst, a->h_path.dentry);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int aufs_link(struct dentry *src_dentry, struct inode *dir,
+	      struct dentry *dentry)
+{
+	int err, rerr;
+	struct au_dtime dt;
+	struct au_link_args *a;
+	struct dentry *wh_dentry, *h_src_dentry;
+	struct inode *inode, *delegated;
+	struct super_block *sb;
+	struct au_wr_dir_args wr_dir_args = {
+		/* .force_btgt	= -1, */
+		.flags		= AuWrDir_ADD_ENTRY
+	};
+
+	IMustLock(dir);
+	inode = d_inode(src_dentry);
+	IMustLock(inode);
+
+	err = -ENOMEM;
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	a->parent = dentry->d_parent; /* dir inode is locked */
+	err = aufs_read_and_write_lock2(dentry, src_dentry,
+					AuLock_NOPLM | AuLock_GEN);
+	if (unlikely(err))
+		goto out_kfree;
+	err = au_d_linkable(src_dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	err = au_d_may_add(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+
+	a->src_parent = dget_parent(src_dentry);
+	wr_dir_args.force_btgt = au_ibtop(inode);
+
+	di_write_lock_parent(a->parent);
+	wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt);
+	wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin,
+				      &wr_dir_args);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	err = 0;
+	sb = dentry->d_sb;
+	a->bdst = au_dbtop(dentry);
+	a->h_path.dentry = au_h_dptr(dentry, a->bdst);
+	a->h_path.mnt = au_sbr_mnt(sb, a->bdst);
+	a->bsrc = au_ibtop(inode);
+	h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
+	if (!h_src_dentry && au_di(src_dentry)->di_tmpfile)
+		h_src_dentry = dget(au_hi_wh(inode, a->bsrc));
+	if (!h_src_dentry) {
+		a->bsrc = au_dbtop(src_dentry);
+		h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
+		AuDebugOn(!h_src_dentry);
+	} else if (IS_ERR(h_src_dentry)) {
+		err = PTR_ERR(h_src_dentry);
+		goto out_parent;
+	}
+
+	if (au_opt_test(au_mntflags(sb), PLINK)) {
+		if (a->bdst < a->bsrc
+		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */)
+			err = au_cpup_or_link(src_dentry, dentry, a);
+		else {
+			delegated = NULL;
+			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
+					 &a->h_path, &delegated);
+			if (unlikely(err == -EWOULDBLOCK)) {
+				pr_warn("cannot retry for NFSv4 delegation"
+					" for an internal link\n");
+				iput(delegated);
+			}
+		}
+		dput(h_src_dentry);
+	} else {
+		/*
+		 * copyup src_dentry to the branch we process,
+		 * and then link(2) to it.
+		 */
+		dput(h_src_dentry);
+		if (a->bdst < a->bsrc
+		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) {
+			au_unpin(&a->pin);
+			di_write_unlock(a->parent);
+			err = au_cpup_before_link(src_dentry, a);
+			di_write_lock_parent(a->parent);
+			if (!err)
+				err = au_pin(&a->pin, dentry, a->bdst,
+					     au_opt_udba(sb),
+					     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+			if (unlikely(err))
+				goto out_wh;
+		}
+		if (!err) {
+			h_src_dentry = au_h_dptr(src_dentry, a->bdst);
+			err = -ENOENT;
+			if (h_src_dentry && d_is_positive(h_src_dentry)) {
+				delegated = NULL;
+				err = vfsub_link(h_src_dentry,
+						 au_pinned_h_dir(&a->pin),
+						 &a->h_path, &delegated);
+				if (unlikely(err == -EWOULDBLOCK)) {
+					pr_warn("cannot retry"
+						" for NFSv4 delegation"
+						" for an internal link\n");
+					iput(delegated);
+				}
+			}
+		}
+	}
+	if (unlikely(err))
+		goto out_unpin;
+
+	if (wh_dentry) {
+		a->h_path.dentry = wh_dentry;
+		err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path,
+					  dentry);
+		if (unlikely(err))
+			goto out_revert;
+	}
+
+	au_dir_ts(dir, a->bdst);
+	dir->i_version++;
+	inc_nlink(inode);
+	inode->i_ctime = dir->i_ctime;
+	d_instantiate(dentry, au_igrab(inode));
+	if (d_unhashed(a->h_path.dentry))
+		/* some filesystem calls d_drop() */
+		d_drop(dentry);
+	/* some filesystems consume an inode even hardlink */
+	au_fhsm_wrote(sb, a->bdst, /*force*/0);
+	goto out_unpin; /* success */
+
+out_revert:
+	/* no delegation since it is just created */
+	rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path,
+			    /*delegated*/NULL, /*force*/0);
+	if (unlikely(rerr)) {
+		AuIOErr("%pd reverting failed(%d, %d)\n", dentry, err, rerr);
+		err = -EIO;
+	}
+	au_dtime_revert(&dt);
+out_unpin:
+	au_unpin(&a->pin);
+out_wh:
+	dput(wh_dentry);
+out_parent:
+	di_write_unlock(a->parent);
+	dput(a->src_parent);
+out_unlock:
+	if (unlikely(err)) {
+		au_update_dbtop(dentry);
+		d_drop(dentry);
+	}
+	aufs_read_and_write_unlock2(dentry, src_dentry);
+out_kfree:
+	au_delayed_kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+	int err, rerr;
+	aufs_bindex_t bindex;
+	unsigned char diropq;
+	struct path h_path;
+	struct dentry *wh_dentry, *parent, *opq_dentry;
+	struct inode *h_inode;
+	struct super_block *sb;
+	struct {
+		struct au_pin pin;
+		struct au_dtime dt;
+	} *a; /* reduce the stack usage */
+	struct au_wr_dir_args wr_dir_args = {
+		.force_btgt	= -1,
+		.flags		= AuWrDir_ADD_ENTRY | AuWrDir_ISDIR
+	};
+
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
+	if (unlikely(err))
+		goto out_free;
+	err = au_d_may_add(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_write_lock_parent(parent);
+	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
+				      &a->pin, &wr_dir_args);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	sb = dentry->d_sb;
+	bindex = au_dbtop(dentry);
+	h_path.dentry = au_h_dptr(dentry, bindex);
+	h_path.mnt = au_sbr_mnt(sb, bindex);
+	err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode);
+	if (unlikely(err))
+		goto out_unpin;
+
+	/* make the dir opaque */
+	diropq = 0;
+	h_inode = d_inode(h_path.dentry);
+	if (wh_dentry
+	    || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) {
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		opq_dentry = au_diropq_create(dentry, bindex);
+		inode_unlock(h_inode);
+		err = PTR_ERR(opq_dentry);
+		if (IS_ERR(opq_dentry))
+			goto out_dir;
+		dput(opq_dentry);
+		diropq = 1;
+	}
+
+	err = epilog(dir, bindex, wh_dentry, dentry);
+	if (!err) {
+		inc_nlink(dir);
+		goto out_unpin; /* success */
+	}
+
+	/* revert */
+	if (diropq) {
+		AuLabel(revert opq);
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		rerr = au_diropq_remove(dentry, bindex);
+		inode_unlock(h_inode);
+		if (rerr) {
+			AuIOErr("%pd reverting diropq failed(%d, %d)\n",
+				dentry, err, rerr);
+			err = -EIO;
+		}
+	}
+
+out_dir:
+	AuLabel(revert dir);
+	rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path);
+	if (rerr) {
+		AuIOErr("%pd reverting dir failed(%d, %d)\n",
+			dentry, err, rerr);
+		err = -EIO;
+	}
+	au_dtime_revert(&a->dt);
+out_unpin:
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+out_parent:
+	di_write_unlock(parent);
+out_unlock:
+	if (unlikely(err)) {
+		au_update_dbtop(dentry);
+		d_drop(dentry);
+	}
+	aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	au_delayed_kfree(a);
+out:
+	return err;
+}
diff --git b/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c
new file mode 100644
index 0000000..2fe8f07
--- /dev/null
+++ b/fs/aufs/i_op_del.c
@@ -0,0 +1,498 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations (del entry)
+ */
+
+#include "aufs.h"
+
+/*
+ * decide if a new whiteout for @dentry is necessary or not.
+ * when it is necessary, prepare the parent dir for the upper branch whose
+ * branch index is @bcpup for creation. the actual creation of the whiteout will
+ * be done by caller.
+ * return value:
+ * 0: wh is unnecessary
+ * plus: wh is necessary
+ * minus: error
+ */
+int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup)
+{
+	int need_wh, err;
+	aufs_bindex_t btop;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	btop = au_dbtop(dentry);
+	if (*bcpup < 0) {
+		*bcpup = btop;
+		if (au_test_ro(sb, btop, d_inode(dentry))) {
+			err = AuWbrCopyup(au_sbi(sb), dentry);
+			*bcpup = err;
+			if (unlikely(err < 0))
+				goto out;
+		}
+	} else
+		AuDebugOn(btop < *bcpup
+			  || au_test_ro(sb, *bcpup, d_inode(dentry)));
+	AuDbg("bcpup %d, btop %d\n", *bcpup, btop);
+
+	if (*bcpup != btop) {
+		err = au_cpup_dirs(dentry, *bcpup);
+		if (unlikely(err))
+			goto out;
+		need_wh = 1;
+	} else {
+		struct au_dinfo *dinfo, *tmp;
+
+		need_wh = -ENOMEM;
+		dinfo = au_di(dentry);
+		tmp = au_di_alloc(sb, AuLsc_DI_TMP);
+		if (tmp) {
+			au_di_cp(tmp, dinfo);
+			au_di_swap(tmp, dinfo);
+			/* returns the number of positive dentries */
+			need_wh = au_lkup_dentry(dentry, btop + 1,
+						 /* AuLkup_IGNORE_PERM */ 0);
+			au_di_swap(tmp, dinfo);
+			au_rw_write_unlock(&tmp->di_rwsem);
+			au_di_free(tmp);
+		}
+	}
+	AuDbg("need_wh %d\n", need_wh);
+	err = need_wh;
+
+out:
+	return err;
+}
+
+/*
+ * simple tests for the del-entry operations.
+ * following the checks in vfs, plus the parent-child relationship.
+ */
+int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir)
+{
+	int err;
+	umode_t h_mode;
+	struct dentry *h_dentry, *h_latest;
+	struct inode *h_inode;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (d_really_is_positive(dentry)) {
+		err = -ENOENT;
+		if (unlikely(d_is_negative(h_dentry)))
+			goto out;
+		h_inode = d_inode(h_dentry);
+		if (unlikely(!h_inode->i_nlink))
+			goto out;
+
+		h_mode = h_inode->i_mode;
+		if (!isdir) {
+			err = -EISDIR;
+			if (unlikely(S_ISDIR(h_mode)))
+				goto out;
+		} else if (unlikely(!S_ISDIR(h_mode))) {
+			err = -ENOTDIR;
+			goto out;
+		}
+	} else {
+		/* rename(2) case */
+		err = -EIO;
+		if (unlikely(d_is_positive(h_dentry)))
+			goto out;
+	}
+
+	err = -ENOENT;
+	/* expected parent dir is locked */
+	if (unlikely(h_parent != h_dentry->d_parent))
+		goto out;
+	err = 0;
+
+	/*
+	 * rmdir a dir may break the consistency on some filesystem.
+	 * let's try heavy test.
+	 */
+	err = -EACCES;
+	if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1)
+		     && au_test_h_perm(d_inode(h_parent),
+				       MAY_EXEC | MAY_WRITE)))
+		goto out;
+
+	h_latest = au_sio_lkup_one(&dentry->d_name, h_parent);
+	err = -EIO;
+	if (IS_ERR(h_latest))
+		goto out;
+	if (h_latest == h_dentry)
+		err = 0;
+	dput(h_latest);
+
+out:
+	return err;
+}
+
+/*
+ * decide the branch where we operate for @dentry. the branch index will be set
+ * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent
+ * dir for reverting.
+ * when a new whiteout is necessary, create it.
+ */
+static struct dentry*
+lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup,
+		    struct au_dtime *dt, struct au_pin *pin)
+{
+	struct dentry *wh_dentry;
+	struct super_block *sb;
+	struct path h_path;
+	int err, need_wh;
+	unsigned int udba;
+	aufs_bindex_t bcpup;
+
+	need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup);
+	wh_dentry = ERR_PTR(need_wh);
+	if (unlikely(need_wh < 0))
+		goto out;
+
+	sb = dentry->d_sb;
+	udba = au_opt_udba(sb);
+	bcpup = *rbcpup;
+	err = au_pin(pin, dentry, bcpup, udba,
+		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+	wh_dentry = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	h_path.dentry = au_pinned_h_parent(pin);
+	if (udba != AuOpt_UDBA_NONE
+	    && au_dbtop(dentry) == bcpup) {
+		err = au_may_del(dentry, bcpup, h_path.dentry, isdir);
+		wh_dentry = ERR_PTR(err);
+		if (unlikely(err))
+			goto out_unpin;
+	}
+
+	h_path.mnt = au_sbr_mnt(sb, bcpup);
+	au_dtime_store(dt, au_pinned_parent(pin), &h_path);
+	wh_dentry = NULL;
+	if (!need_wh)
+		goto out; /* success, no need to create whiteout */
+
+	wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_unpin;
+
+	/* returns with the parent is locked and wh_dentry is dget-ed */
+	goto out; /* success */
+
+out_unpin:
+	au_unpin(pin);
+out:
+	return wh_dentry;
+}
+
+/*
+ * when removing a dir, rename it to a unique temporary whiteout-ed name first
+ * in order to be revertible and save time for removing many child whiteouts
+ * under the dir.
+ * returns 1 when there are too many child whiteout and caller should remove
+ * them asynchronously. returns 0 when the number of children is enough small to
+ * remove now or the branch fs is a remote fs.
+ * otherwise return an error.
+ */
+static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex,
+			   struct au_nhash *whlist, struct inode *dir)
+{
+	int rmdir_later, err, dirwh;
+	struct dentry *h_dentry;
+	struct super_block *sb;
+	struct inode *inode;
+
+	sb = dentry->d_sb;
+	SiMustAnyLock(sb);
+	h_dentry = au_h_dptr(dentry, bindex);
+	err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex));
+	if (unlikely(err))
+		goto out;
+
+	/* stop monitoring */
+	inode = d_inode(dentry);
+	au_hn_free(au_hi(inode, bindex));
+
+	if (!au_test_fs_remote(h_dentry->d_sb)) {
+		dirwh = au_sbi(sb)->si_dirwh;
+		rmdir_later = (dirwh <= 1);
+		if (!rmdir_later)
+			rmdir_later = au_nhash_test_longer_wh(whlist, bindex,
+							      dirwh);
+		if (rmdir_later)
+			return rmdir_later;
+	}
+
+	err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist);
+	if (unlikely(err)) {
+		AuIOErr("rmdir %pd, b%d failed, %d. ignored\n",
+			h_dentry, bindex, err);
+		err = 0;
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * final procedure for deleting a entry.
+ * maintain dentry and iattr.
+ */
+static void epilog(struct inode *dir, struct dentry *dentry,
+		   aufs_bindex_t bindex)
+{
+	struct inode *inode;
+
+	inode = d_inode(dentry);
+	d_drop(dentry);
+	inode->i_ctime = dir->i_ctime;
+
+	au_dir_ts(dir, bindex);
+	dir->i_version++;
+}
+
+/*
+ * when an error happened, remove the created whiteout and revert everything.
+ */
+static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex,
+		     aufs_bindex_t bwh, struct dentry *wh_dentry,
+		     struct dentry *dentry, struct au_dtime *dt)
+{
+	int rerr;
+	struct path h_path = {
+		.dentry	= wh_dentry,
+		.mnt	= au_sbr_mnt(dir->i_sb, bindex)
+	};
+
+	rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry);
+	if (!rerr) {
+		au_set_dbwh(dentry, bwh);
+		au_dtime_revert(dt);
+		return 0;
+	}
+
+	AuIOErr("%pd reverting whiteout failed(%d, %d)\n", dentry, err, rerr);
+	return -EIO;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int aufs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bwh, bindex, btop;
+	struct inode *inode, *h_dir, *delegated;
+	struct dentry *parent, *wh_dentry;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct au_pin pin;
+		struct path h_path;
+	} *a;
+
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
+	if (unlikely(err))
+		goto out_free;
+	err = au_d_hashed_positive(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+	err = -EISDIR;
+	if (unlikely(d_is_dir(dentry)))
+		goto out_unlock; /* possible? */
+
+	btop = au_dbtop(dentry);
+	bwh = au_dbwh(dentry);
+	bindex = -1;
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_write_lock_parent(parent);
+	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &a->dt,
+					&a->pin);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	a->h_path.mnt = au_sbr_mnt(dentry->d_sb, btop);
+	a->h_path.dentry = au_h_dptr(dentry, btop);
+	dget(a->h_path.dentry);
+	if (bindex == btop) {
+		h_dir = au_pinned_h_dir(&a->pin);
+		delegated = NULL;
+		err = vfsub_unlink(h_dir, &a->h_path, &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+	} else {
+		/* dir inode is locked */
+		h_dir = d_inode(wh_dentry->d_parent);
+		IMustLock(h_dir);
+		err = 0;
+	}
+
+	if (!err) {
+		vfsub_drop_nlink(inode);
+		epilog(dir, dentry, bindex);
+
+		/* update target timestamps */
+		if (bindex == btop) {
+			vfsub_update_h_iattr(&a->h_path, /*did*/NULL);
+			/*ignore*/
+			inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
+		} else
+			/* todo: this timestamp may be reverted later */
+			inode->i_ctime = h_dir->i_ctime;
+		goto out_unpin; /* success */
+	}
+
+	/* revert */
+	if (wh_dentry) {
+		int rerr;
+
+		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
+				 &a->dt);
+		if (rerr)
+			err = rerr;
+	}
+
+out_unpin:
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+	dput(a->h_path.dentry);
+out_parent:
+	di_write_unlock(parent);
+out_unlock:
+	aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	au_delayed_kfree(a);
+out:
+	return err;
+}
+
+int aufs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	int err, rmdir_later;
+	aufs_bindex_t bwh, bindex, btop;
+	struct inode *inode;
+	struct dentry *parent, *wh_dentry, *h_dentry;
+	struct au_whtmp_rmdir *args;
+	/* to reuduce stack size */
+	struct {
+		struct au_dtime dt;
+		struct au_pin pin;
+	} *a;
+
+	IMustLock(dir);
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
+	if (unlikely(err))
+		goto out_free;
+	err = au_alive_dir(dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	inode = d_inode(dentry);
+	IMustLock(inode);
+	err = -ENOTDIR;
+	if (unlikely(!d_is_dir(dentry)))
+		goto out_unlock; /* possible? */
+
+	err = -ENOMEM;
+	args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS);
+	if (unlikely(!args))
+		goto out_unlock;
+
+	parent = dentry->d_parent; /* dir inode is locked */
+	di_write_lock_parent(parent);
+	err = au_test_empty(dentry, &args->whlist);
+	if (unlikely(err))
+		goto out_parent;
+
+	btop = au_dbtop(dentry);
+	bwh = au_dbwh(dentry);
+	bindex = -1;
+	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &a->dt,
+					&a->pin);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry))
+		goto out_parent;
+
+	h_dentry = au_h_dptr(dentry, btop);
+	dget(h_dentry);
+	rmdir_later = 0;
+	if (bindex == btop) {
+		err = renwh_and_rmdir(dentry, btop, &args->whlist, dir);
+		if (err > 0) {
+			rmdir_later = err;
+			err = 0;
+		}
+	} else {
+		/* stop monitoring */
+		au_hn_free(au_hi(inode, btop));
+
+		/* dir inode is locked */
+		IMustLock(d_inode(wh_dentry->d_parent));
+		err = 0;
+	}
+
+	if (!err) {
+		vfsub_dead_dir(inode);
+		au_set_dbdiropq(dentry, -1);
+		epilog(dir, dentry, bindex);
+
+		if (rmdir_later) {
+			au_whtmp_kick_rmdir(dir, btop, h_dentry, args);
+			args = NULL;
+		}
+
+		goto out_unpin; /* success */
+	}
+
+	/* revert */
+	AuLabel(revert);
+	if (wh_dentry) {
+		int rerr;
+
+		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
+				 &a->dt);
+		if (rerr)
+			err = rerr;
+	}
+
+out_unpin:
+	au_unpin(&a->pin);
+	dput(wh_dentry);
+	dput(h_dentry);
+out_parent:
+	di_write_unlock(parent);
+	if (args)
+		au_whtmp_rmdir_free(args);
+out_unlock:
+	aufs_read_unlock(dentry, AuLock_DW);
+out_free:
+	au_delayed_kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
diff --git b/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c
new file mode 100644
index 0000000..43a0909
--- /dev/null
+++ b/fs/aufs/i_op_ren.c
@@ -0,0 +1,1002 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operation (rename entry)
+ * todo: this is crazy monster
+ */
+
+#include "aufs.h"
+
+enum { AuSRC, AuDST, AuSrcDst };
+enum { AuPARENT, AuCHILD, AuParentChild };
+
+#define AuRen_ISDIR	1
+#define AuRen_ISSAMEDIR	(1 << 1)
+#define AuRen_WHSRC	(1 << 2)
+#define AuRen_WHDST	(1 << 3)
+#define AuRen_MNT_WRITE	(1 << 4)
+#define AuRen_DT_DSTDIR	(1 << 5)
+#define AuRen_DIROPQ	(1 << 6)
+#define au_ftest_ren(flags, name)	((flags) & AuRen_##name)
+#define au_fset_ren(flags, name) \
+	do { (flags) |= AuRen_##name; } while (0)
+#define au_fclr_ren(flags, name) \
+	do { (flags) &= ~AuRen_##name; } while (0)
+
+struct au_ren_args {
+	struct {
+		struct dentry *dentry, *h_dentry, *parent, *h_parent,
+			*wh_dentry;
+		struct inode *dir, *inode;
+		struct au_hinode *hdir;
+		struct au_dtime dt[AuParentChild];
+		aufs_bindex_t btop;
+	} sd[AuSrcDst];
+
+#define src_dentry	sd[AuSRC].dentry
+#define src_dir		sd[AuSRC].dir
+#define src_inode	sd[AuSRC].inode
+#define src_h_dentry	sd[AuSRC].h_dentry
+#define src_parent	sd[AuSRC].parent
+#define src_h_parent	sd[AuSRC].h_parent
+#define src_wh_dentry	sd[AuSRC].wh_dentry
+#define src_hdir	sd[AuSRC].hdir
+#define src_h_dir	sd[AuSRC].hdir->hi_inode
+#define src_dt		sd[AuSRC].dt
+#define src_btop	sd[AuSRC].btop
+
+#define dst_dentry	sd[AuDST].dentry
+#define dst_dir		sd[AuDST].dir
+#define dst_inode	sd[AuDST].inode
+#define dst_h_dentry	sd[AuDST].h_dentry
+#define dst_parent	sd[AuDST].parent
+#define dst_h_parent	sd[AuDST].h_parent
+#define dst_wh_dentry	sd[AuDST].wh_dentry
+#define dst_hdir	sd[AuDST].hdir
+#define dst_h_dir	sd[AuDST].hdir->hi_inode
+#define dst_dt		sd[AuDST].dt
+#define dst_btop	sd[AuDST].btop
+
+	struct dentry *h_trap;
+	struct au_branch *br;
+	struct au_hinode *src_hinode;
+	struct path h_path;
+	struct au_nhash whlist;
+	aufs_bindex_t btgt, src_bwh, src_bdiropq;
+
+	unsigned int flags;
+
+	struct au_whtmp_rmdir *thargs;
+	struct dentry *h_dst;
+};
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * functions for reverting.
+ * when an error happened in a single rename systemcall, we should revert
+ * everything as if nothing happened.
+ * we don't need to revert the copied-up/down the parent dir since they are
+ * harmless.
+ */
+
+#define RevertFailure(fmt, ...) do { \
+	AuIOErr("revert failure: " fmt " (%d, %d)\n", \
+		##__VA_ARGS__, err, rerr); \
+	err = -EIO; \
+} while (0)
+
+static void au_ren_rev_diropq(int err, struct au_ren_args *a)
+{
+	int rerr;
+
+	au_hn_inode_lock_nested(a->src_hinode, AuLsc_I_CHILD);
+	rerr = au_diropq_remove(a->src_dentry, a->btgt);
+	au_hn_inode_unlock(a->src_hinode);
+	au_set_dbdiropq(a->src_dentry, a->src_bdiropq);
+	if (rerr)
+		RevertFailure("remove diropq %pd", a->src_dentry);
+}
+
+static void au_ren_rev_rename(int err, struct au_ren_args *a)
+{
+	int rerr;
+	struct inode *delegated;
+
+	a->h_path.dentry = vfsub_lkup_one(&a->src_dentry->d_name,
+					  a->src_h_parent);
+	rerr = PTR_ERR(a->h_path.dentry);
+	if (IS_ERR(a->h_path.dentry)) {
+		RevertFailure("lkup one %pd", a->src_dentry);
+		return;
+	}
+
+	delegated = NULL;
+	rerr = vfsub_rename(a->dst_h_dir,
+			    au_h_dptr(a->src_dentry, a->btgt),
+			    a->src_h_dir, &a->h_path, &delegated);
+	if (unlikely(rerr == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal rename\n");
+		iput(delegated);
+	}
+	d_drop(a->h_path.dentry);
+	dput(a->h_path.dentry);
+	/* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */
+	if (rerr)
+		RevertFailure("rename %pd", a->src_dentry);
+}
+
+static void au_ren_rev_whtmp(int err, struct au_ren_args *a)
+{
+	int rerr;
+	struct inode *delegated;
+
+	a->h_path.dentry = vfsub_lkup_one(&a->dst_dentry->d_name,
+					  a->dst_h_parent);
+	rerr = PTR_ERR(a->h_path.dentry);
+	if (IS_ERR(a->h_path.dentry)) {
+		RevertFailure("lkup one %pd", a->dst_dentry);
+		return;
+	}
+	if (d_is_positive(a->h_path.dentry)) {
+		d_drop(a->h_path.dentry);
+		dput(a->h_path.dentry);
+		return;
+	}
+
+	delegated = NULL;
+	rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path,
+			    &delegated);
+	if (unlikely(rerr == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal rename\n");
+		iput(delegated);
+	}
+	d_drop(a->h_path.dentry);
+	dput(a->h_path.dentry);
+	if (!rerr)
+		au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst));
+	else
+		RevertFailure("rename %pd", a->h_dst);
+}
+
+static void au_ren_rev_whsrc(int err, struct au_ren_args *a)
+{
+	int rerr;
+
+	a->h_path.dentry = a->src_wh_dentry;
+	rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry);
+	au_set_dbwh(a->src_dentry, a->src_bwh);
+	if (rerr)
+		RevertFailure("unlink %pd", a->src_wh_dentry);
+}
+#undef RevertFailure
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * when we have to copyup the renaming entry, do it with the rename-target name
+ * in order to minimize the cost (the later actual rename is unnecessary).
+ * otherwise rename it on the target branch.
+ */
+static int au_ren_or_cpup(struct au_ren_args *a)
+{
+	int err;
+	struct dentry *d;
+	struct inode *delegated;
+
+	d = a->src_dentry;
+	if (au_dbtop(d) == a->btgt) {
+		a->h_path.dentry = a->dst_h_dentry;
+		if (au_ftest_ren(a->flags, DIROPQ)
+		    && au_dbdiropq(d) == a->btgt)
+			au_fclr_ren(a->flags, DIROPQ);
+		AuDebugOn(au_dbtop(d) != a->btgt);
+		delegated = NULL;
+		err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt),
+				   a->dst_h_dir, &a->h_path, &delegated);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal rename\n");
+			iput(delegated);
+		}
+	} else
+		BUG();
+
+	if (!err && a->h_dst)
+		/* it will be set to dinfo later */
+		dget(a->h_dst);
+
+	return err;
+}
+
+/* cf. aufs_rmdir() */
+static int au_ren_del_whtmp(struct au_ren_args *a)
+{
+	int err;
+	struct inode *dir;
+
+	dir = a->dst_dir;
+	SiMustAnyLock(dir->i_sb);
+	if (!au_nhash_test_longer_wh(&a->whlist, a->btgt,
+				     au_sbi(dir->i_sb)->si_dirwh)
+	    || au_test_fs_remote(a->h_dst->d_sb)) {
+		err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist);
+		if (unlikely(err))
+			pr_warn("failed removing whtmp dir %pd (%d), "
+				"ignored.\n", a->h_dst, err);
+	} else {
+		au_nhash_wh_free(&a->thargs->whlist);
+		a->thargs->whlist = a->whlist;
+		a->whlist.nh_num = 0;
+		au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs);
+		dput(a->h_dst);
+		a->thargs = NULL;
+	}
+
+	return 0;
+}
+
+/* make it 'opaque' dir. */
+static int au_ren_diropq(struct au_ren_args *a)
+{
+	int err;
+	struct dentry *diropq;
+
+	err = 0;
+	a->src_bdiropq = au_dbdiropq(a->src_dentry);
+	a->src_hinode = au_hi(a->src_inode, a->btgt);
+	au_hn_inode_lock_nested(a->src_hinode, AuLsc_I_CHILD);
+	diropq = au_diropq_create(a->src_dentry, a->btgt);
+	au_hn_inode_unlock(a->src_hinode);
+	if (IS_ERR(diropq))
+		err = PTR_ERR(diropq);
+	else
+		dput(diropq);
+
+	return err;
+}
+
+static int do_rename(struct au_ren_args *a)
+{
+	int err;
+	struct dentry *d, *h_d;
+
+	/* prepare workqueue args for asynchronous rmdir */
+	h_d = a->dst_h_dentry;
+	if (au_ftest_ren(a->flags, ISDIR) && d_is_positive(h_d)) {
+		err = -ENOMEM;
+		a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS);
+		if (unlikely(!a->thargs))
+			goto out;
+		a->h_dst = dget(h_d);
+	}
+
+	/* create whiteout for src_dentry */
+	if (au_ftest_ren(a->flags, WHSRC)) {
+		a->src_bwh = au_dbwh(a->src_dentry);
+		AuDebugOn(a->src_bwh >= 0);
+		a->src_wh_dentry
+			= au_wh_create(a->src_dentry, a->btgt, a->src_h_parent);
+		err = PTR_ERR(a->src_wh_dentry);
+		if (IS_ERR(a->src_wh_dentry))
+			goto out_thargs;
+	}
+
+	/* lookup whiteout for dentry */
+	if (au_ftest_ren(a->flags, WHDST)) {
+		h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name,
+				 a->br);
+		err = PTR_ERR(h_d);
+		if (IS_ERR(h_d))
+			goto out_whsrc;
+		if (d_is_negative(h_d))
+			dput(h_d);
+		else
+			a->dst_wh_dentry = h_d;
+	}
+
+	/* rename dentry to tmpwh */
+	if (a->thargs) {
+		err = au_whtmp_ren(a->dst_h_dentry, a->br);
+		if (unlikely(err))
+			goto out_whdst;
+
+		d = a->dst_dentry;
+		au_set_h_dptr(d, a->btgt, NULL);
+		err = au_lkup_neg(d, a->btgt, /*wh*/0);
+		if (unlikely(err))
+			goto out_whtmp;
+		a->dst_h_dentry = au_h_dptr(d, a->btgt);
+	}
+
+	BUG_ON(d_is_positive(a->dst_h_dentry) && a->src_btop != a->btgt);
+
+	/* rename by vfs_rename or cpup */
+	d = a->dst_dentry;
+	if (au_ftest_ren(a->flags, ISDIR)
+	    && (a->dst_wh_dentry
+		|| au_dbdiropq(d) == a->btgt
+		/* hide the lower to keep xino */
+		|| a->btgt < au_dbbot(d)
+		|| au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ)))
+		au_fset_ren(a->flags, DIROPQ);
+	err = au_ren_or_cpup(a);
+	if (unlikely(err))
+		/* leave the copied-up one */
+		goto out_whtmp;
+
+	/* make dir opaque */
+	if (au_ftest_ren(a->flags, DIROPQ)) {
+		err = au_ren_diropq(a);
+		if (unlikely(err))
+			goto out_rename;
+	}
+
+	/* update target timestamps */
+	AuDebugOn(au_dbtop(a->src_dentry) != a->btgt);
+	a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt);
+	vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/
+	a->src_inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
+
+	/* remove whiteout for dentry */
+	if (a->dst_wh_dentry) {
+		a->h_path.dentry = a->dst_wh_dentry;
+		err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path,
+					  a->dst_dentry);
+		if (unlikely(err))
+			goto out_diropq;
+	}
+
+	/* remove whtmp */
+	if (a->thargs)
+		au_ren_del_whtmp(a); /* ignore this error */
+
+	au_fhsm_wrote(a->src_dentry->d_sb, a->btgt, /*force*/0);
+	err = 0;
+	goto out_success;
+
+out_diropq:
+	if (au_ftest_ren(a->flags, DIROPQ))
+		au_ren_rev_diropq(err, a);
+out_rename:
+	au_ren_rev_rename(err, a);
+	dput(a->h_dst);
+out_whtmp:
+	if (a->thargs)
+		au_ren_rev_whtmp(err, a);
+out_whdst:
+	dput(a->dst_wh_dentry);
+	a->dst_wh_dentry = NULL;
+out_whsrc:
+	if (a->src_wh_dentry)
+		au_ren_rev_whsrc(err, a);
+out_success:
+	dput(a->src_wh_dentry);
+	dput(a->dst_wh_dentry);
+out_thargs:
+	if (a->thargs) {
+		dput(a->h_dst);
+		au_whtmp_rmdir_free(a->thargs);
+		a->thargs = NULL;
+	}
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * test if @dentry dir can be rename destination or not.
+ * success means, it is a logically empty dir.
+ */
+static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist)
+{
+	return au_test_empty(dentry, whlist);
+}
+
+/*
+ * test if @dentry dir can be rename source or not.
+ * if it can, return 0 and @children is filled.
+ * success means,
+ * - it is a logically empty dir.
+ * - or, it exists on writable branch and has no children including whiteouts
+ *       on the lower branch.
+ */
+static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt)
+{
+	int err;
+	unsigned int rdhash;
+	aufs_bindex_t btop;
+
+	btop = au_dbtop(dentry);
+	if (btop != btgt) {
+		struct au_nhash whlist;
+
+		SiMustAnyLock(dentry->d_sb);
+		rdhash = au_sbi(dentry->d_sb)->si_rdhash;
+		if (!rdhash)
+			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL,
+							   dentry));
+		err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
+		if (unlikely(err))
+			goto out;
+		err = au_test_empty(dentry, &whlist);
+		au_nhash_wh_free(&whlist);
+		goto out;
+	}
+
+	if (btop == au_dbtaildir(dentry))
+		return 0; /* success */
+
+	err = au_test_empty_lower(dentry);
+
+out:
+	if (err == -ENOTEMPTY) {
+		AuWarn1("renaming dir who has child(ren) on multiple branches,"
+			" is not supported\n");
+		err = -EXDEV;
+	}
+	return err;
+}
+
+/* side effect: sets whlist and h_dentry */
+static int au_ren_may_dir(struct au_ren_args *a)
+{
+	int err;
+	unsigned int rdhash;
+	struct dentry *d;
+
+	d = a->dst_dentry;
+	SiMustAnyLock(d->d_sb);
+
+	err = 0;
+	if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) {
+		rdhash = au_sbi(d->d_sb)->si_rdhash;
+		if (!rdhash)
+			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d));
+		err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS);
+		if (unlikely(err))
+			goto out;
+
+		au_set_dbtop(d, a->dst_btop);
+		err = may_rename_dstdir(d, &a->whlist);
+		au_set_dbtop(d, a->btgt);
+	}
+	a->dst_h_dentry = au_h_dptr(d, au_dbtop(d));
+	if (unlikely(err))
+		goto out;
+
+	d = a->src_dentry;
+	a->src_h_dentry = au_h_dptr(d, au_dbtop(d));
+	if (au_ftest_ren(a->flags, ISDIR)) {
+		err = may_rename_srcdir(d, a->btgt);
+		if (unlikely(err)) {
+			au_nhash_wh_free(&a->whlist);
+			a->whlist.nh_num = 0;
+		}
+	}
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * simple tests for rename.
+ * following the checks in vfs, plus the parent-child relationship.
+ */
+static int au_may_ren(struct au_ren_args *a)
+{
+	int err, isdir;
+	struct inode *h_inode;
+
+	if (a->src_btop == a->btgt) {
+		err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent,
+				 au_ftest_ren(a->flags, ISDIR));
+		if (unlikely(err))
+			goto out;
+		err = -EINVAL;
+		if (unlikely(a->src_h_dentry == a->h_trap))
+			goto out;
+	}
+
+	err = 0;
+	if (a->dst_btop != a->btgt)
+		goto out;
+
+	err = -ENOTEMPTY;
+	if (unlikely(a->dst_h_dentry == a->h_trap))
+		goto out;
+
+	err = -EIO;
+	isdir = !!au_ftest_ren(a->flags, ISDIR);
+	if (d_really_is_negative(a->dst_dentry)) {
+		if (d_is_negative(a->dst_h_dentry))
+			err = au_may_add(a->dst_dentry, a->btgt,
+					 a->dst_h_parent, isdir);
+	} else {
+		if (unlikely(d_is_negative(a->dst_h_dentry)))
+			goto out;
+		h_inode = d_inode(a->dst_h_dentry);
+		if (h_inode->i_nlink)
+			err = au_may_del(a->dst_dentry, a->btgt,
+					 a->dst_h_parent, isdir);
+	}
+
+out:
+	if (unlikely(err == -ENOENT || err == -EEXIST))
+		err = -EIO;
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * locking order
+ * (VFS)
+ * - src_dir and dir by lock_rename()
+ * - inode if exitsts
+ * (aufs)
+ * - lock all
+ *   + src_dentry and dentry by aufs_read_and_write_lock2() which calls,
+ *     + si_read_lock
+ *     + di_write_lock2_child()
+ *       + di_write_lock_child()
+ *	   + ii_write_lock_child()
+ *       + di_write_lock_child2()
+ *	   + ii_write_lock_child2()
+ *     + src_parent and parent
+ *       + di_write_lock_parent()
+ *	   + ii_write_lock_parent()
+ *       + di_write_lock_parent2()
+ *	   + ii_write_lock_parent2()
+ *   + lower src_dir and dir by vfsub_lock_rename()
+ *   + verify the every relationships between child and parent. if any
+ *     of them failed, unlock all and return -EBUSY.
+ */
+static void au_ren_unlock(struct au_ren_args *a)
+{
+	vfsub_unlock_rename(a->src_h_parent, a->src_hdir,
+			    a->dst_h_parent, a->dst_hdir);
+	if (au_ftest_ren(a->flags, MNT_WRITE))
+		vfsub_mnt_drop_write(au_br_mnt(a->br));
+}
+
+static int au_ren_lock(struct au_ren_args *a)
+{
+	int err;
+	unsigned int udba;
+
+	err = 0;
+	a->src_h_parent = au_h_dptr(a->src_parent, a->btgt);
+	a->src_hdir = au_hi(a->src_dir, a->btgt);
+	a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt);
+	a->dst_hdir = au_hi(a->dst_dir, a->btgt);
+
+	err = vfsub_mnt_want_write(au_br_mnt(a->br));
+	if (unlikely(err))
+		goto out;
+	au_fset_ren(a->flags, MNT_WRITE);
+	a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir,
+				      a->dst_h_parent, a->dst_hdir);
+	udba = au_opt_udba(a->src_dentry->d_sb);
+	if (unlikely(a->src_hdir->hi_inode != d_inode(a->src_h_parent)
+		     || a->dst_hdir->hi_inode != d_inode(a->dst_h_parent)))
+		err = au_busy_or_stale();
+	if (!err && au_dbtop(a->src_dentry) == a->btgt)
+		err = au_h_verify(a->src_h_dentry, udba,
+				  d_inode(a->src_h_parent), a->src_h_parent,
+				  a->br);
+	if (!err && au_dbtop(a->dst_dentry) == a->btgt)
+		err = au_h_verify(a->dst_h_dentry, udba,
+				  d_inode(a->dst_h_parent), a->dst_h_parent,
+				  a->br);
+	if (!err)
+		goto out; /* success */
+
+	err = au_busy_or_stale();
+	au_ren_unlock(a);
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_ren_refresh_dir(struct au_ren_args *a)
+{
+	struct inode *dir;
+
+	dir = a->dst_dir;
+	dir->i_version++;
+	if (au_ftest_ren(a->flags, ISDIR)) {
+		/* is this updating defined in POSIX? */
+		au_cpup_attr_timesizes(a->src_inode);
+		au_cpup_attr_nlink(dir, /*force*/1);
+	}
+
+	au_dir_ts(dir, a->btgt);
+
+	if (au_ftest_ren(a->flags, ISSAMEDIR))
+		return;
+
+	dir = a->src_dir;
+	dir->i_version++;
+	if (au_ftest_ren(a->flags, ISDIR))
+		au_cpup_attr_nlink(dir, /*force*/1);
+	au_dir_ts(dir, a->btgt);
+}
+
+static void au_ren_refresh(struct au_ren_args *a)
+{
+	aufs_bindex_t bbot, bindex;
+	struct dentry *d, *h_d;
+	struct inode *i, *h_i;
+	struct super_block *sb;
+
+	d = a->dst_dentry;
+	d_drop(d);
+	if (a->h_dst)
+		/* already dget-ed by au_ren_or_cpup() */
+		au_set_h_dptr(d, a->btgt, a->h_dst);
+
+	i = a->dst_inode;
+	if (i) {
+		if (!au_ftest_ren(a->flags, ISDIR))
+			vfsub_drop_nlink(i);
+		else {
+			vfsub_dead_dir(i);
+			au_cpup_attr_timesizes(i);
+		}
+		au_update_dbrange(d, /*do_put_zero*/1);
+	} else {
+		bbot = a->btgt;
+		for (bindex = au_dbtop(d); bindex < bbot; bindex++)
+			au_set_h_dptr(d, bindex, NULL);
+		bbot = au_dbbot(d);
+		for (bindex = a->btgt + 1; bindex <= bbot; bindex++)
+			au_set_h_dptr(d, bindex, NULL);
+		au_update_dbrange(d, /*do_put_zero*/0);
+	}
+
+	d = a->src_dentry;
+	au_set_dbwh(d, -1);
+	bbot = au_dbbot(d);
+	for (bindex = a->btgt + 1; bindex <= bbot; bindex++) {
+		h_d = au_h_dptr(d, bindex);
+		if (h_d)
+			au_set_h_dptr(d, bindex, NULL);
+	}
+	au_set_dbbot(d, a->btgt);
+
+	sb = d->d_sb;
+	i = a->src_inode;
+	if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i))
+		return; /* success */
+
+	bbot = au_ibbot(i);
+	for (bindex = a->btgt + 1; bindex <= bbot; bindex++) {
+		h_i = au_h_iptr(i, bindex);
+		if (h_i) {
+			au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0);
+			/* ignore this error */
+			au_set_h_iptr(i, bindex, NULL, 0);
+		}
+	}
+	au_set_ibbot(i, a->btgt);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* mainly for link(2) and rename(2) */
+int au_wbr(struct dentry *dentry, aufs_bindex_t btgt)
+{
+	aufs_bindex_t bdiropq, bwh;
+	struct dentry *parent;
+	struct au_branch *br;
+
+	parent = dentry->d_parent;
+	IMustLock(d_inode(parent)); /* dir is locked */
+
+	bdiropq = au_dbdiropq(parent);
+	bwh = au_dbwh(dentry);
+	br = au_sbr(dentry->d_sb, btgt);
+	if (au_br_rdonly(br)
+	    || (0 <= bdiropq && bdiropq < btgt)
+	    || (0 <= bwh && bwh < btgt))
+		btgt = -1;
+
+	AuDbg("btgt %d\n", btgt);
+	return btgt;
+}
+
+/* sets src_btop, dst_btop and btgt */
+static int au_ren_wbr(struct au_ren_args *a)
+{
+	int err;
+	struct au_wr_dir_args wr_dir_args = {
+		/* .force_btgt	= -1, */
+		.flags		= AuWrDir_ADD_ENTRY
+	};
+
+	a->src_btop = au_dbtop(a->src_dentry);
+	a->dst_btop = au_dbtop(a->dst_dentry);
+	if (au_ftest_ren(a->flags, ISDIR))
+		au_fset_wrdir(wr_dir_args.flags, ISDIR);
+	wr_dir_args.force_btgt = a->src_btop;
+	if (a->dst_inode && a->dst_btop < a->src_btop)
+		wr_dir_args.force_btgt = a->dst_btop;
+	wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt);
+	err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args);
+	a->btgt = err;
+
+	return err;
+}
+
+static void au_ren_dt(struct au_ren_args *a)
+{
+	a->h_path.dentry = a->src_h_parent;
+	au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path);
+	if (!au_ftest_ren(a->flags, ISSAMEDIR)) {
+		a->h_path.dentry = a->dst_h_parent;
+		au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path);
+	}
+
+	au_fclr_ren(a->flags, DT_DSTDIR);
+	if (!au_ftest_ren(a->flags, ISDIR))
+		return;
+
+	a->h_path.dentry = a->src_h_dentry;
+	au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path);
+	if (d_is_positive(a->dst_h_dentry)) {
+		au_fset_ren(a->flags, DT_DSTDIR);
+		a->h_path.dentry = a->dst_h_dentry;
+		au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path);
+	}
+}
+
+static void au_ren_rev_dt(int err, struct au_ren_args *a)
+{
+	struct dentry *h_d;
+	struct inode *h_inode;
+
+	au_dtime_revert(a->src_dt + AuPARENT);
+	if (!au_ftest_ren(a->flags, ISSAMEDIR))
+		au_dtime_revert(a->dst_dt + AuPARENT);
+
+	if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) {
+		h_d = a->src_dt[AuCHILD].dt_h_path.dentry;
+		h_inode = d_inode(h_d);
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		au_dtime_revert(a->src_dt + AuCHILD);
+		inode_unlock(h_inode);
+
+		if (au_ftest_ren(a->flags, DT_DSTDIR)) {
+			h_d = a->dst_dt[AuCHILD].dt_h_path.dentry;
+			h_inode = d_inode(h_d);
+			inode_lock_nested(h_inode, AuLsc_I_CHILD);
+			au_dtime_revert(a->dst_dt + AuCHILD);
+			inode_unlock(h_inode);
+		}
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry,
+		struct inode *_dst_dir, struct dentry *_dst_dentry)
+{
+	int err, flags;
+	/* reduce stack space */
+	struct au_ren_args *a;
+
+	AuDbg("%pd, %pd\n", _src_dentry, _dst_dentry);
+	IMustLock(_src_dir);
+	IMustLock(_dst_dir);
+
+	err = -ENOMEM;
+	BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE);
+	a = kzalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	a->src_dir = _src_dir;
+	a->src_dentry = _src_dentry;
+	a->src_inode = NULL;
+	if (d_really_is_positive(a->src_dentry))
+		a->src_inode = d_inode(a->src_dentry);
+	a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */
+	a->dst_dir = _dst_dir;
+	a->dst_dentry = _dst_dentry;
+	a->dst_inode = NULL;
+	if (d_really_is_positive(a->dst_dentry))
+		a->dst_inode = d_inode(a->dst_dentry);
+	a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */
+	if (a->dst_inode) {
+		IMustLock(a->dst_inode);
+		au_igrab(a->dst_inode);
+	}
+
+	err = -ENOTDIR;
+	flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN;
+	if (d_is_dir(a->src_dentry)) {
+		au_fset_ren(a->flags, ISDIR);
+		if (unlikely(d_really_is_positive(a->dst_dentry)
+			     && !d_is_dir(a->dst_dentry)))
+			goto out_free;
+		flags |= AuLock_DIRS;
+	}
+	err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, flags);
+	if (unlikely(err))
+		goto out_free;
+
+	err = au_d_hashed_positive(a->src_dentry);
+	if (unlikely(err))
+		goto out_unlock;
+	err = -ENOENT;
+	if (a->dst_inode) {
+		/*
+		 * If it is a dir, VFS unhash dst_dentry before this
+		 * function. It means we cannot rely upon d_unhashed().
+		 */
+		if (unlikely(!a->dst_inode->i_nlink))
+			goto out_unlock;
+		if (!S_ISDIR(a->dst_inode->i_mode)) {
+			err = au_d_hashed_positive(a->dst_dentry);
+			if (unlikely(err))
+				goto out_unlock;
+		} else if (unlikely(IS_DEADDIR(a->dst_inode)))
+			goto out_unlock;
+	} else if (unlikely(d_unhashed(a->dst_dentry)))
+		goto out_unlock;
+
+	/*
+	 * is it possible?
+	 * yes, it happened (in linux-3.3-rcN) but I don't know why.
+	 * there may exist a problem somewhere else.
+	 */
+	err = -EINVAL;
+	if (unlikely(d_inode(a->dst_parent) == d_inode(a->src_dentry)))
+		goto out_unlock;
+
+	au_fset_ren(a->flags, ISSAMEDIR); /* temporary */
+	di_write_lock_parent(a->dst_parent);
+
+	/* which branch we process */
+	err = au_ren_wbr(a);
+	if (unlikely(err < 0))
+		goto out_parent;
+	a->br = au_sbr(a->dst_dentry->d_sb, a->btgt);
+	a->h_path.mnt = au_br_mnt(a->br);
+
+	/* are they available to be renamed */
+	err = au_ren_may_dir(a);
+	if (unlikely(err))
+		goto out_children;
+
+	/* prepare the writable parent dir on the same branch */
+	if (a->dst_btop == a->btgt) {
+		au_fset_ren(a->flags, WHDST);
+	} else {
+		err = au_cpup_dirs(a->dst_dentry, a->btgt);
+		if (unlikely(err))
+			goto out_children;
+	}
+
+	if (a->src_dir != a->dst_dir) {
+		/*
+		 * this temporary unlock is safe,
+		 * because both dir->i_mutex are locked.
+		 */
+		di_write_unlock(a->dst_parent);
+		di_write_lock_parent(a->src_parent);
+		err = au_wr_dir_need_wh(a->src_dentry,
+					au_ftest_ren(a->flags, ISDIR),
+					&a->btgt);
+		di_write_unlock(a->src_parent);
+		di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1);
+		au_fclr_ren(a->flags, ISSAMEDIR);
+	} else
+		err = au_wr_dir_need_wh(a->src_dentry,
+					au_ftest_ren(a->flags, ISDIR),
+					&a->btgt);
+	if (unlikely(err < 0))
+		goto out_children;
+	if (err)
+		au_fset_ren(a->flags, WHSRC);
+
+	/* cpup src */
+	if (a->src_btop != a->btgt) {
+		struct au_pin pin;
+
+		err = au_pin(&pin, a->src_dentry, a->btgt,
+			     au_opt_udba(a->src_dentry->d_sb),
+			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
+		if (!err) {
+			struct au_cp_generic cpg = {
+				.dentry	= a->src_dentry,
+				.bdst	= a->btgt,
+				.bsrc	= a->src_btop,
+				.len	= -1,
+				.pin	= &pin,
+				.flags	= AuCpup_DTIME | AuCpup_HOPEN
+			};
+			AuDebugOn(au_dbtop(a->src_dentry) != a->src_btop);
+			err = au_sio_cpup_simple(&cpg);
+			au_unpin(&pin);
+		}
+		if (unlikely(err))
+			goto out_children;
+		a->src_btop = a->btgt;
+		a->src_h_dentry = au_h_dptr(a->src_dentry, a->btgt);
+		au_fset_ren(a->flags, WHSRC);
+	}
+
+	/* lock them all */
+	err = au_ren_lock(a);
+	if (unlikely(err))
+		/* leave the copied-up one */
+		goto out_children;
+
+	if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE))
+		err = au_may_ren(a);
+	else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN))
+		err = -ENAMETOOLONG;
+	if (unlikely(err))
+		goto out_hdir;
+
+	/* store timestamps to be revertible */
+	au_ren_dt(a);
+
+	/* here we go */
+	err = do_rename(a);
+	if (unlikely(err))
+		goto out_dt;
+
+	/* update dir attributes */
+	au_ren_refresh_dir(a);
+
+	/* dput/iput all lower dentries */
+	au_ren_refresh(a);
+
+	goto out_hdir; /* success */
+
+out_dt:
+	au_ren_rev_dt(err, a);
+out_hdir:
+	au_ren_unlock(a);
+out_children:
+	au_nhash_wh_free(&a->whlist);
+	if (err && a->dst_inode && a->dst_btop != a->btgt) {
+		AuDbg("btop %d, btgt %d\n", a->dst_btop, a->btgt);
+		au_set_h_dptr(a->dst_dentry, a->btgt, NULL);
+		au_set_dbtop(a->dst_dentry, a->dst_btop);
+	}
+out_parent:
+	if (!err)
+		d_move(a->src_dentry, a->dst_dentry);
+	else {
+		au_update_dbtop(a->dst_dentry);
+		if (!a->dst_inode)
+			d_drop(a->dst_dentry);
+	}
+	if (au_ftest_ren(a->flags, ISSAMEDIR))
+		di_write_unlock(a->dst_parent);
+	else
+		di_write_unlock2(a->src_parent, a->dst_parent);
+out_unlock:
+	aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry);
+out_free:
+	iput(a->dst_inode);
+	if (a->thargs)
+		au_whtmp_rmdir_free(a->thargs);
+	au_delayed_kfree(a);
+out:
+	AuTraceErr(err);
+	return err;
+}
diff --git b/fs/aufs/iinfo.c b/fs/aufs/iinfo.c
new file mode 100644
index 0000000..975cff3
--- /dev/null
+++ b/fs/aufs/iinfo.c
@@ -0,0 +1,272 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode private data
+ */
+
+#include "aufs.h"
+
+struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex)
+{
+	struct inode *h_inode;
+	struct au_hinode *hinode;
+
+	IiMustAnyLock(inode);
+
+	hinode = au_hinode(au_ii(inode), bindex);
+	h_inode = hinode->hi_inode;
+	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
+	return h_inode;
+}
+
+/* todo: hard/soft set? */
+void au_hiput(struct au_hinode *hinode)
+{
+	au_hn_free(hinode);
+	dput(hinode->hi_whdentry);
+	iput(hinode->hi_inode);
+}
+
+unsigned int au_hi_flags(struct inode *inode, int isdir)
+{
+	unsigned int flags;
+	const unsigned int mnt_flags = au_mntflags(inode->i_sb);
+
+	flags = 0;
+	if (au_opt_test(mnt_flags, XINO))
+		au_fset_hi(flags, XINO);
+	if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY))
+		au_fset_hi(flags, HNOTIFY);
+	return flags;
+}
+
+void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
+		   struct inode *h_inode, unsigned int flags)
+{
+	struct au_hinode *hinode;
+	struct inode *hi;
+	struct au_iinfo *iinfo = au_ii(inode);
+
+	IiMustWriteLock(inode);
+
+	hinode = au_hinode(iinfo, bindex);
+	hi = hinode->hi_inode;
+	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
+
+	if (hi)
+		au_hiput(hinode);
+	hinode->hi_inode = h_inode;
+	if (h_inode) {
+		int err;
+		struct super_block *sb = inode->i_sb;
+		struct au_branch *br;
+
+		AuDebugOn(inode->i_mode
+			  && (h_inode->i_mode & S_IFMT)
+			  != (inode->i_mode & S_IFMT));
+		if (bindex == iinfo->ii_btop)
+			au_cpup_igen(inode, h_inode);
+		br = au_sbr(sb, bindex);
+		hinode->hi_id = br->br_id;
+		if (au_ftest_hi(flags, XINO)) {
+			err = au_xino_write(sb, bindex, h_inode->i_ino,
+					    inode->i_ino);
+			if (unlikely(err))
+				AuIOErr1("failed au_xino_write() %d\n", err);
+		}
+
+		if (au_ftest_hi(flags, HNOTIFY)
+		    && au_br_hnotifyable(br->br_perm)) {
+			err = au_hn_alloc(hinode, inode);
+			if (unlikely(err))
+				AuIOErr1("au_hn_alloc() %d\n", err);
+		}
+	}
+}
+
+void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
+		  struct dentry *h_wh)
+{
+	struct au_hinode *hinode;
+
+	IiMustWriteLock(inode);
+
+	hinode = au_hinode(au_ii(inode), bindex);
+	AuDebugOn(hinode->hi_whdentry);
+	hinode->hi_whdentry = h_wh;
+}
+
+void au_update_iigen(struct inode *inode, int half)
+{
+	struct au_iinfo *iinfo;
+	struct au_iigen *iigen;
+	unsigned int sigen;
+
+	sigen = au_sigen(inode->i_sb);
+	iinfo = au_ii(inode);
+	iigen = &iinfo->ii_generation;
+	spin_lock(&iigen->ig_spin);
+	iigen->ig_generation = sigen;
+	if (half)
+		au_ig_fset(iigen->ig_flags, HALF_REFRESHED);
+	else
+		au_ig_fclr(iigen->ig_flags, HALF_REFRESHED);
+	spin_unlock(&iigen->ig_spin);
+}
+
+/* it may be called at remount time, too */
+void au_update_ibrange(struct inode *inode, int do_put_zero)
+{
+	struct au_iinfo *iinfo;
+	aufs_bindex_t bindex, bbot;
+
+	AuDebugOn(au_is_bad_inode(inode));
+	IiMustWriteLock(inode);
+
+	iinfo = au_ii(inode);
+	if (do_put_zero && iinfo->ii_btop >= 0) {
+		for (bindex = iinfo->ii_btop; bindex <= iinfo->ii_bbot;
+		     bindex++) {
+			struct inode *h_i;
+
+			h_i = au_hinode(iinfo, bindex)->hi_inode;
+			if (h_i
+			    && !h_i->i_nlink
+			    && !(h_i->i_state & I_LINKABLE))
+				au_set_h_iptr(inode, bindex, NULL, 0);
+		}
+	}
+
+	iinfo->ii_btop = -1;
+	iinfo->ii_bbot = -1;
+	bbot = au_sbbot(inode->i_sb);
+	for (bindex = 0; bindex <= bbot; bindex++)
+		if (au_hinode(iinfo, bindex)->hi_inode) {
+			iinfo->ii_btop = bindex;
+			break;
+		}
+	if (iinfo->ii_btop >= 0)
+		for (bindex = bbot; bindex >= iinfo->ii_btop; bindex--)
+			if (au_hinode(iinfo, bindex)->hi_inode) {
+				iinfo->ii_bbot = bindex;
+				break;
+			}
+	AuDebugOn(iinfo->ii_btop > iinfo->ii_bbot);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_icntnr_init_once(void *_c)
+{
+	struct au_icntnr *c = _c;
+	struct au_iinfo *iinfo = &c->iinfo;
+
+	spin_lock_init(&iinfo->ii_generation.ig_spin);
+	au_rw_init(&iinfo->ii_rwsem);
+	inode_init_once(&c->vfs_inode);
+}
+
+void au_hinode_init(struct au_hinode *hinode)
+{
+	hinode->hi_inode = NULL;
+	hinode->hi_id = -1;
+	au_hn_init(hinode);
+	hinode->hi_whdentry = NULL;
+}
+
+int au_iinfo_init(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	struct super_block *sb;
+	struct au_hinode *hi;
+	int nbr, i;
+
+	sb = inode->i_sb;
+	iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
+	nbr = au_sbbot(sb) + 1;
+	if (unlikely(nbr <= 0))
+		nbr = 1;
+	hi = kmalloc_array(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS);
+	if (hi) {
+		au_ninodes_inc(sb);
+
+		iinfo->ii_hinode = hi;
+		for (i = 0; i < nbr; i++, hi++)
+			au_hinode_init(hi);
+
+		iinfo->ii_generation.ig_generation = au_sigen(sb);
+		iinfo->ii_btop = -1;
+		iinfo->ii_bbot = -1;
+		iinfo->ii_vdir = NULL;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+int au_hinode_realloc(struct au_iinfo *iinfo, int nbr, int may_shrink)
+{
+	int err, i;
+	struct au_hinode *hip;
+
+	AuRwMustWriteLock(&iinfo->ii_rwsem);
+
+	err = -ENOMEM;
+	hip = au_krealloc(iinfo->ii_hinode, sizeof(*hip) * nbr, GFP_NOFS,
+			  may_shrink);
+	if (hip) {
+		iinfo->ii_hinode = hip;
+		i = iinfo->ii_bbot + 1;
+		hip += i;
+		for (; i < nbr; i++, hip++)
+			au_hinode_init(hip);
+		err = 0;
+	}
+
+	return err;
+}
+
+void au_iinfo_fin(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	struct au_hinode *hi;
+	struct super_block *sb;
+	aufs_bindex_t bindex, bbot;
+	const unsigned char unlinked = !inode->i_nlink;
+
+	AuDebugOn(au_is_bad_inode(inode));
+
+	sb = inode->i_sb;
+	au_ninodes_dec(sb);
+	if (si_pid_test(sb))
+		au_xino_delete_inode(inode, unlinked);
+	else {
+		/*
+		 * it is safe to hide the dependency between sbinfo and
+		 * sb->s_umount.
+		 */
+		lockdep_off();
+		si_noflush_read_lock(sb);
+		au_xino_delete_inode(inode, unlinked);
+		si_read_unlock(sb);
+		lockdep_on();
+	}
+
+	iinfo = au_ii(inode);
+	if (iinfo->ii_vdir)
+		au_vdir_free(iinfo->ii_vdir, /*atonce*/0);
+
+	bindex = iinfo->ii_btop;
+	if (bindex >= 0) {
+		hi = au_hinode(iinfo, bindex);
+		bbot = iinfo->ii_bbot;
+		while (bindex++ <= bbot) {
+			if (hi->hi_inode)
+				au_hiput(hi);
+			hi++;
+		}
+	}
+	au_delayed_kfree(iinfo->ii_hinode);
+	AuRwDestroy(&iinfo->ii_rwsem);
+}
diff --git b/fs/aufs/inode.c b/fs/aufs/inode.c
new file mode 100644
index 0000000..8f7446d
--- /dev/null
+++ b/fs/aufs/inode.c
@@ -0,0 +1,506 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode functions
+ */
+
+#include "aufs.h"
+
+struct inode *au_igrab(struct inode *inode)
+{
+	if (inode) {
+		AuDebugOn(!atomic_read(&inode->i_count));
+		ihold(inode);
+	}
+	return inode;
+}
+
+static void au_refresh_hinode_attr(struct inode *inode, int do_version)
+{
+	au_cpup_attr_all(inode, /*force*/0);
+	au_update_iigen(inode, /*half*/1);
+	if (do_version)
+		inode->i_version++;
+}
+
+static int au_ii_refresh(struct inode *inode, int *update)
+{
+	int err, e, nbr;
+	umode_t type;
+	aufs_bindex_t bindex, new_bindex;
+	struct super_block *sb;
+	struct au_iinfo *iinfo;
+	struct au_hinode *p, *q, tmp;
+
+	AuDebugOn(au_is_bad_inode(inode));
+	IiMustWriteLock(inode);
+
+	*update = 0;
+	sb = inode->i_sb;
+	nbr = au_sbbot(sb) + 1;
+	type = inode->i_mode & S_IFMT;
+	iinfo = au_ii(inode);
+	err = au_hinode_realloc(iinfo, nbr, /*may_shrink*/0);
+	if (unlikely(err))
+		goto out;
+
+	AuDebugOn(iinfo->ii_btop < 0);
+	p = au_hinode(iinfo, iinfo->ii_btop);
+	for (bindex = iinfo->ii_btop; bindex <= iinfo->ii_bbot;
+	     bindex++, p++) {
+		if (!p->hi_inode)
+			continue;
+
+		AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT));
+		new_bindex = au_br_index(sb, p->hi_id);
+		if (new_bindex == bindex)
+			continue;
+
+		if (new_bindex < 0) {
+			*update = 1;
+			au_hiput(p);
+			p->hi_inode = NULL;
+			continue;
+		}
+
+		if (new_bindex < iinfo->ii_btop)
+			iinfo->ii_btop = new_bindex;
+		if (iinfo->ii_bbot < new_bindex)
+			iinfo->ii_bbot = new_bindex;
+		/* swap two lower inode, and loop again */
+		q = au_hinode(iinfo, new_bindex);
+		tmp = *q;
+		*q = *p;
+		*p = tmp;
+		if (tmp.hi_inode) {
+			bindex--;
+			p--;
+		}
+	}
+	au_update_ibrange(inode, /*do_put_zero*/0);
+	au_hinode_realloc(iinfo, nbr, /*may_shrink*/1); /* harmless if err */
+	e = au_dy_irefresh(inode);
+	if (unlikely(e && !err))
+		err = e;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+void au_refresh_iop(struct inode *inode, int force_getattr)
+{
+	int type;
+	struct au_sbinfo *sbi = au_sbi(inode->i_sb);
+	const struct inode_operations *iop
+		= force_getattr ? aufs_iop : sbi->si_iop_array;
+
+	if (inode->i_op == iop)
+		return;
+
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFDIR:
+		type = AuIop_DIR;
+		break;
+	case S_IFLNK:
+		type = AuIop_SYMLINK;
+		break;
+	default:
+		type = AuIop_OTHER;
+		break;
+	}
+
+	inode->i_op = iop + type;
+	/* unnecessary smp_wmb() */
+}
+
+int au_refresh_hinode_self(struct inode *inode)
+{
+	int err, update;
+
+	err = au_ii_refresh(inode, &update);
+	if (!err)
+		au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode));
+
+	AuTraceErr(err);
+	return err;
+}
+
+int au_refresh_hinode(struct inode *inode, struct dentry *dentry)
+{
+	int err, e, update;
+	unsigned int flags;
+	umode_t mode;
+	aufs_bindex_t bindex, bbot;
+	unsigned char isdir;
+	struct au_hinode *p;
+	struct au_iinfo *iinfo;
+
+	err = au_ii_refresh(inode, &update);
+	if (unlikely(err))
+		goto out;
+
+	update = 0;
+	iinfo = au_ii(inode);
+	p = au_hinode(iinfo, iinfo->ii_btop);
+	mode = (inode->i_mode & S_IFMT);
+	isdir = S_ISDIR(mode);
+	flags = au_hi_flags(inode, isdir);
+	bbot = au_dbbot(dentry);
+	for (bindex = au_dbtop(dentry); bindex <= bbot; bindex++) {
+		struct inode *h_i, *h_inode;
+		struct dentry *h_d;
+
+		h_d = au_h_dptr(dentry, bindex);
+		if (!h_d || d_is_negative(h_d))
+			continue;
+
+		h_inode = d_inode(h_d);
+		AuDebugOn(mode != (h_inode->i_mode & S_IFMT));
+		if (iinfo->ii_btop <= bindex && bindex <= iinfo->ii_bbot) {
+			h_i = au_h_iptr(inode, bindex);
+			if (h_i) {
+				if (h_i == h_inode)
+					continue;
+				err = -EIO;
+				break;
+			}
+		}
+		if (bindex < iinfo->ii_btop)
+			iinfo->ii_btop = bindex;
+		if (iinfo->ii_bbot < bindex)
+			iinfo->ii_bbot = bindex;
+		au_set_h_iptr(inode, bindex, au_igrab(h_inode), flags);
+		update = 1;
+	}
+	au_update_ibrange(inode, /*do_put_zero*/0);
+	e = au_dy_irefresh(inode);
+	if (unlikely(e && !err))
+		err = e;
+	if (!err)
+		au_refresh_hinode_attr(inode, update && isdir);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int set_inode(struct inode *inode, struct dentry *dentry)
+{
+	int err;
+	unsigned int flags;
+	umode_t mode;
+	aufs_bindex_t bindex, btop, btail;
+	unsigned char isdir;
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+	struct au_iinfo *iinfo;
+	struct inode_operations *iop;
+
+	IiMustWriteLock(inode);
+
+	err = 0;
+	isdir = 0;
+	iop = au_sbi(inode->i_sb)->si_iop_array;
+	btop = au_dbtop(dentry);
+	h_dentry = au_h_dptr(dentry, btop);
+	h_inode = d_inode(h_dentry);
+	mode = h_inode->i_mode;
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		btail = au_dbtail(dentry);
+		inode->i_op = iop + AuIop_OTHER;
+		inode->i_fop = &aufs_file_fop;
+		err = au_dy_iaop(inode, btop, h_inode);
+		if (unlikely(err))
+			goto out;
+		break;
+	case S_IFDIR:
+		isdir = 1;
+		btail = au_dbtaildir(dentry);
+		inode->i_op = iop + AuIop_DIR;
+		inode->i_fop = &aufs_dir_fop;
+		break;
+	case S_IFLNK:
+		btail = au_dbtail(dentry);
+		inode->i_op = iop + AuIop_SYMLINK;
+		break;
+	case S_IFBLK:
+	case S_IFCHR:
+	case S_IFIFO:
+	case S_IFSOCK:
+		btail = au_dbtail(dentry);
+		inode->i_op = iop + AuIop_OTHER;
+		init_special_inode(inode, mode, h_inode->i_rdev);
+		break;
+	default:
+		AuIOErr("Unknown file type 0%o\n", mode);
+		err = -EIO;
+		goto out;
+	}
+
+	/* do not set hnotify for whiteouted dirs (SHWH mode) */
+	flags = au_hi_flags(inode, isdir);
+	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)
+	    && au_ftest_hi(flags, HNOTIFY)
+	    && dentry->d_name.len > AUFS_WH_PFX_LEN
+	    && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))
+		au_fclr_hi(flags, HNOTIFY);
+	iinfo = au_ii(inode);
+	iinfo->ii_btop = btop;
+	iinfo->ii_bbot = btail;
+	for (bindex = btop; bindex <= btail; bindex++) {
+		h_dentry = au_h_dptr(dentry, bindex);
+		if (h_dentry)
+			au_set_h_iptr(inode, bindex,
+				      au_igrab(d_inode(h_dentry)), flags);
+	}
+	au_cpup_attr_all(inode, /*force*/1);
+	/*
+	 * to force calling aufs_get_acl() every time,
+	 * do not call cache_no_acl() for aufs inode.
+	 */
+
+out:
+	return err;
+}
+
+/*
+ * successful returns with iinfo write_locked
+ * minus: errno
+ * zero: success, matched
+ * plus: no error, but unmatched
+ */
+static int reval_inode(struct inode *inode, struct dentry *dentry)
+{
+	int err;
+	unsigned int gen, igflags;
+	aufs_bindex_t bindex, bbot;
+	struct inode *h_inode, *h_dinode;
+	struct dentry *h_dentry;
+
+	/*
+	 * before this function, if aufs got any iinfo lock, it must be only
+	 * one, the parent dir.
+	 * it can happen by UDBA and the obsoleted inode number.
+	 */
+	err = -EIO;
+	if (unlikely(inode->i_ino == parent_ino(dentry)))
+		goto out;
+
+	err = 1;
+	ii_write_lock_new_child(inode);
+	h_dentry = au_h_dptr(dentry, au_dbtop(dentry));
+	h_dinode = d_inode(h_dentry);
+	bbot = au_ibbot(inode);
+	for (bindex = au_ibtop(inode); bindex <= bbot; bindex++) {
+		h_inode = au_h_iptr(inode, bindex);
+		if (!h_inode || h_inode != h_dinode)
+			continue;
+
+		err = 0;
+		gen = au_iigen(inode, &igflags);
+		if (gen == au_digen(dentry)
+		    && !au_ig_ftest(igflags, HALF_REFRESHED))
+			break;
+
+		/* fully refresh inode using dentry */
+		err = au_refresh_hinode(inode, dentry);
+		if (!err)
+			au_update_iigen(inode, /*half*/0);
+		break;
+	}
+
+	if (unlikely(err))
+		ii_write_unlock(inode);
+out:
+	return err;
+}
+
+int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+	   unsigned int d_type, ino_t *ino)
+{
+	int err;
+	struct mutex *mtx;
+
+	/* prevent hardlinked inode number from race condition */
+	mtx = NULL;
+	if (d_type != DT_DIR) {
+		mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx;
+		mutex_lock(mtx);
+	}
+	err = au_xino_read(sb, bindex, h_ino, ino);
+	if (unlikely(err))
+		goto out;
+
+	if (!*ino) {
+		err = -EIO;
+		*ino = au_xino_new_ino(sb);
+		if (unlikely(!*ino))
+			goto out;
+		err = au_xino_write(sb, bindex, h_ino, *ino);
+		if (unlikely(err))
+			goto out;
+	}
+
+out:
+	if (mtx)
+		mutex_unlock(mtx);
+	return err;
+}
+
+/* successful returns with iinfo write_locked */
+/* todo: return with unlocked? */
+struct inode *au_new_inode(struct dentry *dentry, int must_new)
+{
+	struct inode *inode, *h_inode;
+	struct dentry *h_dentry;
+	struct super_block *sb;
+	struct mutex *mtx;
+	ino_t h_ino, ino;
+	int err;
+	aufs_bindex_t btop;
+
+	sb = dentry->d_sb;
+	btop = au_dbtop(dentry);
+	h_dentry = au_h_dptr(dentry, btop);
+	h_inode = d_inode(h_dentry);
+	h_ino = h_inode->i_ino;
+
+	/*
+	 * stop 'race'-ing between hardlinks under different
+	 * parents.
+	 */
+	mtx = NULL;
+	if (!d_is_dir(h_dentry))
+		mtx = &au_sbr(sb, btop)->br_xino.xi_nondir_mtx;
+
+new_ino:
+	if (mtx)
+		mutex_lock(mtx);
+	err = au_xino_read(sb, btop, h_ino, &ino);
+	inode = ERR_PTR(err);
+	if (unlikely(err))
+		goto out;
+
+	if (!ino) {
+		ino = au_xino_new_ino(sb);
+		if (unlikely(!ino)) {
+			inode = ERR_PTR(-EIO);
+			goto out;
+		}
+	}
+
+	AuDbg("i%lu\n", (unsigned long)ino);
+	inode = au_iget_locked(sb, ino);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW));
+	if (inode->i_state & I_NEW) {
+		ii_write_lock_new_child(inode);
+		err = set_inode(inode, dentry);
+		if (!err) {
+			unlock_new_inode(inode);
+			goto out; /* success */
+		}
+
+		/*
+		 * iget_failed() calls iput(), but we need to call
+		 * ii_write_unlock() after iget_failed(). so dirty hack for
+		 * i_count.
+		 */
+		atomic_inc(&inode->i_count);
+		iget_failed(inode);
+		ii_write_unlock(inode);
+		au_xino_write(sb, btop, h_ino, /*ino*/0);
+		/* ignore this error */
+		goto out_iput;
+	} else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) {
+		/*
+		 * horrible race condition between lookup, readdir and copyup
+		 * (or something).
+		 */
+		if (mtx)
+			mutex_unlock(mtx);
+		err = reval_inode(inode, dentry);
+		if (unlikely(err < 0)) {
+			mtx = NULL;
+			goto out_iput;
+		}
+
+		if (!err) {
+			mtx = NULL;
+			goto out; /* success */
+		} else if (mtx)
+			mutex_lock(mtx);
+	}
+
+	if (unlikely(au_test_fs_unique_ino(h_inode)))
+		AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir,"
+			" b%d, %s, %pd, hi%lu, i%lu.\n",
+			btop, au_sbtype(h_dentry->d_sb), dentry,
+			(unsigned long)h_ino, (unsigned long)ino);
+	ino = 0;
+	err = au_xino_write(sb, btop, h_ino, /*ino*/0);
+	if (!err) {
+		iput(inode);
+		if (mtx)
+			mutex_unlock(mtx);
+		goto new_ino;
+	}
+
+out_iput:
+	iput(inode);
+	inode = ERR_PTR(err);
+out:
+	if (mtx)
+		mutex_unlock(mtx);
+	return inode;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
+	       struct inode *inode)
+{
+	int err;
+	struct inode *hi;
+
+	err = au_br_rdonly(au_sbr(sb, bindex));
+
+	/* pseudo-link after flushed may happen out of bounds */
+	if (!err
+	    && inode
+	    && au_ibtop(inode) <= bindex
+	    && bindex <= au_ibbot(inode)) {
+		/*
+		 * permission check is unnecessary since vfsub routine
+		 * will be called later
+		 */
+		hi = au_h_iptr(inode, bindex);
+		if (hi)
+			err = IS_IMMUTABLE(hi) ? -EROFS : 0;
+	}
+
+	return err;
+}
+
+int au_test_h_perm(struct inode *h_inode, int mask)
+{
+	if (uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
+		return 0;
+	return inode_permission(h_inode, mask);
+}
+
+int au_test_h_perm_sio(struct inode *h_inode, int mask)
+{
+	if (au_test_nfs(h_inode->i_sb)
+	    && (mask & MAY_WRITE)
+	    && S_ISDIR(h_inode->i_mode))
+		mask |= MAY_READ; /* force permission check */
+	return au_test_h_perm(h_inode, mask);
+}
diff --git b/fs/aufs/inode.h b/fs/aufs/inode.h
new file mode 100644
index 0000000..33d9f5e
--- /dev/null
+++ b/fs/aufs/inode.h
@@ -0,0 +1,687 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * inode operations
+ */
+
+#ifndef __AUFS_INODE_H__
+#define __AUFS_INODE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fsnotify.h>
+#include "rwsem.h"
+
+struct vfsmount;
+
+struct au_hnotify {
+#ifdef CONFIG_AUFS_HNOTIFY
+#ifdef CONFIG_AUFS_HFSNOTIFY
+	/* never use fsnotify_add_vfsmount_mark() */
+	struct fsnotify_mark		hn_mark;
+#endif
+	union {
+		struct inode		*hn_aufs_inode;	/* no get/put */
+		struct llist_node	hn_lnode;	/* delayed free */
+	};
+#endif
+} ____cacheline_aligned_in_smp;
+
+struct au_hinode {
+	struct inode		*hi_inode;
+	aufs_bindex_t		hi_id;
+#ifdef CONFIG_AUFS_HNOTIFY
+	struct au_hnotify	*hi_notify;
+#endif
+
+	/* reference to the copied-up whiteout with get/put */
+	struct dentry		*hi_whdentry;
+};
+
+/* ig_flags */
+#define AuIG_HALF_REFRESHED		1
+#define au_ig_ftest(flags, name)	((flags) & AuIG_##name)
+#define au_ig_fset(flags, name) \
+	do { (flags) |= AuIG_##name; } while (0)
+#define au_ig_fclr(flags, name) \
+	do { (flags) &= ~AuIG_##name; } while (0)
+
+struct au_iigen {
+	spinlock_t	ig_spin;
+	__u32		ig_generation, ig_flags;
+};
+
+struct au_vdir;
+struct au_iinfo {
+	struct au_iigen		ii_generation;
+	struct super_block	*ii_hsb1;	/* no get/put */
+
+	struct au_rwsem		ii_rwsem;
+	aufs_bindex_t		ii_btop, ii_bbot;
+	__u32			ii_higen;
+	struct au_hinode	*ii_hinode;
+	struct au_vdir		*ii_vdir;
+};
+
+struct au_icntnr {
+	struct au_iinfo iinfo;
+	struct inode vfs_inode;
+	union {
+		struct hlist_node	plink;
+		struct llist_node	lnode;	/* delayed free */
+	};
+} ____cacheline_aligned_in_smp;
+
+/* au_pin flags */
+#define AuPin_DI_LOCKED		1
+#define AuPin_MNT_WRITE		(1 << 1)
+#define au_ftest_pin(flags, name)	((flags) & AuPin_##name)
+#define au_fset_pin(flags, name) \
+	do { (flags) |= AuPin_##name; } while (0)
+#define au_fclr_pin(flags, name) \
+	do { (flags) &= ~AuPin_##name; } while (0)
+
+struct au_pin {
+	/* input */
+	struct dentry *dentry;
+	unsigned int udba;
+	unsigned char lsc_di, lsc_hi, flags;
+	aufs_bindex_t bindex;
+
+	/* output */
+	struct dentry *parent;
+	struct au_hinode *hdir;
+	struct vfsmount *h_mnt;
+
+	/* temporary unlock/relock for copyup */
+	struct dentry *h_dentry, *h_parent;
+	struct au_branch *br;
+	struct task_struct *task;
+};
+
+void au_pin_hdir_unlock(struct au_pin *p);
+int au_pin_hdir_lock(struct au_pin *p);
+int au_pin_hdir_relock(struct au_pin *p);
+void au_pin_hdir_acquire_nest(struct au_pin *p);
+void au_pin_hdir_release(struct au_pin *p);
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_iinfo *au_ii(struct inode *inode)
+{
+	BUG_ON(is_bad_inode(inode));
+	return &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* inode.c */
+struct inode *au_igrab(struct inode *inode);
+void au_refresh_iop(struct inode *inode, int force_getattr);
+int au_refresh_hinode_self(struct inode *inode);
+int au_refresh_hinode(struct inode *inode, struct dentry *dentry);
+int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+	   unsigned int d_type, ino_t *ino);
+struct inode *au_new_inode(struct dentry *dentry, int must_new);
+int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
+	       struct inode *inode);
+int au_test_h_perm(struct inode *h_inode, int mask);
+int au_test_h_perm_sio(struct inode *h_inode, int mask);
+
+static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex,
+			    ino_t h_ino, unsigned int d_type, ino_t *ino)
+{
+#ifdef CONFIG_AUFS_SHWH
+	return au_ino(sb, bindex, h_ino, d_type, ino);
+#else
+	return 0;
+#endif
+}
+
+/* i_op.c */
+enum {
+	AuIop_SYMLINK,
+	AuIop_DIR,
+	AuIop_OTHER,
+	AuIop_Last
+};
+extern struct inode_operations aufs_iop[AuIop_Last],
+	aufs_iop_nogetattr[AuIop_Last];
+
+/* au_wr_dir flags */
+#define AuWrDir_ADD_ENTRY	1
+#define AuWrDir_ISDIR		(1 << 1)
+#define AuWrDir_TMPFILE		(1 << 2)
+#define au_ftest_wrdir(flags, name)	((flags) & AuWrDir_##name)
+#define au_fset_wrdir(flags, name) \
+	do { (flags) |= AuWrDir_##name; } while (0)
+#define au_fclr_wrdir(flags, name) \
+	do { (flags) &= ~AuWrDir_##name; } while (0)
+
+struct au_wr_dir_args {
+	aufs_bindex_t force_btgt;
+	unsigned char flags;
+};
+int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
+	      struct au_wr_dir_args *args);
+
+struct dentry *au_pinned_h_parent(struct au_pin *pin);
+void au_pin_init(struct au_pin *pin, struct dentry *dentry,
+		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
+		 unsigned int udba, unsigned char flags);
+int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
+	   unsigned int udba, unsigned char flags) __must_check;
+int au_do_pin(struct au_pin *pin) __must_check;
+void au_unpin(struct au_pin *pin);
+int au_reval_for_attr(struct dentry *dentry, unsigned int sigen);
+
+#define AuIcpup_DID_CPUP	1
+#define au_ftest_icpup(flags, name)	((flags) & AuIcpup_##name)
+#define au_fset_icpup(flags, name) \
+	do { (flags) |= AuIcpup_##name; } while (0)
+#define au_fclr_icpup(flags, name) \
+	do { (flags) &= ~AuIcpup_##name; } while (0)
+
+struct au_icpup_args {
+	unsigned char flags;
+	unsigned char pin_flags;
+	aufs_bindex_t btgt;
+	unsigned int udba;
+	struct au_pin pin;
+	struct path h_path;
+	struct inode *h_inode;
+};
+
+int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
+		     struct au_icpup_args *a);
+
+int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path);
+
+/* i_op_add.c */
+int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir);
+int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+	       dev_t dev);
+int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname);
+int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		bool want_excl);
+struct vfsub_aopen_args;
+int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
+		       struct vfsub_aopen_args *args);
+int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode);
+int aufs_link(struct dentry *src_dentry, struct inode *dir,
+	      struct dentry *dentry);
+int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
+
+/* i_op_del.c */
+int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup);
+int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
+	       struct dentry *h_parent, int isdir);
+int aufs_unlink(struct inode *dir, struct dentry *dentry);
+int aufs_rmdir(struct inode *dir, struct dentry *dentry);
+
+/* i_op_ren.c */
+int au_wbr(struct dentry *dentry, aufs_bindex_t btgt);
+int aufs_rename(struct inode *src_dir, struct dentry *src_dentry,
+		struct inode *dir, struct dentry *dentry);
+
+/* iinfo.c */
+struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex);
+void au_hiput(struct au_hinode *hinode);
+void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
+		  struct dentry *h_wh);
+unsigned int au_hi_flags(struct inode *inode, int isdir);
+
+/* hinode flags */
+#define AuHi_XINO	1
+#define AuHi_HNOTIFY	(1 << 1)
+#define au_ftest_hi(flags, name)	((flags) & AuHi_##name)
+#define au_fset_hi(flags, name) \
+	do { (flags) |= AuHi_##name; } while (0)
+#define au_fclr_hi(flags, name) \
+	do { (flags) &= ~AuHi_##name; } while (0)
+
+#ifndef CONFIG_AUFS_HNOTIFY
+#undef AuHi_HNOTIFY
+#define AuHi_HNOTIFY	0
+#endif
+
+void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
+		   struct inode *h_inode, unsigned int flags);
+
+void au_update_iigen(struct inode *inode, int half);
+void au_update_ibrange(struct inode *inode, int do_put_zero);
+
+void au_icntnr_init_once(void *_c);
+void au_hinode_init(struct au_hinode *hinode);
+int au_iinfo_init(struct inode *inode);
+void au_iinfo_fin(struct inode *inode);
+int au_hinode_realloc(struct au_iinfo *iinfo, int nbr, int may_shrink);
+
+#ifdef CONFIG_PROC_FS
+/* plink.c */
+int au_plink_maint(struct super_block *sb, int flags);
+struct au_sbinfo;
+void au_plink_maint_leave(struct au_sbinfo *sbinfo);
+int au_plink_maint_enter(struct super_block *sb);
+#ifdef CONFIG_AUFS_DEBUG
+void au_plink_list(struct super_block *sb);
+#else
+AuStubVoid(au_plink_list, struct super_block *sb)
+#endif
+int au_plink_test(struct inode *inode);
+struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex);
+void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
+		     struct dentry *h_dentry);
+void au_plink_put(struct super_block *sb, int verbose);
+void au_plink_clean(struct super_block *sb, int verbose);
+void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id);
+#else
+AuStubInt0(au_plink_maint, struct super_block *sb, int flags);
+AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo);
+AuStubInt0(au_plink_maint_enter, struct super_block *sb);
+AuStubVoid(au_plink_list, struct super_block *sb);
+AuStubInt0(au_plink_test, struct inode *inode);
+AuStub(struct dentry *, au_plink_lkup, return NULL,
+       struct inode *inode, aufs_bindex_t bindex);
+AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex,
+	   struct dentry *h_dentry);
+AuStubVoid(au_plink_put, struct super_block *sb, int verbose);
+AuStubVoid(au_plink_clean, struct super_block *sb, int verbose);
+AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id);
+#endif /* CONFIG_PROC_FS */
+
+#ifdef CONFIG_AUFS_XATTR
+/* xattr.c */
+int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
+		  unsigned int verbose);
+ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size);
+ssize_t aufs_getxattr(struct dentry *dentry, struct inode *inode,
+		      const char *name, void *value, size_t size);
+int aufs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+		  const void *value, size_t size, int flags);
+int aufs_removexattr(struct dentry *dentry, const char *name);
+
+/* void au_xattr_init(struct super_block *sb); */
+#else
+AuStubInt0(au_cpup_xattr, struct dentry *h_dst, struct dentry *h_src,
+	   int ignore_flags, unsigned int verbose);
+/* AuStubVoid(au_xattr_init, struct super_block *sb); */
+#endif
+
+#ifdef CONFIG_FS_POSIX_ACL
+struct posix_acl *aufs_get_acl(struct inode *inode, int type);
+int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
+#endif
+
+#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
+enum {
+	AU_XATTR_SET,
+	AU_XATTR_REMOVE,
+	AU_ACL_SET
+};
+
+struct au_srxattr {
+	int type;
+	union {
+		struct {
+			const char	*name;
+			const void	*value;
+			size_t		size;
+			int		flags;
+		} set;
+		struct {
+			const char	*name;
+		} remove;
+		struct {
+			struct posix_acl *acl;
+			int		type;
+		} acl_set;
+	} u;
+};
+ssize_t au_srxattr(struct dentry *dentry, struct inode *inode,
+		   struct au_srxattr *arg);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+/* lock subclass for iinfo */
+enum {
+	AuLsc_II_CHILD,		/* child first */
+	AuLsc_II_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
+	AuLsc_II_CHILD3,	/* copyup dirs */
+	AuLsc_II_PARENT,	/* see AuLsc_I_PARENT in vfsub.h */
+	AuLsc_II_PARENT2,
+	AuLsc_II_PARENT3,	/* copyup dirs */
+	AuLsc_II_NEW_CHILD
+};
+
+/*
+ * ii_read_lock_child, ii_write_lock_child,
+ * ii_read_lock_child2, ii_write_lock_child2,
+ * ii_read_lock_child3, ii_write_lock_child3,
+ * ii_read_lock_parent, ii_write_lock_parent,
+ * ii_read_lock_parent2, ii_write_lock_parent2,
+ * ii_read_lock_parent3, ii_write_lock_parent3,
+ * ii_read_lock_new_child, ii_write_lock_new_child,
+ */
+#define AuReadLockFunc(name, lsc) \
+static inline void ii_read_lock_##name(struct inode *i) \
+{ \
+	au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
+}
+
+#define AuWriteLockFunc(name, lsc) \
+static inline void ii_write_lock_##name(struct inode *i) \
+{ \
+	au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
+}
+
+#define AuRWLockFuncs(name, lsc) \
+	AuReadLockFunc(name, lsc) \
+	AuWriteLockFunc(name, lsc)
+
+AuRWLockFuncs(child, CHILD);
+AuRWLockFuncs(child2, CHILD2);
+AuRWLockFuncs(child3, CHILD3);
+AuRWLockFuncs(parent, PARENT);
+AuRWLockFuncs(parent2, PARENT2);
+AuRWLockFuncs(parent3, PARENT3);
+AuRWLockFuncs(new_child, NEW_CHILD);
+
+#undef AuReadLockFunc
+#undef AuWriteLockFunc
+#undef AuRWLockFuncs
+
+/*
+ * ii_read_unlock, ii_write_unlock, ii_downgrade_lock
+ */
+AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem);
+
+#define IiMustNoWaiters(i)	AuRwMustNoWaiters(&au_ii(i)->ii_rwsem)
+#define IiMustAnyLock(i)	AuRwMustAnyLock(&au_ii(i)->ii_rwsem)
+#define IiMustWriteLock(i)	AuRwMustWriteLock(&au_ii(i)->ii_rwsem)
+
+/* ---------------------------------------------------------------------- */
+
+static inline void au_icntnr_init(struct au_icntnr *c)
+{
+#ifdef CONFIG_AUFS_DEBUG
+	c->vfs_inode.i_mode = 0;
+#endif
+}
+
+static inline unsigned int au_iigen(struct inode *inode, unsigned int *igflags)
+{
+	unsigned int gen;
+	struct au_iinfo *iinfo;
+	struct au_iigen *iigen;
+
+	iinfo = au_ii(inode);
+	iigen = &iinfo->ii_generation;
+	spin_lock(&iigen->ig_spin);
+	if (igflags)
+		*igflags = iigen->ig_flags;
+	gen = iigen->ig_generation;
+	spin_unlock(&iigen->ig_spin);
+
+	return gen;
+}
+
+/* tiny test for inode number */
+/* tmpfs generation is too rough */
+static inline int au_test_higen(struct inode *inode, struct inode *h_inode)
+{
+	struct au_iinfo *iinfo;
+
+	iinfo = au_ii(inode);
+	AuRwMustAnyLock(&iinfo->ii_rwsem);
+	return !(iinfo->ii_hsb1 == h_inode->i_sb
+		 && iinfo->ii_higen == h_inode->i_generation);
+}
+
+static inline void au_iigen_dec(struct inode *inode)
+{
+	struct au_iinfo *iinfo;
+	struct au_iigen *iigen;
+
+	iinfo = au_ii(inode);
+	iigen = &iinfo->ii_generation;
+	spin_lock(&iigen->ig_spin);
+	iigen->ig_generation--;
+	spin_unlock(&iigen->ig_spin);
+}
+
+static inline int au_iigen_test(struct inode *inode, unsigned int sigen)
+{
+	int err;
+
+	err = 0;
+	if (unlikely(inode && au_iigen(inode, NULL) != sigen))
+		err = -EIO;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_hinode *au_hinode(struct au_iinfo *iinfo,
+					  aufs_bindex_t bindex)
+{
+	return iinfo->ii_hinode + bindex;
+}
+
+static inline int au_is_bad_inode(struct inode *inode)
+{
+	return !!(is_bad_inode(inode) || !au_hinode(au_ii(inode), 0));
+}
+
+static inline aufs_bindex_t au_ii_br_id(struct inode *inode,
+					aufs_bindex_t bindex)
+{
+	IiMustAnyLock(inode);
+	return au_hinode(au_ii(inode), bindex)->hi_id;
+}
+
+static inline aufs_bindex_t au_ibtop(struct inode *inode)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_btop;
+}
+
+static inline aufs_bindex_t au_ibbot(struct inode *inode)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_bbot;
+}
+
+static inline struct au_vdir *au_ivdir(struct inode *inode)
+{
+	IiMustAnyLock(inode);
+	return au_ii(inode)->ii_vdir;
+}
+
+static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustAnyLock(inode);
+	return au_hinode(au_ii(inode), bindex)->hi_whdentry;
+}
+
+static inline void au_set_ibtop(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustWriteLock(inode);
+	au_ii(inode)->ii_btop = bindex;
+}
+
+static inline void au_set_ibbot(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustWriteLock(inode);
+	au_ii(inode)->ii_bbot = bindex;
+}
+
+static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir)
+{
+	IiMustWriteLock(inode);
+	au_ii(inode)->ii_vdir = vdir;
+}
+
+static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex)
+{
+	IiMustAnyLock(inode);
+	return au_hinode(au_ii(inode), bindex);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct dentry *au_pinned_parent(struct au_pin *pin)
+{
+	if (pin)
+		return pin->parent;
+	return NULL;
+}
+
+static inline struct inode *au_pinned_h_dir(struct au_pin *pin)
+{
+	if (pin && pin->hdir)
+		return pin->hdir->hi_inode;
+	return NULL;
+}
+
+static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin)
+{
+	if (pin)
+		return pin->hdir;
+	return NULL;
+}
+
+static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry)
+{
+	if (pin)
+		pin->dentry = dentry;
+}
+
+static inline void au_pin_set_parent_lflag(struct au_pin *pin,
+					   unsigned char lflag)
+{
+	if (pin) {
+		if (lflag)
+			au_fset_pin(pin->flags, DI_LOCKED);
+		else
+			au_fclr_pin(pin->flags, DI_LOCKED);
+	}
+}
+
+#if 0 /* reserved */
+static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent)
+{
+	if (pin) {
+		dput(pin->parent);
+		pin->parent = dget(parent);
+	}
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+struct au_branch;
+#ifdef CONFIG_AUFS_HNOTIFY
+struct au_hnotify_op {
+	void (*ctl)(struct au_hinode *hinode, int do_set);
+	int (*alloc)(struct au_hinode *hinode);
+
+	/*
+	 * if it returns true, the the caller should free hinode->hi_notify,
+	 * otherwise ->free() frees it.
+	 */
+	int (*free)(struct au_hinode *hinode,
+		    struct au_hnotify *hn) __must_check;
+
+	void (*fin)(void);
+	int (*init)(void);
+
+	int (*reset_br)(unsigned int udba, struct au_branch *br, int perm);
+	void (*fin_br)(struct au_branch *br);
+	int (*init_br)(struct au_branch *br, int perm);
+};
+
+/* hnotify.c */
+int au_hn_alloc(struct au_hinode *hinode, struct inode *inode);
+void au_hn_free(struct au_hinode *hinode);
+void au_hn_ctl(struct au_hinode *hinode, int do_set);
+void au_hn_reset(struct inode *inode, unsigned int flags);
+int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
+	       struct qstr *h_child_qstr, struct inode *h_child_inode);
+int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm);
+int au_hnotify_init_br(struct au_branch *br, int perm);
+void au_hnotify_fin_br(struct au_branch *br);
+int __init au_hnotify_init(void);
+void au_hnotify_fin(void);
+
+/* hfsnotify.c */
+extern const struct au_hnotify_op au_hnotify_op;
+
+static inline
+void au_hn_init(struct au_hinode *hinode)
+{
+	hinode->hi_notify = NULL;
+}
+
+static inline struct au_hnotify *au_hn(struct au_hinode *hinode)
+{
+	return hinode->hi_notify;
+}
+
+#else
+AuStub(int, au_hn_alloc, return -EOPNOTSUPP,
+       struct au_hinode *hinode __maybe_unused,
+       struct inode *inode __maybe_unused)
+AuStub(struct au_hnotify *, au_hn, return NULL, struct au_hinode *hinode)
+AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused)
+AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused,
+	   int do_set __maybe_unused)
+AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused,
+	   unsigned int flags __maybe_unused)
+AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused,
+	   struct au_branch *br __maybe_unused,
+	   int perm __maybe_unused)
+AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused,
+	   int perm __maybe_unused)
+AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused)
+AuStubInt0(__init au_hnotify_init, void)
+AuStubVoid(au_hnotify_fin, void)
+AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused)
+#endif /* CONFIG_AUFS_HNOTIFY */
+
+static inline void au_hn_suspend(struct au_hinode *hdir)
+{
+	au_hn_ctl(hdir, /*do_set*/0);
+}
+
+static inline void au_hn_resume(struct au_hinode *hdir)
+{
+	au_hn_ctl(hdir, /*do_set*/1);
+}
+
+static inline void au_hn_inode_lock(struct au_hinode *hdir)
+{
+	inode_lock(hdir->hi_inode);
+	au_hn_suspend(hdir);
+}
+
+static inline void au_hn_inode_lock_nested(struct au_hinode *hdir,
+					  unsigned int sc __maybe_unused)
+{
+	inode_lock_nested(hdir->hi_inode, sc);
+	au_hn_suspend(hdir);
+}
+
+static inline void au_hn_inode_unlock(struct au_hinode *hdir)
+{
+	au_hn_resume(hdir);
+	inode_unlock(hdir->hi_inode);
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_INODE_H__ */
diff --git b/fs/aufs/ioctl.c b/fs/aufs/ioctl.c
new file mode 100644
index 0000000..2ebfdc4
--- /dev/null
+++ b/fs/aufs/ioctl.c
@@ -0,0 +1,206 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * ioctl
+ * plink-management and readdir in userspace.
+ * assist the pathconf(3) wrapper library.
+ * move-down
+ * File-based Hierarchical Storage Management.
+ */
+
+#include <linux/compat.h>
+#include <linux/file.h>
+#include "aufs.h"
+
+static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg)
+{
+	int err, fd;
+	aufs_bindex_t wbi, bindex, bbot;
+	struct file *h_file;
+	struct super_block *sb;
+	struct dentry *root;
+	struct au_branch *br;
+	struct aufs_wbr_fd wbrfd = {
+		.oflags	= au_dir_roflags,
+		.brid	= -1
+	};
+	const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY
+		| O_NOATIME | O_CLOEXEC;
+
+	AuDebugOn(wbrfd.oflags & ~valid);
+
+	if (arg) {
+		err = copy_from_user(&wbrfd, arg, sizeof(wbrfd));
+		if (unlikely(err)) {
+			err = -EFAULT;
+			goto out;
+		}
+
+		err = -EINVAL;
+		AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid);
+		wbrfd.oflags |= au_dir_roflags;
+		AuDbg("0%o\n", wbrfd.oflags);
+		if (unlikely(wbrfd.oflags & ~valid))
+			goto out;
+	}
+
+	fd = get_unused_fd_flags(0);
+	err = fd;
+	if (unlikely(fd < 0))
+		goto out;
+
+	h_file = ERR_PTR(-EINVAL);
+	wbi = 0;
+	br = NULL;
+	sb = path->dentry->d_sb;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_IR);
+	bbot = au_sbbot(sb);
+	if (wbrfd.brid >= 0) {
+		wbi = au_br_index(sb, wbrfd.brid);
+		if (unlikely(wbi < 0 || wbi > bbot))
+			goto out_unlock;
+	}
+
+	h_file = ERR_PTR(-ENOENT);
+	br = au_sbr(sb, wbi);
+	if (!au_br_writable(br->br_perm)) {
+		if (arg)
+			goto out_unlock;
+
+		bindex = wbi + 1;
+		wbi = -1;
+		for (; bindex <= bbot; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (au_br_writable(br->br_perm)) {
+				wbi = bindex;
+				br = au_sbr(sb, wbi);
+				break;
+			}
+		}
+	}
+	AuDbg("wbi %d\n", wbi);
+	if (wbi >= 0)
+		h_file = au_h_open(root, wbi, wbrfd.oflags, NULL,
+				   /*force_wr*/0);
+
+out_unlock:
+	aufs_read_unlock(root, AuLock_IR);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out_fd;
+
+	au_br_put(br); /* cf. au_h_open() */
+	fd_install(fd, h_file);
+	err = fd;
+	goto out; /* success */
+
+out_fd:
+	put_unused_fd(fd);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err;
+	struct dentry *dentry;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_ioctl(file, cmd, arg);
+		break;
+
+	case AUFS_CTL_WBR_FD:
+		err = au_wbr_fd(&file->f_path, (void __user *)arg);
+		break;
+
+	case AUFS_CTL_IBUSY:
+		err = au_ibusy_ioctl(file, arg);
+		break;
+
+	case AUFS_CTL_BRINFO:
+		err = au_brinfo_ioctl(file, arg);
+		break;
+
+	case AUFS_CTL_FHSM_FD:
+		dentry = file->f_path.dentry;
+		if (IS_ROOT(dentry))
+			err = au_fhsm_fd(dentry->d_sb, arg);
+		else
+			err = -ENOTTY;
+		break;
+
+	default:
+		/* do not call the lower */
+		AuDbg("0x%x\n", cmd);
+		err = -ENOTTY;
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err;
+
+	switch (cmd) {
+	case AUFS_CTL_MVDOWN:
+		err = au_mvdown(file->f_path.dentry, (void __user *)arg);
+		break;
+
+	case AUFS_CTL_WBR_FD:
+		err = au_wbr_fd(&file->f_path, (void __user *)arg);
+		break;
+
+	default:
+		/* do not call the lower */
+		AuDbg("0x%x\n", cmd);
+		err = -ENOTTY;
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
+			   unsigned long arg)
+{
+	long err;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_compat_ioctl(file, cmd, arg);
+		break;
+
+	case AUFS_CTL_IBUSY:
+		err = au_ibusy_compat_ioctl(file, arg);
+		break;
+
+	case AUFS_CTL_BRINFO:
+		err = au_brinfo_compat_ioctl(file, arg);
+		break;
+
+	default:
+		err = aufs_ioctl_dir(file, cmd, arg);
+	}
+
+	AuTraceErr(err);
+	return err;
+}
+
+long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
+			      unsigned long arg)
+{
+	return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
diff --git b/fs/aufs/loop.c b/fs/aufs/loop.c
new file mode 100644
index 0000000..c3ca50f
--- /dev/null
+++ b/fs/aufs/loop.c
@@ -0,0 +1,134 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * support for loopback block device as a branch
+ */
+
+#include "aufs.h"
+
+/* added into drivers/block/loop.c */
+static struct file *(*backing_file_func)(struct super_block *sb);
+
+/*
+ * test if two lower dentries have overlapping branches.
+ */
+int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding)
+{
+	struct super_block *h_sb;
+	struct file *backing_file;
+
+	if (unlikely(!backing_file_func)) {
+		/* don't load "loop" module here */
+		backing_file_func = symbol_get(loop_backing_file);
+		if (unlikely(!backing_file_func))
+			/* "loop" module is not loaded */
+			return 0;
+	}
+
+	h_sb = h_adding->d_sb;
+	backing_file = backing_file_func(h_sb);
+	if (!backing_file)
+		return 0;
+
+	h_adding = backing_file->f_path.dentry;
+	/*
+	 * h_adding can be local NFS.
+	 * in this case aufs cannot detect the loop.
+	 */
+	if (unlikely(h_adding->d_sb == sb))
+		return 1;
+	return !!au_test_subdir(h_adding, sb->s_root);
+}
+
+/* true if a kernel thread named 'loop[0-9].*' accesses a file */
+int au_test_loopback_kthread(void)
+{
+	int ret;
+	struct task_struct *tsk = current;
+	char c, comm[sizeof(tsk->comm)];
+
+	ret = 0;
+	if (tsk->flags & PF_KTHREAD) {
+		get_task_comm(comm, tsk);
+		c = comm[4];
+		ret = ('0' <= c && c <= '9'
+		       && !strncmp(comm, "loop", 4));
+	}
+
+	return ret;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define au_warn_loopback_step	16
+static int au_warn_loopback_nelem = au_warn_loopback_step;
+static unsigned long *au_warn_loopback_array;
+
+void au_warn_loopback(struct super_block *h_sb)
+{
+	int i, new_nelem;
+	unsigned long *a, magic;
+	static DEFINE_SPINLOCK(spin);
+
+	magic = h_sb->s_magic;
+	spin_lock(&spin);
+	a = au_warn_loopback_array;
+	for (i = 0; i < au_warn_loopback_nelem && *a; i++)
+		if (a[i] == magic) {
+			spin_unlock(&spin);
+			return;
+		}
+
+	/* h_sb is new to us, print it */
+	if (i < au_warn_loopback_nelem) {
+		a[i] = magic;
+		goto pr;
+	}
+
+	/* expand the array */
+	new_nelem = au_warn_loopback_nelem + au_warn_loopback_step;
+	a = au_kzrealloc(au_warn_loopback_array,
+			 au_warn_loopback_nelem * sizeof(unsigned long),
+			 new_nelem * sizeof(unsigned long), GFP_ATOMIC,
+			 /*may_shrink*/0);
+	if (a) {
+		au_warn_loopback_nelem = new_nelem;
+		au_warn_loopback_array = a;
+		a[i] = magic;
+		goto pr;
+	}
+
+	spin_unlock(&spin);
+	AuWarn1("realloc failed, ignored\n");
+	return;
+
+pr:
+	spin_unlock(&spin);
+	pr_warn("you may want to try another patch for loopback file "
+		"on %s(0x%lx) branch\n", au_sbtype(h_sb), magic);
+}
+
+int au_loopback_init(void)
+{
+	int err;
+	struct super_block *sb __maybe_unused;
+
+	BUILD_BUG_ON(sizeof(sb->s_magic) != sizeof(unsigned long));
+
+	err = 0;
+	au_warn_loopback_array = kcalloc(au_warn_loopback_step,
+					 sizeof(unsigned long), GFP_NOFS);
+	if (unlikely(!au_warn_loopback_array))
+		err = -ENOMEM;
+
+	return err;
+}
+
+void au_loopback_fin(void)
+{
+	if (backing_file_func)
+		symbol_put(loop_backing_file);
+	au_delayed_kfree(au_warn_loopback_array);
+}
diff --git b/fs/aufs/loop.h b/fs/aufs/loop.h
new file mode 100644
index 0000000..48bf070
--- /dev/null
+++ b/fs/aufs/loop.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * support for loopback mount as a branch
+ */
+
+#ifndef __AUFS_LOOP_H__
+#define __AUFS_LOOP_H__
+
+#ifdef __KERNEL__
+
+struct dentry;
+struct super_block;
+
+#ifdef CONFIG_AUFS_BDEV_LOOP
+/* drivers/block/loop.c */
+struct file *loop_backing_file(struct super_block *sb);
+
+/* loop.c */
+int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding);
+int au_test_loopback_kthread(void);
+void au_warn_loopback(struct super_block *h_sb);
+
+int au_loopback_init(void);
+void au_loopback_fin(void);
+#else
+AuStubInt0(au_test_loopback_overlap, struct super_block *sb,
+	   struct dentry *h_adding)
+AuStubInt0(au_test_loopback_kthread, void)
+AuStubVoid(au_warn_loopback, struct super_block *h_sb)
+
+AuStubInt0(au_loopback_init, void)
+AuStubVoid(au_loopback_fin, void)
+#endif /* BLK_DEV_LOOP */
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_LOOP_H__ */
diff --git b/fs/aufs/magic.mk b/fs/aufs/magic.mk
new file mode 100644
index 0000000..4f83bdf
--- /dev/null
+++ b/fs/aufs/magic.mk
@@ -0,0 +1,30 @@
+
+# defined in ${srctree}/fs/fuse/inode.c
+# tristate
+ifdef CONFIG_FUSE_FS
+ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546
+endif
+
+# defined in ${srctree}/fs/xfs/xfs_sb.h
+# tristate
+ifdef CONFIG_XFS_FS
+ccflags-y += -DXFS_SB_MAGIC=0x58465342
+endif
+
+# defined in ${srctree}/fs/configfs/mount.c
+# tristate
+ifdef CONFIG_CONFIGFS_FS
+ccflags-y += -DCONFIGFS_MAGIC=0x62656570
+endif
+
+# defined in ${srctree}/fs/ubifs/ubifs.h
+# tristate
+ifdef CONFIG_UBIFS_FS
+ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905
+endif
+
+# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h
+# tristate
+ifdef CONFIG_HFSPLUS_FS
+ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b
+endif
diff --git b/fs/aufs/module.c b/fs/aufs/module.c
new file mode 100644
index 0000000..5092b4b
--- /dev/null
+++ b/fs/aufs/module.c
@@ -0,0 +1,319 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * module global variables and operations
+ */
+
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include "aufs.h"
+
+/* shrinkable realloc */
+void *au_krealloc(void *p, unsigned int new_sz, gfp_t gfp, int may_shrink)
+{
+	size_t sz;
+	int diff;
+
+	sz = 0;
+	diff = -1;
+	if (p) {
+#if 0 /* unused */
+		if (!new_sz) {
+			au_delayed_kfree(p);
+			p = NULL;
+			goto out;
+		}
+#else
+		AuDebugOn(!new_sz);
+#endif
+		sz = ksize(p);
+		diff = au_kmidx_sub(sz, new_sz);
+	}
+	if (sz && !diff)
+		goto out;
+
+	if (sz < new_sz)
+		/* expand or SLOB */
+		p = krealloc(p, new_sz, gfp);
+	else if (new_sz < sz && may_shrink) {
+		/* shrink */
+		void *q;
+
+		q = kmalloc(new_sz, gfp);
+		if (q) {
+			if (p) {
+				memcpy(q, p, new_sz);
+				au_delayed_kfree(p);
+			}
+			p = q;
+		} else
+			p = NULL;
+	}
+
+out:
+	return p;
+}
+
+void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp,
+		   int may_shrink)
+{
+	p = au_krealloc(p, new_sz, gfp, may_shrink);
+	if (p && new_sz > nused)
+		memset(p + nused, 0, new_sz - nused);
+	return p;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * aufs caches
+ */
+
+struct au_dfree au_dfree;
+
+/* delayed free */
+static void au_do_dfree(struct work_struct *work __maybe_unused)
+{
+	struct llist_head *head;
+	struct llist_node *node, *next;
+
+#define AU_CACHE_DFREE_DO_BODY(name, idx, lnode) do {			\
+		head = &au_dfree.cache[AuCache_##idx].llist;		\
+		node = llist_del_all(head);				\
+		for (; node; node = next) {				\
+			struct au_##name *p				\
+				= llist_entry(node, struct au_##name,	\
+					      lnode);			\
+			next = llist_next(node);			\
+			au_cache_free_##name(p);			\
+		}							\
+	} while (0)
+
+	AU_CACHE_DFREE_DO_BODY(dinfo, DINFO, di_lnode);
+	AU_CACHE_DFREE_DO_BODY(icntnr, ICNTNR, lnode);
+	AU_CACHE_DFREE_DO_BODY(finfo, FINFO, fi_lnode);
+	AU_CACHE_DFREE_DO_BODY(vdir, VDIR, vd_lnode);
+	AU_CACHE_DFREE_DO_BODY(vdir_dehstr, DEHSTR, lnode);
+#ifdef CONFIG_AUFS_HNOTIFY
+	AU_CACHE_DFREE_DO_BODY(hnotify, HNOTIFY, hn_lnode);
+#endif
+
+#define AU_DFREE_DO_BODY(llist, func) do {		\
+		node = llist_del_all(llist);		\
+		for (; node; node = next) {		\
+			next = llist_next(node);	\
+			func(node);			\
+		}					\
+	} while (0)
+
+	AU_DFREE_DO_BODY(au_dfree.llist + AU_DFREE_KFREE, kfree);
+	AU_DFREE_DO_BODY(au_dfree.llist + AU_DFREE_FREE_PAGE, au_free_page);
+
+#undef AU_CACHE_DFREE_DO_BODY
+#undef AU_DFREE_DO_BODY
+}
+
+AU_CACHE_DFREE_FUNC(dinfo, DINFO, di_lnode);
+AU_CACHE_DFREE_FUNC(icntnr, ICNTNR, lnode);
+AU_CACHE_DFREE_FUNC(finfo, FINFO, fi_lnode);
+AU_CACHE_DFREE_FUNC(vdir, VDIR, vd_lnode);
+AU_CACHE_DFREE_FUNC(vdir_dehstr, DEHSTR, lnode);
+
+static void au_cache_fin(void)
+{
+	int i;
+	struct au_cache *cp;
+
+	/*
+	 * Make sure all delayed rcu free inodes are flushed before we
+	 * destroy cache.
+	 */
+	rcu_barrier();
+
+	/* excluding AuCache_HNOTIFY */
+	BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last);
+	flush_delayed_work(&au_dfree.dwork);
+	for (i = 0; i < AuCache_HNOTIFY; i++) {
+		cp = au_dfree.cache + i;
+		AuDebugOn(!llist_empty(&cp->llist));
+		kmem_cache_destroy(cp->cache);
+		cp->cache = NULL;
+	}
+}
+
+static int __init au_cache_init(void)
+{
+	struct au_cache *cp;
+
+	cp = au_dfree.cache;
+	cp[AuCache_DINFO].cache = AuCacheCtor(au_dinfo, au_di_init_once);
+	if (cp[AuCache_DINFO].cache)
+		/* SLAB_DESTROY_BY_RCU */
+		cp[AuCache_ICNTNR].cache = AuCacheCtor(au_icntnr,
+						       au_icntnr_init_once);
+	if (cp[AuCache_ICNTNR].cache)
+		cp[AuCache_FINFO].cache = AuCacheCtor(au_finfo,
+						      au_fi_init_once);
+	if (cp[AuCache_FINFO].cache)
+		cp[AuCache_VDIR].cache = AuCache(au_vdir);
+	if (cp[AuCache_VDIR].cache)
+		cp[AuCache_DEHSTR].cache = AuCache(au_vdir_dehstr);
+	if (cp[AuCache_DEHSTR].cache)
+		return 0;
+
+	au_cache_fin();
+	return -ENOMEM;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_dir_roflags;
+
+#ifdef CONFIG_AUFS_SBILIST
+/*
+ * iterate_supers_type() doesn't protect us from
+ * remounting (branch management)
+ */
+struct au_sphlhead au_sbilist;
+#endif
+
+/*
+ * functions for module interface.
+ */
+MODULE_LICENSE("GPL");
+/* MODULE_LICENSE("GPL v2"); */
+MODULE_AUTHOR("Junjiro R. Okajima <aufs-users@lists.sourceforge.net>");
+MODULE_DESCRIPTION(AUFS_NAME
+	" -- Advanced multi layered unification filesystem");
+MODULE_VERSION(AUFS_VERSION);
+
+/* this module parameter has no meaning when SYSFS is disabled */
+int sysaufs_brs = 1;
+MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN");
+module_param_named(brs, sysaufs_brs, int, S_IRUGO);
+
+/* this module parameter has no meaning when USER_NS is disabled */
+bool au_userns;
+MODULE_PARM_DESC(allow_userns, "allow unprivileged to mount under userns");
+module_param_named(allow_userns, au_userns, bool, S_IRUGO);
+
+/* ---------------------------------------------------------------------- */
+
+static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */
+
+int au_seq_path(struct seq_file *seq, struct path *path)
+{
+	int err;
+
+	err = seq_path(seq, path, au_esc_chars);
+	if (err > 0)
+		err = 0;
+	else if (err < 0)
+		err = -ENOMEM;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int __init aufs_init(void)
+{
+	int err, i;
+	char *p;
+	struct au_cache *cp;
+
+	p = au_esc_chars;
+	for (i = 1; i <= ' '; i++)
+		*p++ = i;
+	*p++ = '\\';
+	*p++ = '\x7f';
+	*p = 0;
+
+	au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE);
+
+	memcpy(aufs_iop_nogetattr, aufs_iop, sizeof(aufs_iop));
+	for (i = 0; i < AuIop_Last; i++)
+		aufs_iop_nogetattr[i].getattr = NULL;
+
+	/* First, initialize au_dfree */
+	for (i = 0; i < AuCache_Last; i++) {	/* including hnotify */
+		cp = au_dfree.cache + i;
+		cp->cache = NULL;
+		init_llist_head(&cp->llist);
+	}
+	for (i = 0; i < AU_DFREE_Last; i++)
+		init_llist_head(au_dfree.llist + i);
+	INIT_DELAYED_WORK(&au_dfree.dwork, au_do_dfree);
+
+	au_sbilist_init();
+	sysaufs_brs_init();
+	au_debug_init();
+	au_dy_init();
+	err = sysaufs_init();
+	if (unlikely(err))
+		goto out;
+	err = au_procfs_init();
+	if (unlikely(err))
+		goto out_sysaufs;
+	err = au_wkq_init();
+	if (unlikely(err))
+		goto out_procfs;
+	err = au_loopback_init();
+	if (unlikely(err))
+		goto out_wkq;
+	err = au_hnotify_init();
+	if (unlikely(err))
+		goto out_loopback;
+	err = au_sysrq_init();
+	if (unlikely(err))
+		goto out_hin;
+	err = au_cache_init();
+	if (unlikely(err))
+		goto out_sysrq;
+
+	aufs_fs_type.fs_flags |= au_userns ? FS_USERNS_MOUNT : 0;
+	err = register_filesystem(&aufs_fs_type);
+	if (unlikely(err))
+		goto out_cache;
+
+	/* since we define pr_fmt, call printk directly */
+	printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n");
+	goto out; /* success */
+
+out_cache:
+	au_cache_fin();
+out_sysrq:
+	au_sysrq_fin();
+out_hin:
+	au_hnotify_fin();
+out_loopback:
+	au_loopback_fin();
+out_wkq:
+	au_wkq_fin();
+out_procfs:
+	au_procfs_fin();
+out_sysaufs:
+	sysaufs_fin();
+	au_dy_fin();
+	flush_delayed_work(&au_dfree.dwork);
+out:
+	return err;
+}
+
+static void __exit aufs_exit(void)
+{
+	unregister_filesystem(&aufs_fs_type);
+	au_cache_fin();
+	au_sysrq_fin();
+	au_hnotify_fin();
+	au_loopback_fin();
+	au_wkq_fin();
+	au_procfs_fin();
+	sysaufs_fin();
+	au_dy_fin();
+	flush_delayed_work(&au_dfree.dwork);
+}
+
+module_init(aufs_init);
+module_exit(aufs_exit);
diff --git b/fs/aufs/module.h b/fs/aufs/module.h
new file mode 100644
index 0000000..6f968ba
--- /dev/null
+++ b/fs/aufs/module.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * module initialization and module-global
+ */
+
+#ifndef __AUFS_MODULE_H__
+#define __AUFS_MODULE_H__
+
+#ifdef __KERNEL__
+
+#include <linux/slab.h>
+#include "debug.h"
+
+struct path;
+struct seq_file;
+
+/* module parameters */
+extern int sysaufs_brs;
+extern bool au_userns;
+
+/* ---------------------------------------------------------------------- */
+
+extern int au_dir_roflags;
+
+void *au_krealloc(void *p, unsigned int new_sz, gfp_t gfp, int may_shrink);
+void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp,
+		   int may_shrink);
+
+static inline int au_kmidx_sub(size_t sz, size_t new_sz)
+{
+#ifndef CONFIG_SLOB
+	return kmalloc_index(sz) - kmalloc_index(new_sz);
+#else
+	return -1; /* SLOB is untested */
+#endif
+}
+
+int au_seq_path(struct seq_file *seq, struct path *path);
+
+#ifdef CONFIG_PROC_FS
+/* procfs.c */
+int __init au_procfs_init(void);
+void au_procfs_fin(void);
+#else
+AuStubInt0(au_procfs_init, void);
+AuStubVoid(au_procfs_fin, void);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+/* kmem cache and delayed free */
+enum {
+	AuCache_DINFO,
+	AuCache_ICNTNR,
+	AuCache_FINFO,
+	AuCache_VDIR,
+	AuCache_DEHSTR,
+	AuCache_HNOTIFY, /* must be last */
+	AuCache_Last
+};
+
+enum {
+	AU_DFREE_KFREE,
+	AU_DFREE_FREE_PAGE,
+	AU_DFREE_Last
+};
+
+struct au_cache {
+	struct kmem_cache	*cache;
+	struct llist_head	llist;	/* delayed free */
+};
+
+/*
+ * in order to reduce the cost of the internal timer, consolidate all the
+ * delayed free works into a single delayed_work.
+ */
+struct au_dfree {
+	struct au_cache		cache[AuCache_Last];
+	struct llist_head	llist[AU_DFREE_Last];
+	struct delayed_work	dwork;
+};
+
+extern struct au_dfree au_dfree;
+
+#define AuCacheFlags		(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD)
+#define AuCache(type)		KMEM_CACHE(type, AuCacheFlags)
+#define AuCacheCtor(type, ctor)	\
+	kmem_cache_create(#type, sizeof(struct type), \
+			  __alignof__(struct type), AuCacheFlags, ctor)
+
+#define AU_DFREE_DELAY		msecs_to_jiffies(10)
+#define AU_DFREE_BODY(lnode, llist) do {				\
+		if (llist_add(lnode, llist))				\
+			schedule_delayed_work(&au_dfree.dwork,		\
+					      AU_DFREE_DELAY);		\
+	} while (0)
+#define AU_CACHE_DFREE_FUNC(name, idx, lnode)				\
+	void au_cache_dfree_##name(struct au_##name *p)			\
+	{								\
+		struct au_cache *cp = au_dfree.cache + AuCache_##idx;	\
+		AU_DFREE_BODY(&p->lnode, &cp->llist);			\
+	}
+
+#define AuCacheFuncs(name, index) \
+static inline struct au_##name *au_cache_alloc_##name(void) \
+{ return kmem_cache_alloc(au_dfree.cache[AuCache_##index].cache, GFP_NOFS); } \
+static inline void au_cache_free_##name(struct au_##name *p) \
+{ kmem_cache_free(au_dfree.cache[AuCache_##index].cache, p); } \
+void au_cache_dfree_##name(struct au_##name *p)
+
+AuCacheFuncs(dinfo, DINFO);
+AuCacheFuncs(icntnr, ICNTNR);
+AuCacheFuncs(finfo, FINFO);
+AuCacheFuncs(vdir, VDIR);
+AuCacheFuncs(vdir_dehstr, DEHSTR);
+#ifdef CONFIG_AUFS_HNOTIFY
+AuCacheFuncs(hnotify, HNOTIFY);
+#endif
+
+static inline void au_delayed_kfree(const void *p)
+{
+	AuDebugOn(!p);
+	AuDebugOn(ksize(p) < sizeof(struct llist_node));
+
+	AU_DFREE_BODY((void *)p, au_dfree.llist + AU_DFREE_KFREE);
+}
+
+/* cast only */
+static inline void au_free_page(void *p)
+{
+	free_page((unsigned long)p);
+}
+
+static inline void au_delayed_free_page(unsigned long addr)
+{
+	AU_DFREE_BODY((void *)addr, au_dfree.llist + AU_DFREE_FREE_PAGE);
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_MODULE_H__ */
diff --git b/fs/aufs/mvdown.c b/fs/aufs/mvdown.c
new file mode 100644
index 0000000..b0e5b84
--- /dev/null
+++ b/fs/aufs/mvdown.c
@@ -0,0 +1,691 @@
+/*
+ * Copyright (C) 2011-2016 Junjiro R. Okajima
+ */
+
+/*
+ * move-down, opposite of copy-up
+ */
+
+#include "aufs.h"
+
+struct au_mvd_args {
+	struct {
+		struct super_block *h_sb;
+		struct dentry *h_parent;
+		struct au_hinode *hdir;
+		struct inode *h_dir, *h_inode;
+		struct au_pin pin;
+	} info[AUFS_MVDOWN_NARRAY];
+
+	struct aufs_mvdown mvdown;
+	struct dentry *dentry, *parent;
+	struct inode *inode, *dir;
+	struct super_block *sb;
+	aufs_bindex_t bopq, bwh, bfound;
+	unsigned char rename_lock;
+};
+
+#define mvd_errno		mvdown.au_errno
+#define mvd_bsrc		mvdown.stbr[AUFS_MVDOWN_UPPER].bindex
+#define mvd_src_brid		mvdown.stbr[AUFS_MVDOWN_UPPER].brid
+#define mvd_bdst		mvdown.stbr[AUFS_MVDOWN_LOWER].bindex
+#define mvd_dst_brid		mvdown.stbr[AUFS_MVDOWN_LOWER].brid
+
+#define mvd_h_src_sb		info[AUFS_MVDOWN_UPPER].h_sb
+#define mvd_h_src_parent	info[AUFS_MVDOWN_UPPER].h_parent
+#define mvd_hdir_src		info[AUFS_MVDOWN_UPPER].hdir
+#define mvd_h_src_dir		info[AUFS_MVDOWN_UPPER].h_dir
+#define mvd_h_src_inode		info[AUFS_MVDOWN_UPPER].h_inode
+#define mvd_pin_src		info[AUFS_MVDOWN_UPPER].pin
+
+#define mvd_h_dst_sb		info[AUFS_MVDOWN_LOWER].h_sb
+#define mvd_h_dst_parent	info[AUFS_MVDOWN_LOWER].h_parent
+#define mvd_hdir_dst		info[AUFS_MVDOWN_LOWER].hdir
+#define mvd_h_dst_dir		info[AUFS_MVDOWN_LOWER].h_dir
+#define mvd_h_dst_inode		info[AUFS_MVDOWN_LOWER].h_inode
+#define mvd_pin_dst		info[AUFS_MVDOWN_LOWER].pin
+
+#define AU_MVD_PR(flag, ...) do {			\
+		if (flag)				\
+			pr_err(__VA_ARGS__);		\
+	} while (0)
+
+static int find_lower_writable(struct au_mvd_args *a)
+{
+	struct super_block *sb;
+	aufs_bindex_t bindex, bbot;
+	struct au_branch *br;
+
+	sb = a->sb;
+	bindex = a->mvd_bsrc;
+	bbot = au_sbbot(sb);
+	if (a->mvdown.flags & AUFS_MVDOWN_FHSM_LOWER)
+		for (bindex++; bindex <= bbot; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (au_br_fhsm(br->br_perm)
+			    && (!(au_br_sb(br)->s_flags & MS_RDONLY)))
+				return bindex;
+		}
+	else if (!(a->mvdown.flags & AUFS_MVDOWN_ROLOWER))
+		for (bindex++; bindex <= bbot; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (!au_br_rdonly(br))
+				return bindex;
+		}
+	else
+		for (bindex++; bindex <= bbot; bindex++) {
+			br = au_sbr(sb, bindex);
+			if (!(au_br_sb(br)->s_flags & MS_RDONLY)) {
+				if (au_br_rdonly(br))
+					a->mvdown.flags
+						|= AUFS_MVDOWN_ROLOWER_R;
+				return bindex;
+			}
+		}
+
+	return -1;
+}
+
+/* make the parent dir on bdst */
+static int au_do_mkdir(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+
+	err = 0;
+	a->mvd_hdir_src = au_hi(a->dir, a->mvd_bsrc);
+	a->mvd_hdir_dst = au_hi(a->dir, a->mvd_bdst);
+	a->mvd_h_src_parent = au_h_dptr(a->parent, a->mvd_bsrc);
+	a->mvd_h_dst_parent = NULL;
+	if (au_dbbot(a->parent) >= a->mvd_bdst)
+		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
+	if (!a->mvd_h_dst_parent) {
+		err = au_cpdown_dirs(a->dentry, a->mvd_bdst);
+		if (unlikely(err)) {
+			AU_MVD_PR(dmsg, "cpdown_dirs failed\n");
+			goto out;
+		}
+		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* lock them all */
+static int au_do_lock(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct dentry *h_trap;
+
+	a->mvd_h_src_sb = au_sbr_sb(a->sb, a->mvd_bsrc);
+	a->mvd_h_dst_sb = au_sbr_sb(a->sb, a->mvd_bdst);
+	err = au_pin(&a->mvd_pin_dst, a->dentry, a->mvd_bdst,
+		     au_opt_udba(a->sb),
+		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
+	AuTraceErr(err);
+	if (unlikely(err)) {
+		AU_MVD_PR(dmsg, "pin_dst failed\n");
+		goto out;
+	}
+
+	if (a->mvd_h_src_sb != a->mvd_h_dst_sb) {
+		a->rename_lock = 0;
+		au_pin_init(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
+			    AuLsc_DI_PARENT, AuLsc_I_PARENT3,
+			    au_opt_udba(a->sb),
+			    AuPin_MNT_WRITE | AuPin_DI_LOCKED);
+		err = au_do_pin(&a->mvd_pin_src);
+		AuTraceErr(err);
+		a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
+		if (unlikely(err)) {
+			AU_MVD_PR(dmsg, "pin_src failed\n");
+			goto out_dst;
+		}
+		goto out; /* success */
+	}
+
+	a->rename_lock = 1;
+	au_pin_hdir_unlock(&a->mvd_pin_dst);
+	err = au_pin(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
+		     au_opt_udba(a->sb),
+		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
+	AuTraceErr(err);
+	a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
+	if (unlikely(err)) {
+		AU_MVD_PR(dmsg, "pin_src failed\n");
+		au_pin_hdir_lock(&a->mvd_pin_dst);
+		goto out_dst;
+	}
+	au_pin_hdir_unlock(&a->mvd_pin_src);
+	h_trap = vfsub_lock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
+				   a->mvd_h_dst_parent, a->mvd_hdir_dst);
+	if (h_trap) {
+		err = (h_trap != a->mvd_h_src_parent);
+		if (err)
+			err = (h_trap != a->mvd_h_dst_parent);
+	}
+	BUG_ON(err); /* it should never happen */
+	if (unlikely(a->mvd_h_src_dir != au_pinned_h_dir(&a->mvd_pin_src))) {
+		err = -EBUSY;
+		AuTraceErr(err);
+		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
+				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
+		au_pin_hdir_lock(&a->mvd_pin_src);
+		au_unpin(&a->mvd_pin_src);
+		au_pin_hdir_lock(&a->mvd_pin_dst);
+		goto out_dst;
+	}
+	goto out; /* success */
+
+out_dst:
+	au_unpin(&a->mvd_pin_dst);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static void au_do_unlock(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	if (!a->rename_lock)
+		au_unpin(&a->mvd_pin_src);
+	else {
+		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
+				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
+		au_pin_hdir_lock(&a->mvd_pin_src);
+		au_unpin(&a->mvd_pin_src);
+		au_pin_hdir_lock(&a->mvd_pin_dst);
+	}
+	au_unpin(&a->mvd_pin_dst);
+}
+
+/* copy-down the file */
+static int au_do_cpdown(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct au_cp_generic cpg = {
+		.dentry	= a->dentry,
+		.bdst	= a->mvd_bdst,
+		.bsrc	= a->mvd_bsrc,
+		.len	= -1,
+		.pin	= &a->mvd_pin_dst,
+		.flags	= AuCpup_DTIME | AuCpup_HOPEN
+	};
+
+	AuDbg("b%d, b%d\n", cpg.bsrc, cpg.bdst);
+	if (a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
+		au_fset_cpup(cpg.flags, OVERWRITE);
+	if (a->mvdown.flags & AUFS_MVDOWN_ROLOWER)
+		au_fset_cpup(cpg.flags, RWDST);
+	err = au_sio_cpdown_simple(&cpg);
+	if (unlikely(err))
+		AU_MVD_PR(dmsg, "cpdown failed\n");
+
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * unlink the whiteout on bdst if exist which may be created by UDBA while we
+ * were sleeping
+ */
+static int au_do_unlink_wh(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct path h_path;
+	struct au_branch *br;
+	struct inode *delegated;
+
+	br = au_sbr(a->sb, a->mvd_bdst);
+	h_path.dentry = au_wh_lkup(a->mvd_h_dst_parent, &a->dentry->d_name, br);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry)) {
+		AU_MVD_PR(dmsg, "wh_lkup failed\n");
+		goto out;
+	}
+
+	err = 0;
+	if (d_is_positive(h_path.dentry)) {
+		h_path.mnt = au_br_mnt(br);
+		delegated = NULL;
+		err = vfsub_unlink(d_inode(a->mvd_h_dst_parent), &h_path,
+				   &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+		if (unlikely(err))
+			AU_MVD_PR(dmsg, "wh_unlink failed\n");
+	}
+	dput(h_path.dentry);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/*
+ * unlink the topmost h_dentry
+ */
+static int au_do_unlink(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct path h_path;
+	struct inode *delegated;
+
+	h_path.mnt = au_sbr_mnt(a->sb, a->mvd_bsrc);
+	h_path.dentry = au_h_dptr(a->dentry, a->mvd_bsrc);
+	delegated = NULL;
+	err = vfsub_unlink(a->mvd_h_src_dir, &h_path, &delegated, /*force*/0);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	if (unlikely(err))
+		AU_MVD_PR(dmsg, "unlink failed\n");
+
+	AuTraceErr(err);
+	return err;
+}
+
+/* Since mvdown succeeded, we ignore an error of this function */
+static void au_do_stfs(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct au_branch *br;
+
+	a->mvdown.flags |= AUFS_MVDOWN_STFS_FAILED;
+	br = au_sbr(a->sb, a->mvd_bsrc);
+	err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_UPPER].stfs);
+	if (!err) {
+		br = au_sbr(a->sb, a->mvd_bdst);
+		a->mvdown.stbr[AUFS_MVDOWN_LOWER].brid = br->br_id;
+		err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_LOWER].stfs);
+	}
+	if (!err)
+		a->mvdown.flags &= ~AUFS_MVDOWN_STFS_FAILED;
+	else
+		AU_MVD_PR(dmsg, "statfs failed (%d), ignored\n", err);
+}
+
+/*
+ * copy-down the file and unlink the bsrc file.
+ * - unlink the bdst whout if exist
+ * - copy-down the file (with whtmp name and rename)
+ * - unlink the bsrc file
+ */
+static int au_do_mvdown(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+
+	err = au_do_mkdir(dmsg, a);
+	if (!err)
+		err = au_do_lock(dmsg, a);
+	if (unlikely(err))
+		goto out;
+
+	/*
+	 * do not revert the activities we made on bdst since they should be
+	 * harmless in aufs.
+	 */
+
+	err = au_do_cpdown(dmsg, a);
+	if (!err)
+		err = au_do_unlink_wh(dmsg, a);
+	if (!err && !(a->mvdown.flags & AUFS_MVDOWN_KUPPER))
+		err = au_do_unlink(dmsg, a);
+	if (unlikely(err))
+		goto out_unlock;
+
+	AuDbg("%pd2, 0x%x, %d --> %d\n",
+	      a->dentry, a->mvdown.flags, a->mvd_bsrc, a->mvd_bdst);
+	if (find_lower_writable(a) < 0)
+		a->mvdown.flags |= AUFS_MVDOWN_BOTTOM;
+
+	if (a->mvdown.flags & AUFS_MVDOWN_STFS)
+		au_do_stfs(dmsg, a);
+
+	/* maintain internal array */
+	if (!(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) {
+		au_set_h_dptr(a->dentry, a->mvd_bsrc, NULL);
+		au_set_dbtop(a->dentry, a->mvd_bdst);
+		au_set_h_iptr(a->inode, a->mvd_bsrc, NULL, /*flags*/0);
+		au_set_ibtop(a->inode, a->mvd_bdst);
+	} else {
+		/* hide the lower */
+		au_set_h_dptr(a->dentry, a->mvd_bdst, NULL);
+		au_set_dbbot(a->dentry, a->mvd_bsrc);
+		au_set_h_iptr(a->inode, a->mvd_bdst, NULL, /*flags*/0);
+		au_set_ibbot(a->inode, a->mvd_bsrc);
+	}
+	if (au_dbbot(a->dentry) < a->mvd_bdst)
+		au_set_dbbot(a->dentry, a->mvd_bdst);
+	if (au_ibbot(a->inode) < a->mvd_bdst)
+		au_set_ibbot(a->inode, a->mvd_bdst);
+
+out_unlock:
+	au_do_unlock(dmsg, a);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* make sure the file is idle */
+static int au_mvd_args_busy(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err, plinked;
+
+	err = 0;
+	plinked = !!au_opt_test(au_mntflags(a->sb), PLINK);
+	if (au_dbtop(a->dentry) == a->mvd_bsrc
+	    && au_dcount(a->dentry) == 1
+	    && atomic_read(&a->inode->i_count) == 1
+	    /* && a->mvd_h_src_inode->i_nlink == 1 */
+	    && (!plinked || !au_plink_test(a->inode))
+	    && a->inode->i_nlink == 1)
+		goto out;
+
+	err = -EBUSY;
+	AU_MVD_PR(dmsg,
+		  "b%d, d{b%d, c%d?}, i{c%d?, l%u}, hi{l%u}, p{%d, %d}\n",
+		  a->mvd_bsrc, au_dbtop(a->dentry), au_dcount(a->dentry),
+		  atomic_read(&a->inode->i_count), a->inode->i_nlink,
+		  a->mvd_h_src_inode->i_nlink,
+		  plinked, plinked ? au_plink_test(a->inode) : 0);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* make sure the parent dir is fine */
+static int au_mvd_args_parent(const unsigned char dmsg,
+			      struct au_mvd_args *a)
+{
+	int err;
+	aufs_bindex_t bindex;
+
+	err = 0;
+	if (unlikely(au_alive_dir(a->parent))) {
+		err = -ENOENT;
+		AU_MVD_PR(dmsg, "parent dir is dead\n");
+		goto out;
+	}
+
+	a->bopq = au_dbdiropq(a->parent);
+	bindex = au_wbr_nonopq(a->dentry, a->mvd_bdst);
+	AuDbg("b%d\n", bindex);
+	if (unlikely((bindex >= 0 && bindex < a->mvd_bdst)
+		     || (a->bopq != -1 && a->bopq < a->mvd_bdst))) {
+		err = -EINVAL;
+		a->mvd_errno = EAU_MVDOWN_OPAQUE;
+		AU_MVD_PR(dmsg, "ancestor is opaque b%d, b%d\n",
+			  a->bopq, a->mvd_bdst);
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_mvd_args_intermediate(const unsigned char dmsg,
+				    struct au_mvd_args *a)
+{
+	int err;
+	struct au_dinfo *dinfo, *tmp;
+
+	/* lookup the next lower positive entry */
+	err = -ENOMEM;
+	tmp = au_di_alloc(a->sb, AuLsc_DI_TMP);
+	if (unlikely(!tmp))
+		goto out;
+
+	a->bfound = -1;
+	a->bwh = -1;
+	dinfo = au_di(a->dentry);
+	au_di_cp(tmp, dinfo);
+	au_di_swap(tmp, dinfo);
+
+	/* returns the number of positive dentries */
+	err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1,
+			     /* AuLkup_IGNORE_PERM */ 0);
+	if (!err)
+		a->bwh = au_dbwh(a->dentry);
+	else if (err > 0)
+		a->bfound = au_dbtop(a->dentry);
+
+	au_di_swap(tmp, dinfo);
+	au_rw_write_unlock(&tmp->di_rwsem);
+	au_di_free(tmp);
+	if (unlikely(err < 0))
+		AU_MVD_PR(dmsg, "failed look-up lower\n");
+
+	/*
+	 * here, we have these cases.
+	 * bfound == -1
+	 *	no positive dentry under bsrc. there are more sub-cases.
+	 *	bwh < 0
+	 *		there no whiteout, we can safely move-down.
+	 *	bwh <= bsrc
+	 *		impossible
+	 *	bsrc < bwh && bwh < bdst
+	 *		there is a whiteout on RO branch. cannot proceed.
+	 *	bwh == bdst
+	 *		there is a whiteout on the RW target branch. it should
+	 *		be removed.
+	 *	bdst < bwh
+	 *		there is a whiteout somewhere unrelated branch.
+	 * -1 < bfound && bfound <= bsrc
+	 *	impossible.
+	 * bfound < bdst
+	 *	found, but it is on RO branch between bsrc and bdst. cannot
+	 *	proceed.
+	 * bfound == bdst
+	 *	found, replace it if AUFS_MVDOWN_FORCE is set. otherwise return
+	 *	error.
+	 * bdst < bfound
+	 *	found, after we create the file on bdst, it will be hidden.
+	 */
+
+	AuDebugOn(a->bfound == -1
+		  && a->bwh != -1
+		  && a->bwh <= a->mvd_bsrc);
+	AuDebugOn(-1 < a->bfound
+		  && a->bfound <= a->mvd_bsrc);
+
+	err = -EINVAL;
+	if (a->bfound == -1
+	    && a->mvd_bsrc < a->bwh
+	    && a->bwh != -1
+	    && a->bwh < a->mvd_bdst) {
+		a->mvd_errno = EAU_MVDOWN_WHITEOUT;
+		AU_MVD_PR(dmsg, "bsrc %d, bdst %d, bfound %d, bwh %d\n",
+			  a->mvd_bsrc, a->mvd_bdst, a->bfound, a->bwh);
+		goto out;
+	} else if (a->bfound != -1 && a->bfound < a->mvd_bdst) {
+		a->mvd_errno = EAU_MVDOWN_UPPER;
+		AU_MVD_PR(dmsg, "bdst %d, bfound %d\n",
+			  a->mvd_bdst, a->bfound);
+		goto out;
+	}
+
+	err = 0; /* success */
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_mvd_args_exist(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+
+	err = 0;
+	if (!(a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
+	    && a->bfound == a->mvd_bdst)
+		err = -EEXIST;
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_mvd_args(const unsigned char dmsg, struct au_mvd_args *a)
+{
+	int err;
+	struct au_branch *br;
+
+	err = -EISDIR;
+	if (unlikely(S_ISDIR(a->inode->i_mode)))
+		goto out;
+
+	err = -EINVAL;
+	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_UPPER))
+		a->mvd_bsrc = au_ibtop(a->inode);
+	else {
+		a->mvd_bsrc = au_br_index(a->sb, a->mvd_src_brid);
+		if (unlikely(a->mvd_bsrc < 0
+			     || (a->mvd_bsrc < au_dbtop(a->dentry)
+				 || au_dbbot(a->dentry) < a->mvd_bsrc
+				 || !au_h_dptr(a->dentry, a->mvd_bsrc))
+			     || (a->mvd_bsrc < au_ibtop(a->inode)
+				 || au_ibbot(a->inode) < a->mvd_bsrc
+				 || !au_h_iptr(a->inode, a->mvd_bsrc)))) {
+			a->mvd_errno = EAU_MVDOWN_NOUPPER;
+			AU_MVD_PR(dmsg, "no upper\n");
+			goto out;
+		}
+	}
+	if (unlikely(a->mvd_bsrc == au_sbbot(a->sb))) {
+		a->mvd_errno = EAU_MVDOWN_BOTTOM;
+		AU_MVD_PR(dmsg, "on the bottom\n");
+		goto out;
+	}
+	a->mvd_h_src_inode = au_h_iptr(a->inode, a->mvd_bsrc);
+	br = au_sbr(a->sb, a->mvd_bsrc);
+	err = au_br_rdonly(br);
+	if (!(a->mvdown.flags & AUFS_MVDOWN_ROUPPER)) {
+		if (unlikely(err))
+			goto out;
+	} else if (!(vfsub_native_ro(a->mvd_h_src_inode)
+		     || IS_APPEND(a->mvd_h_src_inode))) {
+		if (err)
+			a->mvdown.flags |= AUFS_MVDOWN_ROUPPER_R;
+		/* go on */
+	} else
+		goto out;
+
+	err = -EINVAL;
+	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_LOWER)) {
+		a->mvd_bdst = find_lower_writable(a);
+		if (unlikely(a->mvd_bdst < 0)) {
+			a->mvd_errno = EAU_MVDOWN_BOTTOM;
+			AU_MVD_PR(dmsg, "no writable lower branch\n");
+			goto out;
+		}
+	} else {
+		a->mvd_bdst = au_br_index(a->sb, a->mvd_dst_brid);
+		if (unlikely(a->mvd_bdst < 0
+			     || au_sbbot(a->sb) < a->mvd_bdst)) {
+			a->mvd_errno = EAU_MVDOWN_NOLOWERBR;
+			AU_MVD_PR(dmsg, "no lower brid\n");
+			goto out;
+		}
+	}
+
+	err = au_mvd_args_busy(dmsg, a);
+	if (!err)
+		err = au_mvd_args_parent(dmsg, a);
+	if (!err)
+		err = au_mvd_args_intermediate(dmsg, a);
+	if (!err)
+		err = au_mvd_args_exist(dmsg, a);
+	if (!err)
+		AuDbg("b%d, b%d\n", a->mvd_bsrc, a->mvd_bdst);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *uarg)
+{
+	int err, e;
+	unsigned char dmsg;
+	struct au_mvd_args *args;
+	struct inode *inode;
+
+	inode = d_inode(dentry);
+	err = -EPERM;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = -ENOMEM;
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (unlikely(!args))
+		goto out;
+
+	err = copy_from_user(&args->mvdown, uarg, sizeof(args->mvdown));
+	if (!err)
+		err = !access_ok(VERIFY_WRITE, uarg, sizeof(*uarg));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out_free;
+	}
+	AuDbg("flags 0x%x\n", args->mvdown.flags);
+	args->mvdown.flags &= ~(AUFS_MVDOWN_ROLOWER_R | AUFS_MVDOWN_ROUPPER_R);
+	args->mvdown.au_errno = 0;
+	args->dentry = dentry;
+	args->inode = inode;
+	args->sb = dentry->d_sb;
+
+	err = -ENOENT;
+	dmsg = !!(args->mvdown.flags & AUFS_MVDOWN_DMSG);
+	args->parent = dget_parent(dentry);
+	args->dir = d_inode(args->parent);
+	inode_lock_nested(args->dir, I_MUTEX_PARENT);
+	dput(args->parent);
+	if (unlikely(args->parent != dentry->d_parent)) {
+		AU_MVD_PR(dmsg, "parent dir is moved\n");
+		goto out_dir;
+	}
+
+	inode_lock_nested(inode, I_MUTEX_CHILD);
+	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_NOPLMW);
+	if (unlikely(err))
+		goto out_inode;
+
+	di_write_lock_parent(args->parent);
+	err = au_mvd_args(dmsg, args);
+	if (unlikely(err))
+		goto out_parent;
+
+	err = au_do_mvdown(dmsg, args);
+	if (unlikely(err))
+		goto out_parent;
+
+	au_cpup_attr_timesizes(args->dir);
+	au_cpup_attr_timesizes(inode);
+	if (!(args->mvdown.flags & AUFS_MVDOWN_KUPPER))
+		au_cpup_igen(inode, au_h_iptr(inode, args->mvd_bdst));
+	/* au_digen_dec(dentry); */
+
+out_parent:
+	di_write_unlock(args->parent);
+	aufs_read_unlock(dentry, AuLock_DW);
+out_inode:
+	inode_unlock(inode);
+out_dir:
+	inode_unlock(args->dir);
+out_free:
+	e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown));
+	if (unlikely(e))
+		err = -EFAULT;
+	au_delayed_kfree(args);
+out:
+	AuTraceErr(err);
+	return err;
+}
diff --git b/fs/aufs/opts.c b/fs/aufs/opts.c
new file mode 100644
index 0000000..1d21f0d
--- /dev/null
+++ b/fs/aufs/opts.c
@@ -0,0 +1,1847 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * mount options/flags
+ */
+
+#include <linux/namei.h>
+#include <linux/types.h> /* a distribution requires */
+#include <linux/parser.h>
+#include "aufs.h"
+
+/* ---------------------------------------------------------------------- */
+
+enum {
+	Opt_br,
+	Opt_add, Opt_del, Opt_mod, Opt_append, Opt_prepend,
+	Opt_idel, Opt_imod,
+	Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash,
+	Opt_rdblk_def, Opt_rdhash_def,
+	Opt_xino, Opt_noxino,
+	Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino,
+	Opt_trunc_xino_path, Opt_itrunc_xino,
+	Opt_trunc_xib, Opt_notrunc_xib,
+	Opt_shwh, Opt_noshwh,
+	Opt_plink, Opt_noplink, Opt_list_plink,
+	Opt_udba,
+	Opt_dio, Opt_nodio,
+	Opt_diropq_a, Opt_diropq_w,
+	Opt_warn_perm, Opt_nowarn_perm,
+	Opt_wbr_copyup, Opt_wbr_create,
+	Opt_fhsm_sec,
+	Opt_verbose, Opt_noverbose,
+	Opt_sum, Opt_nosum, Opt_wsum,
+	Opt_dirperm1, Opt_nodirperm1,
+	Opt_acl, Opt_noacl,
+	Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err
+};
+
+static match_table_t options = {
+	{Opt_br, "br=%s"},
+	{Opt_br, "br:%s"},
+
+	{Opt_add, "add=%d:%s"},
+	{Opt_add, "add:%d:%s"},
+	{Opt_add, "ins=%d:%s"},
+	{Opt_add, "ins:%d:%s"},
+	{Opt_append, "append=%s"},
+	{Opt_append, "append:%s"},
+	{Opt_prepend, "prepend=%s"},
+	{Opt_prepend, "prepend:%s"},
+
+	{Opt_del, "del=%s"},
+	{Opt_del, "del:%s"},
+	/* {Opt_idel, "idel:%d"}, */
+	{Opt_mod, "mod=%s"},
+	{Opt_mod, "mod:%s"},
+	/* {Opt_imod, "imod:%d:%s"}, */
+
+	{Opt_dirwh, "dirwh=%d"},
+
+	{Opt_xino, "xino=%s"},
+	{Opt_noxino, "noxino"},
+	{Opt_trunc_xino, "trunc_xino"},
+	{Opt_trunc_xino_v, "trunc_xino_v=%d:%d"},
+	{Opt_notrunc_xino, "notrunc_xino"},
+	{Opt_trunc_xino_path, "trunc_xino=%s"},
+	{Opt_itrunc_xino, "itrunc_xino=%d"},
+	/* {Opt_zxino, "zxino=%s"}, */
+	{Opt_trunc_xib, "trunc_xib"},
+	{Opt_notrunc_xib, "notrunc_xib"},
+
+#ifdef CONFIG_PROC_FS
+	{Opt_plink, "plink"},
+#else
+	{Opt_ignore_silent, "plink"},
+#endif
+
+	{Opt_noplink, "noplink"},
+
+#ifdef CONFIG_AUFS_DEBUG
+	{Opt_list_plink, "list_plink"},
+#endif
+
+	{Opt_udba, "udba=%s"},
+
+	{Opt_dio, "dio"},
+	{Opt_nodio, "nodio"},
+
+#ifdef CONFIG_AUFS_FHSM
+	{Opt_fhsm_sec, "fhsm_sec=%d"},
+#else
+	{Opt_ignore_silent, "fhsm_sec=%d"},
+#endif
+
+	{Opt_diropq_a, "diropq=always"},
+	{Opt_diropq_a, "diropq=a"},
+	{Opt_diropq_w, "diropq=whiteouted"},
+	{Opt_diropq_w, "diropq=w"},
+
+	{Opt_warn_perm, "warn_perm"},
+	{Opt_nowarn_perm, "nowarn_perm"},
+
+	/* keep them temporary */
+	{Opt_ignore_silent, "nodlgt"},
+	{Opt_ignore_silent, "clean_plink"},
+
+#ifdef CONFIG_AUFS_SHWH
+	{Opt_shwh, "shwh"},
+#endif
+	{Opt_noshwh, "noshwh"},
+
+	{Opt_dirperm1, "dirperm1"},
+	{Opt_nodirperm1, "nodirperm1"},
+
+	{Opt_verbose, "verbose"},
+	{Opt_verbose, "v"},
+	{Opt_noverbose, "noverbose"},
+	{Opt_noverbose, "quiet"},
+	{Opt_noverbose, "q"},
+	{Opt_noverbose, "silent"},
+
+	{Opt_sum, "sum"},
+	{Opt_nosum, "nosum"},
+	{Opt_wsum, "wsum"},
+
+	{Opt_rdcache, "rdcache=%d"},
+	{Opt_rdblk, "rdblk=%d"},
+	{Opt_rdblk_def, "rdblk=def"},
+	{Opt_rdhash, "rdhash=%d"},
+	{Opt_rdhash_def, "rdhash=def"},
+
+	{Opt_wbr_create, "create=%s"},
+	{Opt_wbr_create, "create_policy=%s"},
+	{Opt_wbr_copyup, "cpup=%s"},
+	{Opt_wbr_copyup, "copyup=%s"},
+	{Opt_wbr_copyup, "copyup_policy=%s"},
+
+	/* generic VFS flag */
+#ifdef CONFIG_FS_POSIX_ACL
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+#else
+	{Opt_ignore_silent, "acl"},
+	{Opt_ignore_silent, "noacl"},
+#endif
+
+	/* internal use for the scripts */
+	{Opt_ignore_silent, "si=%s"},
+
+	{Opt_br, "dirs=%s"},
+	{Opt_ignore, "debug=%d"},
+	{Opt_ignore, "delete=whiteout"},
+	{Opt_ignore, "delete=all"},
+	{Opt_ignore, "imap=%s"},
+
+	/* temporary workaround, due to old mount(8)? */
+	{Opt_ignore_silent, "relatime"},
+
+	{Opt_err, NULL}
+};
+
+/* ---------------------------------------------------------------------- */
+
+static const char *au_parser_pattern(int val, match_table_t tbl)
+{
+	struct match_token *p;
+
+	p = tbl;
+	while (p->pattern) {
+		if (p->token == val)
+			return p->pattern;
+		p++;
+	}
+	BUG();
+	return "??";
+}
+
+static const char *au_optstr(int *val, match_table_t tbl)
+{
+	struct match_token *p;
+	int v;
+
+	v = *val;
+	if (!v)
+		goto out;
+	p = tbl;
+	while (p->pattern) {
+		if (p->token
+		    && (v & p->token) == p->token) {
+			*val &= ~p->token;
+			return p->pattern;
+		}
+		p++;
+	}
+
+out:
+	return NULL;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static match_table_t brperm = {
+	{AuBrPerm_RO, AUFS_BRPERM_RO},
+	{AuBrPerm_RR, AUFS_BRPERM_RR},
+	{AuBrPerm_RW, AUFS_BRPERM_RW},
+	{0, NULL}
+};
+
+static match_table_t brattr = {
+	/* general */
+	{AuBrAttr_COO_REG, AUFS_BRATTR_COO_REG},
+	{AuBrAttr_COO_ALL, AUFS_BRATTR_COO_ALL},
+	/* 'unpin' attrib is meaningless since linux-3.18-rc1 */
+	{AuBrAttr_UNPIN, AUFS_BRATTR_UNPIN},
+#ifdef CONFIG_AUFS_FHSM
+	{AuBrAttr_FHSM, AUFS_BRATTR_FHSM},
+#endif
+#ifdef CONFIG_AUFS_XATTR
+	{AuBrAttr_ICEX, AUFS_BRATTR_ICEX},
+	{AuBrAttr_ICEX_SEC, AUFS_BRATTR_ICEX_SEC},
+	{AuBrAttr_ICEX_SYS, AUFS_BRATTR_ICEX_SYS},
+	{AuBrAttr_ICEX_TR, AUFS_BRATTR_ICEX_TR},
+	{AuBrAttr_ICEX_USR, AUFS_BRATTR_ICEX_USR},
+	{AuBrAttr_ICEX_OTH, AUFS_BRATTR_ICEX_OTH},
+#endif
+
+	/* ro/rr branch */
+	{AuBrRAttr_WH, AUFS_BRRATTR_WH},
+
+	/* rw branch */
+	{AuBrWAttr_MOO, AUFS_BRWATTR_MOO},
+	{AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH},
+
+	{0, NULL}
+};
+
+static int br_attr_val(char *str, match_table_t table, substring_t args[])
+{
+	int attr, v;
+	char *p;
+
+	attr = 0;
+	do {
+		p = strchr(str, '+');
+		if (p)
+			*p = 0;
+		v = match_token(str, table, args);
+		if (v) {
+			if (v & AuBrAttr_CMOO_Mask)
+				attr &= ~AuBrAttr_CMOO_Mask;
+			attr |= v;
+		} else {
+			if (p)
+				*p = '+';
+			pr_warn("ignored branch attribute %s\n", str);
+			break;
+		}
+		if (p)
+			str = p + 1;
+	} while (p);
+
+	return attr;
+}
+
+static int au_do_optstr_br_attr(au_br_perm_str_t *str, int perm)
+{
+	int sz;
+	const char *p;
+	char *q;
+
+	q = str->a;
+	*q = 0;
+	p = au_optstr(&perm, brattr);
+	if (p) {
+		sz = strlen(p);
+		memcpy(q, p, sz + 1);
+		q += sz;
+	} else
+		goto out;
+
+	do {
+		p = au_optstr(&perm, brattr);
+		if (p) {
+			*q++ = '+';
+			sz = strlen(p);
+			memcpy(q, p, sz + 1);
+			q += sz;
+		}
+	} while (p);
+
+out:
+	return q - str->a;
+}
+
+static int noinline_for_stack br_perm_val(char *perm)
+{
+	int val, bad, sz;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	au_br_perm_str_t attr;
+
+	p = strchr(perm, '+');
+	if (p)
+		*p = 0;
+	val = match_token(perm, brperm, args);
+	if (!val) {
+		if (p)
+			*p = '+';
+		pr_warn("ignored branch permission %s\n", perm);
+		val = AuBrPerm_RO;
+		goto out;
+	}
+	if (!p)
+		goto out;
+
+	val |= br_attr_val(p + 1, brattr, args);
+
+	bad = 0;
+	switch (val & AuBrPerm_Mask) {
+	case AuBrPerm_RO:
+	case AuBrPerm_RR:
+		bad = val & AuBrWAttr_Mask;
+		val &= ~AuBrWAttr_Mask;
+		break;
+	case AuBrPerm_RW:
+		bad = val & AuBrRAttr_Mask;
+		val &= ~AuBrRAttr_Mask;
+		break;
+	}
+
+	/*
+	 * 'unpin' attrib becomes meaningless since linux-3.18-rc1, but aufs
+	 * does not treat it as an error, just warning.
+	 * this is a tiny guard for the user operation.
+	 */
+	if (val & AuBrAttr_UNPIN) {
+		bad |= AuBrAttr_UNPIN;
+		val &= ~AuBrAttr_UNPIN;
+	}
+
+	if (unlikely(bad)) {
+		sz = au_do_optstr_br_attr(&attr, bad);
+		AuDebugOn(!sz);
+		pr_warn("ignored branch attribute %s\n", attr.a);
+	}
+
+out:
+	return val;
+}
+
+void au_optstr_br_perm(au_br_perm_str_t *str, int perm)
+{
+	au_br_perm_str_t attr;
+	const char *p;
+	char *q;
+	int sz;
+
+	q = str->a;
+	p = au_optstr(&perm, brperm);
+	AuDebugOn(!p || !*p);
+	sz = strlen(p);
+	memcpy(q, p, sz + 1);
+	q += sz;
+
+	sz = au_do_optstr_br_attr(&attr, perm);
+	if (sz) {
+		*q++ = '+';
+		memcpy(q, attr.a, sz + 1);
+	}
+
+	AuDebugOn(strlen(str->a) >= sizeof(str->a));
+}
+
+/* ---------------------------------------------------------------------- */
+
+static match_table_t udbalevel = {
+	{AuOpt_UDBA_REVAL, "reval"},
+	{AuOpt_UDBA_NONE, "none"},
+#ifdef CONFIG_AUFS_HNOTIFY
+	{AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */
+#ifdef CONFIG_AUFS_HFSNOTIFY
+	{AuOpt_UDBA_HNOTIFY, "fsnotify"},
+#endif
+#endif
+	{-1, NULL}
+};
+
+static int noinline_for_stack udba_val(char *str)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	return match_token(str, udbalevel, args);
+}
+
+const char *au_optstr_udba(int udba)
+{
+	return au_parser_pattern(udba, udbalevel);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static match_table_t au_wbr_create_policy = {
+	{AuWbrCreate_TDP, "tdp"},
+	{AuWbrCreate_TDP, "top-down-parent"},
+	{AuWbrCreate_RR, "rr"},
+	{AuWbrCreate_RR, "round-robin"},
+	{AuWbrCreate_MFS, "mfs"},
+	{AuWbrCreate_MFS, "most-free-space"},
+	{AuWbrCreate_MFSV, "mfs:%d"},
+	{AuWbrCreate_MFSV, "most-free-space:%d"},
+
+	{AuWbrCreate_MFSRR, "mfsrr:%d"},
+	{AuWbrCreate_MFSRRV, "mfsrr:%d:%d"},
+	{AuWbrCreate_PMFS, "pmfs"},
+	{AuWbrCreate_PMFSV, "pmfs:%d"},
+	{AuWbrCreate_PMFSRR, "pmfsrr:%d"},
+	{AuWbrCreate_PMFSRRV, "pmfsrr:%d:%d"},
+
+	{-1, NULL}
+};
+
+/*
+ * cf. linux/lib/parser.c and cmdline.c
+ * gave up calling memparse() since it uses simple_strtoull() instead of
+ * kstrto...().
+ */
+static int noinline_for_stack
+au_match_ull(substring_t *s, unsigned long long *result)
+{
+	int err;
+	unsigned int len;
+	char a[32];
+
+	err = -ERANGE;
+	len = s->to - s->from;
+	if (len + 1 <= sizeof(a)) {
+		memcpy(a, s->from, len);
+		a[len] = '\0';
+		err = kstrtoull(a, 0, result);
+	}
+	return err;
+}
+
+static int au_wbr_mfs_wmark(substring_t *arg, char *str,
+			    struct au_opt_wbr_create *create)
+{
+	int err;
+	unsigned long long ull;
+
+	err = 0;
+	if (!au_match_ull(arg, &ull))
+		create->mfsrr_watermark = ull;
+	else {
+		pr_err("bad integer in %s\n", str);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int au_wbr_mfs_sec(substring_t *arg, char *str,
+			  struct au_opt_wbr_create *create)
+{
+	int n, err;
+
+	err = 0;
+	if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC)
+		create->mfs_second = n;
+	else {
+		pr_err("bad integer in %s\n", str);
+		err = -EINVAL;
+	}
+
+	return err;
+}
+
+static int noinline_for_stack
+au_wbr_create_val(char *str, struct au_opt_wbr_create *create)
+{
+	int err, e;
+	substring_t args[MAX_OPT_ARGS];
+
+	err = match_token(str, au_wbr_create_policy, args);
+	create->wbr_create = err;
+	switch (err) {
+	case AuWbrCreate_MFSRRV:
+	case AuWbrCreate_PMFSRRV:
+		e = au_wbr_mfs_wmark(&args[0], str, create);
+		if (!e)
+			e = au_wbr_mfs_sec(&args[1], str, create);
+		if (unlikely(e))
+			err = e;
+		break;
+	case AuWbrCreate_MFSRR:
+	case AuWbrCreate_PMFSRR:
+		e = au_wbr_mfs_wmark(&args[0], str, create);
+		if (unlikely(e)) {
+			err = e;
+			break;
+		}
+		/*FALLTHROUGH*/
+	case AuWbrCreate_MFS:
+	case AuWbrCreate_PMFS:
+		create->mfs_second = AUFS_MFS_DEF_SEC;
+		break;
+	case AuWbrCreate_MFSV:
+	case AuWbrCreate_PMFSV:
+		e = au_wbr_mfs_sec(&args[0], str, create);
+		if (unlikely(e))
+			err = e;
+		break;
+	}
+
+	return err;
+}
+
+const char *au_optstr_wbr_create(int wbr_create)
+{
+	return au_parser_pattern(wbr_create, au_wbr_create_policy);
+}
+
+static match_table_t au_wbr_copyup_policy = {
+	{AuWbrCopyup_TDP, "tdp"},
+	{AuWbrCopyup_TDP, "top-down-parent"},
+	{AuWbrCopyup_BUP, "bup"},
+	{AuWbrCopyup_BUP, "bottom-up-parent"},
+	{AuWbrCopyup_BU, "bu"},
+	{AuWbrCopyup_BU, "bottom-up"},
+	{-1, NULL}
+};
+
+static int noinline_for_stack au_wbr_copyup_val(char *str)
+{
+	substring_t args[MAX_OPT_ARGS];
+
+	return match_token(str, au_wbr_copyup_policy, args);
+}
+
+const char *au_optstr_wbr_copyup(int wbr_copyup)
+{
+	return au_parser_pattern(wbr_copyup, au_wbr_copyup_policy);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
+
+static void dump_opts(struct au_opts *opts)
+{
+#ifdef CONFIG_AUFS_DEBUG
+	/* reduce stack space */
+	union {
+		struct au_opt_add *add;
+		struct au_opt_del *del;
+		struct au_opt_mod *mod;
+		struct au_opt_xino *xino;
+		struct au_opt_xino_itrunc *xino_itrunc;
+		struct au_opt_wbr_create *create;
+	} u;
+	struct au_opt *opt;
+
+	opt = opts->opt;
+	while (opt->type != Opt_tail) {
+		switch (opt->type) {
+		case Opt_add:
+			u.add = &opt->add;
+			AuDbg("add {b%d, %s, 0x%x, %p}\n",
+				  u.add->bindex, u.add->pathname, u.add->perm,
+				  u.add->path.dentry);
+			break;
+		case Opt_del:
+		case Opt_idel:
+			u.del = &opt->del;
+			AuDbg("del {%s, %p}\n",
+			      u.del->pathname, u.del->h_path.dentry);
+			break;
+		case Opt_mod:
+		case Opt_imod:
+			u.mod = &opt->mod;
+			AuDbg("mod {%s, 0x%x, %p}\n",
+				  u.mod->path, u.mod->perm, u.mod->h_root);
+			break;
+		case Opt_append:
+			u.add = &opt->add;
+			AuDbg("append {b%d, %s, 0x%x, %p}\n",
+				  u.add->bindex, u.add->pathname, u.add->perm,
+				  u.add->path.dentry);
+			break;
+		case Opt_prepend:
+			u.add = &opt->add;
+			AuDbg("prepend {b%d, %s, 0x%x, %p}\n",
+				  u.add->bindex, u.add->pathname, u.add->perm,
+				  u.add->path.dentry);
+			break;
+		case Opt_dirwh:
+			AuDbg("dirwh %d\n", opt->dirwh);
+			break;
+		case Opt_rdcache:
+			AuDbg("rdcache %d\n", opt->rdcache);
+			break;
+		case Opt_rdblk:
+			AuDbg("rdblk %u\n", opt->rdblk);
+			break;
+		case Opt_rdblk_def:
+			AuDbg("rdblk_def\n");
+			break;
+		case Opt_rdhash:
+			AuDbg("rdhash %u\n", opt->rdhash);
+			break;
+		case Opt_rdhash_def:
+			AuDbg("rdhash_def\n");
+			break;
+		case Opt_xino:
+			u.xino = &opt->xino;
+			AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file);
+			break;
+		case Opt_trunc_xino:
+			AuLabel(trunc_xino);
+			break;
+		case Opt_notrunc_xino:
+			AuLabel(notrunc_xino);
+			break;
+		case Opt_trunc_xino_path:
+		case Opt_itrunc_xino:
+			u.xino_itrunc = &opt->xino_itrunc;
+			AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex);
+			break;
+		case Opt_noxino:
+			AuLabel(noxino);
+			break;
+		case Opt_trunc_xib:
+			AuLabel(trunc_xib);
+			break;
+		case Opt_notrunc_xib:
+			AuLabel(notrunc_xib);
+			break;
+		case Opt_shwh:
+			AuLabel(shwh);
+			break;
+		case Opt_noshwh:
+			AuLabel(noshwh);
+			break;
+		case Opt_dirperm1:
+			AuLabel(dirperm1);
+			break;
+		case Opt_nodirperm1:
+			AuLabel(nodirperm1);
+			break;
+		case Opt_plink:
+			AuLabel(plink);
+			break;
+		case Opt_noplink:
+			AuLabel(noplink);
+			break;
+		case Opt_list_plink:
+			AuLabel(list_plink);
+			break;
+		case Opt_udba:
+			AuDbg("udba %d, %s\n",
+				  opt->udba, au_optstr_udba(opt->udba));
+			break;
+		case Opt_dio:
+			AuLabel(dio);
+			break;
+		case Opt_nodio:
+			AuLabel(nodio);
+			break;
+		case Opt_diropq_a:
+			AuLabel(diropq_a);
+			break;
+		case Opt_diropq_w:
+			AuLabel(diropq_w);
+			break;
+		case Opt_warn_perm:
+			AuLabel(warn_perm);
+			break;
+		case Opt_nowarn_perm:
+			AuLabel(nowarn_perm);
+			break;
+		case Opt_verbose:
+			AuLabel(verbose);
+			break;
+		case Opt_noverbose:
+			AuLabel(noverbose);
+			break;
+		case Opt_sum:
+			AuLabel(sum);
+			break;
+		case Opt_nosum:
+			AuLabel(nosum);
+			break;
+		case Opt_wsum:
+			AuLabel(wsum);
+			break;
+		case Opt_wbr_create:
+			u.create = &opt->wbr_create;
+			AuDbg("create %d, %s\n", u.create->wbr_create,
+				  au_optstr_wbr_create(u.create->wbr_create));
+			switch (u.create->wbr_create) {
+			case AuWbrCreate_MFSV:
+			case AuWbrCreate_PMFSV:
+				AuDbg("%d sec\n", u.create->mfs_second);
+				break;
+			case AuWbrCreate_MFSRR:
+				AuDbg("%llu watermark\n",
+					  u.create->mfsrr_watermark);
+				break;
+			case AuWbrCreate_MFSRRV:
+			case AuWbrCreate_PMFSRRV:
+				AuDbg("%llu watermark, %d sec\n",
+					  u.create->mfsrr_watermark,
+					  u.create->mfs_second);
+				break;
+			}
+			break;
+		case Opt_wbr_copyup:
+			AuDbg("copyup %d, %s\n", opt->wbr_copyup,
+				  au_optstr_wbr_copyup(opt->wbr_copyup));
+			break;
+		case Opt_fhsm_sec:
+			AuDbg("fhsm_sec %u\n", opt->fhsm_second);
+			break;
+		case Opt_acl:
+			AuLabel(acl);
+			break;
+		case Opt_noacl:
+			AuLabel(noacl);
+			break;
+		default:
+			BUG();
+		}
+		opt++;
+	}
+#endif
+}
+
+void au_opts_free(struct au_opts *opts)
+{
+	struct au_opt *opt;
+
+	opt = opts->opt;
+	while (opt->type != Opt_tail) {
+		switch (opt->type) {
+		case Opt_add:
+		case Opt_append:
+		case Opt_prepend:
+			path_put(&opt->add.path);
+			break;
+		case Opt_del:
+		case Opt_idel:
+			path_put(&opt->del.h_path);
+			break;
+		case Opt_mod:
+		case Opt_imod:
+			dput(opt->mod.h_root);
+			break;
+		case Opt_xino:
+			fput(opt->xino.file);
+			break;
+		}
+		opt++;
+	}
+}
+
+static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags,
+		   aufs_bindex_t bindex)
+{
+	int err;
+	struct au_opt_add *add = &opt->add;
+	char *p;
+
+	add->bindex = bindex;
+	add->perm = AuBrPerm_RO;
+	add->pathname = opt_str;
+	p = strchr(opt_str, '=');
+	if (p) {
+		*p++ = 0;
+		if (*p)
+			add->perm = br_perm_val(p);
+	}
+
+	err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path);
+	if (!err) {
+		if (!p) {
+			add->perm = AuBrPerm_RO;
+			if (au_test_fs_rr(add->path.dentry->d_sb))
+				add->perm = AuBrPerm_RR;
+			else if (!bindex && !(sb_flags & MS_RDONLY))
+				add->perm = AuBrPerm_RW;
+		}
+		opt->type = Opt_add;
+		goto out;
+	}
+	pr_err("lookup failed %s (%d)\n", add->pathname, err);
+	err = -EINVAL;
+
+out:
+	return err;
+}
+
+static int au_opts_parse_del(struct au_opt_del *del, substring_t args[])
+{
+	int err;
+
+	del->pathname = args[0].from;
+	AuDbg("del path %s\n", del->pathname);
+
+	err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path);
+	if (unlikely(err))
+		pr_err("lookup failed %s (%d)\n", del->pathname, err);
+
+	return err;
+}
+
+#if 0 /* reserved for future use */
+static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex,
+			      struct au_opt_del *del, substring_t args[])
+{
+	int err;
+	struct dentry *root;
+
+	err = -EINVAL;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_FLUSH);
+	if (bindex < 0 || au_sbbot(sb) < bindex) {
+		pr_err("out of bounds, %d\n", bindex);
+		goto out;
+	}
+
+	err = 0;
+	del->h_path.dentry = dget(au_h_dptr(root, bindex));
+	del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex));
+
+out:
+	aufs_read_unlock(root, !AuLock_IR);
+	return err;
+}
+#endif
+
+static int noinline_for_stack
+au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[])
+{
+	int err;
+	struct path path;
+	char *p;
+
+	err = -EINVAL;
+	mod->path = args[0].from;
+	p = strchr(mod->path, '=');
+	if (unlikely(!p)) {
+		pr_err("no permssion %s\n", args[0].from);
+		goto out;
+	}
+
+	*p++ = 0;
+	err = vfsub_kern_path(mod->path, lkup_dirflags, &path);
+	if (unlikely(err)) {
+		pr_err("lookup failed %s (%d)\n", mod->path, err);
+		goto out;
+	}
+
+	mod->perm = br_perm_val(p);
+	AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p);
+	mod->h_root = dget(path.dentry);
+	path_put(&path);
+
+out:
+	return err;
+}
+
+#if 0 /* reserved for future use */
+static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex,
+			      struct au_opt_mod *mod, substring_t args[])
+{
+	int err;
+	struct dentry *root;
+
+	err = -EINVAL;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_FLUSH);
+	if (bindex < 0 || au_sbbot(sb) < bindex) {
+		pr_err("out of bounds, %d\n", bindex);
+		goto out;
+	}
+
+	err = 0;
+	mod->perm = br_perm_val(args[1].from);
+	AuDbg("mod path %s, perm 0x%x, %s\n",
+	      mod->path, mod->perm, args[1].from);
+	mod->h_root = dget(au_h_dptr(root, bindex));
+
+out:
+	aufs_read_unlock(root, !AuLock_IR);
+	return err;
+}
+#endif
+
+static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino,
+			      substring_t args[])
+{
+	int err;
+	struct file *file;
+
+	file = au_xino_create(sb, args[0].from, /*silent*/0);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+
+	err = -EINVAL;
+	if (unlikely(file->f_path.dentry->d_sb == sb)) {
+		fput(file);
+		pr_err("%s must be outside\n", args[0].from);
+		goto out;
+	}
+
+	err = 0;
+	xino->file = file;
+	xino->path = args[0].from;
+
+out:
+	return err;
+}
+
+static int noinline_for_stack
+au_opts_parse_xino_itrunc_path(struct super_block *sb,
+			       struct au_opt_xino_itrunc *xino_itrunc,
+			       substring_t args[])
+{
+	int err;
+	aufs_bindex_t bbot, bindex;
+	struct path path;
+	struct dentry *root;
+
+	err = vfsub_kern_path(args[0].from, lkup_dirflags, &path);
+	if (unlikely(err)) {
+		pr_err("lookup failed %s (%d)\n", args[0].from, err);
+		goto out;
+	}
+
+	xino_itrunc->bindex = -1;
+	root = sb->s_root;
+	aufs_read_lock(root, AuLock_FLUSH);
+	bbot = au_sbbot(sb);
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		if (au_h_dptr(root, bindex) == path.dentry) {
+			xino_itrunc->bindex = bindex;
+			break;
+		}
+	}
+	aufs_read_unlock(root, !AuLock_IR);
+	path_put(&path);
+
+	if (unlikely(xino_itrunc->bindex < 0)) {
+		pr_err("no such branch %s\n", args[0].from);
+		err = -EINVAL;
+	}
+
+out:
+	return err;
+}
+
+/* called without aufs lock */
+int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts)
+{
+	int err, n, token;
+	aufs_bindex_t bindex;
+	unsigned char skipped;
+	struct dentry *root;
+	struct au_opt *opt, *opt_tail;
+	char *opt_str;
+	/* reduce the stack space */
+	union {
+		struct au_opt_xino_itrunc *xino_itrunc;
+		struct au_opt_wbr_create *create;
+	} u;
+	struct {
+		substring_t args[MAX_OPT_ARGS];
+	} *a;
+
+	err = -ENOMEM;
+	a = kmalloc(sizeof(*a), GFP_NOFS);
+	if (unlikely(!a))
+		goto out;
+
+	root = sb->s_root;
+	err = 0;
+	bindex = 0;
+	opt = opts->opt;
+	opt_tail = opt + opts->max_opt - 1;
+	opt->type = Opt_tail;
+	while (!err && (opt_str = strsep(&str, ",")) && *opt_str) {
+		err = -EINVAL;
+		skipped = 0;
+		token = match_token(opt_str, options, a->args);
+		switch (token) {
+		case Opt_br:
+			err = 0;
+			while (!err && (opt_str = strsep(&a->args[0].from, ":"))
+			       && *opt_str) {
+				err = opt_add(opt, opt_str, opts->sb_flags,
+					      bindex++);
+				if (unlikely(!err && ++opt > opt_tail)) {
+					err = -E2BIG;
+					break;
+				}
+				opt->type = Opt_tail;
+				skipped = 1;
+			}
+			break;
+		case Opt_add:
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			bindex = n;
+			err = opt_add(opt, a->args[1].from, opts->sb_flags,
+				      bindex);
+			if (!err)
+				opt->type = token;
+			break;
+		case Opt_append:
+			err = opt_add(opt, a->args[0].from, opts->sb_flags,
+				      /*dummy bindex*/1);
+			if (!err)
+				opt->type = token;
+			break;
+		case Opt_prepend:
+			err = opt_add(opt, a->args[0].from, opts->sb_flags,
+				      /*bindex*/0);
+			if (!err)
+				opt->type = token;
+			break;
+		case Opt_del:
+			err = au_opts_parse_del(&opt->del, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#if 0 /* reserved for future use */
+		case Opt_idel:
+			del->pathname = "(indexed)";
+			if (unlikely(match_int(&args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			err = au_opts_parse_idel(sb, n, &opt->del, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#endif
+		case Opt_mod:
+			err = au_opts_parse_mod(&opt->mod, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#ifdef IMOD /* reserved for future use */
+		case Opt_imod:
+			u.mod->path = "(indexed)";
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			err = au_opts_parse_imod(sb, n, &opt->mod, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+#endif
+		case Opt_xino:
+			err = au_opts_parse_xino(sb, &opt->xino, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+
+		case Opt_trunc_xino_path:
+			err = au_opts_parse_xino_itrunc_path
+				(sb, &opt->xino_itrunc, a->args);
+			if (!err)
+				opt->type = token;
+			break;
+
+		case Opt_itrunc_xino:
+			u.xino_itrunc = &opt->xino_itrunc;
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			u.xino_itrunc->bindex = n;
+			aufs_read_lock(root, AuLock_FLUSH);
+			if (n < 0 || au_sbbot(sb) < n) {
+				pr_err("out of bounds, %d\n", n);
+				aufs_read_unlock(root, !AuLock_IR);
+				break;
+			}
+			aufs_read_unlock(root, !AuLock_IR);
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_dirwh:
+			if (unlikely(match_int(&a->args[0], &opt->dirwh)))
+				break;
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_rdcache:
+			if (unlikely(match_int(&a->args[0], &n))) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			if (unlikely(n > AUFS_RDCACHE_MAX)) {
+				pr_err("rdcache must be smaller than %d\n",
+				       AUFS_RDCACHE_MAX);
+				break;
+			}
+			opt->rdcache = n;
+			err = 0;
+			opt->type = token;
+			break;
+		case Opt_rdblk:
+			if (unlikely(match_int(&a->args[0], &n)
+				     || n < 0
+				     || n > KMALLOC_MAX_SIZE)) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			if (unlikely(n && n < NAME_MAX)) {
+				pr_err("rdblk must be larger than %d\n",
+				       NAME_MAX);
+				break;
+			}
+			opt->rdblk = n;
+			err = 0;
+			opt->type = token;
+			break;
+		case Opt_rdhash:
+			if (unlikely(match_int(&a->args[0], &n)
+				     || n < 0
+				     || n * sizeof(struct hlist_head)
+				     > KMALLOC_MAX_SIZE)) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			opt->rdhash = n;
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_trunc_xino:
+		case Opt_notrunc_xino:
+		case Opt_noxino:
+		case Opt_trunc_xib:
+		case Opt_notrunc_xib:
+		case Opt_shwh:
+		case Opt_noshwh:
+		case Opt_dirperm1:
+		case Opt_nodirperm1:
+		case Opt_plink:
+		case Opt_noplink:
+		case Opt_list_plink:
+		case Opt_dio:
+		case Opt_nodio:
+		case Opt_diropq_a:
+		case Opt_diropq_w:
+		case Opt_warn_perm:
+		case Opt_nowarn_perm:
+		case Opt_verbose:
+		case Opt_noverbose:
+		case Opt_sum:
+		case Opt_nosum:
+		case Opt_wsum:
+		case Opt_rdblk_def:
+		case Opt_rdhash_def:
+		case Opt_acl:
+		case Opt_noacl:
+			err = 0;
+			opt->type = token;
+			break;
+
+		case Opt_udba:
+			opt->udba = udba_val(a->args[0].from);
+			if (opt->udba >= 0) {
+				err = 0;
+				opt->type = token;
+			} else
+				pr_err("wrong value, %s\n", opt_str);
+			break;
+
+		case Opt_wbr_create:
+			u.create = &opt->wbr_create;
+			u.create->wbr_create
+				= au_wbr_create_val(a->args[0].from, u.create);
+			if (u.create->wbr_create >= 0) {
+				err = 0;
+				opt->type = token;
+			} else
+				pr_err("wrong value, %s\n", opt_str);
+			break;
+		case Opt_wbr_copyup:
+			opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from);
+			if (opt->wbr_copyup >= 0) {
+				err = 0;
+				opt->type = token;
+			} else
+				pr_err("wrong value, %s\n", opt_str);
+			break;
+
+		case Opt_fhsm_sec:
+			if (unlikely(match_int(&a->args[0], &n)
+				     || n < 0)) {
+				pr_err("bad integer in %s\n", opt_str);
+				break;
+			}
+			if (sysaufs_brs) {
+				opt->fhsm_second = n;
+				opt->type = token;
+			} else
+				pr_warn("ignored %s\n", opt_str);
+			err = 0;
+			break;
+
+		case Opt_ignore:
+			pr_warn("ignored %s\n", opt_str);
+			/*FALLTHROUGH*/
+		case Opt_ignore_silent:
+			skipped = 1;
+			err = 0;
+			break;
+		case Opt_err:
+			pr_err("unknown option %s\n", opt_str);
+			break;
+		}
+
+		if (!err && !skipped) {
+			if (unlikely(++opt > opt_tail)) {
+				err = -E2BIG;
+				opt--;
+				opt->type = Opt_tail;
+				break;
+			}
+			opt->type = Opt_tail;
+		}
+	}
+
+	au_delayed_kfree(a);
+	dump_opts(opts);
+	if (unlikely(err))
+		au_opts_free(opts);
+
+out:
+	return err;
+}
+
+static int au_opt_wbr_create(struct super_block *sb,
+			     struct au_opt_wbr_create *create)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 1; /* handled */
+	sbinfo = au_sbi(sb);
+	if (sbinfo->si_wbr_create_ops->fin) {
+		err = sbinfo->si_wbr_create_ops->fin(sb);
+		if (!err)
+			err = 1;
+	}
+
+	sbinfo->si_wbr_create = create->wbr_create;
+	sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create;
+	switch (create->wbr_create) {
+	case AuWbrCreate_MFSRRV:
+	case AuWbrCreate_MFSRR:
+	case AuWbrCreate_PMFSRR:
+	case AuWbrCreate_PMFSRRV:
+		sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark;
+		/*FALLTHROUGH*/
+	case AuWbrCreate_MFS:
+	case AuWbrCreate_MFSV:
+	case AuWbrCreate_PMFS:
+	case AuWbrCreate_PMFSV:
+		sbinfo->si_wbr_mfs.mfs_expire
+			= msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC);
+		break;
+	}
+
+	if (sbinfo->si_wbr_create_ops->init)
+		sbinfo->si_wbr_create_ops->init(sb); /* ignore */
+
+	return err;
+}
+
+/*
+ * returns,
+ * plus: processed without an error
+ * zero: unprocessed
+ */
+static int au_opt_simple(struct super_block *sb, struct au_opt *opt,
+			 struct au_opts *opts)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 1; /* handled */
+	sbinfo = au_sbi(sb);
+	switch (opt->type) {
+	case Opt_udba:
+		sbinfo->si_mntflags &= ~AuOptMask_UDBA;
+		sbinfo->si_mntflags |= opt->udba;
+		opts->given_udba |= opt->udba;
+		break;
+
+	case Opt_plink:
+		au_opt_set(sbinfo->si_mntflags, PLINK);
+		break;
+	case Opt_noplink:
+		if (au_opt_test(sbinfo->si_mntflags, PLINK))
+			au_plink_put(sb, /*verbose*/1);
+		au_opt_clr(sbinfo->si_mntflags, PLINK);
+		break;
+	case Opt_list_plink:
+		if (au_opt_test(sbinfo->si_mntflags, PLINK))
+			au_plink_list(sb);
+		break;
+
+	case Opt_dio:
+		au_opt_set(sbinfo->si_mntflags, DIO);
+		au_fset_opts(opts->flags, REFRESH_DYAOP);
+		break;
+	case Opt_nodio:
+		au_opt_clr(sbinfo->si_mntflags, DIO);
+		au_fset_opts(opts->flags, REFRESH_DYAOP);
+		break;
+
+	case Opt_fhsm_sec:
+		au_fhsm_set(sbinfo, opt->fhsm_second);
+		break;
+
+	case Opt_diropq_a:
+		au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ);
+		break;
+	case Opt_diropq_w:
+		au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ);
+		break;
+
+	case Opt_warn_perm:
+		au_opt_set(sbinfo->si_mntflags, WARN_PERM);
+		break;
+	case Opt_nowarn_perm:
+		au_opt_clr(sbinfo->si_mntflags, WARN_PERM);
+		break;
+
+	case Opt_verbose:
+		au_opt_set(sbinfo->si_mntflags, VERBOSE);
+		break;
+	case Opt_noverbose:
+		au_opt_clr(sbinfo->si_mntflags, VERBOSE);
+		break;
+
+	case Opt_sum:
+		au_opt_set(sbinfo->si_mntflags, SUM);
+		break;
+	case Opt_wsum:
+		au_opt_clr(sbinfo->si_mntflags, SUM);
+		au_opt_set(sbinfo->si_mntflags, SUM_W);
+	case Opt_nosum:
+		au_opt_clr(sbinfo->si_mntflags, SUM);
+		au_opt_clr(sbinfo->si_mntflags, SUM_W);
+		break;
+
+	case Opt_wbr_create:
+		err = au_opt_wbr_create(sb, &opt->wbr_create);
+		break;
+	case Opt_wbr_copyup:
+		sbinfo->si_wbr_copyup = opt->wbr_copyup;
+		sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup;
+		break;
+
+	case Opt_dirwh:
+		sbinfo->si_dirwh = opt->dirwh;
+		break;
+
+	case Opt_rdcache:
+		sbinfo->si_rdcache
+			= msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC);
+		break;
+	case Opt_rdblk:
+		sbinfo->si_rdblk = opt->rdblk;
+		break;
+	case Opt_rdblk_def:
+		sbinfo->si_rdblk = AUFS_RDBLK_DEF;
+		break;
+	case Opt_rdhash:
+		sbinfo->si_rdhash = opt->rdhash;
+		break;
+	case Opt_rdhash_def:
+		sbinfo->si_rdhash = AUFS_RDHASH_DEF;
+		break;
+
+	case Opt_shwh:
+		au_opt_set(sbinfo->si_mntflags, SHWH);
+		break;
+	case Opt_noshwh:
+		au_opt_clr(sbinfo->si_mntflags, SHWH);
+		break;
+
+	case Opt_dirperm1:
+		au_opt_set(sbinfo->si_mntflags, DIRPERM1);
+		break;
+	case Opt_nodirperm1:
+		au_opt_clr(sbinfo->si_mntflags, DIRPERM1);
+		break;
+
+	case Opt_trunc_xino:
+		au_opt_set(sbinfo->si_mntflags, TRUNC_XINO);
+		break;
+	case Opt_notrunc_xino:
+		au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO);
+		break;
+
+	case Opt_trunc_xino_path:
+	case Opt_itrunc_xino:
+		err = au_xino_trunc(sb, opt->xino_itrunc.bindex);
+		if (!err)
+			err = 1;
+		break;
+
+	case Opt_trunc_xib:
+		au_fset_opts(opts->flags, TRUNC_XIB);
+		break;
+	case Opt_notrunc_xib:
+		au_fclr_opts(opts->flags, TRUNC_XIB);
+		break;
+
+	case Opt_acl:
+		sb->s_flags |= MS_POSIXACL;
+		break;
+	case Opt_noacl:
+		sb->s_flags &= ~MS_POSIXACL;
+		break;
+
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+/*
+ * returns tri-state.
+ * plus: processed without an error
+ * zero: unprocessed
+ * minus: error
+ */
+static int au_opt_br(struct super_block *sb, struct au_opt *opt,
+		     struct au_opts *opts)
+{
+	int err, do_refresh;
+
+	err = 0;
+	switch (opt->type) {
+	case Opt_append:
+		opt->add.bindex = au_sbbot(sb) + 1;
+		if (opt->add.bindex < 0)
+			opt->add.bindex = 0;
+		goto add;
+	case Opt_prepend:
+		opt->add.bindex = 0;
+	add: /* indented label */
+	case Opt_add:
+		err = au_br_add(sb, &opt->add,
+				au_ftest_opts(opts->flags, REMOUNT));
+		if (!err) {
+			err = 1;
+			au_fset_opts(opts->flags, REFRESH);
+		}
+		break;
+
+	case Opt_del:
+	case Opt_idel:
+		err = au_br_del(sb, &opt->del,
+				au_ftest_opts(opts->flags, REMOUNT));
+		if (!err) {
+			err = 1;
+			au_fset_opts(opts->flags, TRUNC_XIB);
+			au_fset_opts(opts->flags, REFRESH);
+		}
+		break;
+
+	case Opt_mod:
+	case Opt_imod:
+		err = au_br_mod(sb, &opt->mod,
+				au_ftest_opts(opts->flags, REMOUNT),
+				&do_refresh);
+		if (!err) {
+			err = 1;
+			if (do_refresh)
+				au_fset_opts(opts->flags, REFRESH);
+		}
+		break;
+	}
+
+	return err;
+}
+
+static int au_opt_xino(struct super_block *sb, struct au_opt *opt,
+		       struct au_opt_xino **opt_xino,
+		       struct au_opts *opts)
+{
+	int err;
+	aufs_bindex_t bbot, bindex;
+	struct dentry *root, *parent, *h_root;
+
+	err = 0;
+	switch (opt->type) {
+	case Opt_xino:
+		err = au_xino_set(sb, &opt->xino,
+				  !!au_ftest_opts(opts->flags, REMOUNT));
+		if (unlikely(err))
+			break;
+
+		*opt_xino = &opt->xino;
+		au_xino_brid_set(sb, -1);
+
+		/* safe d_parent access */
+		parent = opt->xino.file->f_path.dentry->d_parent;
+		root = sb->s_root;
+		bbot = au_sbbot(sb);
+		for (bindex = 0; bindex <= bbot; bindex++) {
+			h_root = au_h_dptr(root, bindex);
+			if (h_root == parent) {
+				au_xino_brid_set(sb, au_sbr_id(sb, bindex));
+				break;
+			}
+		}
+		break;
+
+	case Opt_noxino:
+		au_xino_clr(sb);
+		au_xino_brid_set(sb, -1);
+		*opt_xino = (void *)-1;
+		break;
+	}
+
+	return err;
+}
+
+int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
+		   unsigned int pending)
+{
+	int err, fhsm;
+	aufs_bindex_t bindex, bbot;
+	unsigned char do_plink, skip, do_free, can_no_dreval;
+	struct au_branch *br;
+	struct au_wbr *wbr;
+	struct dentry *root, *dentry;
+	struct inode *dir, *h_dir;
+	struct au_sbinfo *sbinfo;
+	struct au_hinode *hdir;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA));
+
+	if (!(sb_flags & MS_RDONLY)) {
+		if (unlikely(!au_br_writable(au_sbr_perm(sb, 0))))
+			pr_warn("first branch should be rw\n");
+		if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH)))
+			pr_warn_once("shwh should be used with ro\n");
+	}
+
+	if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY)
+	    && !au_opt_test(sbinfo->si_mntflags, XINO))
+		pr_warn_once("udba=*notify requires xino\n");
+
+	if (au_opt_test(sbinfo->si_mntflags, DIRPERM1))
+		pr_warn_once("dirperm1 breaks the protection"
+			     " by the permission bits on the lower branch\n");
+
+	err = 0;
+	fhsm = 0;
+	root = sb->s_root;
+	dir = d_inode(root);
+	do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK);
+	can_no_dreval = !!au_opt_test((sbinfo->si_mntflags | pending),
+				      UDBA_NONE);
+	bbot = au_sbbot(sb);
+	for (bindex = 0; !err && bindex <= bbot; bindex++) {
+		skip = 0;
+		h_dir = au_h_iptr(dir, bindex);
+		br = au_sbr(sb, bindex);
+
+		if ((br->br_perm & AuBrAttr_ICEX)
+		    && !h_dir->i_op->listxattr)
+			br->br_perm &= ~AuBrAttr_ICEX;
+#if 0
+		if ((br->br_perm & AuBrAttr_ICEX_SEC)
+		    && (au_br_sb(br)->s_flags & MS_NOSEC))
+			br->br_perm &= ~AuBrAttr_ICEX_SEC;
+#endif
+
+		do_free = 0;
+		wbr = br->br_wbr;
+		if (wbr)
+			wbr_wh_read_lock(wbr);
+
+		if (!au_br_writable(br->br_perm)) {
+			do_free = !!wbr;
+			skip = (!wbr
+				|| (!wbr->wbr_whbase
+				    && !wbr->wbr_plink
+				    && !wbr->wbr_orph));
+		} else if (!au_br_wh_linkable(br->br_perm)) {
+			/* skip = (!br->br_whbase && !br->br_orph); */
+			skip = (!wbr || !wbr->wbr_whbase);
+			if (skip && wbr) {
+				if (do_plink)
+					skip = !!wbr->wbr_plink;
+				else
+					skip = !wbr->wbr_plink;
+			}
+		} else {
+			/* skip = (br->br_whbase && br->br_ohph); */
+			skip = (wbr && wbr->wbr_whbase);
+			if (skip) {
+				if (do_plink)
+					skip = !!wbr->wbr_plink;
+				else
+					skip = !wbr->wbr_plink;
+			}
+		}
+		if (wbr)
+			wbr_wh_read_unlock(wbr);
+
+		if (can_no_dreval) {
+			dentry = br->br_path.dentry;
+			spin_lock(&dentry->d_lock);
+			if (dentry->d_flags &
+			    (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE))
+				can_no_dreval = 0;
+			spin_unlock(&dentry->d_lock);
+		}
+
+		if (au_br_fhsm(br->br_perm)) {
+			fhsm++;
+			AuDebugOn(!br->br_fhsm);
+		}
+
+		if (skip)
+			continue;
+
+		hdir = au_hi(dir, bindex);
+		au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT);
+		if (wbr)
+			wbr_wh_write_lock(wbr);
+		err = au_wh_init(br, sb);
+		if (wbr)
+			wbr_wh_write_unlock(wbr);
+		au_hn_inode_unlock(hdir);
+
+		if (!err && do_free) {
+			if (wbr)
+				au_delayed_kfree(wbr);
+			br->br_wbr = NULL;
+		}
+	}
+
+	if (can_no_dreval)
+		au_fset_si(sbinfo, NO_DREVAL);
+	else
+		au_fclr_si(sbinfo, NO_DREVAL);
+
+	if (fhsm >= 2) {
+		au_fset_si(sbinfo, FHSM);
+		for (bindex = bbot; bindex >= 0; bindex--) {
+			br = au_sbr(sb, bindex);
+			if (au_br_fhsm(br->br_perm)) {
+				au_fhsm_set_bottom(sb, bindex);
+				break;
+			}
+		}
+	} else {
+		au_fclr_si(sbinfo, FHSM);
+		au_fhsm_set_bottom(sb, -1);
+	}
+
+	return err;
+}
+
+int au_opts_mount(struct super_block *sb, struct au_opts *opts)
+{
+	int err;
+	unsigned int tmp;
+	aufs_bindex_t bindex, bbot;
+	struct au_opt *opt;
+	struct au_opt_xino *opt_xino, xino;
+	struct au_sbinfo *sbinfo;
+	struct au_branch *br;
+	struct inode *dir;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	opt_xino = NULL;
+	opt = opts->opt;
+	while (err >= 0 && opt->type != Opt_tail)
+		err = au_opt_simple(sb, opt++, opts);
+	if (err > 0)
+		err = 0;
+	else if (unlikely(err < 0))
+		goto out;
+
+	/* disable xino and udba temporary */
+	sbinfo = au_sbi(sb);
+	tmp = sbinfo->si_mntflags;
+	au_opt_clr(sbinfo->si_mntflags, XINO);
+	au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL);
+
+	opt = opts->opt;
+	while (err >= 0 && opt->type != Opt_tail)
+		err = au_opt_br(sb, opt++, opts);
+	if (err > 0)
+		err = 0;
+	else if (unlikely(err < 0))
+		goto out;
+
+	bbot = au_sbbot(sb);
+	if (unlikely(bbot < 0)) {
+		err = -EINVAL;
+		pr_err("no branches\n");
+		goto out;
+	}
+
+	if (au_opt_test(tmp, XINO))
+		au_opt_set(sbinfo->si_mntflags, XINO);
+	opt = opts->opt;
+	while (!err && opt->type != Opt_tail)
+		err = au_opt_xino(sb, opt++, &opt_xino, opts);
+	if (unlikely(err))
+		goto out;
+
+	err = au_opts_verify(sb, sb->s_flags, tmp);
+	if (unlikely(err))
+		goto out;
+
+	/* restore xino */
+	if (au_opt_test(tmp, XINO) && !opt_xino) {
+		xino.file = au_xino_def(sb);
+		err = PTR_ERR(xino.file);
+		if (IS_ERR(xino.file))
+			goto out;
+
+		err = au_xino_set(sb, &xino, /*remount*/0);
+		fput(xino.file);
+		if (unlikely(err))
+			goto out;
+	}
+
+	/* restore udba */
+	tmp &= AuOptMask_UDBA;
+	sbinfo->si_mntflags &= ~AuOptMask_UDBA;
+	sbinfo->si_mntflags |= tmp;
+	bbot = au_sbbot(sb);
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		err = au_hnotify_reset_br(tmp, br, br->br_perm);
+		if (unlikely(err))
+			AuIOErr("hnotify failed on br %d, %d, ignored\n",
+				bindex, err);
+		/* go on even if err */
+	}
+	if (au_opt_test(tmp, UDBA_HNOTIFY)) {
+		dir = d_inode(sb->s_root);
+		au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO);
+	}
+
+out:
+	return err;
+}
+
+int au_opts_remount(struct super_block *sb, struct au_opts *opts)
+{
+	int err, rerr;
+	unsigned char no_dreval;
+	struct inode *dir;
+	struct au_opt_xino *opt_xino;
+	struct au_opt *opt;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	dir = d_inode(sb->s_root);
+	sbinfo = au_sbi(sb);
+	opt_xino = NULL;
+	opt = opts->opt;
+	while (err >= 0 && opt->type != Opt_tail) {
+		err = au_opt_simple(sb, opt, opts);
+		if (!err)
+			err = au_opt_br(sb, opt, opts);
+		if (!err)
+			err = au_opt_xino(sb, opt, &opt_xino, opts);
+		opt++;
+	}
+	if (err > 0)
+		err = 0;
+	AuTraceErr(err);
+	/* go on even err */
+
+	no_dreval = !!au_ftest_si(sbinfo, NO_DREVAL);
+	rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0);
+	if (unlikely(rerr && !err))
+		err = rerr;
+
+	if (no_dreval != !!au_ftest_si(sbinfo, NO_DREVAL))
+		au_fset_opts(opts->flags, REFRESH_IDOP);
+
+	if (au_ftest_opts(opts->flags, TRUNC_XIB)) {
+		rerr = au_xib_trunc(sb);
+		if (unlikely(rerr && !err))
+			err = rerr;
+	}
+
+	/* will be handled by the caller */
+	if (!au_ftest_opts(opts->flags, REFRESH)
+	    && (opts->given_udba
+		|| au_opt_test(sbinfo->si_mntflags, XINO)
+		|| au_ftest_opts(opts->flags, REFRESH_IDOP)
+		    ))
+		au_fset_opts(opts->flags, REFRESH);
+
+	AuDbg("status 0x%x\n", opts->flags);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+unsigned int au_opt_udba(struct super_block *sb)
+{
+	return au_mntflags(sb) & AuOptMask_UDBA;
+}
diff --git b/fs/aufs/opts.h b/fs/aufs/opts.h
new file mode 100644
index 0000000..0d6c2e1
--- /dev/null
+++ b/fs/aufs/opts.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * mount options/flags
+ */
+
+#ifndef __AUFS_OPTS_H__
+#define __AUFS_OPTS_H__
+
+#ifdef __KERNEL__
+
+#include <linux/path.h>
+
+struct file;
+struct super_block;
+
+/* ---------------------------------------------------------------------- */
+
+/* mount flags */
+#define AuOpt_XINO		1		/* external inode number bitmap
+						   and translation table */
+#define AuOpt_TRUNC_XINO	(1 << 1)	/* truncate xino files */
+#define AuOpt_UDBA_NONE		(1 << 2)	/* users direct branch access */
+#define AuOpt_UDBA_REVAL	(1 << 3)
+#define AuOpt_UDBA_HNOTIFY	(1 << 4)
+#define AuOpt_SHWH		(1 << 5)	/* show whiteout */
+#define AuOpt_PLINK		(1 << 6)	/* pseudo-link */
+#define AuOpt_DIRPERM1		(1 << 7)	/* ignore the lower dir's perm
+						   bits */
+#define AuOpt_ALWAYS_DIROPQ	(1 << 9)	/* policy to creating diropq */
+#define AuOpt_SUM		(1 << 10)	/* summation for statfs(2) */
+#define AuOpt_SUM_W		(1 << 11)	/* unimplemented */
+#define AuOpt_WARN_PERM		(1 << 12)	/* warn when add-branch */
+#define AuOpt_VERBOSE		(1 << 13)	/* busy inode when del-branch */
+#define AuOpt_DIO		(1 << 14)	/* direct io */
+
+#ifndef CONFIG_AUFS_HNOTIFY
+#undef AuOpt_UDBA_HNOTIFY
+#define AuOpt_UDBA_HNOTIFY	0
+#endif
+#ifndef CONFIG_AUFS_SHWH
+#undef AuOpt_SHWH
+#define AuOpt_SHWH		0
+#endif
+
+#define AuOpt_Def	(AuOpt_XINO \
+			 | AuOpt_UDBA_REVAL \
+			 | AuOpt_PLINK \
+			 /* | AuOpt_DIRPERM1 */ \
+			 | AuOpt_WARN_PERM)
+#define AuOptMask_UDBA	(AuOpt_UDBA_NONE \
+			 | AuOpt_UDBA_REVAL \
+			 | AuOpt_UDBA_HNOTIFY)
+
+#define au_opt_test(flags, name)	(flags & AuOpt_##name)
+#define au_opt_set(flags, name) do { \
+	BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \
+	((flags) |= AuOpt_##name); \
+} while (0)
+#define au_opt_set_udba(flags, name) do { \
+	(flags) &= ~AuOptMask_UDBA; \
+	((flags) |= AuOpt_##name); \
+} while (0)
+#define au_opt_clr(flags, name) do { \
+	((flags) &= ~AuOpt_##name); \
+} while (0)
+
+static inline unsigned int au_opts_plink(unsigned int mntflags)
+{
+#ifdef CONFIG_PROC_FS
+	return mntflags;
+#else
+	return mntflags & ~AuOpt_PLINK;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* policies to select one among multiple writable branches */
+enum {
+	AuWbrCreate_TDP,	/* top down parent */
+	AuWbrCreate_RR,		/* round robin */
+	AuWbrCreate_MFS,	/* most free space */
+	AuWbrCreate_MFSV,	/* mfs with seconds */
+	AuWbrCreate_MFSRR,	/* mfs then rr */
+	AuWbrCreate_MFSRRV,	/* mfs then rr with seconds */
+	AuWbrCreate_PMFS,	/* parent and mfs */
+	AuWbrCreate_PMFSV,	/* parent and mfs with seconds */
+	AuWbrCreate_PMFSRR,	/* parent, mfs and round-robin */
+	AuWbrCreate_PMFSRRV,	/* plus seconds */
+
+	AuWbrCreate_Def = AuWbrCreate_TDP
+};
+
+enum {
+	AuWbrCopyup_TDP,	/* top down parent */
+	AuWbrCopyup_BUP,	/* bottom up parent */
+	AuWbrCopyup_BU,		/* bottom up */
+
+	AuWbrCopyup_Def = AuWbrCopyup_TDP
+};
+
+/* ---------------------------------------------------------------------- */
+
+struct au_opt_add {
+	aufs_bindex_t	bindex;
+	char		*pathname;
+	int		perm;
+	struct path	path;
+};
+
+struct au_opt_del {
+	char		*pathname;
+	struct path	h_path;
+};
+
+struct au_opt_mod {
+	char		*path;
+	int		perm;
+	struct dentry	*h_root;
+};
+
+struct au_opt_xino {
+	char		*path;
+	struct file	*file;
+};
+
+struct au_opt_xino_itrunc {
+	aufs_bindex_t	bindex;
+};
+
+struct au_opt_wbr_create {
+	int			wbr_create;
+	int			mfs_second;
+	unsigned long long	mfsrr_watermark;
+};
+
+struct au_opt {
+	int type;
+	union {
+		struct au_opt_xino	xino;
+		struct au_opt_xino_itrunc xino_itrunc;
+		struct au_opt_add	add;
+		struct au_opt_del	del;
+		struct au_opt_mod	mod;
+		int			dirwh;
+		int			rdcache;
+		unsigned int		rdblk;
+		unsigned int		rdhash;
+		int			udba;
+		struct au_opt_wbr_create wbr_create;
+		int			wbr_copyup;
+		unsigned int		fhsm_second;
+	};
+};
+
+/* opts flags */
+#define AuOpts_REMOUNT		1
+#define AuOpts_REFRESH		(1 << 1)
+#define AuOpts_TRUNC_XIB	(1 << 2)
+#define AuOpts_REFRESH_DYAOP	(1 << 3)
+#define AuOpts_REFRESH_IDOP	(1 << 4)
+#define au_ftest_opts(flags, name)	((flags) & AuOpts_##name)
+#define au_fset_opts(flags, name) \
+	do { (flags) |= AuOpts_##name; } while (0)
+#define au_fclr_opts(flags, name) \
+	do { (flags) &= ~AuOpts_##name; } while (0)
+
+struct au_opts {
+	struct au_opt	*opt;
+	int		max_opt;
+
+	unsigned int	given_udba;
+	unsigned int	flags;
+	unsigned long	sb_flags;
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* opts.c */
+void au_optstr_br_perm(au_br_perm_str_t *str, int perm);
+const char *au_optstr_udba(int udba);
+const char *au_optstr_wbr_copyup(int wbr_copyup);
+const char *au_optstr_wbr_create(int wbr_create);
+
+void au_opts_free(struct au_opts *opts);
+int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts);
+int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
+		   unsigned int pending);
+int au_opts_mount(struct super_block *sb, struct au_opts *opts);
+int au_opts_remount(struct super_block *sb, struct au_opts *opts);
+
+unsigned int au_opt_udba(struct super_block *sb);
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_OPTS_H__ */
diff --git b/fs/aufs/plink.c b/fs/aufs/plink.c
new file mode 100644
index 0000000..d8d75e8
--- /dev/null
+++ b/fs/aufs/plink.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * pseudo-link
+ */
+
+#include "aufs.h"
+
+/*
+ * the pseudo-link maintenance mode.
+ * during a user process maintains the pseudo-links,
+ * prohibit adding a new plink and branch manipulation.
+ *
+ * Flags
+ * NOPLM:
+ *	For entry functions which will handle plink, and i_mutex is already held
+ *	in VFS.
+ *	They cannot wait and should return an error at once.
+ *	Callers has to check the error.
+ * NOPLMW:
+ *	For entry functions which will handle plink, but i_mutex is not held
+ *	in VFS.
+ *	They can wait the plink maintenance mode to finish.
+ *
+ * They behave like F_SETLK and F_SETLKW.
+ * If the caller never handle plink, then both flags are unnecessary.
+ */
+
+int au_plink_maint(struct super_block *sb, int flags)
+{
+	int err;
+	pid_t pid, ppid;
+	struct task_struct *parent, *prev;
+	struct au_sbinfo *sbi;
+
+	SiMustAnyLock(sb);
+
+	err = 0;
+	if (!au_opt_test(au_mntflags(sb), PLINK))
+		goto out;
+
+	sbi = au_sbi(sb);
+	pid = sbi->si_plink_maint_pid;
+	if (!pid || pid == current->pid)
+		goto out;
+
+	/* todo: it highly depends upon /sbin/mount.aufs */
+	prev = NULL;
+	parent = current;
+	ppid = 0;
+	rcu_read_lock();
+	while (1) {
+		parent = rcu_dereference(parent->real_parent);
+		if (parent == prev)
+			break;
+		ppid = task_pid_vnr(parent);
+		if (pid == ppid) {
+			rcu_read_unlock();
+			goto out;
+		}
+		prev = parent;
+	}
+	rcu_read_unlock();
+
+	if (au_ftest_lock(flags, NOPLMW)) {
+		/* if there is no i_mutex lock in VFS, we don't need to wait */
+		/* AuDebugOn(!lockdep_depth(current)); */
+		while (sbi->si_plink_maint_pid) {
+			si_read_unlock(sb);
+			/* gave up wake_up_bit() */
+			wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid);
+
+			if (au_ftest_lock(flags, FLUSH))
+				au_nwt_flush(&sbi->si_nowait);
+			si_noflush_read_lock(sb);
+		}
+	} else if (au_ftest_lock(flags, NOPLM)) {
+		AuDbg("ppid %d, pid %d\n", ppid, pid);
+		err = -EAGAIN;
+	}
+
+out:
+	return err;
+}
+
+void au_plink_maint_leave(struct au_sbinfo *sbinfo)
+{
+	spin_lock(&sbinfo->si_plink_maint_lock);
+	sbinfo->si_plink_maint_pid = 0;
+	spin_unlock(&sbinfo->si_plink_maint_lock);
+	wake_up_all(&sbinfo->si_plink_wq);
+}
+
+int au_plink_maint_enter(struct super_block *sb)
+{
+	int err;
+	struct au_sbinfo *sbinfo;
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	/* make sure i am the only one in this fs */
+	si_write_lock(sb, AuLock_FLUSH);
+	if (au_opt_test(au_mntflags(sb), PLINK)) {
+		spin_lock(&sbinfo->si_plink_maint_lock);
+		if (!sbinfo->si_plink_maint_pid)
+			sbinfo->si_plink_maint_pid = current->pid;
+		else
+			err = -EBUSY;
+		spin_unlock(&sbinfo->si_plink_maint_lock);
+	}
+	si_write_unlock(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_DEBUG
+void au_plink_list(struct super_block *sb)
+{
+	int i;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct au_icntnr *icntnr;
+
+	SiMustAnyLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	for (i = 0; i < AuPlink_NHASH; i++) {
+		plink_hlist = &sbinfo->si_plink[i].head;
+		rcu_read_lock();
+		hlist_for_each_entry_rcu(icntnr, plink_hlist, plink)
+			AuDbg("%lu\n", icntnr->vfs_inode.i_ino);
+		rcu_read_unlock();
+	}
+}
+#endif
+
+/* is the inode pseudo-linked? */
+int au_plink_test(struct inode *inode)
+{
+	int found, i;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct au_icntnr *icntnr;
+
+	sbinfo = au_sbi(inode->i_sb);
+	AuRwMustAnyLock(&sbinfo->si_rwsem);
+	AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK));
+	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
+
+	found = 0;
+	i = au_plink_hash(inode->i_ino);
+	plink_hlist = &sbinfo->si_plink[i].head;
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(icntnr, plink_hlist, plink)
+		if (&icntnr->vfs_inode == inode) {
+			found = 1;
+			break;
+		}
+	rcu_read_unlock();
+	return found;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * generate a name for plink.
+ * the file will be stored under AUFS_WH_PLINKDIR.
+ */
+/* 20 is max digits length of ulong 64 */
+#define PLINK_NAME_LEN	((20 + 1) * 2)
+
+static int plink_name(char *name, int len, struct inode *inode,
+		      aufs_bindex_t bindex)
+{
+	int rlen;
+	struct inode *h_inode;
+
+	h_inode = au_h_iptr(inode, bindex);
+	rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino);
+	return rlen;
+}
+
+struct au_do_plink_lkup_args {
+	struct dentry **errp;
+	struct qstr *tgtname;
+	struct dentry *h_parent;
+	struct au_branch *br;
+};
+
+static struct dentry *au_do_plink_lkup(struct qstr *tgtname,
+				       struct dentry *h_parent,
+				       struct au_branch *br)
+{
+	struct dentry *h_dentry;
+	struct inode *h_inode;
+
+	h_inode = d_inode(h_parent);
+	inode_lock_nested(h_inode, AuLsc_I_CHILD2);
+	h_dentry = vfsub_lkup_one(tgtname, h_parent);
+	inode_unlock(h_inode);
+	return h_dentry;
+}
+
+static void au_call_do_plink_lkup(void *args)
+{
+	struct au_do_plink_lkup_args *a = args;
+	*a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br);
+}
+
+/* lookup the plink-ed @inode under the branch at @bindex */
+struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex)
+{
+	struct dentry *h_dentry, *h_parent;
+	struct au_branch *br;
+	int wkq_err;
+	char a[PLINK_NAME_LEN];
+	struct qstr tgtname = QSTR_INIT(a, 0);
+
+	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
+
+	br = au_sbr(inode->i_sb, bindex);
+	h_parent = br->br_wbr->wbr_plink;
+	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
+
+	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
+		struct au_do_plink_lkup_args args = {
+			.errp		= &h_dentry,
+			.tgtname	= &tgtname,
+			.h_parent	= h_parent,
+			.br		= br
+		};
+
+		wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args);
+		if (unlikely(wkq_err))
+			h_dentry = ERR_PTR(wkq_err);
+	} else
+		h_dentry = au_do_plink_lkup(&tgtname, h_parent, br);
+
+	return h_dentry;
+}
+
+/* create a pseudo-link */
+static int do_whplink(struct qstr *tgt, struct dentry *h_parent,
+		      struct dentry *h_dentry, struct au_branch *br)
+{
+	int err;
+	struct path h_path = {
+		.mnt = au_br_mnt(br)
+	};
+	struct inode *h_dir, *delegated;
+
+	h_dir = d_inode(h_parent);
+	inode_lock_nested(h_dir, AuLsc_I_CHILD2);
+again:
+	h_path.dentry = vfsub_lkup_one(tgt, h_parent);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry))
+		goto out;
+
+	err = 0;
+	/* wh.plink dir is not monitored */
+	/* todo: is it really safe? */
+	if (d_is_positive(h_path.dentry)
+	    && d_inode(h_path.dentry) != d_inode(h_dentry)) {
+		delegated = NULL;
+		err = vfsub_unlink(h_dir, &h_path, &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+		dput(h_path.dentry);
+		h_path.dentry = NULL;
+		if (!err)
+			goto again;
+	}
+	if (!err && d_is_negative(h_path.dentry)) {
+		delegated = NULL;
+		err = vfsub_link(h_dentry, h_dir, &h_path, &delegated);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal link\n");
+			iput(delegated);
+		}
+	}
+	dput(h_path.dentry);
+
+out:
+	inode_unlock(h_dir);
+	return err;
+}
+
+struct do_whplink_args {
+	int *errp;
+	struct qstr *tgt;
+	struct dentry *h_parent;
+	struct dentry *h_dentry;
+	struct au_branch *br;
+};
+
+static void call_do_whplink(void *args)
+{
+	struct do_whplink_args *a = args;
+	*a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br);
+}
+
+static int whplink(struct dentry *h_dentry, struct inode *inode,
+		   aufs_bindex_t bindex, struct au_branch *br)
+{
+	int err, wkq_err;
+	struct au_wbr *wbr;
+	struct dentry *h_parent;
+	char a[PLINK_NAME_LEN];
+	struct qstr tgtname = QSTR_INIT(a, 0);
+
+	wbr = au_sbr(inode->i_sb, bindex)->br_wbr;
+	h_parent = wbr->wbr_plink;
+	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
+
+	/* always superio. */
+	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
+		struct do_whplink_args args = {
+			.errp		= &err,
+			.tgt		= &tgtname,
+			.h_parent	= h_parent,
+			.h_dentry	= h_dentry,
+			.br		= br
+		};
+		wkq_err = au_wkq_wait(call_do_whplink, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	} else
+		err = do_whplink(&tgtname, h_parent, h_dentry, br);
+
+	return err;
+}
+
+/*
+ * create a new pseudo-link for @h_dentry on @bindex.
+ * the linked inode is held in aufs @inode.
+ */
+void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
+		     struct dentry *h_dentry)
+{
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct au_icntnr *icntnr;
+	struct au_sphlhead *sphl;
+	int found, err, cnt, i;
+
+	sb = inode->i_sb;
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	found = au_plink_test(inode);
+	if (found)
+		return;
+
+	i = au_plink_hash(inode->i_ino);
+	sphl = sbinfo->si_plink + i;
+	plink_hlist = &sphl->head;
+	au_igrab(inode);
+
+	spin_lock(&sphl->spin);
+	hlist_for_each_entry(icntnr, plink_hlist, plink) {
+		if (&icntnr->vfs_inode == inode) {
+			found = 1;
+			break;
+		}
+	}
+	if (!found) {
+		icntnr = container_of(inode, struct au_icntnr, vfs_inode);
+		hlist_add_head_rcu(&icntnr->plink, plink_hlist);
+	}
+	spin_unlock(&sphl->spin);
+	if (!found) {
+		cnt = au_sphl_count(sphl);
+#define msg "unexpectedly unblanced or too many pseudo-links"
+		if (cnt > AUFS_PLINK_WARN)
+			AuWarn1(msg ", %d\n", cnt);
+#undef msg
+		err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex));
+		if (unlikely(err)) {
+			pr_warn("err %d, damaged pseudo link.\n", err);
+			au_sphl_del_rcu(&icntnr->plink, sphl);
+			iput(&icntnr->vfs_inode);
+		}
+	} else
+		iput(&icntnr->vfs_inode);
+}
+
+/* free all plinks */
+void au_plink_put(struct super_block *sb, int verbose)
+{
+	int i, warned;
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct hlist_node *tmp;
+	struct au_icntnr *icntnr;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	/* no spin_lock since sbinfo is write-locked */
+	warned = 0;
+	for (i = 0; i < AuPlink_NHASH; i++) {
+		plink_hlist = &sbinfo->si_plink[i].head;
+		if (!warned && verbose && !hlist_empty(plink_hlist)) {
+			pr_warn("pseudo-link is not flushed");
+			warned = 1;
+		}
+		hlist_for_each_entry_safe(icntnr, tmp, plink_hlist, plink)
+			iput(&icntnr->vfs_inode);
+		INIT_HLIST_HEAD(plink_hlist);
+	}
+}
+
+void au_plink_clean(struct super_block *sb, int verbose)
+{
+	struct dentry *root;
+
+	root = sb->s_root;
+	aufs_write_lock(root);
+	if (au_opt_test(au_mntflags(sb), PLINK))
+		au_plink_put(sb, verbose);
+	aufs_write_unlock(root);
+}
+
+static int au_plink_do_half_refresh(struct inode *inode, aufs_bindex_t br_id)
+{
+	int do_put;
+	aufs_bindex_t btop, bbot, bindex;
+
+	do_put = 0;
+	btop = au_ibtop(inode);
+	bbot = au_ibbot(inode);
+	if (btop >= 0) {
+		for (bindex = btop; bindex <= bbot; bindex++) {
+			if (!au_h_iptr(inode, bindex)
+			    || au_ii_br_id(inode, bindex) != br_id)
+				continue;
+			au_set_h_iptr(inode, bindex, NULL, 0);
+			do_put = 1;
+			break;
+		}
+		if (do_put)
+			for (bindex = btop; bindex <= bbot; bindex++)
+				if (au_h_iptr(inode, bindex)) {
+					do_put = 0;
+					break;
+				}
+	} else
+		do_put = 1;
+
+	return do_put;
+}
+
+/* free the plinks on a branch specified by @br_id */
+void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id)
+{
+	struct au_sbinfo *sbinfo;
+	struct hlist_head *plink_hlist;
+	struct hlist_node *tmp;
+	struct au_icntnr *icntnr;
+	struct inode *inode;
+	int i, do_put;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
+	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
+
+	/* no spin_lock since sbinfo is write-locked */
+	for (i = 0; i < AuPlink_NHASH; i++) {
+		plink_hlist = &sbinfo->si_plink[i].head;
+		hlist_for_each_entry_safe(icntnr, tmp, plink_hlist, plink) {
+			inode = au_igrab(&icntnr->vfs_inode);
+			ii_write_lock_child(inode);
+			do_put = au_plink_do_half_refresh(inode, br_id);
+			if (do_put) {
+				hlist_del(&icntnr->plink);
+				iput(inode);
+			}
+			ii_write_unlock(inode);
+			iput(inode);
+		}
+	}
+}
diff --git b/fs/aufs/poll.c b/fs/aufs/poll.c
new file mode 100644
index 0000000..dd2baf5
--- /dev/null
+++ b/fs/aufs/poll.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * poll operation
+ * There is only one filesystem which implements ->poll operation, currently.
+ */
+
+#include "aufs.h"
+
+unsigned int aufs_poll(struct file *file, poll_table *wait)
+{
+	unsigned int mask;
+	int err;
+	struct file *h_file;
+	struct super_block *sb;
+
+	/* We should pretend an error happened. */
+	mask = POLLERR /* | POLLIN | POLLOUT */;
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
+
+	h_file = au_read_pre(file, /*keep_fi*/0);
+	err = PTR_ERR(h_file);
+	if (IS_ERR(h_file))
+		goto out;
+
+	/* it is not an error if h_file has no operation */
+	mask = DEFAULT_POLLMASK;
+	if (h_file->f_op->poll)
+		mask = h_file->f_op->poll(h_file, wait);
+	fput(h_file); /* instead of au_read_post() */
+
+out:
+	si_read_unlock(sb);
+	AuTraceErr((int)mask);
+	return mask;
+}
diff --git b/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c
new file mode 100644
index 0000000..1ee17ee
--- /dev/null
+++ b/fs/aufs/posix_acl.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2014-2016 Junjiro R. Okajima
+ */
+
+/*
+ * posix acl operations
+ */
+
+#include <linux/fs.h>
+#include "aufs.h"
+
+struct posix_acl *aufs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	int err;
+	aufs_bindex_t bindex;
+	struct inode *h_inode;
+	struct super_block *sb;
+
+	acl = NULL;
+	sb = inode->i_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	ii_read_lock_child(inode);
+	if (!(sb->s_flags & MS_POSIXACL))
+		goto out;
+
+	bindex = au_ibtop(inode);
+	h_inode = au_h_iptr(inode, bindex);
+	if (unlikely(!h_inode
+		     || ((h_inode->i_mode & S_IFMT)
+			 != (inode->i_mode & S_IFMT)))) {
+		err = au_busy_or_stale();
+		acl = ERR_PTR(err);
+		goto out;
+	}
+
+	/* always topmost only */
+	acl = get_acl(h_inode, type);
+
+out:
+	ii_read_unlock(inode);
+	si_read_unlock(sb);
+
+	AuTraceErrPtr(acl);
+	return acl;
+}
+
+int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
+{
+	int err;
+	ssize_t ssz;
+	struct dentry *dentry;
+	struct au_srxattr arg = {
+		.type = AU_ACL_SET,
+		.u.acl_set = {
+			.acl	= acl,
+			.type	= type
+		},
+	};
+
+	IMustLock(inode);
+
+	if (inode->i_ino == AUFS_ROOT_INO)
+		dentry = dget(inode->i_sb->s_root);
+	else {
+		dentry = d_find_alias(inode);
+		if (!dentry)
+			dentry = d_find_any_alias(inode);
+		if (!dentry) {
+			pr_warn("cannot handle this inode, "
+				"please report to aufs-users ML\n");
+			err = -ENOENT;
+			goto out;
+		}
+	}
+
+	ssz = au_srxattr(dentry, inode, &arg);
+	dput(dentry);
+	err = ssz;
+	if (ssz >= 0)
+		err = 0;
+
+out:
+	return err;
+}
diff --git b/fs/aufs/procfs.c b/fs/aufs/procfs.c
new file mode 100644
index 0000000..e5a4e37
--- /dev/null
+++ b/fs/aufs/procfs.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright (C) 2010-2016 Junjiro R. Okajima
+ */
+
+/*
+ * procfs interfaces
+ */
+
+#include <linux/proc_fs.h>
+#include "aufs.h"
+
+static int au_procfs_plm_release(struct inode *inode, struct file *file)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = file->private_data;
+	if (sbinfo) {
+		au_plink_maint_leave(sbinfo);
+		kobject_put(&sbinfo->si_kobj);
+	}
+
+	return 0;
+}
+
+static void au_procfs_plm_write_clean(struct file *file)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = file->private_data;
+	if (sbinfo)
+		au_plink_clean(sbinfo->si_sb, /*verbose*/0);
+}
+
+static int au_procfs_plm_write_si(struct file *file, unsigned long id)
+{
+	int err;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+	err = -EBUSY;
+	if (unlikely(file->private_data))
+		goto out;
+
+	sb = NULL;
+	/* don't use au_sbilist_lock() here */
+	spin_lock(&au_sbilist.spin);
+	hlist_for_each_entry(sbinfo, &au_sbilist.head, si_list)
+		if (id == sysaufs_si_id(sbinfo)) {
+			kobject_get(&sbinfo->si_kobj);
+			sb = sbinfo->si_sb;
+			break;
+		}
+	spin_unlock(&au_sbilist.spin);
+
+	err = -EINVAL;
+	if (unlikely(!sb))
+		goto out;
+
+	err = au_plink_maint_enter(sb);
+	if (!err)
+		/* keep kobject_get() */
+		file->private_data = sbinfo;
+	else
+		kobject_put(&sbinfo->si_kobj);
+out:
+	return err;
+}
+
+/*
+ * Accept a valid "si=xxxx" only.
+ * Once it is accepted successfully, accept "clean" too.
+ */
+static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf,
+				   size_t count, loff_t *ppos)
+{
+	ssize_t err;
+	unsigned long id;
+	/* last newline is allowed */
+	char buf[3 + sizeof(unsigned long) * 2 + 1];
+
+	err = -EACCES;
+	if (unlikely(!capable(CAP_SYS_ADMIN)))
+		goto out;
+
+	err = -EINVAL;
+	if (unlikely(count > sizeof(buf)))
+		goto out;
+
+	err = copy_from_user(buf, ubuf, count);
+	if (unlikely(err)) {
+		err = -EFAULT;
+		goto out;
+	}
+	buf[count] = 0;
+
+	err = -EINVAL;
+	if (!strcmp("clean", buf)) {
+		au_procfs_plm_write_clean(file);
+		goto out_success;
+	} else if (unlikely(strncmp("si=", buf, 3)))
+		goto out;
+
+	err = kstrtoul(buf + 3, 16, &id);
+	if (unlikely(err))
+		goto out;
+
+	err = au_procfs_plm_write_si(file, id);
+	if (unlikely(err))
+		goto out;
+
+out_success:
+	err = count; /* success */
+out:
+	return err;
+}
+
+static const struct file_operations au_procfs_plm_fop = {
+	.write		= au_procfs_plm_write,
+	.release	= au_procfs_plm_release,
+	.owner		= THIS_MODULE
+};
+
+/* ---------------------------------------------------------------------- */
+
+static struct proc_dir_entry *au_procfs_dir;
+
+void au_procfs_fin(void)
+{
+	remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir);
+	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
+}
+
+int __init au_procfs_init(void)
+{
+	int err;
+	struct proc_dir_entry *entry;
+
+	err = -ENOMEM;
+	au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL);
+	if (unlikely(!au_procfs_dir))
+		goto out;
+
+	entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR,
+			    au_procfs_dir, &au_procfs_plm_fop);
+	if (unlikely(!entry))
+		goto out_dir;
+
+	err = 0;
+	goto out; /* success */
+
+
+out_dir:
+	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
+out:
+	return err;
+}
diff --git b/fs/aufs/rdu.c b/fs/aufs/rdu.c
new file mode 100644
index 0000000..54abc14
--- /dev/null
+++ b/fs/aufs/rdu.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * readdir in userspace.
+ */
+
+#include <linux/compat.h>
+#include <linux/fs_stack.h>
+#include <linux/security.h>
+#include "aufs.h"
+
+/* bits for struct aufs_rdu.flags */
+#define	AuRdu_CALLED	1
+#define	AuRdu_CONT	(1 << 1)
+#define	AuRdu_FULL	(1 << 2)
+#define au_ftest_rdu(flags, name)	((flags) & AuRdu_##name)
+#define au_fset_rdu(flags, name) \
+	do { (flags) |= AuRdu_##name; } while (0)
+#define au_fclr_rdu(flags, name) \
+	do { (flags) &= ~AuRdu_##name; } while (0)
+
+struct au_rdu_arg {
+	struct dir_context		ctx;
+	struct aufs_rdu			*rdu;
+	union au_rdu_ent_ul		ent;
+	unsigned long			end;
+
+	struct super_block		*sb;
+	int				err;
+};
+
+static int au_rdu_fill(struct dir_context *ctx, const char *name, int nlen,
+		       loff_t offset, u64 h_ino, unsigned int d_type)
+{
+	int err, len;
+	struct au_rdu_arg *arg = container_of(ctx, struct au_rdu_arg, ctx);
+	struct aufs_rdu *rdu = arg->rdu;
+	struct au_rdu_ent ent;
+
+	err = 0;
+	arg->err = 0;
+	au_fset_rdu(rdu->cookie.flags, CALLED);
+	len = au_rdu_len(nlen);
+	if (arg->ent.ul + len  < arg->end) {
+		ent.ino = h_ino;
+		ent.bindex = rdu->cookie.bindex;
+		ent.type = d_type;
+		ent.nlen = nlen;
+		if (unlikely(nlen > AUFS_MAX_NAMELEN))
+			ent.type = DT_UNKNOWN;
+
+		/* unnecessary to support mmap_sem since this is a dir */
+		err = -EFAULT;
+		if (copy_to_user(arg->ent.e, &ent, sizeof(ent)))
+			goto out;
+		if (copy_to_user(arg->ent.e->name, name, nlen))
+			goto out;
+		/* the terminating NULL */
+		if (__put_user(0, arg->ent.e->name + nlen))
+			goto out;
+		err = 0;
+		/* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */
+		arg->ent.ul += len;
+		rdu->rent++;
+	} else {
+		err = -EFAULT;
+		au_fset_rdu(rdu->cookie.flags, FULL);
+		rdu->full = 1;
+		rdu->tail = arg->ent;
+	}
+
+out:
+	/* AuTraceErr(err); */
+	return err;
+}
+
+static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg)
+{
+	int err;
+	loff_t offset;
+	struct au_rdu_cookie *cookie = &arg->rdu->cookie;
+
+	/* we don't have to care (FMODE_32BITHASH | FMODE_64BITHASH) for ext4 */
+	offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET);
+	err = offset;
+	if (unlikely(offset != cookie->h_pos))
+		goto out;
+
+	err = 0;
+	do {
+		arg->err = 0;
+		au_fclr_rdu(cookie->flags, CALLED);
+		/* smp_mb(); */
+		err = vfsub_iterate_dir(h_file, &arg->ctx);
+		if (err >= 0)
+			err = arg->err;
+	} while (!err
+		 && au_ftest_rdu(cookie->flags, CALLED)
+		 && !au_ftest_rdu(cookie->flags, FULL));
+	cookie->h_pos = h_file->f_pos;
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_rdu(struct file *file, struct aufs_rdu *rdu)
+{
+	int err;
+	aufs_bindex_t bbot;
+	struct au_rdu_arg arg = {
+		.ctx = {
+			.actor = au_rdu_fill
+		}
+	};
+	struct dentry *dentry;
+	struct inode *inode;
+	struct file *h_file;
+	struct au_rdu_cookie *cookie = &rdu->cookie;
+
+	err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz);
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+	rdu->rent = 0;
+	rdu->tail = rdu->ent;
+	rdu->full = 0;
+	arg.rdu = rdu;
+	arg.ent = rdu->ent;
+	arg.end = arg.ent.ul;
+	arg.end += rdu->sz;
+
+	err = -ENOTDIR;
+	if (unlikely(!file->f_op->iterate && !file->f_op->iterate_shared))
+		goto out;
+
+	err = security_file_permission(file, MAY_READ);
+	AuTraceErr(err);
+	if (unlikely(err))
+		goto out;
+
+	dentry = file->f_path.dentry;
+	inode = d_inode(dentry);
+	inode_lock_shared(inode);
+
+	arg.sb = inode->i_sb;
+	err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_mtx;
+	err = au_alive_dir(dentry);
+	if (unlikely(err))
+		goto out_si;
+	/* todo: reval? */
+	fi_read_lock(file);
+
+	err = -EAGAIN;
+	if (unlikely(au_ftest_rdu(cookie->flags, CONT)
+		     && cookie->generation != au_figen(file)))
+		goto out_unlock;
+
+	err = 0;
+	if (!rdu->blk) {
+		rdu->blk = au_sbi(arg.sb)->si_rdblk;
+		if (!rdu->blk)
+			rdu->blk = au_dir_size(file, /*dentry*/NULL);
+	}
+	bbot = au_fbtop(file);
+	if (cookie->bindex < bbot)
+		cookie->bindex = bbot;
+	bbot = au_fbbot_dir(file);
+	/* AuDbg("b%d, b%d\n", cookie->bindex, bbot); */
+	for (; !err && cookie->bindex <= bbot;
+	     cookie->bindex++, cookie->h_pos = 0) {
+		h_file = au_hf_dir(file, cookie->bindex);
+		if (!h_file)
+			continue;
+
+		au_fclr_rdu(cookie->flags, FULL);
+		err = au_rdu_do(h_file, &arg);
+		AuTraceErr(err);
+		if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err))
+			break;
+	}
+	AuDbg("rent %llu\n", rdu->rent);
+
+	if (!err && !au_ftest_rdu(cookie->flags, CONT)) {
+		rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH);
+		au_fset_rdu(cookie->flags, CONT);
+		cookie->generation = au_figen(file);
+	}
+
+	ii_read_lock_child(inode);
+	fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibtop(inode)));
+	ii_read_unlock(inode);
+
+out_unlock:
+	fi_read_unlock(file);
+out_si:
+	si_read_unlock(arg.sb);
+out_mtx:
+	inode_unlock_shared(inode);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu)
+{
+	int err;
+	ino_t ino;
+	unsigned long long nent;
+	union au_rdu_ent_ul *u;
+	struct au_rdu_ent ent;
+	struct super_block *sb;
+
+	err = 0;
+	nent = rdu->nent;
+	u = &rdu->ent;
+	sb = file->f_path.dentry->d_sb;
+	si_read_lock(sb, AuLock_FLUSH);
+	while (nent-- > 0) {
+		/* unnecessary to support mmap_sem since this is a dir */
+		err = copy_from_user(&ent, u->e, sizeof(ent));
+		if (!err)
+			err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino));
+		if (unlikely(err)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+			break;
+		}
+
+		/* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */
+		if (!ent.wh)
+			err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino);
+		else
+			err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type,
+					&ino);
+		if (unlikely(err)) {
+			AuTraceErr(err);
+			break;
+		}
+
+		err = __put_user(ino, &u->e->ino);
+		if (unlikely(err)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+			break;
+		}
+		u->ul += au_rdu_len(ent.nlen);
+	}
+	si_read_unlock(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_rdu_verify(struct aufs_rdu *rdu)
+{
+	AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | "
+	      "%llu, b%d, 0x%x, g%u}\n",
+	      rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ],
+	      rdu->blk,
+	      rdu->rent, rdu->shwh, rdu->full,
+	      rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags,
+	      rdu->cookie.generation);
+
+	if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu))
+		return 0;
+
+	AuDbg("%u:%u\n",
+	      rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu));
+	return -EINVAL;
+}
+
+long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err, e;
+	struct aufs_rdu rdu;
+	void __user *p = (void __user *)arg;
+
+	err = copy_from_user(&rdu, p, sizeof(rdu));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+	err = au_rdu_verify(&rdu);
+	if (unlikely(err))
+		goto out;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+		err = au_rdu(file, &rdu);
+		if (unlikely(err))
+			break;
+
+		e = copy_to_user(p, &rdu, sizeof(rdu));
+		if (unlikely(e)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+		}
+		break;
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_ino(file, &rdu);
+		break;
+
+	default:
+		/* err = -ENOTTY; */
+		err = -EINVAL;
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	long err, e;
+	struct aufs_rdu rdu;
+	void __user *p = compat_ptr(arg);
+
+	/* todo: get_user()? */
+	err = copy_from_user(&rdu, p, sizeof(rdu));
+	if (unlikely(err)) {
+		err = -EFAULT;
+		AuTraceErr(err);
+		goto out;
+	}
+	rdu.ent.e = compat_ptr(rdu.ent.ul);
+	err = au_rdu_verify(&rdu);
+	if (unlikely(err))
+		goto out;
+
+	switch (cmd) {
+	case AUFS_CTL_RDU:
+		err = au_rdu(file, &rdu);
+		if (unlikely(err))
+			break;
+
+		rdu.ent.ul = ptr_to_compat(rdu.ent.e);
+		rdu.tail.ul = ptr_to_compat(rdu.tail.e);
+		e = copy_to_user(p, &rdu, sizeof(rdu));
+		if (unlikely(e)) {
+			err = -EFAULT;
+			AuTraceErr(err);
+		}
+		break;
+	case AUFS_CTL_RDU_INO:
+		err = au_rdu_ino(file, &rdu);
+		break;
+
+	default:
+		/* err = -ENOTTY; */
+		err = -EINVAL;
+	}
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+#endif
diff --git b/fs/aufs/rwsem.h b/fs/aufs/rwsem.h
new file mode 100644
index 0000000..6c0d5a9
--- /dev/null
+++ b/fs/aufs/rwsem.h
@@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * simple read-write semaphore wrappers
+ */
+
+#ifndef __AUFS_RWSEM_H__
+#define __AUFS_RWSEM_H__
+
+#ifdef __KERNEL__
+
+#include "debug.h"
+
+struct au_rwsem {
+	struct rw_semaphore	rwsem;
+#ifdef CONFIG_AUFS_DEBUG
+	/* just for debugging, not almighty counter */
+	atomic_t		rcnt, wcnt;
+#endif
+};
+
+#ifdef CONFIG_LOCKDEP
+#define au_lockdep_set_name(rw)						\
+	lockdep_set_class_and_name(&(rw)->rwsem,			\
+				   /*original key*/(rw)->rwsem.dep_map.key, \
+				   /*name*/#rw)
+#else
+#define au_lockdep_set_name(rw) do {} while (0)
+#endif
+
+#ifdef CONFIG_AUFS_DEBUG
+#define AuDbgCntInit(rw) do { \
+	atomic_set(&(rw)->rcnt, 0); \
+	atomic_set(&(rw)->wcnt, 0); \
+	smp_mb(); /* atomic set */ \
+} while (0)
+
+#define AuDbgCnt(rw, cnt)	atomic_read(&(rw)->cnt)
+#define AuDbgCntInc(rw, cnt)	atomic_inc(&(rw)->cnt)
+#define AuDbgCntDec(rw, cnt)	WARN_ON(atomic_dec_return(&(rw)->cnt) < 0)
+#define AuDbgRcntInc(rw)	AuDbgCntInc(rw, rcnt)
+#define AuDbgRcntDec(rw)	AuDbgCntDec(rw, rcnt)
+#define AuDbgWcntInc(rw)	AuDbgCntInc(rw, wcnt)
+#define AuDbgWcntDec(rw)	AuDbgCntDec(rw, wcnt)
+#else
+#define AuDbgCnt(rw, cnt)	0
+#define AuDbgCntInit(rw)	do {} while (0)
+#define AuDbgRcntInc(rw)	do {} while (0)
+#define AuDbgRcntDec(rw)	do {} while (0)
+#define AuDbgWcntInc(rw)	do {} while (0)
+#define AuDbgWcntDec(rw)	do {} while (0)
+#endif /* CONFIG_AUFS_DEBUG */
+
+/* to debug easier, do not make them inlined functions */
+#define AuRwMustNoWaiters(rw)	AuDebugOn(rwsem_is_contended(&(rw)->rwsem))
+/* rwsem_is_locked() is unusable */
+#define AuRwMustReadLock(rw)	AuDebugOn(AuDbgCnt(rw, rcnt) <= 0)
+#define AuRwMustWriteLock(rw)	AuDebugOn(AuDbgCnt(rw, wcnt) <= 0)
+#define AuRwMustAnyLock(rw)	AuDebugOn(AuDbgCnt(rw, rcnt) <= 0	\
+					  && AuDbgCnt(rw, wcnt) <= 0)
+#define AuRwDestroy(rw)		AuDebugOn(AuDbgCnt(rw, rcnt)		\
+					  || AuDbgCnt(rw, wcnt))
+
+#define au_rw_init(rw) do {			\
+		AuDbgCntInit(rw);		\
+		init_rwsem(&(rw)->rwsem);	\
+		au_lockdep_set_name(rw);	\
+	} while (0)
+
+#define au_rw_init_wlock(rw) do {		\
+		au_rw_init(rw);			\
+		down_write(&(rw)->rwsem);	\
+		AuDbgWcntInc(rw);		\
+	} while (0)
+
+#define au_rw_init_wlock_nested(rw, lsc) do { \
+		au_rw_init(rw);				\
+		down_write_nested(&(rw)->rwsem, lsc);	\
+		AuDbgWcntInc(rw);			\
+	} while (0)
+
+static inline void au_rw_read_lock(struct au_rwsem *rw)
+{
+	down_read(&rw->rwsem);
+	AuDbgRcntInc(rw);
+}
+
+static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc)
+{
+	down_read_nested(&rw->rwsem, lsc);
+	AuDbgRcntInc(rw);
+}
+
+static inline void au_rw_read_unlock(struct au_rwsem *rw)
+{
+	AuRwMustReadLock(rw);
+	AuDbgRcntDec(rw);
+	up_read(&rw->rwsem);
+}
+
+static inline void au_rw_dgrade_lock(struct au_rwsem *rw)
+{
+	AuRwMustWriteLock(rw);
+	AuDbgRcntInc(rw);
+	AuDbgWcntDec(rw);
+	downgrade_write(&rw->rwsem);
+}
+
+static inline void au_rw_write_lock(struct au_rwsem *rw)
+{
+	down_write(&rw->rwsem);
+	AuDbgWcntInc(rw);
+}
+
+static inline void au_rw_write_lock_nested(struct au_rwsem *rw,
+					   unsigned int lsc)
+{
+	down_write_nested(&rw->rwsem, lsc);
+	AuDbgWcntInc(rw);
+}
+
+static inline void au_rw_write_unlock(struct au_rwsem *rw)
+{
+	AuRwMustWriteLock(rw);
+	AuDbgWcntDec(rw);
+	up_write(&rw->rwsem);
+}
+
+/* why is not _nested version defined */
+static inline int au_rw_read_trylock(struct au_rwsem *rw)
+{
+	int ret;
+
+	ret = down_read_trylock(&rw->rwsem);
+	if (ret)
+		AuDbgRcntInc(rw);
+	return ret;
+}
+
+static inline int au_rw_write_trylock(struct au_rwsem *rw)
+{
+	int ret;
+
+	ret = down_write_trylock(&rw->rwsem);
+	if (ret)
+		AuDbgWcntInc(rw);
+	return ret;
+}
+
+#undef AuDbgCntDec
+#undef AuDbgRcntInc
+#undef AuDbgRcntDec
+#undef AuDbgWcntDec
+
+#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
+static inline void prefix##_read_lock(param) \
+{ au_rw_read_lock(rwsem); } \
+static inline void prefix##_write_lock(param) \
+{ au_rw_write_lock(rwsem); } \
+static inline int prefix##_read_trylock(param) \
+{ return au_rw_read_trylock(rwsem); } \
+static inline int prefix##_write_trylock(param) \
+{ return au_rw_write_trylock(rwsem); }
+/* why is not _nested version defined */
+/* static inline void prefix##_read_trylock_nested(param, lsc)
+{ au_rw_read_trylock_nested(rwsem, lsc)); }
+static inline void prefix##_write_trylock_nestd(param, lsc)
+{ au_rw_write_trylock_nested(rwsem, lsc); } */
+
+#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \
+static inline void prefix##_read_unlock(param) \
+{ au_rw_read_unlock(rwsem); } \
+static inline void prefix##_write_unlock(param) \
+{ au_rw_write_unlock(rwsem); } \
+static inline void prefix##_downgrade_lock(param) \
+{ au_rw_dgrade_lock(rwsem); }
+
+#define AuSimpleRwsemFuncs(prefix, param, rwsem) \
+	AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
+	AuSimpleUnlockRwsemFuncs(prefix, param, rwsem)
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_RWSEM_H__ */
diff --git b/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c
new file mode 100644
index 0000000..e113b51
--- /dev/null
+++ b/fs/aufs/sbinfo.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * superblock private data
+ */
+
+#include "aufs.h"
+
+/*
+ * they are necessary regardless sysfs is disabled.
+ */
+void au_si_free(struct kobject *kobj)
+{
+	int i;
+	struct au_sbinfo *sbinfo;
+	char *locked __maybe_unused; /* debug only */
+
+	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
+	for (i = 0; i < AuPlink_NHASH; i++)
+		AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head));
+	AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len));
+
+	AuDebugOn(percpu_counter_sum(&sbinfo->si_ninodes));
+	percpu_counter_destroy(&sbinfo->si_ninodes);
+	AuDebugOn(percpu_counter_sum(&sbinfo->si_nfiles));
+	percpu_counter_destroy(&sbinfo->si_nfiles);
+
+	au_rw_write_lock(&sbinfo->si_rwsem);
+	au_br_free(sbinfo);
+	au_rw_write_unlock(&sbinfo->si_rwsem);
+
+	au_delayed_kfree(sbinfo->si_branch);
+	for (i = 0; i < AU_NPIDMAP; i++)
+		if (sbinfo->au_si_pid.pid_bitmap[i])
+			au_delayed_kfree(sbinfo->au_si_pid.pid_bitmap[i]);
+	mutex_destroy(&sbinfo->au_si_pid.pid_mtx);
+	mutex_destroy(&sbinfo->si_xib_mtx);
+	AuRwDestroy(&sbinfo->si_rwsem);
+
+	au_delayed_kfree(sbinfo);
+}
+
+int au_si_alloc(struct super_block *sb)
+{
+	int err, i;
+	struct au_sbinfo *sbinfo;
+
+	err = -ENOMEM;
+	sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS);
+	if (unlikely(!sbinfo))
+		goto out;
+
+	/* will be reallocated separately */
+	sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS);
+	if (unlikely(!sbinfo->si_branch))
+		goto out_sbinfo;
+
+	err = sysaufs_si_init(sbinfo);
+	if (unlikely(err))
+		goto out_br;
+
+	au_nwt_init(&sbinfo->si_nowait);
+	au_rw_init_wlock(&sbinfo->si_rwsem);
+	mutex_init(&sbinfo->au_si_pid.pid_mtx);
+
+	percpu_counter_init(&sbinfo->si_ninodes, 0, GFP_NOFS);
+	percpu_counter_init(&sbinfo->si_nfiles, 0, GFP_NOFS);
+
+	sbinfo->si_bbot = -1;
+	sbinfo->si_last_br_id = AUFS_BRANCH_MAX / 2;
+
+	sbinfo->si_wbr_copyup = AuWbrCopyup_Def;
+	sbinfo->si_wbr_create = AuWbrCreate_Def;
+	sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup;
+	sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create;
+
+	au_fhsm_init(sbinfo);
+
+	sbinfo->si_mntflags = au_opts_plink(AuOpt_Def);
+
+	sbinfo->si_xino_jiffy = jiffies;
+	sbinfo->si_xino_expire
+		= msecs_to_jiffies(AUFS_XINO_DEF_SEC * MSEC_PER_SEC);
+	mutex_init(&sbinfo->si_xib_mtx);
+	sbinfo->si_xino_brid = -1;
+	/* leave si_xib_last_pindex and si_xib_next_bit */
+
+	au_sphl_init(&sbinfo->si_aopen);
+
+	sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC);
+	sbinfo->si_rdblk = AUFS_RDBLK_DEF;
+	sbinfo->si_rdhash = AUFS_RDHASH_DEF;
+	sbinfo->si_dirwh = AUFS_DIRWH_DEF;
+
+	for (i = 0; i < AuPlink_NHASH; i++)
+		au_sphl_init(sbinfo->si_plink + i);
+	init_waitqueue_head(&sbinfo->si_plink_wq);
+	spin_lock_init(&sbinfo->si_plink_maint_lock);
+
+	au_sphl_init(&sbinfo->si_files);
+
+	/* with getattr by default */
+	sbinfo->si_iop_array = aufs_iop;
+
+	/* leave other members for sysaufs and si_mnt. */
+	sbinfo->si_sb = sb;
+	sb->s_fs_info = sbinfo;
+	si_pid_set(sb);
+	return 0; /* success */
+
+out_br:
+	au_delayed_kfree(sbinfo->si_branch);
+out_sbinfo:
+	au_delayed_kfree(sbinfo);
+out:
+	return err;
+}
+
+int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr, int may_shrink)
+{
+	int err, sz;
+	struct au_branch **brp;
+
+	AuRwMustWriteLock(&sbinfo->si_rwsem);
+
+	err = -ENOMEM;
+	sz = sizeof(*brp) * (sbinfo->si_bbot + 1);
+	if (unlikely(!sz))
+		sz = sizeof(*brp);
+	brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS,
+			   may_shrink);
+	if (brp) {
+		sbinfo->si_branch = brp;
+		err = 0;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+unsigned int au_sigen_inc(struct super_block *sb)
+{
+	unsigned int gen;
+	struct inode *inode;
+
+	SiMustWriteLock(sb);
+
+	gen = ++au_sbi(sb)->si_generation;
+	au_update_digen(sb->s_root);
+	inode = d_inode(sb->s_root);
+	au_update_iigen(inode, /*half*/0);
+	inode->i_version++;
+	return gen;
+}
+
+aufs_bindex_t au_new_br_id(struct super_block *sb)
+{
+	aufs_bindex_t br_id;
+	int i;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	for (i = 0; i <= AUFS_BRANCH_MAX; i++) {
+		br_id = ++sbinfo->si_last_br_id;
+		AuDebugOn(br_id < 0);
+		if (br_id && au_br_index(sb, br_id) < 0)
+			return br_id;
+	}
+
+	return -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* it is ok that new 'nwt' tasks are appended while we are sleeping */
+int si_read_lock(struct super_block *sb, int flags)
+{
+	int err;
+
+	err = 0;
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+
+	si_noflush_read_lock(sb);
+	err = au_plink_maint(sb, flags);
+	if (unlikely(err))
+		si_read_unlock(sb);
+
+	return err;
+}
+
+int si_write_lock(struct super_block *sb, int flags)
+{
+	int err;
+
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+
+	si_noflush_write_lock(sb);
+	err = au_plink_maint(sb, flags);
+	if (unlikely(err))
+		si_write_unlock(sb);
+
+	return err;
+}
+
+/* dentry and super_block lock. call at entry point */
+int aufs_read_lock(struct dentry *dentry, int flags)
+{
+	int err;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, flags);
+	if (unlikely(err))
+		goto out;
+
+	if (au_ftest_lock(flags, DW))
+		di_write_lock_child(dentry);
+	else
+		di_read_lock_child(dentry, flags);
+
+	if (au_ftest_lock(flags, GEN)) {
+		err = au_digen_test(dentry, au_sigen(sb));
+		if (!au_opt_test(au_mntflags(sb), UDBA_NONE))
+			AuDebugOn(!err && au_dbrange_test(dentry));
+		else if (!err)
+			err = au_dbrange_test(dentry);
+		if (unlikely(err))
+			aufs_read_unlock(dentry, flags);
+	}
+
+out:
+	return err;
+}
+
+void aufs_read_unlock(struct dentry *dentry, int flags)
+{
+	if (au_ftest_lock(flags, DW))
+		di_write_unlock(dentry);
+	else
+		di_read_unlock(dentry, flags);
+	si_read_unlock(dentry->d_sb);
+}
+
+void aufs_write_lock(struct dentry *dentry)
+{
+	si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW);
+	di_write_lock_child(dentry);
+}
+
+void aufs_write_unlock(struct dentry *dentry)
+{
+	di_write_unlock(dentry);
+	si_write_unlock(dentry->d_sb);
+}
+
+int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags)
+{
+	int err;
+	unsigned int sigen;
+	struct super_block *sb;
+
+	sb = d1->d_sb;
+	err = si_read_lock(sb, flags);
+	if (unlikely(err))
+		goto out;
+
+	di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIRS));
+
+	if (au_ftest_lock(flags, GEN)) {
+		sigen = au_sigen(sb);
+		err = au_digen_test(d1, sigen);
+		AuDebugOn(!err && au_dbrange_test(d1));
+		if (!err) {
+			err = au_digen_test(d2, sigen);
+			AuDebugOn(!err && au_dbrange_test(d2));
+		}
+		if (unlikely(err))
+			aufs_read_and_write_unlock2(d1, d2);
+	}
+
+out:
+	return err;
+}
+
+void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2)
+{
+	di_write_unlock2(d1, d2);
+	si_read_unlock(d1->d_sb);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void si_pid_alloc(struct au_si_pid *au_si_pid, int idx)
+{
+	unsigned long *p;
+
+	BUILD_BUG_ON(sizeof(unsigned long) !=
+		     sizeof(*au_si_pid->pid_bitmap));
+
+	mutex_lock(&au_si_pid->pid_mtx);
+	p = au_si_pid->pid_bitmap[idx];
+	while (!p) {
+		/*
+		 * bad approach.
+		 * but keeping 'si_pid_set()' void is more important.
+		 */
+		p = kcalloc(BITS_TO_LONGS(AU_PIDSTEP),
+			    sizeof(*au_si_pid->pid_bitmap),
+			    GFP_NOFS);
+		if (p)
+			break;
+		cond_resched();
+	}
+	au_si_pid->pid_bitmap[idx] = p;
+	mutex_unlock(&au_si_pid->pid_mtx);
+}
+
+void si_pid_set(struct super_block *sb)
+{
+	pid_t bit;
+	int idx;
+	unsigned long *bitmap;
+	struct au_si_pid *au_si_pid;
+
+	si_pid_idx_bit(&idx, &bit);
+	au_si_pid = &au_sbi(sb)->au_si_pid;
+	bitmap = au_si_pid->pid_bitmap[idx];
+	if (!bitmap) {
+		si_pid_alloc(au_si_pid, idx);
+		bitmap = au_si_pid->pid_bitmap[idx];
+	}
+	AuDebugOn(test_bit(bit, bitmap));
+	set_bit(bit, bitmap);
+	/* smp_mb(); */
+}
diff --git b/fs/aufs/spl.h b/fs/aufs/spl.h
new file mode 100644
index 0000000..8f519db
--- /dev/null
+++ b/fs/aufs/spl.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * simple list protected by a spinlock
+ */
+
+#ifndef __AUFS_SPL_H__
+#define __AUFS_SPL_H__
+
+#ifdef __KERNEL__
+
+#if 0
+struct au_splhead {
+	spinlock_t		spin;
+	struct list_head	head;
+};
+
+static inline void au_spl_init(struct au_splhead *spl)
+{
+	spin_lock_init(&spl->spin);
+	INIT_LIST_HEAD(&spl->head);
+}
+
+static inline void au_spl_add(struct list_head *list, struct au_splhead *spl)
+{
+	spin_lock(&spl->spin);
+	list_add(list, &spl->head);
+	spin_unlock(&spl->spin);
+}
+
+static inline void au_spl_del(struct list_head *list, struct au_splhead *spl)
+{
+	spin_lock(&spl->spin);
+	list_del(list);
+	spin_unlock(&spl->spin);
+}
+
+static inline void au_spl_del_rcu(struct list_head *list,
+				  struct au_splhead *spl)
+{
+	spin_lock(&spl->spin);
+	list_del_rcu(list);
+	spin_unlock(&spl->spin);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+struct au_sphlhead {
+	spinlock_t		spin;
+	struct hlist_head	head;
+};
+
+static inline void au_sphl_init(struct au_sphlhead *sphl)
+{
+	spin_lock_init(&sphl->spin);
+	INIT_HLIST_HEAD(&sphl->head);
+}
+
+static inline void au_sphl_add(struct hlist_node *hlist,
+			       struct au_sphlhead *sphl)
+{
+	spin_lock(&sphl->spin);
+	hlist_add_head(hlist, &sphl->head);
+	spin_unlock(&sphl->spin);
+}
+
+static inline void au_sphl_del(struct hlist_node *hlist,
+			       struct au_sphlhead *sphl)
+{
+	spin_lock(&sphl->spin);
+	hlist_del(hlist);
+	spin_unlock(&sphl->spin);
+}
+
+static inline void au_sphl_del_rcu(struct hlist_node *hlist,
+				   struct au_sphlhead *sphl)
+{
+	spin_lock(&sphl->spin);
+	hlist_del_rcu(hlist);
+	spin_unlock(&sphl->spin);
+}
+
+static inline unsigned long au_sphl_count(struct au_sphlhead *sphl)
+{
+	unsigned long cnt;
+	struct hlist_node *pos;
+
+	cnt = 0;
+	spin_lock(&sphl->spin);
+	hlist_for_each(pos, &sphl->head)
+		cnt++;
+	spin_unlock(&sphl->spin);
+	return cnt;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_SPL_H__ */
diff --git b/fs/aufs/super.c b/fs/aufs/super.c
new file mode 100644
index 0000000..58a773c
--- /dev/null
+++ b/fs/aufs/super.c
@@ -0,0 +1,1025 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * mount and super_block operations
+ */
+
+#include <linux/mm.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/vmalloc.h>
+#include "aufs.h"
+
+/*
+ * super_operations
+ */
+static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused)
+{
+	struct au_icntnr *c;
+
+	c = au_cache_alloc_icntnr();
+	if (c) {
+		au_icntnr_init(c);
+		c->vfs_inode.i_version = 1; /* sigen(sb); */
+		c->iinfo.ii_hinode = NULL;
+		return &c->vfs_inode;
+	}
+	return NULL;
+}
+
+static void aufs_destroy_inode_cb(struct rcu_head *head)
+{
+	struct inode *inode = container_of(head, struct inode, i_rcu);
+
+	au_cache_dfree_icntnr(container_of(inode, struct au_icntnr, vfs_inode));
+}
+
+static void aufs_destroy_inode(struct inode *inode)
+{
+	if (!au_is_bad_inode(inode))
+		au_iinfo_fin(inode);
+	call_rcu(&inode->i_rcu, aufs_destroy_inode_cb);
+}
+
+struct inode *au_iget_locked(struct super_block *sb, ino_t ino)
+{
+	struct inode *inode;
+	int err;
+
+	inode = iget_locked(sb, ino);
+	if (unlikely(!inode)) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	if (!(inode->i_state & I_NEW))
+		goto out;
+
+	err = au_xigen_new(inode);
+	if (!err)
+		err = au_iinfo_init(inode);
+	if (!err)
+		inode->i_version++;
+	else {
+		iget_failed(inode);
+		inode = ERR_PTR(err);
+	}
+
+out:
+	/* never return NULL */
+	AuDebugOn(!inode);
+	AuTraceErrPtr(inode);
+	return inode;
+}
+
+/* lock free root dinfo */
+static int au_show_brs(struct seq_file *seq, struct super_block *sb)
+{
+	int err;
+	aufs_bindex_t bindex, bbot;
+	struct path path;
+	struct au_hdentry *hdp;
+	struct au_branch *br;
+	au_br_perm_str_t perm;
+
+	err = 0;
+	bbot = au_sbbot(sb);
+	bindex = 0;
+	hdp = au_hdentry(au_di(sb->s_root), bindex);
+	for (; !err && bindex <= bbot; bindex++, hdp++) {
+		br = au_sbr(sb, bindex);
+		path.mnt = au_br_mnt(br);
+		path.dentry = hdp->hd_dentry;
+		err = au_seq_path(seq, &path);
+		if (!err) {
+			au_optstr_br_perm(&perm, br->br_perm);
+			seq_printf(seq, "=%s", perm.a);
+			if (bindex != bbot)
+				seq_putc(seq, ':');
+		}
+	}
+	if (unlikely(err || seq_has_overflowed(seq)))
+		err = -E2BIG;
+
+	return err;
+}
+
+static void au_show_wbr_create(struct seq_file *m, int v,
+			       struct au_sbinfo *sbinfo)
+{
+	const char *pat;
+
+	AuRwMustAnyLock(&sbinfo->si_rwsem);
+
+	seq_puts(m, ",create=");
+	pat = au_optstr_wbr_create(v);
+	switch (v) {
+	case AuWbrCreate_TDP:
+	case AuWbrCreate_RR:
+	case AuWbrCreate_MFS:
+	case AuWbrCreate_PMFS:
+		seq_puts(m, pat);
+		break;
+	case AuWbrCreate_MFSV:
+		seq_printf(m, /*pat*/"mfs:%lu",
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	case AuWbrCreate_PMFSV:
+		seq_printf(m, /*pat*/"pmfs:%lu",
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	case AuWbrCreate_MFSRR:
+		seq_printf(m, /*pat*/"mfsrr:%llu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark);
+		break;
+	case AuWbrCreate_MFSRRV:
+		seq_printf(m, /*pat*/"mfsrr:%llu:%lu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark,
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	case AuWbrCreate_PMFSRR:
+		seq_printf(m, /*pat*/"pmfsrr:%llu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark);
+		break;
+	case AuWbrCreate_PMFSRRV:
+		seq_printf(m, /*pat*/"pmfsrr:%llu:%lu",
+			   sbinfo->si_wbr_mfs.mfsrr_watermark,
+			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
+			   / MSEC_PER_SEC);
+		break;
+	}
+}
+
+static int au_show_xino(struct seq_file *seq, struct super_block *sb)
+{
+#ifdef CONFIG_SYSFS
+	return 0;
+#else
+	int err;
+	const int len = sizeof(AUFS_XINO_FNAME) - 1;
+	aufs_bindex_t bindex, brid;
+	struct qstr *name;
+	struct file *f;
+	struct dentry *d, *h_root;
+
+	AuRwMustAnyLock(&sbinfo->si_rwsem);
+
+	err = 0;
+	f = au_sbi(sb)->si_xib;
+	if (!f)
+		goto out;
+
+	/* stop printing the default xino path on the first writable branch */
+	h_root = NULL;
+	brid = au_xino_brid(sb);
+	if (brid >= 0) {
+		bindex = au_br_index(sb, brid);
+		h_root = au_hdentry(au_di(sb->s_root), bindex)->hd_dentry;
+	}
+	d = f->f_path.dentry;
+	name = &d->d_name;
+	/* safe ->d_parent because the file is unlinked */
+	if (d->d_parent == h_root
+	    && name->len == len
+	    && !memcmp(name->name, AUFS_XINO_FNAME, len))
+		goto out;
+
+	seq_puts(seq, ",xino=");
+	err = au_xino_path(seq, f);
+
+out:
+	return err;
+#endif
+}
+
+/* seq_file will re-call me in case of too long string */
+static int aufs_show_options(struct seq_file *m, struct dentry *dentry)
+{
+	int err;
+	unsigned int mnt_flags, v;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+#define AuBool(name, str) do { \
+	v = au_opt_test(mnt_flags, name); \
+	if (v != au_opt_test(AuOpt_Def, name)) \
+		seq_printf(m, ",%s" #str, v ? "" : "no"); \
+} while (0)
+
+#define AuStr(name, str) do { \
+	v = mnt_flags & AuOptMask_##name; \
+	if (v != (AuOpt_Def & AuOptMask_##name)) \
+		seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \
+} while (0)
+
+#define AuUInt(name, str, val) do { \
+	if (val != AUFS_##name##_DEF) \
+		seq_printf(m, "," #str "=%u", val); \
+} while (0)
+
+	sb = dentry->d_sb;
+	if (sb->s_flags & MS_POSIXACL)
+		seq_puts(m, ",acl");
+
+	/* lock free root dinfo */
+	si_noflush_read_lock(sb);
+	sbinfo = au_sbi(sb);
+	seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo));
+
+	mnt_flags = au_mntflags(sb);
+	if (au_opt_test(mnt_flags, XINO)) {
+		err = au_show_xino(m, sb);
+		if (unlikely(err))
+			goto out;
+	} else
+		seq_puts(m, ",noxino");
+
+	AuBool(TRUNC_XINO, trunc_xino);
+	AuStr(UDBA, udba);
+	AuBool(SHWH, shwh);
+	AuBool(PLINK, plink);
+	AuBool(DIO, dio);
+	AuBool(DIRPERM1, dirperm1);
+
+	v = sbinfo->si_wbr_create;
+	if (v != AuWbrCreate_Def)
+		au_show_wbr_create(m, v, sbinfo);
+
+	v = sbinfo->si_wbr_copyup;
+	if (v != AuWbrCopyup_Def)
+		seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v));
+
+	v = au_opt_test(mnt_flags, ALWAYS_DIROPQ);
+	if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ))
+		seq_printf(m, ",diropq=%c", v ? 'a' : 'w');
+
+	AuUInt(DIRWH, dirwh, sbinfo->si_dirwh);
+
+	v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC;
+	AuUInt(RDCACHE, rdcache, v);
+
+	AuUInt(RDBLK, rdblk, sbinfo->si_rdblk);
+	AuUInt(RDHASH, rdhash, sbinfo->si_rdhash);
+
+	au_fhsm_show(m, sbinfo);
+
+	AuBool(SUM, sum);
+	/* AuBool(SUM_W, wsum); */
+	AuBool(WARN_PERM, warn_perm);
+	AuBool(VERBOSE, verbose);
+
+out:
+	/* be sure to print "br:" last */
+	if (!sysaufs_brs) {
+		seq_puts(m, ",br:");
+		au_show_brs(m, sb);
+	}
+	si_read_unlock(sb);
+	return 0;
+
+#undef AuBool
+#undef AuStr
+#undef AuUInt
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* sum mode which returns the summation for statfs(2) */
+
+static u64 au_add_till_max(u64 a, u64 b)
+{
+	u64 old;
+
+	old = a;
+	a += b;
+	if (old <= a)
+		return a;
+	return ULLONG_MAX;
+}
+
+static u64 au_mul_till_max(u64 a, long mul)
+{
+	u64 old;
+
+	old = a;
+	a *= mul;
+	if (old <= a)
+		return a;
+	return ULLONG_MAX;
+}
+
+static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf)
+{
+	int err;
+	long bsize, factor;
+	u64 blocks, bfree, bavail, files, ffree;
+	aufs_bindex_t bbot, bindex, i;
+	unsigned char shared;
+	struct path h_path;
+	struct super_block *h_sb;
+
+	err = 0;
+	bsize = LONG_MAX;
+	files = 0;
+	ffree = 0;
+	blocks = 0;
+	bfree = 0;
+	bavail = 0;
+	bbot = au_sbbot(sb);
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		h_path.mnt = au_sbr_mnt(sb, bindex);
+		h_sb = h_path.mnt->mnt_sb;
+		shared = 0;
+		for (i = 0; !shared && i < bindex; i++)
+			shared = (au_sbr_sb(sb, i) == h_sb);
+		if (shared)
+			continue;
+
+		/* sb->s_root for NFS is unreliable */
+		h_path.dentry = h_path.mnt->mnt_root;
+		err = vfs_statfs(&h_path, buf);
+		if (unlikely(err))
+			goto out;
+
+		if (bsize > buf->f_bsize) {
+			/*
+			 * we will reduce bsize, so we have to expand blocks
+			 * etc. to match them again
+			 */
+			factor = (bsize / buf->f_bsize);
+			blocks = au_mul_till_max(blocks, factor);
+			bfree = au_mul_till_max(bfree, factor);
+			bavail = au_mul_till_max(bavail, factor);
+			bsize = buf->f_bsize;
+		}
+
+		factor = (buf->f_bsize / bsize);
+		blocks = au_add_till_max(blocks,
+				au_mul_till_max(buf->f_blocks, factor));
+		bfree = au_add_till_max(bfree,
+				au_mul_till_max(buf->f_bfree, factor));
+		bavail = au_add_till_max(bavail,
+				au_mul_till_max(buf->f_bavail, factor));
+		files = au_add_till_max(files, buf->f_files);
+		ffree = au_add_till_max(ffree, buf->f_ffree);
+	}
+
+	buf->f_bsize = bsize;
+	buf->f_blocks = blocks;
+	buf->f_bfree = bfree;
+	buf->f_bavail = bavail;
+	buf->f_files = files;
+	buf->f_ffree = ffree;
+	buf->f_frsize = 0;
+
+out:
+	return err;
+}
+
+static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	int err;
+	struct path h_path;
+	struct super_block *sb;
+
+	/* lock free root dinfo */
+	sb = dentry->d_sb;
+	si_noflush_read_lock(sb);
+	if (!au_opt_test(au_mntflags(sb), SUM)) {
+		/* sb->s_root for NFS is unreliable */
+		h_path.mnt = au_sbr_mnt(sb, 0);
+		h_path.dentry = h_path.mnt->mnt_root;
+		err = vfs_statfs(&h_path, buf);
+	} else
+		err = au_statfs_sum(sb, buf);
+	si_read_unlock(sb);
+
+	if (!err) {
+		buf->f_type = AUFS_SUPER_MAGIC;
+		buf->f_namelen = AUFS_MAX_NAMELEN;
+		memset(&buf->f_fsid, 0, sizeof(buf->f_fsid));
+	}
+	/* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int aufs_sync_fs(struct super_block *sb, int wait)
+{
+	int err, e;
+	aufs_bindex_t bbot, bindex;
+	struct au_branch *br;
+	struct super_block *h_sb;
+
+	err = 0;
+	si_noflush_read_lock(sb);
+	bbot = au_sbbot(sb);
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (!au_br_writable(br->br_perm))
+			continue;
+
+		h_sb = au_sbr_sb(sb, bindex);
+		if (h_sb->s_op->sync_fs) {
+			e = h_sb->s_op->sync_fs(h_sb, wait);
+			if (unlikely(e && !err))
+				err = e;
+			/* go on even if an error happens */
+		}
+	}
+	si_read_unlock(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* final actions when unmounting a file system */
+static void aufs_put_super(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = au_sbi(sb);
+	if (!sbinfo)
+		return;
+
+	dbgaufs_si_fin(sbinfo);
+	kobject_put(&sbinfo->si_kobj);
+}
+
+/* ---------------------------------------------------------------------- */
+
+void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb,
+		     struct super_block *sb, void *arg)
+{
+	void *array;
+	unsigned long long n, sz;
+
+	array = NULL;
+	n = 0;
+	if (!*hint)
+		goto out;
+
+	if (*hint > ULLONG_MAX / sizeof(array)) {
+		array = ERR_PTR(-EMFILE);
+		pr_err("hint %llu\n", *hint);
+		goto out;
+	}
+
+	sz = sizeof(array) * *hint;
+	array = kzalloc(sz, GFP_NOFS);
+	if (unlikely(!array))
+		array = vzalloc(sz);
+	if (unlikely(!array)) {
+		array = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	n = cb(sb, array, *hint, arg);
+	AuDebugOn(n > *hint);
+
+out:
+	*hint = n;
+	return array;
+}
+
+static unsigned long long au_iarray_cb(struct super_block *sb, void *a,
+				       unsigned long long max __maybe_unused,
+				       void *arg)
+{
+	unsigned long long n;
+	struct inode **p, *inode;
+	struct list_head *head;
+
+	n = 0;
+	p = a;
+	head = arg;
+	spin_lock(&sb->s_inode_list_lock);
+	list_for_each_entry(inode, head, i_sb_list) {
+		if (!au_is_bad_inode(inode)
+		    && au_ii(inode)->ii_btop >= 0) {
+			spin_lock(&inode->i_lock);
+			if (atomic_read(&inode->i_count)) {
+				au_igrab(inode);
+				*p++ = inode;
+				n++;
+				AuDebugOn(n > max);
+			}
+			spin_unlock(&inode->i_lock);
+		}
+	}
+	spin_unlock(&sb->s_inode_list_lock);
+
+	return n;
+}
+
+struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max)
+{
+	*max = au_ninodes(sb);
+	return au_array_alloc(max, au_iarray_cb, sb, &sb->s_inodes);
+}
+
+void au_iarray_free(struct inode **a, unsigned long long max)
+{
+	unsigned long long ull;
+
+	for (ull = 0; ull < max; ull++)
+		iput(a[ull]);
+	kvfree(a);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * refresh dentry and inode at remount time.
+ */
+/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */
+static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags,
+		      struct dentry *parent)
+{
+	int err;
+
+	di_write_lock_child(dentry);
+	di_read_lock_parent(parent, AuLock_IR);
+	err = au_refresh_dentry(dentry, parent);
+	if (!err && dir_flags)
+		au_hn_reset(d_inode(dentry), dir_flags);
+	di_read_unlock(parent, AuLock_IR);
+	di_write_unlock(dentry);
+
+	return err;
+}
+
+static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen,
+			   struct au_sbinfo *sbinfo,
+			   const unsigned int dir_flags, unsigned int do_idop)
+{
+	int err;
+	struct dentry *parent;
+
+	err = 0;
+	parent = dget_parent(dentry);
+	if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) {
+		if (d_really_is_positive(dentry)) {
+			if (!d_is_dir(dentry))
+				err = au_do_refresh(dentry, /*dir_flags*/0,
+						 parent);
+			else {
+				err = au_do_refresh(dentry, dir_flags, parent);
+				if (unlikely(err))
+					au_fset_si(sbinfo, FAILED_REFRESH_DIR);
+			}
+		} else
+			err = au_do_refresh(dentry, /*dir_flags*/0, parent);
+		AuDbgDentry(dentry);
+	}
+	dput(parent);
+
+	if (!err) {
+		if (do_idop)
+			au_refresh_dop(dentry, /*force_reval*/0);
+	} else
+		au_refresh_dop(dentry, /*force_reval*/1);
+
+	AuTraceErr(err);
+	return err;
+}
+
+static int au_refresh_d(struct super_block *sb, unsigned int do_idop)
+{
+	int err, i, j, ndentry, e;
+	unsigned int sigen;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries, *d;
+	struct au_sbinfo *sbinfo;
+	struct dentry *root = sb->s_root;
+	const unsigned int dir_flags = au_hi_flags(d_inode(root), /*isdir*/1);
+
+	if (do_idop)
+		au_refresh_dop(root, /*force_reval*/0);
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_dcsub_pages(&dpages, root, NULL, NULL);
+	if (unlikely(err))
+		goto out_dpages;
+
+	sigen = au_sigen(sb);
+	sbinfo = au_sbi(sb);
+	for (i = 0; i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		ndentry = dpage->ndentry;
+		for (j = 0; j < ndentry; j++) {
+			d = dentries[j];
+			e = au_do_refresh_d(d, sigen, sbinfo, dir_flags,
+					    do_idop);
+			if (unlikely(e && !err))
+				err = e;
+			/* go on even err */
+		}
+	}
+
+out_dpages:
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static int au_refresh_i(struct super_block *sb, unsigned int do_idop)
+{
+	int err, e;
+	unsigned int sigen;
+	unsigned long long max, ull;
+	struct inode *inode, **array;
+
+	array = au_iarray_alloc(sb, &max);
+	err = PTR_ERR(array);
+	if (IS_ERR(array))
+		goto out;
+
+	err = 0;
+	sigen = au_sigen(sb);
+	for (ull = 0; ull < max; ull++) {
+		inode = array[ull];
+		if (unlikely(!inode))
+			break;
+
+		e = 0;
+		ii_write_lock_child(inode);
+		if (au_iigen(inode, NULL) != sigen) {
+			e = au_refresh_hinode_self(inode);
+			if (unlikely(e)) {
+				au_refresh_iop(inode, /*force_getattr*/1);
+				pr_err("error %d, i%lu\n", e, inode->i_ino);
+				if (!err)
+					err = e;
+				/* go on even if err */
+			}
+		}
+		if (!e && do_idop)
+			au_refresh_iop(inode, /*force_getattr*/0);
+		ii_write_unlock(inode);
+	}
+
+	au_iarray_free(array, max);
+
+out:
+	return err;
+}
+
+static void au_remount_refresh(struct super_block *sb, unsigned int do_idop)
+{
+	int err, e;
+	unsigned int udba;
+	aufs_bindex_t bindex, bbot;
+	struct dentry *root;
+	struct inode *inode;
+	struct au_branch *br;
+	struct au_sbinfo *sbi;
+
+	au_sigen_inc(sb);
+	sbi = au_sbi(sb);
+	au_fclr_si(sbi, FAILED_REFRESH_DIR);
+
+	root = sb->s_root;
+	DiMustNoWaiters(root);
+	inode = d_inode(root);
+	IiMustNoWaiters(inode);
+
+	udba = au_opt_udba(sb);
+	bbot = au_sbbot(sb);
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		err = au_hnotify_reset_br(udba, br, br->br_perm);
+		if (unlikely(err))
+			AuIOErr("hnotify failed on br %d, %d, ignored\n",
+				bindex, err);
+		/* go on even if err */
+	}
+	au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1));
+
+	if (do_idop) {
+		if (au_ftest_si(sbi, NO_DREVAL)) {
+			AuDebugOn(sb->s_d_op == &aufs_dop_noreval);
+			sb->s_d_op = &aufs_dop_noreval;
+			AuDebugOn(sbi->si_iop_array == aufs_iop_nogetattr);
+			sbi->si_iop_array = aufs_iop_nogetattr;
+		} else {
+			AuDebugOn(sb->s_d_op == &aufs_dop);
+			sb->s_d_op = &aufs_dop;
+			AuDebugOn(sbi->si_iop_array == aufs_iop);
+			sbi->si_iop_array = aufs_iop;
+		}
+		pr_info("reset to %pf and %pf\n",
+			sb->s_d_op, sbi->si_iop_array);
+	}
+
+	di_write_unlock(root);
+	err = au_refresh_d(sb, do_idop);
+	e = au_refresh_i(sb, do_idop);
+	if (unlikely(e && !err))
+		err = e;
+	/* aufs_write_lock() calls ..._child() */
+	di_write_lock_child(root);
+
+	au_cpup_attr_all(inode, /*force*/1);
+
+	if (unlikely(err))
+		AuIOErr("refresh failed, ignored, %d\n", err);
+}
+
+/* stop extra interpretation of errno in mount(8), and strange error messages */
+static int cvt_err(int err)
+{
+	AuTraceErr(err);
+
+	switch (err) {
+	case -ENOENT:
+	case -ENOTDIR:
+	case -EEXIST:
+	case -EIO:
+		err = -EINVAL;
+	}
+	return err;
+}
+
+static int aufs_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	int err, do_dx;
+	unsigned int mntflags;
+	struct au_opts opts = {
+		.opt = NULL
+	};
+	struct dentry *root;
+	struct inode *inode;
+	struct au_sbinfo *sbinfo;
+
+	err = 0;
+	root = sb->s_root;
+	if (!data || !*data) {
+		err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+		if (!err) {
+			di_write_lock_child(root);
+			err = au_opts_verify(sb, *flags, /*pending*/0);
+			aufs_write_unlock(root);
+		}
+		goto out;
+	}
+
+	err = -ENOMEM;
+	opts.opt = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!opts.opt))
+		goto out;
+	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
+	opts.flags = AuOpts_REMOUNT;
+	opts.sb_flags = *flags;
+
+	/* parse it before aufs lock */
+	err = au_opts_parse(sb, data, &opts);
+	if (unlikely(err))
+		goto out_opts;
+
+	sbinfo = au_sbi(sb);
+	inode = d_inode(root);
+	inode_lock(inode);
+	err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out_mtx;
+	di_write_lock_child(root);
+
+	/* au_opts_remount() may return an error */
+	err = au_opts_remount(sb, &opts);
+	au_opts_free(&opts);
+
+	if (au_ftest_opts(opts.flags, REFRESH))
+		au_remount_refresh(sb, au_ftest_opts(opts.flags, REFRESH_IDOP));
+
+	if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) {
+		mntflags = au_mntflags(sb);
+		do_dx = !!au_opt_test(mntflags, DIO);
+		au_dy_arefresh(do_dx);
+	}
+
+	au_fhsm_wrote_all(sb, /*force*/1); /* ?? */
+	aufs_write_unlock(root);
+
+out_mtx:
+	inode_unlock(inode);
+out_opts:
+	au_delayed_free_page((unsigned long)opts.opt);
+out:
+	err = cvt_err(err);
+	AuTraceErr(err);
+	return err;
+}
+
+static const struct super_operations aufs_sop = {
+	.alloc_inode	= aufs_alloc_inode,
+	.destroy_inode	= aufs_destroy_inode,
+	/* always deleting, no clearing */
+	.drop_inode	= generic_delete_inode,
+	.show_options	= aufs_show_options,
+	.statfs		= aufs_statfs,
+	.put_super	= aufs_put_super,
+	.sync_fs	= aufs_sync_fs,
+	.remount_fs	= aufs_remount_fs
+};
+
+/* ---------------------------------------------------------------------- */
+
+static int alloc_root(struct super_block *sb)
+{
+	int err;
+	struct inode *inode;
+	struct dentry *root;
+
+	err = -ENOMEM;
+	inode = au_iget_locked(sb, AUFS_ROOT_INO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	inode->i_op = aufs_iop + AuIop_DIR; /* with getattr by default */
+	inode->i_fop = &aufs_dir_fop;
+	inode->i_mode = S_IFDIR;
+	set_nlink(inode, 2);
+	unlock_new_inode(inode);
+
+	root = d_make_root(inode);
+	if (unlikely(!root))
+		goto out;
+	err = PTR_ERR(root);
+	if (IS_ERR(root))
+		goto out;
+
+	err = au_di_init(root);
+	if (!err) {
+		sb->s_root = root;
+		return 0; /* success */
+	}
+	dput(root);
+
+out:
+	return err;
+}
+
+static int aufs_fill_super(struct super_block *sb, void *raw_data,
+			   int silent __maybe_unused)
+{
+	int err;
+	struct au_opts opts = {
+		.opt = NULL
+	};
+	struct au_sbinfo *sbinfo;
+	struct dentry *root;
+	struct inode *inode;
+	char *arg = raw_data;
+
+	if (unlikely(!arg || !*arg)) {
+		err = -EINVAL;
+		pr_err("no arg\n");
+		goto out;
+	}
+
+	err = -ENOMEM;
+	opts.opt = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!opts.opt))
+		goto out;
+	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
+	opts.sb_flags = sb->s_flags;
+
+	err = au_si_alloc(sb);
+	if (unlikely(err))
+		goto out_opts;
+	sbinfo = au_sbi(sb);
+
+	/* all timestamps always follow the ones on the branch */
+	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
+	sb->s_op = &aufs_sop;
+	sb->s_d_op = &aufs_dop;
+	sb->s_magic = AUFS_SUPER_MAGIC;
+	sb->s_maxbytes = 0;
+	sb->s_stack_depth = 1;
+	au_export_init(sb);
+	/* au_xattr_init(sb); */
+
+	err = alloc_root(sb);
+	if (unlikely(err)) {
+		si_write_unlock(sb);
+		goto out_info;
+	}
+	root = sb->s_root;
+	inode = d_inode(root);
+
+	/*
+	 * actually we can parse options regardless aufs lock here.
+	 * but at remount time, parsing must be done before aufs lock.
+	 * so we follow the same rule.
+	 */
+	ii_write_lock_parent(inode);
+	aufs_write_unlock(root);
+	err = au_opts_parse(sb, arg, &opts);
+	if (unlikely(err))
+		goto out_root;
+
+	/* lock vfs_inode first, then aufs. */
+	inode_lock(inode);
+	aufs_write_lock(root);
+	err = au_opts_mount(sb, &opts);
+	au_opts_free(&opts);
+	if (!err && au_ftest_si(sbinfo, NO_DREVAL)) {
+		sb->s_d_op = &aufs_dop_noreval;
+		pr_info("%pf\n", sb->s_d_op);
+		au_refresh_dop(root, /*force_reval*/0);
+		sbinfo->si_iop_array = aufs_iop_nogetattr;
+		au_refresh_iop(inode, /*force_getattr*/0);
+	}
+	aufs_write_unlock(root);
+	inode_unlock(inode);
+	if (!err)
+		goto out_opts; /* success */
+
+out_root:
+	dput(root);
+	sb->s_root = NULL;
+out_info:
+	dbgaufs_si_fin(sbinfo);
+	kobject_put(&sbinfo->si_kobj);
+	sb->s_fs_info = NULL;
+out_opts:
+	au_delayed_free_page((unsigned long)opts.opt);
+out:
+	AuTraceErr(err);
+	err = cvt_err(err);
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags,
+				 const char *dev_name __maybe_unused,
+				 void *raw_data)
+{
+	struct dentry *root;
+	struct super_block *sb;
+
+	/* all timestamps always follow the ones on the branch */
+	/* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */
+	root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super);
+	if (IS_ERR(root))
+		goto out;
+
+	sb = root->d_sb;
+	si_write_lock(sb, !AuLock_FLUSH);
+	sysaufs_brs_add(sb, 0);
+	si_write_unlock(sb);
+	au_sbilist_add(sb);
+
+out:
+	return root;
+}
+
+static void aufs_kill_sb(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	sbinfo = au_sbi(sb);
+	if (sbinfo) {
+		au_sbilist_del(sb);
+		aufs_write_lock(sb->s_root);
+		au_fhsm_fin(sb);
+		if (sbinfo->si_wbr_create_ops->fin)
+			sbinfo->si_wbr_create_ops->fin(sb);
+		if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) {
+			au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE);
+			au_remount_refresh(sb, /*do_idop*/0);
+		}
+		if (au_opt_test(sbinfo->si_mntflags, PLINK))
+			au_plink_put(sb, /*verbose*/1);
+		au_xino_clr(sb);
+		sbinfo->si_sb = NULL;
+		aufs_write_unlock(sb->s_root);
+		au_nwt_flush(&sbinfo->si_nowait);
+	}
+	kill_anon_super(sb);
+}
+
+struct file_system_type aufs_fs_type = {
+	.name		= AUFS_FSTYPE,
+	/* a race between rename and others */
+	.fs_flags	= FS_RENAME_DOES_D_MOVE,
+	.mount		= aufs_mount,
+	.kill_sb	= aufs_kill_sb,
+	/* no need to __module_get() and module_put(). */
+	.owner		= THIS_MODULE,
+};
diff --git b/fs/aufs/super.h b/fs/aufs/super.h
new file mode 100644
index 0000000..e4ac866
--- /dev/null
+++ b/fs/aufs/super.h
@@ -0,0 +1,625 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * super_block operations
+ */
+
+#ifndef __AUFS_SUPER_H__
+#define __AUFS_SUPER_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/kobject.h>
+#include "rwsem.h"
+#include "spl.h"
+#include "wkq.h"
+
+/* policies to select one among multiple writable branches */
+struct au_wbr_copyup_operations {
+	int (*copyup)(struct dentry *dentry);
+};
+
+#define AuWbr_DIR	1		/* target is a dir */
+#define AuWbr_PARENT	(1 << 1)	/* always require a parent */
+
+#define au_ftest_wbr(flags, name)	((flags) & AuWbr_##name)
+#define au_fset_wbr(flags, name)	{ (flags) |= AuWbr_##name; }
+#define au_fclr_wbr(flags, name)	{ (flags) &= ~AuWbr_##name; }
+
+struct au_wbr_create_operations {
+	int (*create)(struct dentry *dentry, unsigned int flags);
+	int (*init)(struct super_block *sb);
+	int (*fin)(struct super_block *sb);
+};
+
+struct au_wbr_mfs {
+	struct mutex	mfs_lock; /* protect this structure */
+	unsigned long	mfs_jiffy;
+	unsigned long	mfs_expire;
+	aufs_bindex_t	mfs_bindex;
+
+	unsigned long long	mfsrr_bytes;
+	unsigned long long	mfsrr_watermark;
+};
+
+#define AuPlink_NHASH 100
+static inline int au_plink_hash(ino_t ino)
+{
+	return ino % AuPlink_NHASH;
+}
+
+/* File-based Hierarchical Storage Management */
+struct au_fhsm {
+#ifdef CONFIG_AUFS_FHSM
+	/* allow only one process who can receive the notification */
+	spinlock_t		fhsm_spin;
+	pid_t			fhsm_pid;
+	wait_queue_head_t	fhsm_wqh;
+	atomic_t		fhsm_readable;
+
+	/* these are protected by si_rwsem */
+	unsigned long		fhsm_expire;
+	aufs_bindex_t		fhsm_bottom;
+#endif
+};
+
+#define AU_PIDSTEP	(int)(BITS_TO_LONGS(PID_MAX_DEFAULT) * BITS_PER_LONG)
+#define AU_NPIDMAP	(int)DIV_ROUND_UP(PID_MAX_LIMIT, AU_PIDSTEP)
+struct au_si_pid {
+	unsigned long	*pid_bitmap[AU_NPIDMAP];
+	struct mutex	pid_mtx;
+};
+
+struct au_branch;
+struct au_sbinfo {
+	/* nowait tasks in the system-wide workqueue */
+	struct au_nowait_tasks	si_nowait;
+
+	/*
+	 * tried sb->s_umount, but failed due to the dependecy between i_mutex.
+	 * rwsem for au_sbinfo is necessary.
+	 */
+	struct au_rwsem		si_rwsem;
+
+	/* prevent recursive locking in deleting inode */
+	struct au_si_pid	au_si_pid;
+
+	/*
+	 * dirty approach to protect sb->sb_inodes and ->s_files (gone) from
+	 * remount.
+	 */
+	struct percpu_counter	si_ninodes, si_nfiles;
+
+	/* branch management */
+	unsigned int		si_generation;
+
+	/* see AuSi_ flags */
+	unsigned char		au_si_status;
+
+	aufs_bindex_t		si_bbot;
+
+	/* dirty trick to keep br_id plus */
+	unsigned int		si_last_br_id :
+				sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1;
+	struct au_branch	**si_branch;
+
+	/* policy to select a writable branch */
+	unsigned char		si_wbr_copyup;
+	unsigned char		si_wbr_create;
+	struct au_wbr_copyup_operations *si_wbr_copyup_ops;
+	struct au_wbr_create_operations *si_wbr_create_ops;
+
+	/* round robin */
+	atomic_t		si_wbr_rr_next;
+
+	/* most free space */
+	struct au_wbr_mfs	si_wbr_mfs;
+
+	/* File-based Hierarchical Storage Management */
+	struct au_fhsm		si_fhsm;
+
+	/* mount flags */
+	/* include/asm-ia64/siginfo.h defines a macro named si_flags */
+	unsigned int		si_mntflags;
+
+	/* external inode number (bitmap and translation table) */
+	vfs_readf_t		si_xread;
+	vfs_writef_t		si_xwrite;
+	struct file		*si_xib;
+	struct mutex		si_xib_mtx; /* protect xib members */
+	unsigned long		*si_xib_buf;
+	unsigned long		si_xib_last_pindex;
+	int			si_xib_next_bit;
+	aufs_bindex_t		si_xino_brid;
+	unsigned long		si_xino_jiffy;
+	unsigned long		si_xino_expire;
+	/* reserved for future use */
+	/* unsigned long long	si_xib_limit; */	/* Max xib file size */
+
+#ifdef CONFIG_AUFS_EXPORT
+	/* i_generation */
+	struct file		*si_xigen;
+	atomic_t		si_xigen_next;
+#endif
+
+	/* dirty trick to suppoer atomic_open */
+	struct au_sphlhead	si_aopen;
+
+	/* vdir parameters */
+	unsigned long		si_rdcache;	/* max cache time in jiffies */
+	unsigned int		si_rdblk;	/* deblk size */
+	unsigned int		si_rdhash;	/* hash size */
+
+	/*
+	 * If the number of whiteouts are larger than si_dirwh, leave all of
+	 * them after au_whtmp_ren to reduce the cost of rmdir(2).
+	 * future fsck.aufs or kernel thread will remove them later.
+	 * Otherwise, remove all whiteouts and the dir in rmdir(2).
+	 */
+	unsigned int		si_dirwh;
+
+	/* pseudo_link list */
+	struct au_sphlhead	si_plink[AuPlink_NHASH];
+	wait_queue_head_t	si_plink_wq;
+	spinlock_t		si_plink_maint_lock;
+	pid_t			si_plink_maint_pid;
+
+	/* file list */
+	struct au_sphlhead	si_files;
+
+	/* with/without getattr, brother of sb->s_d_op */
+	struct inode_operations *si_iop_array;
+
+	/*
+	 * sysfs and lifetime management.
+	 * this is not a small structure and it may be a waste of memory in case
+	 * of sysfs is disabled, particulary when many aufs-es are mounted.
+	 * but using sysfs is majority.
+	 */
+	struct kobject		si_kobj;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry		 *si_dbgaufs;
+	struct dentry		 *si_dbgaufs_plink;
+	struct dentry		 *si_dbgaufs_xib;
+#ifdef CONFIG_AUFS_EXPORT
+	struct dentry		 *si_dbgaufs_xigen;
+#endif
+#endif
+
+#ifdef CONFIG_AUFS_SBILIST
+	struct hlist_node	si_list;
+#endif
+
+	/* dirty, necessary for unmounting, sysfs and sysrq */
+	struct super_block	*si_sb;
+};
+
+/* sbinfo status flags */
+/*
+ * set true when refresh_dirs() failed at remount time.
+ * then try refreshing dirs at access time again.
+ * if it is false, refreshing dirs at access time is unnecesary
+ */
+#define AuSi_FAILED_REFRESH_DIR	1
+#define AuSi_FHSM		(1 << 1)	/* fhsm is active now */
+#define AuSi_NO_DREVAL		(1 << 2)	/* disable all d_revalidate */
+
+#ifndef CONFIG_AUFS_FHSM
+#undef AuSi_FHSM
+#define AuSi_FHSM		0
+#endif
+
+static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi,
+					   unsigned int flag)
+{
+	AuRwMustAnyLock(&sbi->si_rwsem);
+	return sbi->au_si_status & flag;
+}
+#define au_ftest_si(sbinfo, name)	au_do_ftest_si(sbinfo, AuSi_##name)
+#define au_fset_si(sbinfo, name) do { \
+	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
+	(sbinfo)->au_si_status |= AuSi_##name; \
+} while (0)
+#define au_fclr_si(sbinfo, name) do { \
+	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
+	(sbinfo)->au_si_status &= ~AuSi_##name; \
+} while (0)
+
+/* ---------------------------------------------------------------------- */
+
+/* policy to select one among writable branches */
+#define AuWbrCopyup(sbinfo, ...) \
+	((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__))
+#define AuWbrCreate(sbinfo, ...) \
+	((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__))
+
+/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */
+#define AuLock_DW		1		/* write-lock dentry */
+#define AuLock_IR		(1 << 1)	/* read-lock inode */
+#define AuLock_IW		(1 << 2)	/* write-lock inode */
+#define AuLock_FLUSH		(1 << 3)	/* wait for 'nowait' tasks */
+#define AuLock_DIRS		(1 << 4)	/* target is a pair of dirs */
+#define AuLock_NOPLM		(1 << 5)	/* return err in plm mode */
+#define AuLock_NOPLMW		(1 << 6)	/* wait for plm mode ends */
+#define AuLock_GEN		(1 << 7)	/* test digen/iigen */
+#define au_ftest_lock(flags, name)	((flags) & AuLock_##name)
+#define au_fset_lock(flags, name) \
+	do { (flags) |= AuLock_##name; } while (0)
+#define au_fclr_lock(flags, name) \
+	do { (flags) &= ~AuLock_##name; } while (0)
+
+/* ---------------------------------------------------------------------- */
+
+/* super.c */
+extern struct file_system_type aufs_fs_type;
+struct inode *au_iget_locked(struct super_block *sb, ino_t ino);
+typedef unsigned long long (*au_arraycb_t)(struct super_block *sb, void *array,
+					   unsigned long long max, void *arg);
+void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb,
+		     struct super_block *sb, void *arg);
+struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max);
+void au_iarray_free(struct inode **a, unsigned long long max);
+
+/* sbinfo.c */
+void au_si_free(struct kobject *kobj);
+int au_si_alloc(struct super_block *sb);
+int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr, int may_shrink);
+
+unsigned int au_sigen_inc(struct super_block *sb);
+aufs_bindex_t au_new_br_id(struct super_block *sb);
+
+int si_read_lock(struct super_block *sb, int flags);
+int si_write_lock(struct super_block *sb, int flags);
+int aufs_read_lock(struct dentry *dentry, int flags);
+void aufs_read_unlock(struct dentry *dentry, int flags);
+void aufs_write_lock(struct dentry *dentry);
+void aufs_write_unlock(struct dentry *dentry);
+int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags);
+void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2);
+
+/* wbr_policy.c */
+extern struct au_wbr_copyup_operations au_wbr_copyup_ops[];
+extern struct au_wbr_create_operations au_wbr_create_ops[];
+int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst);
+int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex);
+int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t btop);
+
+/* mvdown.c */
+int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *arg);
+
+#ifdef CONFIG_AUFS_FHSM
+/* fhsm.c */
+
+static inline pid_t au_fhsm_pid(struct au_fhsm *fhsm)
+{
+	pid_t pid;
+
+	spin_lock(&fhsm->fhsm_spin);
+	pid = fhsm->fhsm_pid;
+	spin_unlock(&fhsm->fhsm_spin);
+
+	return pid;
+}
+
+void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force);
+void au_fhsm_wrote_all(struct super_block *sb, int force);
+int au_fhsm_fd(struct super_block *sb, int oflags);
+int au_fhsm_br_alloc(struct au_branch *br);
+void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex);
+void au_fhsm_fin(struct super_block *sb);
+void au_fhsm_init(struct au_sbinfo *sbinfo);
+void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec);
+void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo);
+#else
+AuStubVoid(au_fhsm_wrote, struct super_block *sb, aufs_bindex_t bindex,
+	   int force)
+AuStubVoid(au_fhsm_wrote_all, struct super_block *sb, int force)
+AuStub(int, au_fhsm_fd, return -EOPNOTSUPP, struct super_block *sb, int oflags)
+AuStub(pid_t, au_fhsm_pid, return 0, struct au_fhsm *fhsm)
+AuStubInt0(au_fhsm_br_alloc, struct au_branch *br)
+AuStubVoid(au_fhsm_set_bottom, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(au_fhsm_fin, struct super_block *sb)
+AuStubVoid(au_fhsm_init, struct au_sbinfo *sbinfo)
+AuStubVoid(au_fhsm_set, struct au_sbinfo *sbinfo, unsigned int sec)
+AuStubVoid(au_fhsm_show, struct seq_file *seq, struct au_sbinfo *sbinfo)
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct au_sbinfo *au_sbi(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_EXPORT
+int au_test_nfsd(void);
+void au_export_init(struct super_block *sb);
+void au_xigen_inc(struct inode *inode);
+int au_xigen_new(struct inode *inode);
+int au_xigen_set(struct super_block *sb, struct file *base);
+void au_xigen_clr(struct super_block *sb);
+
+static inline int au_busy_or_stale(void)
+{
+	if (!au_test_nfsd())
+		return -EBUSY;
+	return -ESTALE;
+}
+#else
+AuStubInt0(au_test_nfsd, void)
+AuStubVoid(au_export_init, struct super_block *sb)
+AuStubVoid(au_xigen_inc, struct inode *inode)
+AuStubInt0(au_xigen_new, struct inode *inode)
+AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base)
+AuStubVoid(au_xigen_clr, struct super_block *sb)
+AuStub(int, au_busy_or_stale, return -EBUSY, void)
+#endif /* CONFIG_AUFS_EXPORT */
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_SBILIST
+/* module.c */
+extern struct au_sphlhead au_sbilist;
+
+static inline void au_sbilist_init(void)
+{
+	au_sphl_init(&au_sbilist);
+}
+
+static inline void au_sbilist_add(struct super_block *sb)
+{
+	au_sphl_add(&au_sbi(sb)->si_list, &au_sbilist);
+}
+
+static inline void au_sbilist_del(struct super_block *sb)
+{
+	au_sphl_del(&au_sbi(sb)->si_list, &au_sbilist);
+}
+
+#ifdef CONFIG_AUFS_MAGIC_SYSRQ
+static inline void au_sbilist_lock(void)
+{
+	spin_lock(&au_sbilist.spin);
+}
+
+static inline void au_sbilist_unlock(void)
+{
+	spin_unlock(&au_sbilist.spin);
+}
+#define AuGFP_SBILIST	GFP_ATOMIC
+#else
+AuStubVoid(au_sbilist_lock, void)
+AuStubVoid(au_sbilist_unlock, void)
+#define AuGFP_SBILIST	GFP_NOFS
+#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
+#else
+AuStubVoid(au_sbilist_init, void)
+AuStubVoid(au_sbilist_add, struct super_block *sb)
+AuStubVoid(au_sbilist_del, struct super_block *sb)
+AuStubVoid(au_sbilist_lock, void)
+AuStubVoid(au_sbilist_unlock, void)
+#define AuGFP_SBILIST	GFP_NOFS
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo)
+{
+	/*
+	 * This function is a dynamic '__init' function actually,
+	 * so the tiny check for si_rwsem is unnecessary.
+	 */
+	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
+#ifdef CONFIG_DEBUG_FS
+	sbinfo->si_dbgaufs = NULL;
+	sbinfo->si_dbgaufs_plink = NULL;
+	sbinfo->si_dbgaufs_xib = NULL;
+#ifdef CONFIG_AUFS_EXPORT
+	sbinfo->si_dbgaufs_xigen = NULL;
+#endif
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+static inline void si_pid_idx_bit(int *idx, pid_t *bit)
+{
+	/* the origin of pid is 1, but the bitmap's is 0 */
+	*bit = current->pid - 1;
+	*idx = *bit / AU_PIDSTEP;
+	*bit %= AU_PIDSTEP;
+}
+
+static inline int si_pid_test(struct super_block *sb)
+{
+	pid_t bit;
+	int idx;
+	unsigned long *bitmap;
+
+	si_pid_idx_bit(&idx, &bit);
+	bitmap = au_sbi(sb)->au_si_pid.pid_bitmap[idx];
+	if (bitmap)
+		return test_bit(bit, bitmap);
+	return 0;
+}
+
+static inline void si_pid_clr(struct super_block *sb)
+{
+	pid_t bit;
+	int idx;
+	unsigned long *bitmap;
+
+	si_pid_idx_bit(&idx, &bit);
+	bitmap = au_sbi(sb)->au_si_pid.pid_bitmap[idx];
+	BUG_ON(!bitmap);
+	AuDebugOn(!test_bit(bit, bitmap));
+	clear_bit(bit, bitmap);
+	/* smp_mb(); */
+}
+
+void si_pid_set(struct super_block *sb);
+
+/* ---------------------------------------------------------------------- */
+
+/* lock superblock. mainly for entry point functions */
+/*
+ * __si_read_lock, __si_write_lock,
+ * __si_read_unlock, __si_write_unlock, __si_downgrade_lock
+ */
+AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem);
+
+#define SiMustNoWaiters(sb)	AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem)
+#define SiMustAnyLock(sb)	AuRwMustAnyLock(&au_sbi(sb)->si_rwsem)
+#define SiMustWriteLock(sb)	AuRwMustWriteLock(&au_sbi(sb)->si_rwsem)
+
+static inline void si_noflush_read_lock(struct super_block *sb)
+{
+	__si_read_lock(sb);
+	si_pid_set(sb);
+}
+
+static inline int si_noflush_read_trylock(struct super_block *sb)
+{
+	int locked;
+
+	locked = __si_read_trylock(sb);
+	if (locked)
+		si_pid_set(sb);
+	return locked;
+}
+
+static inline void si_noflush_write_lock(struct super_block *sb)
+{
+	__si_write_lock(sb);
+	si_pid_set(sb);
+}
+
+static inline int si_noflush_write_trylock(struct super_block *sb)
+{
+	int locked;
+
+	locked = __si_write_trylock(sb);
+	if (locked)
+		si_pid_set(sb);
+	return locked;
+}
+
+#if 0 /* reserved */
+static inline int si_read_trylock(struct super_block *sb, int flags)
+{
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+	return si_noflush_read_trylock(sb);
+}
+#endif
+
+static inline void si_read_unlock(struct super_block *sb)
+{
+	si_pid_clr(sb);
+	__si_read_unlock(sb);
+}
+
+#if 0 /* reserved */
+static inline int si_write_trylock(struct super_block *sb, int flags)
+{
+	if (au_ftest_lock(flags, FLUSH))
+		au_nwt_flush(&au_sbi(sb)->si_nowait);
+	return si_noflush_write_trylock(sb);
+}
+#endif
+
+static inline void si_write_unlock(struct super_block *sb)
+{
+	si_pid_clr(sb);
+	__si_write_unlock(sb);
+}
+
+#if 0 /* reserved */
+static inline void si_downgrade_lock(struct super_block *sb)
+{
+	__si_downgrade_lock(sb);
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+static inline aufs_bindex_t au_sbbot(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_bbot;
+}
+
+static inline unsigned int au_mntflags(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_mntflags;
+}
+
+static inline unsigned int au_sigen(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_generation;
+}
+
+static inline unsigned long long au_ninodes(struct super_block *sb)
+{
+	s64 n = percpu_counter_sum(&au_sbi(sb)->si_ninodes);
+
+	BUG_ON(n < 0);
+	return n;
+}
+
+static inline void au_ninodes_inc(struct super_block *sb)
+{
+	percpu_counter_inc(&au_sbi(sb)->si_ninodes);
+}
+
+static inline void au_ninodes_dec(struct super_block *sb)
+{
+	percpu_counter_dec(&au_sbi(sb)->si_ninodes);
+}
+
+static inline unsigned long long au_nfiles(struct super_block *sb)
+{
+	s64 n = percpu_counter_sum(&au_sbi(sb)->si_nfiles);
+
+	BUG_ON(n < 0);
+	return n;
+}
+
+static inline void au_nfiles_inc(struct super_block *sb)
+{
+	percpu_counter_inc(&au_sbi(sb)->si_nfiles);
+}
+
+static inline void au_nfiles_dec(struct super_block *sb)
+{
+	percpu_counter_dec(&au_sbi(sb)->si_nfiles);
+}
+
+static inline struct au_branch *au_sbr(struct super_block *sb,
+				       aufs_bindex_t bindex)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_branch[0 + bindex];
+}
+
+static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid)
+{
+	SiMustWriteLock(sb);
+	au_sbi(sb)->si_xino_brid = brid;
+}
+
+static inline aufs_bindex_t au_xino_brid(struct super_block *sb)
+{
+	SiMustAnyLock(sb);
+	return au_sbi(sb)->si_xino_brid;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_SUPER_H__ */
diff --git b/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c
new file mode 100644
index 0000000..8ec10fb
--- /dev/null
+++ b/fs/aufs/sysaufs.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sysfs interface and lifetime management
+ * they are necessary regardless sysfs is disabled.
+ */
+
+#include <linux/random.h>
+#include "aufs.h"
+
+unsigned long sysaufs_si_mask;
+struct kset *sysaufs_kset;
+
+#define AuSiAttr(_name) { \
+	.attr   = { .name = __stringify(_name), .mode = 0444 },	\
+	.show   = sysaufs_si_##_name,				\
+}
+
+static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path);
+struct attribute *sysaufs_si_attrs[] = {
+	&sysaufs_si_attr_xi_path.attr,
+	NULL,
+};
+
+static const struct sysfs_ops au_sbi_ops = {
+	.show   = sysaufs_si_show
+};
+
+static struct kobj_type au_sbi_ktype = {
+	.release	= au_si_free,
+	.sysfs_ops	= &au_sbi_ops,
+	.default_attrs	= sysaufs_si_attrs
+};
+
+/* ---------------------------------------------------------------------- */
+
+int sysaufs_si_init(struct au_sbinfo *sbinfo)
+{
+	int err;
+
+	sbinfo->si_kobj.kset = sysaufs_kset;
+	/* cf. sysaufs_name() */
+	err = kobject_init_and_add
+		(&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL,
+		 SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo));
+
+	dbgaufs_si_null(sbinfo);
+	if (!err) {
+		err = dbgaufs_si_init(sbinfo);
+		if (unlikely(err))
+			kobject_put(&sbinfo->si_kobj);
+	}
+	return err;
+}
+
+void sysaufs_fin(void)
+{
+	dbgaufs_fin();
+	sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group);
+	kset_unregister(sysaufs_kset);
+}
+
+int __init sysaufs_init(void)
+{
+	int err;
+
+	do {
+		get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask));
+	} while (!sysaufs_si_mask);
+
+	err = -EINVAL;
+	sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj);
+	if (unlikely(!sysaufs_kset))
+		goto out;
+	err = PTR_ERR(sysaufs_kset);
+	if (IS_ERR(sysaufs_kset))
+		goto out;
+	err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group);
+	if (unlikely(err)) {
+		kset_unregister(sysaufs_kset);
+		goto out;
+	}
+
+	err = dbgaufs_init();
+	if (unlikely(err))
+		sysaufs_fin();
+out:
+	return err;
+}
diff --git b/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h
new file mode 100644
index 0000000..1f79983
--- /dev/null
+++ b/fs/aufs/sysaufs.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sysfs interface and mount lifetime management
+ */
+
+#ifndef __SYSAUFS_H__
+#define __SYSAUFS_H__
+
+#ifdef __KERNEL__
+
+#include <linux/sysfs.h>
+#include "module.h"
+
+struct super_block;
+struct au_sbinfo;
+
+struct sysaufs_si_attr {
+	struct attribute attr;
+	int (*show)(struct seq_file *seq, struct super_block *sb);
+};
+
+/* ---------------------------------------------------------------------- */
+
+/* sysaufs.c */
+extern unsigned long sysaufs_si_mask;
+extern struct kset *sysaufs_kset;
+extern struct attribute *sysaufs_si_attrs[];
+int sysaufs_si_init(struct au_sbinfo *sbinfo);
+int __init sysaufs_init(void);
+void sysaufs_fin(void);
+
+/* ---------------------------------------------------------------------- */
+
+/* some people doesn't like to show a pointer in kernel */
+static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo)
+{
+	return sysaufs_si_mask ^ (unsigned long)sbinfo;
+}
+
+#define SysaufsSiNamePrefix	"si_"
+#define SysaufsSiNameLen	(sizeof(SysaufsSiNamePrefix) + 16)
+static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name)
+{
+	snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx",
+		 sysaufs_si_id(sbinfo));
+}
+
+struct au_branch;
+#ifdef CONFIG_SYSFS
+/* sysfs.c */
+extern struct attribute_group *sysaufs_attr_group;
+
+int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb);
+ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
+			 char *buf);
+long au_brinfo_ioctl(struct file *file, unsigned long arg);
+#ifdef CONFIG_COMPAT
+long au_brinfo_compat_ioctl(struct file *file, unsigned long arg);
+#endif
+
+void sysaufs_br_init(struct au_branch *br);
+void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
+void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
+
+#define sysaufs_brs_init()	do {} while (0)
+
+#else
+#define sysaufs_attr_group	NULL
+
+AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb)
+AuStub(ssize_t, sysaufs_si_show, return 0, struct kobject *kobj,
+       struct attribute *attr, char *buf)
+AuStubVoid(sysaufs_br_init, struct au_branch *br)
+AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
+AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
+
+static inline void sysaufs_brs_init(void)
+{
+	sysaufs_brs = 0;
+}
+
+#endif /* CONFIG_SYSFS */
+
+#endif /* __KERNEL__ */
+#endif /* __SYSAUFS_H__ */
diff --git b/fs/aufs/sysfs.c b/fs/aufs/sysfs.c
new file mode 100644
index 0000000..f4c3170
--- /dev/null
+++ b/fs/aufs/sysfs.c
@@ -0,0 +1,340 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sysfs interface
+ */
+
+#include <linux/compat.h>
+#include <linux/seq_file.h>
+#include "aufs.h"
+
+static struct attribute *au_attr[] = {
+	NULL,	/* need to NULL terminate the list of attributes */
+};
+
+static struct attribute_group sysaufs_attr_group_body = {
+	.attrs = au_attr
+};
+
+struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body;
+
+/* ---------------------------------------------------------------------- */
+
+int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb)
+{
+	int err;
+
+	SiMustAnyLock(sb);
+
+	err = 0;
+	if (au_opt_test(au_mntflags(sb), XINO)) {
+		err = au_xino_path(seq, au_sbi(sb)->si_xib);
+		seq_putc(seq, '\n');
+	}
+	return err;
+}
+
+/*
+ * the lifetime of branch is independent from the entry under sysfs.
+ * sysfs handles the lifetime of the entry, and never call ->show() after it is
+ * unlinked.
+ */
+static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb,
+			 aufs_bindex_t bindex, int idx)
+{
+	int err;
+	struct path path;
+	struct dentry *root;
+	struct au_branch *br;
+	au_br_perm_str_t perm;
+
+	AuDbg("b%d\n", bindex);
+
+	err = 0;
+	root = sb->s_root;
+	di_read_lock_parent(root, !AuLock_IR);
+	br = au_sbr(sb, bindex);
+
+	switch (idx) {
+	case AuBrSysfs_BR:
+		path.mnt = au_br_mnt(br);
+		path.dentry = au_h_dptr(root, bindex);
+		err = au_seq_path(seq, &path);
+		if (!err) {
+			au_optstr_br_perm(&perm, br->br_perm);
+			seq_printf(seq, "=%s\n", perm.a);
+		}
+		break;
+	case AuBrSysfs_BRID:
+		seq_printf(seq, "%d\n", br->br_id);
+		break;
+	}
+	di_read_unlock(root, !AuLock_IR);
+	if (unlikely(err || seq_has_overflowed(seq)))
+		err = -E2BIG;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static struct seq_file *au_seq(char *p, ssize_t len)
+{
+	struct seq_file *seq;
+
+	seq = kzalloc(sizeof(*seq), GFP_NOFS);
+	if (seq) {
+		/* mutex_init(&seq.lock); */
+		seq->buf = p;
+		seq->size = len;
+		return seq; /* success */
+	}
+
+	seq = ERR_PTR(-ENOMEM);
+	return seq;
+}
+
+#define SysaufsBr_PREFIX	"br"
+#define SysaufsBrid_PREFIX	"brid"
+
+/* todo: file size may exceed PAGE_SIZE */
+ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
+			char *buf)
+{
+	ssize_t err;
+	int idx;
+	long l;
+	aufs_bindex_t bbot;
+	struct au_sbinfo *sbinfo;
+	struct super_block *sb;
+	struct seq_file *seq;
+	char *name;
+	struct attribute **cattr;
+
+	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
+	sb = sbinfo->si_sb;
+
+	/*
+	 * prevent a race condition between sysfs and aufs.
+	 * for instance, sysfs_file_read() calls sysfs_get_active_two() which
+	 * prohibits maintaining the sysfs entries.
+	 * hew we acquire read lock after sysfs_get_active_two().
+	 * on the other hand, the remount process may maintain the sysfs/aufs
+	 * entries after acquiring write lock.
+	 * it can cause a deadlock.
+	 * simply we gave up processing read here.
+	 */
+	err = -EBUSY;
+	if (unlikely(!si_noflush_read_trylock(sb)))
+		goto out;
+
+	seq = au_seq(buf, PAGE_SIZE);
+	err = PTR_ERR(seq);
+	if (IS_ERR(seq))
+		goto out_unlock;
+
+	name = (void *)attr->name;
+	cattr = sysaufs_si_attrs;
+	while (*cattr) {
+		if (!strcmp(name, (*cattr)->name)) {
+			err = container_of(*cattr, struct sysaufs_si_attr, attr)
+				->show(seq, sb);
+			goto out_seq;
+		}
+		cattr++;
+	}
+
+	if (!strncmp(name, SysaufsBrid_PREFIX,
+		     sizeof(SysaufsBrid_PREFIX) - 1)) {
+		idx = AuBrSysfs_BRID;
+		name += sizeof(SysaufsBrid_PREFIX) - 1;
+	} else if (!strncmp(name, SysaufsBr_PREFIX,
+			    sizeof(SysaufsBr_PREFIX) - 1)) {
+		idx = AuBrSysfs_BR;
+		name += sizeof(SysaufsBr_PREFIX) - 1;
+	} else
+		  BUG();
+
+	err = kstrtol(name, 10, &l);
+	if (!err) {
+		bbot = au_sbbot(sb);
+		if (l <= bbot)
+			err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l, idx);
+		else
+			err = -ENOENT;
+	}
+
+out_seq:
+	if (!err) {
+		err = seq->count;
+		/* sysfs limit */
+		if (unlikely(err == PAGE_SIZE))
+			err = -EFBIG;
+	}
+	au_delayed_kfree(seq);
+out_unlock:
+	si_read_unlock(sb);
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_brinfo(struct super_block *sb, union aufs_brinfo __user *arg)
+{
+	int err;
+	int16_t brid;
+	aufs_bindex_t bindex, bbot;
+	size_t sz;
+	char *buf;
+	struct seq_file *seq;
+	struct au_branch *br;
+
+	si_read_lock(sb, AuLock_FLUSH);
+	bbot = au_sbbot(sb);
+	err = bbot + 1;
+	if (!arg)
+		goto out;
+
+	err = -ENOMEM;
+	buf = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!buf))
+		goto out;
+
+	seq = au_seq(buf, PAGE_SIZE);
+	err = PTR_ERR(seq);
+	if (IS_ERR(seq))
+		goto out_buf;
+
+	sz = sizeof(*arg) - offsetof(union aufs_brinfo, path);
+	for (bindex = 0; bindex <= bbot; bindex++, arg++) {
+		err = !access_ok(VERIFY_WRITE, arg, sizeof(*arg));
+		if (unlikely(err))
+			break;
+
+		br = au_sbr(sb, bindex);
+		brid = br->br_id;
+		BUILD_BUG_ON(sizeof(brid) != sizeof(arg->id));
+		err = __put_user(brid, &arg->id);
+		if (unlikely(err))
+			break;
+
+		BUILD_BUG_ON(sizeof(br->br_perm) != sizeof(arg->perm));
+		err = __put_user(br->br_perm, &arg->perm);
+		if (unlikely(err))
+			break;
+
+		err = au_seq_path(seq, &br->br_path);
+		if (unlikely(err))
+			break;
+		seq_putc(seq, '\0');
+		if (!seq_has_overflowed(seq)) {
+			err = copy_to_user(arg->path, seq->buf, seq->count);
+			seq->count = 0;
+			if (unlikely(err))
+				break;
+		} else {
+			err = -E2BIG;
+			goto out_seq;
+		}
+	}
+	if (unlikely(err))
+		err = -EFAULT;
+
+out_seq:
+	au_delayed_kfree(seq);
+out_buf:
+	au_delayed_free_page((unsigned long)buf);
+out:
+	si_read_unlock(sb);
+	return err;
+}
+
+long au_brinfo_ioctl(struct file *file, unsigned long arg)
+{
+	return au_brinfo(file->f_path.dentry->d_sb, (void __user *)arg);
+}
+
+#ifdef CONFIG_COMPAT
+long au_brinfo_compat_ioctl(struct file *file, unsigned long arg)
+{
+	return au_brinfo(file->f_path.dentry->d_sb, compat_ptr(arg));
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+void sysaufs_br_init(struct au_branch *br)
+{
+	int i;
+	struct au_brsysfs *br_sysfs;
+	struct attribute *attr;
+
+	br_sysfs = br->br_sysfs;
+	for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
+		attr = &br_sysfs->attr;
+		sysfs_attr_init(attr);
+		attr->name = br_sysfs->name;
+		attr->mode = S_IRUGO;
+		br_sysfs++;
+	}
+}
+
+void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
+{
+	struct au_branch *br;
+	struct kobject *kobj;
+	struct au_brsysfs *br_sysfs;
+	int i;
+	aufs_bindex_t bbot;
+
+	dbgaufs_brs_del(sb, bindex);
+
+	if (!sysaufs_brs)
+		return;
+
+	kobj = &au_sbi(sb)->si_kobj;
+	bbot = au_sbbot(sb);
+	for (; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		br_sysfs = br->br_sysfs;
+		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
+			sysfs_remove_file(kobj, &br_sysfs->attr);
+			br_sysfs++;
+		}
+	}
+}
+
+void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
+{
+	int err, i;
+	aufs_bindex_t bbot;
+	struct kobject *kobj;
+	struct au_branch *br;
+	struct au_brsysfs *br_sysfs;
+
+	dbgaufs_brs_add(sb, bindex);
+
+	if (!sysaufs_brs)
+		return;
+
+	kobj = &au_sbi(sb)->si_kobj;
+	bbot = au_sbbot(sb);
+	for (; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		br_sysfs = br->br_sysfs;
+		snprintf(br_sysfs[AuBrSysfs_BR].name, sizeof(br_sysfs->name),
+			 SysaufsBr_PREFIX "%d", bindex);
+		snprintf(br_sysfs[AuBrSysfs_BRID].name, sizeof(br_sysfs->name),
+			 SysaufsBrid_PREFIX "%d", bindex);
+		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
+			err = sysfs_create_file(kobj, &br_sysfs->attr);
+			if (unlikely(err))
+				pr_warn("failed %s under sysfs(%d)\n",
+					br_sysfs->name, err);
+			br_sysfs++;
+		}
+	}
+}
diff --git b/fs/aufs/sysrq.c b/fs/aufs/sysrq.c
new file mode 100644
index 0000000..7d22979
--- /dev/null
+++ b/fs/aufs/sysrq.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * magic sysrq hanlder
+ */
+
+/* #include <linux/sysrq.h> */
+#include <linux/writeback.h>
+#include "aufs.h"
+
+/* ---------------------------------------------------------------------- */
+
+static void sysrq_sb(struct super_block *sb)
+{
+	char *plevel;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+	struct au_sphlhead *files;
+	struct au_finfo *finfo;
+
+	plevel = au_plevel;
+	au_plevel = KERN_WARNING;
+
+	/* since we define pr_fmt, call printk directly */
+#define pr(str) printk(KERN_WARNING AUFS_NAME ": " str)
+
+	sbinfo = au_sbi(sb);
+	printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo));
+	pr("superblock\n");
+	au_dpri_sb(sb);
+
+#if 0
+	pr("root dentry\n");
+	au_dpri_dentry(sb->s_root);
+	pr("root inode\n");
+	au_dpri_inode(d_inode(sb->s_root));
+#endif
+
+#if 0
+	do {
+		int err, i, j, ndentry;
+		struct au_dcsub_pages dpages;
+		struct au_dpage *dpage;
+
+		err = au_dpages_init(&dpages, GFP_ATOMIC);
+		if (unlikely(err))
+			break;
+		err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL);
+		if (!err)
+			for (i = 0; i < dpages.ndpage; i++) {
+				dpage = dpages.dpages + i;
+				ndentry = dpage->ndentry;
+				for (j = 0; j < ndentry; j++)
+					au_dpri_dentry(dpage->dentries[j]);
+			}
+		au_dpages_free(&dpages);
+	} while (0);
+#endif
+
+#if 1
+	{
+		struct inode *i;
+
+		pr("isolated inode\n");
+		spin_lock(&sb->s_inode_list_lock);
+		list_for_each_entry(i, &sb->s_inodes, i_sb_list) {
+			spin_lock(&i->i_lock);
+			if (1 || hlist_empty(&i->i_dentry))
+				au_dpri_inode(i);
+			spin_unlock(&i->i_lock);
+		}
+		spin_unlock(&sb->s_inode_list_lock);
+	}
+#endif
+	pr("files\n");
+	files = &au_sbi(sb)->si_files;
+	spin_lock(&files->spin);
+	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
+		umode_t mode;
+
+		file = finfo->fi_file;
+		mode = file_inode(file)->i_mode;
+		if (!special_file(mode))
+			au_dpri_file(file);
+	}
+	spin_unlock(&files->spin);
+	pr("done\n");
+
+#undef pr
+	au_plevel = plevel;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* module parameter */
+static char *aufs_sysrq_key = "a";
+module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO);
+MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME);
+
+static void au_sysrq(int key __maybe_unused)
+{
+	struct au_sbinfo *sbinfo;
+
+	lockdep_off();
+	au_sbilist_lock();
+	hlist_for_each_entry(sbinfo, &au_sbilist.head, si_list)
+		sysrq_sb(sbinfo->si_sb);
+	au_sbilist_unlock();
+	lockdep_on();
+}
+
+static struct sysrq_key_op au_sysrq_op = {
+	.handler	= au_sysrq,
+	.help_msg	= "Aufs",
+	.action_msg	= "Aufs",
+	.enable_mask	= SYSRQ_ENABLE_DUMP
+};
+
+/* ---------------------------------------------------------------------- */
+
+int __init au_sysrq_init(void)
+{
+	int err;
+	char key;
+
+	err = -1;
+	key = *aufs_sysrq_key;
+	if ('a' <= key && key <= 'z')
+		err = register_sysrq_key(key, &au_sysrq_op);
+	if (unlikely(err))
+		pr_err("err %d, sysrq=%c\n", err, key);
+	return err;
+}
+
+void au_sysrq_fin(void)
+{
+	int err;
+
+	err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op);
+	if (unlikely(err))
+		pr_err("err %d (ignored)\n", err);
+}
diff --git b/fs/aufs/vdir.c b/fs/aufs/vdir.c
new file mode 100644
index 0000000..b2eb4c0
--- /dev/null
+++ b/fs/aufs/vdir.c
@@ -0,0 +1,887 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * virtual or vertical directory
+ */
+
+#include "aufs.h"
+
+static unsigned int calc_size(int nlen)
+{
+	return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t));
+}
+
+static int set_deblk_end(union au_vdir_deblk_p *p,
+			 union au_vdir_deblk_p *deblk_end)
+{
+	if (calc_size(0) <= deblk_end->deblk - p->deblk) {
+		p->de->de_str.len = 0;
+		/* smp_mb(); */
+		return 0;
+	}
+	return -1; /* error */
+}
+
+/* returns true or false */
+static int is_deblk_end(union au_vdir_deblk_p *p,
+			union au_vdir_deblk_p *deblk_end)
+{
+	if (calc_size(0) <= deblk_end->deblk - p->deblk)
+		return !p->de->de_str.len;
+	return 1;
+}
+
+static unsigned char *last_deblk(struct au_vdir *vdir)
+{
+	return vdir->vd_deblk[vdir->vd_nblk - 1];
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* estimate the appropriate size for name hash table */
+unsigned int au_rdhash_est(loff_t sz)
+{
+	unsigned int n;
+
+	n = UINT_MAX;
+	sz >>= 10;
+	if (sz < n)
+		n = sz;
+	if (sz < AUFS_RDHASH_DEF)
+		n = AUFS_RDHASH_DEF;
+	/* pr_info("n %u\n", n); */
+	return n;
+}
+
+/*
+ * the allocated memory has to be freed by
+ * au_nhash_wh_free() or au_nhash_de_free().
+ */
+int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp)
+{
+	struct hlist_head *head;
+	unsigned int u;
+	size_t sz;
+
+	sz = sizeof(*nhash->nh_head) * num_hash;
+	head = kmalloc(sz, gfp);
+	if (head) {
+		nhash->nh_num = num_hash;
+		nhash->nh_head = head;
+		for (u = 0; u < num_hash; u++)
+			INIT_HLIST_HEAD(head++);
+		return 0; /* success */
+	}
+
+	return -ENOMEM;
+}
+
+static void nhash_count(struct hlist_head *head)
+{
+#if 0
+	unsigned long n;
+	struct hlist_node *pos;
+
+	n = 0;
+	hlist_for_each(pos, head)
+		n++;
+	pr_info("%lu\n", n);
+#endif
+}
+
+static void au_nhash_wh_do_free(struct hlist_head *head)
+{
+	struct au_vdir_wh *pos;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_safe(pos, node, head, wh_hash)
+		au_delayed_kfree(pos);
+}
+
+static void au_nhash_de_do_free(struct hlist_head *head)
+{
+	struct au_vdir_dehstr *pos;
+	struct hlist_node *node;
+
+	hlist_for_each_entry_safe(pos, node, head, hash)
+		au_cache_dfree_vdir_dehstr(pos);
+}
+
+static void au_nhash_do_free(struct au_nhash *nhash,
+			     void (*free)(struct hlist_head *head))
+{
+	unsigned int n;
+	struct hlist_head *head;
+
+	n = nhash->nh_num;
+	if (!n)
+		return;
+
+	head = nhash->nh_head;
+	while (n-- > 0) {
+		nhash_count(head);
+		free(head++);
+	}
+	au_delayed_kfree(nhash->nh_head);
+}
+
+void au_nhash_wh_free(struct au_nhash *whlist)
+{
+	au_nhash_do_free(whlist, au_nhash_wh_do_free);
+}
+
+static void au_nhash_de_free(struct au_nhash *delist)
+{
+	au_nhash_do_free(delist, au_nhash_de_do_free);
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
+			    int limit)
+{
+	int num;
+	unsigned int u, n;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+
+	num = 0;
+	n = whlist->nh_num;
+	head = whlist->nh_head;
+	for (u = 0; u < n; u++, head++)
+		hlist_for_each_entry(pos, head, wh_hash)
+			if (pos->wh_bindex == btgt && ++num > limit)
+				return 1;
+	return 0;
+}
+
+static struct hlist_head *au_name_hash(struct au_nhash *nhash,
+				       unsigned char *name,
+				       unsigned int len)
+{
+	unsigned int v;
+	/* const unsigned int magic_bit = 12; */
+
+	AuDebugOn(!nhash->nh_num || !nhash->nh_head);
+
+	v = 0;
+	if (len > 8)
+		len = 8;
+	while (len--)
+		v += *name++;
+	/* v = hash_long(v, magic_bit); */
+	v %= nhash->nh_num;
+	return nhash->nh_head + v;
+}
+
+static int au_nhash_test_name(struct au_vdir_destr *str, const char *name,
+			      int nlen)
+{
+	return str->len == nlen && !memcmp(str->name, name, nlen);
+}
+
+/* returns found or not */
+int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen)
+{
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+	struct au_vdir_destr *str;
+
+	head = au_name_hash(whlist, name, nlen);
+	hlist_for_each_entry(pos, head, wh_hash) {
+		str = &pos->wh_str;
+		AuDbg("%.*s\n", str->len, str->name);
+		if (au_nhash_test_name(str, name, nlen))
+			return 1;
+	}
+	return 0;
+}
+
+/* returns found(true) or not */
+static int test_known(struct au_nhash *delist, char *name, int nlen)
+{
+	struct hlist_head *head;
+	struct au_vdir_dehstr *pos;
+	struct au_vdir_destr *str;
+
+	head = au_name_hash(delist, name, nlen);
+	hlist_for_each_entry(pos, head, hash) {
+		str = pos->str;
+		AuDbg("%.*s\n", str->len, str->name);
+		if (au_nhash_test_name(str, name, nlen))
+			return 1;
+	}
+	return 0;
+}
+
+static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino,
+			    unsigned char d_type)
+{
+#ifdef CONFIG_AUFS_SHWH
+	wh->wh_ino = ino;
+	wh->wh_type = d_type;
+#endif
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
+		       unsigned int d_type, aufs_bindex_t bindex,
+		       unsigned char shwh)
+{
+	int err;
+	struct au_vdir_destr *str;
+	struct au_vdir_wh *wh;
+
+	AuDbg("%.*s\n", nlen, name);
+	AuDebugOn(!whlist->nh_num || !whlist->nh_head);
+
+	err = -ENOMEM;
+	wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS);
+	if (unlikely(!wh))
+		goto out;
+
+	err = 0;
+	wh->wh_bindex = bindex;
+	if (shwh)
+		au_shwh_init_wh(wh, ino, d_type);
+	str = &wh->wh_str;
+	str->len = nlen;
+	memcpy(str->name, name, nlen);
+	hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen));
+	/* smp_mb(); */
+
+out:
+	return err;
+}
+
+static int append_deblk(struct au_vdir *vdir)
+{
+	int err;
+	unsigned long ul;
+	const unsigned int deblk_sz = vdir->vd_deblk_sz;
+	union au_vdir_deblk_p p, deblk_end;
+	unsigned char **o;
+
+	err = -ENOMEM;
+	o = au_krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1),
+			GFP_NOFS, /*may_shrink*/0);
+	if (unlikely(!o))
+		goto out;
+
+	vdir->vd_deblk = o;
+	p.deblk = kmalloc(deblk_sz, GFP_NOFS);
+	if (p.deblk) {
+		ul = vdir->vd_nblk++;
+		vdir->vd_deblk[ul] = p.deblk;
+		vdir->vd_last.ul = ul;
+		vdir->vd_last.p.deblk = p.deblk;
+		deblk_end.deblk = p.deblk + deblk_sz;
+		err = set_deblk_end(&p, &deblk_end);
+	}
+
+out:
+	return err;
+}
+
+static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino,
+		     unsigned int d_type, struct au_nhash *delist)
+{
+	int err;
+	unsigned int sz;
+	const unsigned int deblk_sz = vdir->vd_deblk_sz;
+	union au_vdir_deblk_p p, *room, deblk_end;
+	struct au_vdir_dehstr *dehstr;
+
+	p.deblk = last_deblk(vdir);
+	deblk_end.deblk = p.deblk + deblk_sz;
+	room = &vdir->vd_last.p;
+	AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk
+		  || !is_deblk_end(room, &deblk_end));
+
+	sz = calc_size(nlen);
+	if (unlikely(sz > deblk_end.deblk - room->deblk)) {
+		err = append_deblk(vdir);
+		if (unlikely(err))
+			goto out;
+
+		p.deblk = last_deblk(vdir);
+		deblk_end.deblk = p.deblk + deblk_sz;
+		/* smp_mb(); */
+		AuDebugOn(room->deblk != p.deblk);
+	}
+
+	err = -ENOMEM;
+	dehstr = au_cache_alloc_vdir_dehstr();
+	if (unlikely(!dehstr))
+		goto out;
+
+	dehstr->str = &room->de->de_str;
+	hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen));
+	room->de->de_ino = ino;
+	room->de->de_type = d_type;
+	room->de->de_str.len = nlen;
+	memcpy(room->de->de_str.name, name, nlen);
+
+	err = 0;
+	room->deblk += sz;
+	if (unlikely(set_deblk_end(room, &deblk_end)))
+		err = append_deblk(vdir);
+	/* smp_mb(); */
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_vdir_free(struct au_vdir *vdir, int atonce)
+{
+	unsigned char **deblk;
+
+	deblk = vdir->vd_deblk;
+	if (!atonce) {
+		while (vdir->vd_nblk--)
+			au_delayed_kfree(*deblk++);
+		au_delayed_kfree(vdir->vd_deblk);
+		au_cache_dfree_vdir(vdir);
+	} else {
+		/* not delayed */
+		while (vdir->vd_nblk--)
+			kfree(*deblk++);
+		kfree(vdir->vd_deblk);
+		au_cache_free_vdir(vdir);
+	}
+}
+
+static struct au_vdir *alloc_vdir(struct file *file)
+{
+	struct au_vdir *vdir;
+	struct super_block *sb;
+	int err;
+
+	sb = file->f_path.dentry->d_sb;
+	SiMustAnyLock(sb);
+
+	err = -ENOMEM;
+	vdir = au_cache_alloc_vdir();
+	if (unlikely(!vdir))
+		goto out;
+
+	vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS);
+	if (unlikely(!vdir->vd_deblk))
+		goto out_free;
+
+	vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk;
+	if (!vdir->vd_deblk_sz) {
+		/* estimate the appropriate size for deblk */
+		vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL);
+		/* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */
+	}
+	vdir->vd_nblk = 0;
+	vdir->vd_version = 0;
+	vdir->vd_jiffy = 0;
+	err = append_deblk(vdir);
+	if (!err)
+		return vdir; /* success */
+
+	au_delayed_kfree(vdir->vd_deblk);
+
+out_free:
+	au_cache_dfree_vdir(vdir);
+out:
+	vdir = ERR_PTR(err);
+	return vdir;
+}
+
+static int reinit_vdir(struct au_vdir *vdir)
+{
+	int err;
+	union au_vdir_deblk_p p, deblk_end;
+
+	while (vdir->vd_nblk > 1) {
+		au_delayed_kfree(vdir->vd_deblk[vdir->vd_nblk - 1]);
+		/* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */
+		vdir->vd_nblk--;
+	}
+	p.deblk = vdir->vd_deblk[0];
+	deblk_end.deblk = p.deblk + vdir->vd_deblk_sz;
+	err = set_deblk_end(&p, &deblk_end);
+	/* keep vd_dblk_sz */
+	vdir->vd_last.ul = 0;
+	vdir->vd_last.p.deblk = vdir->vd_deblk[0];
+	vdir->vd_version = 0;
+	vdir->vd_jiffy = 0;
+	/* smp_mb(); */
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+#define AuFillVdir_CALLED	1
+#define AuFillVdir_WHABLE	(1 << 1)
+#define AuFillVdir_SHWH		(1 << 2)
+#define au_ftest_fillvdir(flags, name)	((flags) & AuFillVdir_##name)
+#define au_fset_fillvdir(flags, name) \
+	do { (flags) |= AuFillVdir_##name; } while (0)
+#define au_fclr_fillvdir(flags, name) \
+	do { (flags) &= ~AuFillVdir_##name; } while (0)
+
+#ifndef CONFIG_AUFS_SHWH
+#undef AuFillVdir_SHWH
+#define AuFillVdir_SHWH		0
+#endif
+
+struct fillvdir_arg {
+	struct dir_context	ctx;
+	struct file		*file;
+	struct au_vdir		*vdir;
+	struct au_nhash		delist;
+	struct au_nhash		whlist;
+	aufs_bindex_t		bindex;
+	unsigned int		flags;
+	int			err;
+};
+
+static int fillvdir(struct dir_context *ctx, const char *__name, int nlen,
+		    loff_t offset __maybe_unused, u64 h_ino,
+		    unsigned int d_type)
+{
+	struct fillvdir_arg *arg = container_of(ctx, struct fillvdir_arg, ctx);
+	char *name = (void *)__name;
+	struct super_block *sb;
+	ino_t ino;
+	const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH);
+
+	arg->err = 0;
+	sb = arg->file->f_path.dentry->d_sb;
+	au_fset_fillvdir(arg->flags, CALLED);
+	/* smp_mb(); */
+	if (nlen <= AUFS_WH_PFX_LEN
+	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
+		if (test_known(&arg->delist, name, nlen)
+		    || au_nhash_test_known_wh(&arg->whlist, name, nlen))
+			goto out; /* already exists or whiteouted */
+
+		arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino);
+		if (!arg->err) {
+			if (unlikely(nlen > AUFS_MAX_NAMELEN))
+				d_type = DT_UNKNOWN;
+			arg->err = append_de(arg->vdir, name, nlen, ino,
+					     d_type, &arg->delist);
+		}
+	} else if (au_ftest_fillvdir(arg->flags, WHABLE)) {
+		name += AUFS_WH_PFX_LEN;
+		nlen -= AUFS_WH_PFX_LEN;
+		if (au_nhash_test_known_wh(&arg->whlist, name, nlen))
+			goto out; /* already whiteouted */
+
+		if (shwh)
+			arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type,
+					     &ino);
+		if (!arg->err) {
+			if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN)
+				d_type = DT_UNKNOWN;
+			arg->err = au_nhash_append_wh
+				(&arg->whlist, name, nlen, ino, d_type,
+				 arg->bindex, shwh);
+		}
+	}
+
+out:
+	if (!arg->err)
+		arg->vdir->vd_jiffy = jiffies;
+	/* smp_mb(); */
+	AuTraceErr(arg->err);
+	return arg->err;
+}
+
+static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir,
+			  struct au_nhash *whlist, struct au_nhash *delist)
+{
+#ifdef CONFIG_AUFS_SHWH
+	int err;
+	unsigned int nh, u;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+	struct hlist_node *n;
+	char *p, *o;
+	struct au_vdir_destr *destr;
+
+	AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH));
+
+	err = -ENOMEM;
+	o = p = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!p))
+		goto out;
+
+	err = 0;
+	nh = whlist->nh_num;
+	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
+	p += AUFS_WH_PFX_LEN;
+	for (u = 0; u < nh; u++) {
+		head = whlist->nh_head + u;
+		hlist_for_each_entry_safe(pos, n, head, wh_hash) {
+			destr = &pos->wh_str;
+			memcpy(p, destr->name, destr->len);
+			err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN,
+					pos->wh_ino, pos->wh_type, delist);
+			if (unlikely(err))
+				break;
+		}
+	}
+
+	au_delayed_free_page((unsigned long)o);
+
+out:
+	AuTraceErr(err);
+	return err;
+#else
+	return 0;
+#endif
+}
+
+static int au_do_read_vdir(struct fillvdir_arg *arg)
+{
+	int err;
+	unsigned int rdhash;
+	loff_t offset;
+	aufs_bindex_t bbot, bindex, btop;
+	unsigned char shwh;
+	struct file *hf, *file;
+	struct super_block *sb;
+
+	file = arg->file;
+	sb = file->f_path.dentry->d_sb;
+	SiMustAnyLock(sb);
+
+	rdhash = au_sbi(sb)->si_rdhash;
+	if (!rdhash)
+		rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL));
+	err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS);
+	if (unlikely(err))
+		goto out_delist;
+
+	err = 0;
+	arg->flags = 0;
+	shwh = 0;
+	if (au_opt_test(au_mntflags(sb), SHWH)) {
+		shwh = 1;
+		au_fset_fillvdir(arg->flags, SHWH);
+	}
+	btop = au_fbtop(file);
+	bbot = au_fbbot_dir(file);
+	for (bindex = btop; !err && bindex <= bbot; bindex++) {
+		hf = au_hf_dir(file, bindex);
+		if (!hf)
+			continue;
+
+		offset = vfsub_llseek(hf, 0, SEEK_SET);
+		err = offset;
+		if (unlikely(offset))
+			break;
+
+		arg->bindex = bindex;
+		au_fclr_fillvdir(arg->flags, WHABLE);
+		if (shwh
+		    || (bindex != bbot
+			&& au_br_whable(au_sbr_perm(sb, bindex))))
+			au_fset_fillvdir(arg->flags, WHABLE);
+		do {
+			arg->err = 0;
+			au_fclr_fillvdir(arg->flags, CALLED);
+			/* smp_mb(); */
+			err = vfsub_iterate_dir(hf, &arg->ctx);
+			if (err >= 0)
+				err = arg->err;
+		} while (!err && au_ftest_fillvdir(arg->flags, CALLED));
+
+		/*
+		 * dir_relax() may be good for concurrency, but aufs should not
+		 * use it since it will cause a lockdep problem.
+		 */
+	}
+
+	if (!err && shwh)
+		err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist);
+
+	au_nhash_wh_free(&arg->whlist);
+
+out_delist:
+	au_nhash_de_free(&arg->delist);
+out:
+	return err;
+}
+
+static int read_vdir(struct file *file, int may_read)
+{
+	int err;
+	unsigned long expire;
+	unsigned char do_read;
+	struct fillvdir_arg arg = {
+		.ctx = {
+			.actor = fillvdir
+		}
+	};
+	struct inode *inode;
+	struct au_vdir *vdir, *allocated;
+
+	err = 0;
+	inode = file_inode(file);
+	IMustLock(inode);
+	IiMustWriteLock(inode);
+	SiMustAnyLock(inode->i_sb);
+
+	allocated = NULL;
+	do_read = 0;
+	expire = au_sbi(inode->i_sb)->si_rdcache;
+	vdir = au_ivdir(inode);
+	if (!vdir) {
+		do_read = 1;
+		vdir = alloc_vdir(file);
+		err = PTR_ERR(vdir);
+		if (IS_ERR(vdir))
+			goto out;
+		err = 0;
+		allocated = vdir;
+	} else if (may_read
+		   && (inode->i_version != vdir->vd_version
+		       || time_after(jiffies, vdir->vd_jiffy + expire))) {
+		do_read = 1;
+		err = reinit_vdir(vdir);
+		if (unlikely(err))
+			goto out;
+	}
+
+	if (!do_read)
+		return 0; /* success */
+
+	arg.file = file;
+	arg.vdir = vdir;
+	err = au_do_read_vdir(&arg);
+	if (!err) {
+		/* file->f_pos = 0; */ /* todo: ctx->pos? */
+		vdir->vd_version = inode->i_version;
+		vdir->vd_last.ul = 0;
+		vdir->vd_last.p.deblk = vdir->vd_deblk[0];
+		if (allocated)
+			au_set_ivdir(inode, allocated);
+	} else if (allocated)
+		au_vdir_free(allocated, /*atonce*/0);
+
+out:
+	return err;
+}
+
+static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src)
+{
+	int err, rerr;
+	unsigned long ul, n;
+	const unsigned int deblk_sz = src->vd_deblk_sz;
+
+	AuDebugOn(tgt->vd_nblk != 1);
+
+	err = -ENOMEM;
+	if (tgt->vd_nblk < src->vd_nblk) {
+		unsigned char **p;
+
+		p = au_krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk,
+				GFP_NOFS, /*may_shrink*/0);
+		if (unlikely(!p))
+			goto out;
+		tgt->vd_deblk = p;
+	}
+
+	if (tgt->vd_deblk_sz != deblk_sz) {
+		unsigned char *p;
+
+		tgt->vd_deblk_sz = deblk_sz;
+		p = au_krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS,
+				/*may_shrink*/1);
+		if (unlikely(!p))
+			goto out;
+		tgt->vd_deblk[0] = p;
+	}
+	memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz);
+	tgt->vd_version = src->vd_version;
+	tgt->vd_jiffy = src->vd_jiffy;
+
+	n = src->vd_nblk;
+	for (ul = 1; ul < n; ul++) {
+		tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz,
+					    GFP_NOFS);
+		if (unlikely(!tgt->vd_deblk[ul]))
+			goto out;
+		tgt->vd_nblk++;
+	}
+	tgt->vd_nblk = n;
+	tgt->vd_last.ul = tgt->vd_last.ul;
+	tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul];
+	tgt->vd_last.p.deblk += src->vd_last.p.deblk
+		- src->vd_deblk[src->vd_last.ul];
+	/* smp_mb(); */
+	return 0; /* success */
+
+out:
+	rerr = reinit_vdir(tgt);
+	BUG_ON(rerr);
+	return err;
+}
+
+int au_vdir_init(struct file *file)
+{
+	int err;
+	struct inode *inode;
+	struct au_vdir *vdir_cache, *allocated;
+
+	/* test file->f_pos here instead of ctx->pos */
+	err = read_vdir(file, !file->f_pos);
+	if (unlikely(err))
+		goto out;
+
+	allocated = NULL;
+	vdir_cache = au_fvdir_cache(file);
+	if (!vdir_cache) {
+		vdir_cache = alloc_vdir(file);
+		err = PTR_ERR(vdir_cache);
+		if (IS_ERR(vdir_cache))
+			goto out;
+		allocated = vdir_cache;
+	} else if (!file->f_pos && vdir_cache->vd_version != file->f_version) {
+		/* test file->f_pos here instead of ctx->pos */
+		err = reinit_vdir(vdir_cache);
+		if (unlikely(err))
+			goto out;
+	} else
+		return 0; /* success */
+
+	inode = file_inode(file);
+	err = copy_vdir(vdir_cache, au_ivdir(inode));
+	if (!err) {
+		file->f_version = inode->i_version;
+		if (allocated)
+			au_set_fvdir_cache(file, allocated);
+	} else if (allocated)
+		au_vdir_free(allocated, /*atonce*/0);
+
+out:
+	return err;
+}
+
+static loff_t calc_offset(struct au_vdir *vdir)
+{
+	loff_t offset;
+	union au_vdir_deblk_p p;
+
+	p.deblk = vdir->vd_deblk[vdir->vd_last.ul];
+	offset = vdir->vd_last.p.deblk - p.deblk;
+	offset += vdir->vd_deblk_sz * vdir->vd_last.ul;
+	return offset;
+}
+
+/* returns true or false */
+static int seek_vdir(struct file *file, struct dir_context *ctx)
+{
+	int valid;
+	unsigned int deblk_sz;
+	unsigned long ul, n;
+	loff_t offset;
+	union au_vdir_deblk_p p, deblk_end;
+	struct au_vdir *vdir_cache;
+
+	valid = 1;
+	vdir_cache = au_fvdir_cache(file);
+	offset = calc_offset(vdir_cache);
+	AuDbg("offset %lld\n", offset);
+	if (ctx->pos == offset)
+		goto out;
+
+	vdir_cache->vd_last.ul = 0;
+	vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0];
+	if (!ctx->pos)
+		goto out;
+
+	valid = 0;
+	deblk_sz = vdir_cache->vd_deblk_sz;
+	ul = div64_u64(ctx->pos, deblk_sz);
+	AuDbg("ul %lu\n", ul);
+	if (ul >= vdir_cache->vd_nblk)
+		goto out;
+
+	n = vdir_cache->vd_nblk;
+	for (; ul < n; ul++) {
+		p.deblk = vdir_cache->vd_deblk[ul];
+		deblk_end.deblk = p.deblk + deblk_sz;
+		offset = ul;
+		offset *= deblk_sz;
+		while (!is_deblk_end(&p, &deblk_end) && offset < ctx->pos) {
+			unsigned int l;
+
+			l = calc_size(p.de->de_str.len);
+			offset += l;
+			p.deblk += l;
+		}
+		if (!is_deblk_end(&p, &deblk_end)) {
+			valid = 1;
+			vdir_cache->vd_last.ul = ul;
+			vdir_cache->vd_last.p = p;
+			break;
+		}
+	}
+
+out:
+	/* smp_mb(); */
+	AuTraceErr(!valid);
+	return valid;
+}
+
+int au_vdir_fill_de(struct file *file, struct dir_context *ctx)
+{
+	unsigned int l, deblk_sz;
+	union au_vdir_deblk_p deblk_end;
+	struct au_vdir *vdir_cache;
+	struct au_vdir_de *de;
+
+	vdir_cache = au_fvdir_cache(file);
+	if (!seek_vdir(file, ctx))
+		return 0;
+
+	deblk_sz = vdir_cache->vd_deblk_sz;
+	while (1) {
+		deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
+		deblk_end.deblk += deblk_sz;
+		while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) {
+			de = vdir_cache->vd_last.p.de;
+			AuDbg("%.*s, off%lld, i%lu, dt%d\n",
+			      de->de_str.len, de->de_str.name, ctx->pos,
+			      (unsigned long)de->de_ino, de->de_type);
+			if (unlikely(!dir_emit(ctx, de->de_str.name,
+					       de->de_str.len, de->de_ino,
+					       de->de_type))) {
+				/* todo: ignore the error caused by udba? */
+				/* return err; */
+				return 0;
+			}
+
+			l = calc_size(de->de_str.len);
+			vdir_cache->vd_last.p.deblk += l;
+			ctx->pos += l;
+		}
+		if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) {
+			vdir_cache->vd_last.ul++;
+			vdir_cache->vd_last.p.deblk
+				= vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
+			ctx->pos = deblk_sz * vdir_cache->vd_last.ul;
+			continue;
+		}
+		break;
+	}
+
+	/* smp_mb(); */
+	return 0;
+}
diff --git b/fs/aufs/vfsub.c b/fs/aufs/vfsub.c
new file mode 100644
index 0000000..2e54bad
--- /dev/null
+++ b/fs/aufs/vfsub.c
@@ -0,0 +1,871 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for VFS
+ */
+
+#include <linux/namei.h>
+#include <linux/nsproxy.h>
+#include <linux/security.h>
+#include <linux/splice.h>
+#include "../fs/mount.h"
+#include "aufs.h"
+
+#ifdef CONFIG_AUFS_BR_FUSE
+int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb)
+{
+	struct nsproxy *ns;
+
+	if (!au_test_fuse(h_sb) || !au_userns)
+		return 0;
+
+	ns = current->nsproxy;
+	/* no {get,put}_nsproxy(ns) */
+	return real_mount(mnt)->mnt_ns == ns->mnt_ns ? 0 : -EACCES;
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_update_h_iattr(struct path *h_path, int *did)
+{
+	int err;
+	struct kstat st;
+	struct super_block *h_sb;
+
+	/* for remote fs, leave work for its getattr or d_revalidate */
+	/* for bad i_attr fs, handle them in aufs_getattr() */
+	/* still some fs may acquire i_mutex. we need to skip them */
+	err = 0;
+	if (!did)
+		did = &err;
+	h_sb = h_path->dentry->d_sb;
+	*did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb));
+	if (*did)
+		err = vfs_getattr(h_path, &st);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct file *vfsub_dentry_open(struct path *path, int flags)
+{
+	struct file *file;
+
+	file = dentry_open(path, flags /* | __FMODE_NONOTIFY */,
+			   current_cred());
+	if (!IS_ERR_OR_NULL(file)
+	    && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		i_readcount_inc(d_inode(path->dentry));
+
+	return file;
+}
+
+struct file *vfsub_filp_open(const char *path, int oflags, int mode)
+{
+	struct file *file;
+
+	lockdep_off();
+	file = filp_open(path,
+			 oflags /* | __FMODE_NONOTIFY */,
+			 mode);
+	lockdep_on();
+	if (IS_ERR(file))
+		goto out;
+	vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+
+out:
+	return file;
+}
+
+/*
+ * Ideally this function should call VFS:do_last() in order to keep all its
+ * checkings. But it is very hard for aufs to regenerate several VFS internal
+ * structure such as nameidata. This is a second (or third) best approach.
+ * cf. linux/fs/namei.c:do_last(), lookup_open() and atomic_open().
+ */
+int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
+		      struct vfsub_aopen_args *args, struct au_branch *br)
+{
+	int err;
+	struct file *file = args->file;
+	/* copied from linux/fs/namei.c:atomic_open() */
+	struct dentry *const DENTRY_NOT_SET = (void *)-1UL;
+
+	IMustLock(dir);
+	AuDebugOn(!dir->i_op->atomic_open);
+
+	err = au_br_test_oflag(args->open_flag, br);
+	if (unlikely(err))
+		goto out;
+
+	args->file->f_path.dentry = DENTRY_NOT_SET;
+	args->file->f_path.mnt = au_br_mnt(br);
+	err = dir->i_op->atomic_open(dir, dentry, file, args->open_flag,
+				     args->create_mode, args->opened);
+	if (err >= 0) {
+		/* some filesystems don't set FILE_CREATED while succeeded? */
+		if (*args->opened & FILE_CREATED)
+			fsnotify_create(dir, dentry);
+	} else
+		goto out;
+
+
+	if (!err) {
+		/* todo: call VFS:may_open() here */
+		err = open_check_o_direct(file);
+		/* todo: ima_file_check() too? */
+		if (!err && (args->open_flag & __FMODE_EXEC))
+			err = deny_write_access(file);
+		if (unlikely(err))
+			/* note that the file is created and still opened */
+			goto out;
+	}
+
+	au_br_get(br);
+	fsnotify_open(file);
+
+out:
+	return err;
+}
+
+int vfsub_kern_path(const char *name, unsigned int flags, struct path *path)
+{
+	int err;
+
+	err = kern_path(name, flags, path);
+	if (!err && d_is_positive(path->dentry))
+		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+struct dentry *vfsub_lookup_one_len_unlocked(const char *name,
+					     struct dentry *parent, int len)
+{
+	struct path path = {
+		.mnt = NULL
+	};
+
+	path.dentry = lookup_one_len_unlocked(name, parent, len);
+	if (IS_ERR(path.dentry))
+		goto out;
+	if (d_is_positive(path.dentry))
+		vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/
+
+out:
+	AuTraceErrPtr(path.dentry);
+	return path.dentry;
+}
+
+struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
+				    int len)
+{
+	struct path path = {
+		.mnt = NULL
+	};
+
+	/* VFS checks it too, but by WARN_ON_ONCE() */
+	IMustLock(d_inode(parent));
+
+	path.dentry = lookup_one_len(name, parent, len);
+	if (IS_ERR(path.dentry))
+		goto out;
+	if (d_is_positive(path.dentry))
+		vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/
+
+out:
+	AuTraceErrPtr(path.dentry);
+	return path.dentry;
+}
+
+void vfsub_call_lkup_one(void *args)
+{
+	struct vfsub_lkup_one_args *a = args;
+	*a->errp = vfsub_lkup_one(a->name, a->parent);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
+				 struct dentry *d2, struct au_hinode *hdir2)
+{
+	struct dentry *d;
+
+	lockdep_off();
+	d = lock_rename(d1, d2);
+	lockdep_on();
+	au_hn_suspend(hdir1);
+	if (hdir1 != hdir2)
+		au_hn_suspend(hdir2);
+
+	return d;
+}
+
+void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
+			 struct dentry *d2, struct au_hinode *hdir2)
+{
+	au_hn_resume(hdir1);
+	if (hdir1 != hdir2)
+		au_hn_resume(hdir2);
+	lockdep_off();
+	unlock_rename(d1, d2);
+	lockdep_on();
+}
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_create(struct inode *dir, struct path *path, int mode, bool want_excl)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_mknod(path, d, mode, 0);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_create(dir, path->dentry, mode, want_excl);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_symlink(struct inode *dir, struct path *path, const char *symname)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_symlink(path, d, symname);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_symlink(dir, path->dentry, symname);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_mknod(path, d, mode, new_encode_dev(dev));
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_mknod(dir, path->dentry, mode, dev);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+static int au_test_nlink(struct inode *inode)
+{
+	const unsigned int link_max = UINT_MAX >> 1; /* rough margin */
+
+	if (!au_test_fs_no_limit_nlink(inode->i_sb)
+	    || inode->i_nlink < link_max)
+		return 0;
+	return -EMLINK;
+}
+
+int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path,
+	       struct inode **delegated_inode)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	err = au_test_nlink(d_inode(src_dentry));
+	if (unlikely(err))
+		return err;
+
+	/* we don't call may_linkat() */
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_link(src_dentry, path, d);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_link(src_dentry, dir, path->dentry, delegated_inode);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		/* fuse has different memory inode for the same inumber */
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+			tmp.dentry = src_dentry;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry,
+		 struct inode *dir, struct path *path,
+		 struct inode **delegated_inode)
+{
+	int err;
+	struct path tmp = {
+		.mnt	= path->mnt
+	};
+	struct dentry *d;
+
+	IMustLock(dir);
+	IMustLock(src_dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	tmp.dentry = src_dentry->d_parent;
+	err = security_path_rename(&tmp, src_dentry, path, d, /*flags*/0);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_rename(src_dir, src_dentry, dir, path->dentry,
+			 delegated_inode, /*flags*/0);
+	lockdep_on();
+	if (!err) {
+		int did;
+
+		tmp.dentry = d->d_parent;
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = src_dentry;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+			tmp.dentry = src_dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_mkdir(struct inode *dir, struct path *path, int mode)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_mkdir(path, d, mode);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_mkdir(dir, path->dentry, mode);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = *path;
+		int did;
+
+		vfsub_update_h_iattr(&tmp, &did);
+		if (did) {
+			tmp.dentry = path->dentry->d_parent;
+			vfsub_update_h_iattr(&tmp, /*did*/NULL);
+		}
+		/*ignore*/
+	}
+
+out:
+	return err;
+}
+
+int vfsub_rmdir(struct inode *dir, struct path *path)
+{
+	int err;
+	struct dentry *d;
+
+	IMustLock(dir);
+
+	d = path->dentry;
+	path->dentry = d->d_parent;
+	err = security_path_rmdir(path, d);
+	path->dentry = d;
+	if (unlikely(err))
+		goto out;
+
+	lockdep_off();
+	err = vfs_rmdir(dir, path->dentry);
+	lockdep_on();
+	if (!err) {
+		struct path tmp = {
+			.dentry	= path->dentry->d_parent,
+			.mnt	= path->mnt
+		};
+
+		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* todo: support mmap_sem? */
+ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
+		     loff_t *ppos)
+{
+	ssize_t err;
+
+	lockdep_off();
+	err = vfs_read(file, ubuf, count, ppos);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+/* todo: kernel_read()? */
+ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
+		     loff_t *ppos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		char __user *u;
+	} buf;
+
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = vfsub_read_u(file, buf.u, count, ppos);
+	set_fs(oldfs);
+	return err;
+}
+
+ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
+		      loff_t *ppos)
+{
+	ssize_t err;
+
+	lockdep_off();
+	err = vfs_write(file, ubuf, count, ppos);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		const char __user *u;
+	} buf;
+
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	err = vfsub_write_u(file, buf.u, count, ppos);
+	set_fs(oldfs);
+	return err;
+}
+
+int vfsub_flush(struct file *file, fl_owner_t id)
+{
+	int err;
+
+	err = 0;
+	if (file->f_op->flush) {
+		if (!au_test_nfs(file->f_path.dentry->d_sb))
+			err = file->f_op->flush(file, id);
+		else {
+			lockdep_off();
+			err = file->f_op->flush(file, id);
+			lockdep_on();
+		}
+		if (!err)
+			vfsub_update_h_iattr(&file->f_path, /*did*/NULL);
+		/*ignore*/
+	}
+	return err;
+}
+
+int vfsub_iterate_dir(struct file *file, struct dir_context *ctx)
+{
+	int err;
+
+	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
+
+	lockdep_off();
+	err = iterate_dir(file, ctx);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+long vfsub_splice_to(struct file *in, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t len,
+		     unsigned int flags)
+{
+	long err;
+
+	lockdep_off();
+	err = do_splice_to(in, ppos, pipe, len, flags);
+	lockdep_on();
+	file_accessed(in);
+	if (err >= 0)
+		vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
+		       loff_t *ppos, size_t len, unsigned int flags)
+{
+	long err;
+
+	lockdep_off();
+	err = do_splice_from(pipe, out, ppos, len, flags);
+	lockdep_on();
+	if (err >= 0)
+		vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/
+	return err;
+}
+
+int vfsub_fsync(struct file *file, struct path *path, int datasync)
+{
+	int err;
+
+	/* file can be NULL */
+	lockdep_off();
+	err = vfs_fsync(file, datasync);
+	lockdep_on();
+	if (!err) {
+		if (!path) {
+			AuDebugOn(!file);
+			path = &file->f_path;
+		}
+		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
+	}
+	return err;
+}
+
+/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */
+int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
+		struct file *h_file)
+{
+	int err;
+	struct inode *h_inode;
+	struct super_block *h_sb;
+
+	if (!h_file) {
+		err = vfsub_truncate(h_path, length);
+		goto out;
+	}
+
+	h_inode = d_inode(h_path->dentry);
+	h_sb = h_inode->i_sb;
+	lockdep_off();
+	sb_start_write(h_sb);
+	lockdep_on();
+	err = locks_verify_truncate(h_inode, h_file, length);
+	if (!err)
+		err = security_path_truncate(h_path);
+	if (!err) {
+		lockdep_off();
+		err = do_truncate(h_path->dentry, length, attr, h_file);
+		lockdep_on();
+	}
+	lockdep_off();
+	sb_end_write(h_sb);
+	lockdep_on();
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_vfsub_mkdir_args {
+	int *errp;
+	struct inode *dir;
+	struct path *path;
+	int mode;
+};
+
+static void au_call_vfsub_mkdir(void *args)
+{
+	struct au_vfsub_mkdir_args *a = args;
+	*a->errp = vfsub_mkdir(a->dir, a->path, a->mode);
+}
+
+int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode)
+{
+	int err, do_sio, wkq_err;
+
+	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
+	if (!do_sio) {
+		lockdep_off();
+		err = vfsub_mkdir(dir, path, mode);
+		lockdep_on();
+	} else {
+		struct au_vfsub_mkdir_args args = {
+			.errp	= &err,
+			.dir	= dir,
+			.path	= path,
+			.mode	= mode
+		};
+		wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
+
+struct au_vfsub_rmdir_args {
+	int *errp;
+	struct inode *dir;
+	struct path *path;
+};
+
+static void au_call_vfsub_rmdir(void *args)
+{
+	struct au_vfsub_rmdir_args *a = args;
+	*a->errp = vfsub_rmdir(a->dir, a->path);
+}
+
+int vfsub_sio_rmdir(struct inode *dir, struct path *path)
+{
+	int err, do_sio, wkq_err;
+
+	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
+	if (!do_sio) {
+		lockdep_off();
+		err = vfsub_rmdir(dir, path);
+		lockdep_on();
+	} else {
+		struct au_vfsub_rmdir_args args = {
+			.errp	= &err,
+			.dir	= dir,
+			.path	= path
+		};
+		wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct notify_change_args {
+	int *errp;
+	struct path *path;
+	struct iattr *ia;
+	struct inode **delegated_inode;
+};
+
+static void call_notify_change(void *args)
+{
+	struct notify_change_args *a = args;
+	struct inode *h_inode;
+
+	h_inode = d_inode(a->path->dentry);
+	IMustLock(h_inode);
+
+	*a->errp = -EPERM;
+	if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) {
+		lockdep_off();
+		*a->errp = notify_change(a->path->dentry, a->ia,
+					 a->delegated_inode);
+		lockdep_on();
+		if (!*a->errp)
+			vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/
+	}
+	AuTraceErr(*a->errp);
+}
+
+int vfsub_notify_change(struct path *path, struct iattr *ia,
+			struct inode **delegated_inode)
+{
+	int err;
+	struct notify_change_args args = {
+		.errp			= &err,
+		.path			= path,
+		.ia			= ia,
+		.delegated_inode	= delegated_inode
+	};
+
+	call_notify_change(&args);
+
+	return err;
+}
+
+int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
+			    struct inode **delegated_inode)
+{
+	int err, wkq_err;
+	struct notify_change_args args = {
+		.errp			= &err,
+		.path			= path,
+		.ia			= ia,
+		.delegated_inode	= delegated_inode
+	};
+
+	wkq_err = au_wkq_wait(call_notify_change, &args);
+	if (unlikely(wkq_err))
+		err = wkq_err;
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct unlink_args {
+	int *errp;
+	struct inode *dir;
+	struct path *path;
+	struct inode **delegated_inode;
+};
+
+static void call_unlink(void *args)
+{
+	struct unlink_args *a = args;
+	struct dentry *d = a->path->dentry;
+	struct inode *h_inode;
+	const int stop_sillyrename = (au_test_nfs(d->d_sb)
+				      && au_dcount(d) == 1);
+
+	IMustLock(a->dir);
+
+	a->path->dentry = d->d_parent;
+	*a->errp = security_path_unlink(a->path, d);
+	a->path->dentry = d;
+	if (unlikely(*a->errp))
+		return;
+
+	if (!stop_sillyrename)
+		dget(d);
+	h_inode = NULL;
+	if (d_is_positive(d)) {
+		h_inode = d_inode(d);
+		ihold(h_inode);
+	}
+
+	lockdep_off();
+	*a->errp = vfs_unlink(a->dir, d, a->delegated_inode);
+	lockdep_on();
+	if (!*a->errp) {
+		struct path tmp = {
+			.dentry = d->d_parent,
+			.mnt	= a->path->mnt
+		};
+		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
+	}
+
+	if (!stop_sillyrename)
+		dput(d);
+	if (h_inode)
+		iput(h_inode);
+
+	AuTraceErr(*a->errp);
+}
+
+/*
+ * @dir: must be locked.
+ * @dentry: target dentry.
+ */
+int vfsub_unlink(struct inode *dir, struct path *path,
+		 struct inode **delegated_inode, int force)
+{
+	int err;
+	struct unlink_args args = {
+		.errp			= &err,
+		.dir			= dir,
+		.path			= path,
+		.delegated_inode	= delegated_inode
+	};
+
+	if (!force)
+		call_unlink(&args);
+	else {
+		int wkq_err;
+
+		wkq_err = au_wkq_wait(call_unlink, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+
+	return err;
+}
diff --git b/fs/aufs/vfsub.h b/fs/aufs/vfsub.h
new file mode 100644
index 0000000..64fff0f
--- /dev/null
+++ b/fs/aufs/vfsub.h
@@ -0,0 +1,303 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * sub-routines for VFS
+ */
+
+#ifndef __AUFS_VFSUB_H__
+#define __AUFS_VFSUB_H__
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/posix_acl.h>
+#include <linux/xattr.h>
+#include "debug.h"
+
+/* copied from linux/fs/internal.h */
+/* todo: BAD approach!! */
+extern void __mnt_drop_write(struct vfsmount *);
+extern int open_check_o_direct(struct file *f);
+
+/* ---------------------------------------------------------------------- */
+
+/* lock subclass for lower inode */
+/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */
+/* reduce? gave up. */
+enum {
+	AuLsc_I_Begin = I_MUTEX_PARENT2, /* 5 */
+	AuLsc_I_PARENT,		/* lower inode, parent first */
+	AuLsc_I_PARENT2,	/* copyup dirs */
+	AuLsc_I_PARENT3,	/* copyup wh */
+	AuLsc_I_CHILD,
+	AuLsc_I_CHILD2,
+	AuLsc_I_End
+};
+
+/* to debug easier, do not make them inlined functions */
+#define MtxMustLock(mtx)	AuDebugOn(!mutex_is_locked(mtx))
+#define IMustLock(i)		AuDebugOn(!inode_is_locked(i))
+
+/* ---------------------------------------------------------------------- */
+
+static inline void vfsub_drop_nlink(struct inode *inode)
+{
+	AuDebugOn(!inode->i_nlink);
+	drop_nlink(inode);
+}
+
+static inline void vfsub_dead_dir(struct inode *inode)
+{
+	AuDebugOn(!S_ISDIR(inode->i_mode));
+	inode->i_flags |= S_DEAD;
+	clear_nlink(inode);
+}
+
+static inline int vfsub_native_ro(struct inode *inode)
+{
+	return (inode->i_sb->s_flags & MS_RDONLY)
+		|| IS_RDONLY(inode)
+		/* || IS_APPEND(inode) */
+		|| IS_IMMUTABLE(inode);
+}
+
+#ifdef CONFIG_AUFS_BR_FUSE
+int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb);
+#else
+AuStubInt0(vfsub_test_mntns, struct vfsmount *mnt, struct super_block *h_sb);
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_update_h_iattr(struct path *h_path, int *did);
+struct file *vfsub_dentry_open(struct path *path, int flags);
+struct file *vfsub_filp_open(const char *path, int oflags, int mode);
+struct vfsub_aopen_args {
+	struct file	*file;
+	unsigned int	open_flag;
+	umode_t		create_mode;
+	int		*opened;
+};
+struct au_branch;
+int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
+		      struct vfsub_aopen_args *args, struct au_branch *br);
+int vfsub_kern_path(const char *name, unsigned int flags, struct path *path);
+
+struct dentry *vfsub_lookup_one_len_unlocked(const char *name,
+					     struct dentry *parent, int len);
+struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
+				    int len);
+
+struct vfsub_lkup_one_args {
+	struct dentry **errp;
+	struct qstr *name;
+	struct dentry *parent;
+};
+
+static inline struct dentry *vfsub_lkup_one(struct qstr *name,
+					    struct dentry *parent)
+{
+	return vfsub_lookup_one_len(name->name, parent, name->len);
+}
+
+void vfsub_call_lkup_one(void *args);
+
+/* ---------------------------------------------------------------------- */
+
+static inline int vfsub_mnt_want_write(struct vfsmount *mnt)
+{
+	int err;
+
+	lockdep_off();
+	err = mnt_want_write(mnt);
+	lockdep_on();
+	return err;
+}
+
+static inline void vfsub_mnt_drop_write(struct vfsmount *mnt)
+{
+	lockdep_off();
+	mnt_drop_write(mnt);
+	lockdep_on();
+}
+
+#if 0 /* reserved */
+static inline void vfsub_mnt_drop_write_file(struct file *file)
+{
+	lockdep_off();
+	mnt_drop_write_file(file);
+	lockdep_on();
+}
+#endif
+
+/* ---------------------------------------------------------------------- */
+
+struct au_hinode;
+struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
+				 struct dentry *d2, struct au_hinode *hdir2);
+void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
+			 struct dentry *d2, struct au_hinode *hdir2);
+
+int vfsub_create(struct inode *dir, struct path *path, int mode,
+		 bool want_excl);
+int vfsub_symlink(struct inode *dir, struct path *path,
+		  const char *symname);
+int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev);
+int vfsub_link(struct dentry *src_dentry, struct inode *dir,
+	       struct path *path, struct inode **delegated_inode);
+int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry,
+		 struct inode *hdir, struct path *path,
+		 struct inode **delegated_inode);
+int vfsub_mkdir(struct inode *dir, struct path *path, int mode);
+int vfsub_rmdir(struct inode *dir, struct path *path);
+
+/* ---------------------------------------------------------------------- */
+
+ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
+		     loff_t *ppos);
+ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
+			loff_t *ppos);
+ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
+		      loff_t *ppos);
+ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count,
+		      loff_t *ppos);
+int vfsub_flush(struct file *file, fl_owner_t id);
+int vfsub_iterate_dir(struct file *file, struct dir_context *ctx);
+
+static inline loff_t vfsub_f_size_read(struct file *file)
+{
+	return i_size_read(file_inode(file));
+}
+
+static inline unsigned int vfsub_file_flags(struct file *file)
+{
+	unsigned int flags;
+
+	spin_lock(&file->f_lock);
+	flags = file->f_flags;
+	spin_unlock(&file->f_lock);
+
+	return flags;
+}
+
+static inline int vfsub_file_execed(struct file *file)
+{
+	/* todo: direct access f_flags */
+	return !!(vfsub_file_flags(file) & __FMODE_EXEC);
+}
+
+#if 0 /* reserved */
+static inline void vfsub_file_accessed(struct file *h_file)
+{
+	file_accessed(h_file);
+	vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/
+}
+#endif
+
+#if 0 /* reserved */
+static inline void vfsub_touch_atime(struct vfsmount *h_mnt,
+				     struct dentry *h_dentry)
+{
+	struct path h_path = {
+		.dentry	= h_dentry,
+		.mnt	= h_mnt
+	};
+	touch_atime(&h_path);
+	vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/
+}
+#endif
+
+static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts,
+				    int flags)
+{
+	return update_time(h_inode, ts, flags);
+	/* no vfsub_update_h_iattr() since we don't have struct path */
+}
+
+#ifdef CONFIG_FS_POSIX_ACL
+static inline int vfsub_acl_chmod(struct inode *h_inode, umode_t h_mode)
+{
+	int err;
+
+	err = posix_acl_chmod(h_inode, h_mode);
+	if (err == -EOPNOTSUPP)
+		err = 0;
+	return err;
+}
+#else
+AuStubInt0(vfsub_acl_chmod, struct inode *h_inode, umode_t h_mode);
+#endif
+
+long vfsub_splice_to(struct file *in, loff_t *ppos,
+		     struct pipe_inode_info *pipe, size_t len,
+		     unsigned int flags);
+long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
+		       loff_t *ppos, size_t len, unsigned int flags);
+
+static inline long vfsub_truncate(struct path *path, loff_t length)
+{
+	long err;
+
+	lockdep_off();
+	err = vfs_truncate(path, length);
+	lockdep_on();
+	return err;
+}
+
+int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
+		struct file *h_file);
+int vfsub_fsync(struct file *file, struct path *path, int datasync);
+
+/* ---------------------------------------------------------------------- */
+
+static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin)
+{
+	loff_t err;
+
+	lockdep_off();
+	err = vfs_llseek(file, offset, origin);
+	lockdep_on();
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode);
+int vfsub_sio_rmdir(struct inode *dir, struct path *path);
+int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
+			    struct inode **delegated_inode);
+int vfsub_notify_change(struct path *path, struct iattr *ia,
+			struct inode **delegated_inode);
+int vfsub_unlink(struct inode *dir, struct path *path,
+		 struct inode **delegated_inode, int force);
+
+/* ---------------------------------------------------------------------- */
+
+static inline int vfsub_setxattr(struct dentry *dentry, const char *name,
+				 const void *value, size_t size, int flags)
+{
+	int err;
+
+	lockdep_off();
+	err = vfs_setxattr(dentry, name, value, size, flags);
+	lockdep_on();
+
+	return err;
+}
+
+static inline int vfsub_removexattr(struct dentry *dentry, const char *name)
+{
+	int err;
+
+	lockdep_off();
+	err = vfs_removexattr(dentry, name);
+	lockdep_on();
+
+	return err;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_VFSUB_H__ */
diff --git b/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c
new file mode 100644
index 0000000..79eb9b4
--- /dev/null
+++ b/fs/aufs/wbr_policy.c
@@ -0,0 +1,752 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * policies for selecting one among multiple writable branches
+ */
+
+#include <linux/statfs.h>
+#include "aufs.h"
+
+/* subset of cpup_attr() */
+static noinline_for_stack
+int au_cpdown_attr(struct path *h_path, struct dentry *h_src)
+{
+	int err, sbits;
+	struct iattr ia;
+	struct inode *h_isrc;
+
+	h_isrc = d_inode(h_src);
+	ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID;
+	ia.ia_mode = h_isrc->i_mode;
+	ia.ia_uid = h_isrc->i_uid;
+	ia.ia_gid = h_isrc->i_gid;
+	sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID));
+	au_cpup_attr_flags(d_inode(h_path->dentry), h_isrc->i_flags);
+	/* no delegation since it is just created */
+	err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
+
+	/* is this nfs only? */
+	if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) {
+		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
+		ia.ia_mode = h_isrc->i_mode;
+		err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
+	}
+
+	return err;
+}
+
+#define AuCpdown_PARENT_OPQ	1
+#define AuCpdown_WHED		(1 << 1)
+#define AuCpdown_MADE_DIR	(1 << 2)
+#define AuCpdown_DIROPQ		(1 << 3)
+#define au_ftest_cpdown(flags, name)	((flags) & AuCpdown_##name)
+#define au_fset_cpdown(flags, name) \
+	do { (flags) |= AuCpdown_##name; } while (0)
+#define au_fclr_cpdown(flags, name) \
+	do { (flags) &= ~AuCpdown_##name; } while (0)
+
+static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst,
+			     unsigned int *flags)
+{
+	int err;
+	struct dentry *opq_dentry;
+
+	opq_dentry = au_diropq_create(dentry, bdst);
+	err = PTR_ERR(opq_dentry);
+	if (IS_ERR(opq_dentry))
+		goto out;
+	dput(opq_dentry);
+	au_fset_cpdown(*flags, DIROPQ);
+
+out:
+	return err;
+}
+
+static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent,
+			    struct inode *dir, aufs_bindex_t bdst)
+{
+	int err;
+	struct path h_path;
+	struct au_branch *br;
+
+	br = au_sbr(dentry->d_sb, bdst);
+	h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry))
+		goto out;
+
+	err = 0;
+	if (d_is_positive(h_path.dentry)) {
+		h_path.mnt = au_br_mnt(br);
+		err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path,
+					  dentry);
+	}
+	dput(h_path.dentry);
+
+out:
+	return err;
+}
+
+static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst,
+			 struct au_pin *pin,
+			 struct dentry *h_parent, void *arg)
+{
+	int err, rerr;
+	aufs_bindex_t bopq, btop;
+	struct path h_path;
+	struct dentry *parent;
+	struct inode *h_dir, *h_inode, *inode, *dir;
+	unsigned int *flags = arg;
+
+	btop = au_dbtop(dentry);
+	/* dentry is di-locked */
+	parent = dget_parent(dentry);
+	dir = d_inode(parent);
+	h_dir = d_inode(h_parent);
+	AuDebugOn(h_dir != au_h_iptr(dir, bdst));
+	IMustLock(h_dir);
+
+	err = au_lkup_neg(dentry, bdst, /*wh*/0);
+	if (unlikely(err < 0))
+		goto out;
+	h_path.dentry = au_h_dptr(dentry, bdst);
+	h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst);
+	err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path,
+			      S_IRWXU | S_IRUGO | S_IXUGO);
+	if (unlikely(err))
+		goto out_put;
+	au_fset_cpdown(*flags, MADE_DIR);
+
+	bopq = au_dbdiropq(dentry);
+	au_fclr_cpdown(*flags, WHED);
+	au_fclr_cpdown(*flags, DIROPQ);
+	if (au_dbwh(dentry) == bdst)
+		au_fset_cpdown(*flags, WHED);
+	if (!au_ftest_cpdown(*flags, PARENT_OPQ) && bopq <= bdst)
+		au_fset_cpdown(*flags, PARENT_OPQ);
+	h_inode = d_inode(h_path.dentry);
+	inode_lock_nested(h_inode, AuLsc_I_CHILD);
+	if (au_ftest_cpdown(*flags, WHED)) {
+		err = au_cpdown_dir_opq(dentry, bdst, flags);
+		if (unlikely(err)) {
+			inode_unlock(h_inode);
+			goto out_dir;
+		}
+	}
+
+	err = au_cpdown_attr(&h_path, au_h_dptr(dentry, btop));
+	inode_unlock(h_inode);
+	if (unlikely(err))
+		goto out_opq;
+
+	if (au_ftest_cpdown(*flags, WHED)) {
+		err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst);
+		if (unlikely(err))
+			goto out_opq;
+	}
+
+	inode = d_inode(dentry);
+	if (au_ibbot(inode) < bdst)
+		au_set_ibbot(inode, bdst);
+	au_set_h_iptr(inode, bdst, au_igrab(h_inode),
+		      au_hi_flags(inode, /*isdir*/1));
+	au_fhsm_wrote(dentry->d_sb, bdst, /*force*/0);
+	goto out; /* success */
+
+	/* revert */
+out_opq:
+	if (au_ftest_cpdown(*flags, DIROPQ)) {
+		inode_lock_nested(h_inode, AuLsc_I_CHILD);
+		rerr = au_diropq_remove(dentry, bdst);
+		inode_unlock(h_inode);
+		if (unlikely(rerr)) {
+			AuIOErr("failed removing diropq for %pd b%d (%d)\n",
+				dentry, bdst, rerr);
+			err = -EIO;
+			goto out;
+		}
+	}
+out_dir:
+	if (au_ftest_cpdown(*flags, MADE_DIR)) {
+		rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path);
+		if (unlikely(rerr)) {
+			AuIOErr("failed removing %pd b%d (%d)\n",
+				dentry, bdst, rerr);
+			err = -EIO;
+		}
+	}
+out_put:
+	au_set_h_dptr(dentry, bdst, NULL);
+	if (au_dbbot(dentry) == bdst)
+		au_update_dbbot(dentry);
+out:
+	dput(parent);
+	return err;
+}
+
+int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst)
+{
+	int err;
+	unsigned int flags;
+
+	flags = 0;
+	err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &flags);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* policies for create */
+
+int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	int err, i, j, ndentry;
+	aufs_bindex_t bopq;
+	struct au_dcsub_pages dpages;
+	struct au_dpage *dpage;
+	struct dentry **dentries, *parent, *d;
+
+	err = au_dpages_init(&dpages, GFP_NOFS);
+	if (unlikely(err))
+		goto out;
+	parent = dget_parent(dentry);
+	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0);
+	if (unlikely(err))
+		goto out_free;
+
+	err = bindex;
+	for (i = 0; i < dpages.ndpage; i++) {
+		dpage = dpages.dpages + i;
+		dentries = dpage->dentries;
+		ndentry = dpage->ndentry;
+		for (j = 0; j < ndentry; j++) {
+			d = dentries[j];
+			di_read_lock_parent2(d, !AuLock_IR);
+			bopq = au_dbdiropq(d);
+			di_read_unlock(d, !AuLock_IR);
+			if (bopq >= 0 && bopq < err)
+				err = bopq;
+		}
+	}
+
+out_free:
+	dput(parent);
+	au_dpages_free(&dpages);
+out:
+	return err;
+}
+
+static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex)
+{
+	for (; bindex >= 0; bindex--)
+		if (!au_br_rdonly(au_sbr(sb, bindex)))
+			return bindex;
+	return -EROFS;
+}
+
+/* top down parent */
+static int au_wbr_create_tdp(struct dentry *dentry,
+			     unsigned int flags __maybe_unused)
+{
+	int err;
+	aufs_bindex_t btop, bindex;
+	struct super_block *sb;
+	struct dentry *parent, *h_parent;
+
+	sb = dentry->d_sb;
+	btop = au_dbtop(dentry);
+	err = btop;
+	if (!au_br_rdonly(au_sbr(sb, btop)))
+		goto out;
+
+	err = -EROFS;
+	parent = dget_parent(dentry);
+	for (bindex = au_dbtop(parent); bindex < btop; bindex++) {
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || d_is_negative(h_parent))
+			continue;
+
+		if (!au_br_rdonly(au_sbr(sb, bindex))) {
+			err = bindex;
+			break;
+		}
+	}
+	dput(parent);
+
+	/* bottom up here */
+	if (unlikely(err < 0)) {
+		err = au_wbr_bu(sb, btop - 1);
+		if (err >= 0)
+			err = au_wbr_nonopq(dentry, err);
+	}
+
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* an exception for the policy other than tdp */
+static int au_wbr_create_exp(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bwh, bdiropq;
+	struct dentry *parent;
+
+	err = -1;
+	bwh = au_dbwh(dentry);
+	parent = dget_parent(dentry);
+	bdiropq = au_dbdiropq(parent);
+	if (bwh >= 0) {
+		if (bdiropq >= 0)
+			err = min(bdiropq, bwh);
+		else
+			err = bwh;
+		AuDbg("%d\n", err);
+	} else if (bdiropq >= 0) {
+		err = bdiropq;
+		AuDbg("%d\n", err);
+	}
+	dput(parent);
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+	if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err)))
+		err = -1;
+
+	AuDbg("%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* round robin */
+static int au_wbr_create_init_rr(struct super_block *sb)
+{
+	int err;
+
+	err = au_wbr_bu(sb, au_sbbot(sb));
+	atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */
+	/* smp_mb(); */
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_create_rr(struct dentry *dentry, unsigned int flags)
+{
+	int err, nbr;
+	unsigned int u;
+	aufs_bindex_t bindex, bbot;
+	struct super_block *sb;
+	atomic_t *next;
+
+	err = au_wbr_create_exp(dentry);
+	if (err >= 0)
+		goto out;
+
+	sb = dentry->d_sb;
+	next = &au_sbi(sb)->si_wbr_rr_next;
+	bbot = au_sbbot(sb);
+	nbr = bbot + 1;
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		if (!au_ftest_wbr(flags, DIR)) {
+			err = atomic_dec_return(next) + 1;
+			/* modulo for 0 is meaningless */
+			if (unlikely(!err))
+				err = atomic_dec_return(next) + 1;
+		} else
+			err = atomic_read(next);
+		AuDbg("%d\n", err);
+		u = err;
+		err = u % nbr;
+		AuDbg("%d\n", err);
+		if (!au_br_rdonly(au_sbr(sb, err)))
+			break;
+		err = -EROFS;
+	}
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+out:
+	AuDbg("%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* most free space */
+static void au_mfs(struct dentry *dentry, struct dentry *parent)
+{
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_wbr_mfs *mfs;
+	struct dentry *h_parent;
+	aufs_bindex_t bindex, bbot;
+	int err;
+	unsigned long long b, bavail;
+	struct path h_path;
+	/* reduce the stack usage */
+	struct kstatfs *st;
+
+	st = kmalloc(sizeof(*st), GFP_NOFS);
+	if (unlikely(!st)) {
+		AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM);
+		return;
+	}
+
+	bavail = 0;
+	sb = dentry->d_sb;
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	MtxMustLock(&mfs->mfs_lock);
+	mfs->mfs_bindex = -EROFS;
+	mfs->mfsrr_bytes = 0;
+	if (!parent) {
+		bindex = 0;
+		bbot = au_sbbot(sb);
+	} else {
+		bindex = au_dbtop(parent);
+		bbot = au_dbtaildir(parent);
+	}
+
+	for (; bindex <= bbot; bindex++) {
+		if (parent) {
+			h_parent = au_h_dptr(parent, bindex);
+			if (!h_parent || d_is_negative(h_parent))
+				continue;
+		}
+		br = au_sbr(sb, bindex);
+		if (au_br_rdonly(br))
+			continue;
+
+		/* sb->s_root for NFS is unreliable */
+		h_path.mnt = au_br_mnt(br);
+		h_path.dentry = h_path.mnt->mnt_root;
+		err = vfs_statfs(&h_path, st);
+		if (unlikely(err)) {
+			AuWarn1("failed statfs, b%d, %d\n", bindex, err);
+			continue;
+		}
+
+		/* when the available size is equal, select the lower one */
+		BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail)
+			     || sizeof(b) < sizeof(st->f_bsize));
+		b = st->f_bavail * st->f_bsize;
+		br->br_wbr->wbr_bytes = b;
+		if (b >= bavail) {
+			bavail = b;
+			mfs->mfs_bindex = bindex;
+			mfs->mfs_jiffy = jiffies;
+		}
+	}
+
+	mfs->mfsrr_bytes = bavail;
+	AuDbg("b%d\n", mfs->mfs_bindex);
+	au_delayed_kfree(st);
+}
+
+static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags)
+{
+	int err;
+	struct dentry *parent;
+	struct super_block *sb;
+	struct au_wbr_mfs *mfs;
+
+	err = au_wbr_create_exp(dentry);
+	if (err >= 0)
+		goto out;
+
+	sb = dentry->d_sb;
+	parent = NULL;
+	if (au_ftest_wbr(flags, PARENT))
+		parent = dget_parent(dentry);
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	mutex_lock(&mfs->mfs_lock);
+	if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire)
+	    || mfs->mfs_bindex < 0
+	    || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex)))
+		au_mfs(dentry, parent);
+	mutex_unlock(&mfs->mfs_lock);
+	err = mfs->mfs_bindex;
+	dput(parent);
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_create_init_mfs(struct super_block *sb)
+{
+	struct au_wbr_mfs *mfs;
+
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	mutex_init(&mfs->mfs_lock);
+	mfs->mfs_jiffy = 0;
+	mfs->mfs_bindex = -EROFS;
+
+	return 0;
+}
+
+static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused)
+{
+	mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock);
+	return 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* most free space and then round robin */
+static int au_wbr_create_mfsrr(struct dentry *dentry, unsigned int flags)
+{
+	int err;
+	struct au_wbr_mfs *mfs;
+
+	err = au_wbr_create_mfs(dentry, flags);
+	if (err >= 0) {
+		mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs;
+		mutex_lock(&mfs->mfs_lock);
+		if (mfs->mfsrr_bytes < mfs->mfsrr_watermark)
+			err = au_wbr_create_rr(dentry, flags);
+		mutex_unlock(&mfs->mfs_lock);
+	}
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_create_init_mfsrr(struct super_block *sb)
+{
+	int err;
+
+	au_wbr_create_init_mfs(sb); /* ignore */
+	err = au_wbr_create_init_rr(sb);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* top down parent and most free space */
+static int au_wbr_create_pmfs(struct dentry *dentry, unsigned int flags)
+{
+	int err, e2;
+	unsigned long long b;
+	aufs_bindex_t bindex, btop, bbot;
+	struct super_block *sb;
+	struct dentry *parent, *h_parent;
+	struct au_branch *br;
+
+	err = au_wbr_create_tdp(dentry, flags);
+	if (unlikely(err < 0))
+		goto out;
+	parent = dget_parent(dentry);
+	btop = au_dbtop(parent);
+	bbot = au_dbtaildir(parent);
+	if (btop == bbot)
+		goto out_parent; /* success */
+
+	e2 = au_wbr_create_mfs(dentry, flags);
+	if (e2 < 0)
+		goto out_parent; /* success */
+
+	/* when the available size is equal, select upper one */
+	sb = dentry->d_sb;
+	br = au_sbr(sb, err);
+	b = br->br_wbr->wbr_bytes;
+	AuDbg("b%d, %llu\n", err, b);
+
+	for (bindex = btop; bindex <= bbot; bindex++) {
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || d_is_negative(h_parent))
+			continue;
+
+		br = au_sbr(sb, bindex);
+		if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) {
+			b = br->br_wbr->wbr_bytes;
+			err = bindex;
+			AuDbg("b%d, %llu\n", err, b);
+		}
+	}
+
+	if (err >= 0)
+		err = au_wbr_nonopq(dentry, err);
+
+out_parent:
+	dput(parent);
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * - top down parent
+ * - most free space with parent
+ * - most free space round-robin regardless parent
+ */
+static int au_wbr_create_pmfsrr(struct dentry *dentry, unsigned int flags)
+{
+	int err;
+	unsigned long long watermark;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct au_wbr_mfs *mfs;
+
+	err = au_wbr_create_pmfs(dentry, flags | AuWbr_PARENT);
+	if (unlikely(err < 0))
+		goto out;
+
+	sb = dentry->d_sb;
+	br = au_sbr(sb, err);
+	mfs = &au_sbi(sb)->si_wbr_mfs;
+	mutex_lock(&mfs->mfs_lock);
+	watermark = mfs->mfsrr_watermark;
+	mutex_unlock(&mfs->mfs_lock);
+	if (br->br_wbr->wbr_bytes < watermark)
+		/* regardless the parent dir */
+		err = au_wbr_create_mfsrr(dentry, flags);
+
+out:
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* policies for copyup */
+
+/* top down parent */
+static int au_wbr_copyup_tdp(struct dentry *dentry)
+{
+	return au_wbr_create_tdp(dentry, /*flags, anything is ok*/0);
+}
+
+/* bottom up parent */
+static int au_wbr_copyup_bup(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t bindex, btop;
+	struct dentry *parent, *h_parent;
+	struct super_block *sb;
+
+	err = -EROFS;
+	sb = dentry->d_sb;
+	parent = dget_parent(dentry);
+	btop = au_dbtop(parent);
+	for (bindex = au_dbtop(dentry); bindex >= btop; bindex--) {
+		h_parent = au_h_dptr(parent, bindex);
+		if (!h_parent || d_is_negative(h_parent))
+			continue;
+
+		if (!au_br_rdonly(au_sbr(sb, bindex))) {
+			err = bindex;
+			break;
+		}
+	}
+	dput(parent);
+
+	/* bottom up here */
+	if (unlikely(err < 0))
+		err = au_wbr_bu(sb, btop - 1);
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+/* bottom up */
+int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t btop)
+{
+	int err;
+
+	err = au_wbr_bu(dentry->d_sb, btop);
+	AuDbg("b%d\n", err);
+	if (err > btop)
+		err = au_wbr_nonopq(dentry, err);
+
+	AuDbg("b%d\n", err);
+	return err;
+}
+
+static int au_wbr_copyup_bu(struct dentry *dentry)
+{
+	int err;
+	aufs_bindex_t btop;
+
+	btop = au_dbtop(dentry);
+	err = au_wbr_do_copyup_bu(dentry, btop);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_wbr_copyup_operations au_wbr_copyup_ops[] = {
+	[AuWbrCopyup_TDP] = {
+		.copyup	= au_wbr_copyup_tdp
+	},
+	[AuWbrCopyup_BUP] = {
+		.copyup	= au_wbr_copyup_bup
+	},
+	[AuWbrCopyup_BU] = {
+		.copyup	= au_wbr_copyup_bu
+	}
+};
+
+struct au_wbr_create_operations au_wbr_create_ops[] = {
+	[AuWbrCreate_TDP] = {
+		.create	= au_wbr_create_tdp
+	},
+	[AuWbrCreate_RR] = {
+		.create	= au_wbr_create_rr,
+		.init	= au_wbr_create_init_rr
+	},
+	[AuWbrCreate_MFS] = {
+		.create	= au_wbr_create_mfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_MFSV] = {
+		.create	= au_wbr_create_mfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_MFSRR] = {
+		.create	= au_wbr_create_mfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_MFSRRV] = {
+		.create	= au_wbr_create_mfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFS] = {
+		.create	= au_wbr_create_pmfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFSV] = {
+		.create	= au_wbr_create_pmfs,
+		.init	= au_wbr_create_init_mfs,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFSRR] = {
+		.create	= au_wbr_create_pmfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	},
+	[AuWbrCreate_PMFSRRV] = {
+		.create	= au_wbr_create_pmfsrr,
+		.init	= au_wbr_create_init_mfsrr,
+		.fin	= au_wbr_create_fin_mfs
+	}
+};
diff --git b/fs/aufs/whout.c b/fs/aufs/whout.c
new file mode 100644
index 0000000..54feb08
--- /dev/null
+++ b/fs/aufs/whout.c
@@ -0,0 +1,1047 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * whiteout for logical deletion and opaque directory
+ */
+
+#include "aufs.h"
+
+#define WH_MASK			S_IRUGO
+
+/*
+ * If a directory contains this file, then it is opaque.  We start with the
+ * .wh. flag so that it is blocked by lookup.
+ */
+static struct qstr diropq_name = QSTR_INIT(AUFS_WH_DIROPQ,
+					   sizeof(AUFS_WH_DIROPQ) - 1);
+
+/*
+ * generate whiteout name, which is NOT terminated by NULL.
+ * @name: original d_name.name
+ * @len: original d_name.len
+ * @wh: whiteout qstr
+ * returns zero when succeeds, otherwise error.
+ * succeeded value as wh->name should be freed by kfree().
+ */
+int au_wh_name_alloc(struct qstr *wh, const struct qstr *name)
+{
+	char *p;
+
+	if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN))
+		return -ENAMETOOLONG;
+
+	wh->len = name->len + AUFS_WH_PFX_LEN;
+	p = kmalloc(wh->len, GFP_NOFS);
+	wh->name = p;
+	if (p) {
+		memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
+		memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len);
+		/* smp_mb(); */
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * test if the @wh_name exists under @h_parent.
+ * @try_sio specifies the necessary of super-io.
+ */
+int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio)
+{
+	int err;
+	struct dentry *wh_dentry;
+
+	if (!try_sio)
+		wh_dentry = vfsub_lkup_one(wh_name, h_parent);
+	else
+		wh_dentry = au_sio_lkup_one(wh_name, h_parent);
+	err = PTR_ERR(wh_dentry);
+	if (IS_ERR(wh_dentry)) {
+		if (err == -ENAMETOOLONG)
+			err = 0;
+		goto out;
+	}
+
+	err = 0;
+	if (d_is_negative(wh_dentry))
+		goto out_wh; /* success */
+
+	err = 1;
+	if (d_is_reg(wh_dentry))
+		goto out_wh; /* success */
+
+	err = -EIO;
+	AuIOErr("%pd Invalid whiteout entry type 0%o.\n",
+		wh_dentry, d_inode(wh_dentry)->i_mode);
+
+out_wh:
+	dput(wh_dentry);
+out:
+	return err;
+}
+
+/*
+ * test if the @h_dentry sets opaque or not.
+ */
+int au_diropq_test(struct dentry *h_dentry)
+{
+	int err;
+	struct inode *h_dir;
+
+	h_dir = d_inode(h_dentry);
+	err = au_wh_test(h_dentry, &diropq_name,
+			 au_test_h_perm_sio(h_dir, MAY_EXEC));
+	return err;
+}
+
+/*
+ * returns a negative dentry whose name is unique and temporary.
+ */
+struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
+			     struct qstr *prefix)
+{
+	struct dentry *dentry;
+	int i;
+	char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1],
+		*name, *p;
+	/* strict atomic_t is unnecessary here */
+	static unsigned short cnt;
+	struct qstr qs;
+
+	BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN);
+
+	name = defname;
+	qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1;
+	if (unlikely(prefix->len > DNAME_INLINE_LEN)) {
+		dentry = ERR_PTR(-ENAMETOOLONG);
+		if (unlikely(qs.len > NAME_MAX))
+			goto out;
+		dentry = ERR_PTR(-ENOMEM);
+		name = kmalloc(qs.len + 1, GFP_NOFS);
+		if (unlikely(!name))
+			goto out;
+	}
+
+	/* doubly whiteout-ed */
+	memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2);
+	p = name + AUFS_WH_PFX_LEN * 2;
+	memcpy(p, prefix->name, prefix->len);
+	p += prefix->len;
+	*p++ = '.';
+	AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN);
+
+	qs.name = name;
+	for (i = 0; i < 3; i++) {
+		sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++);
+		dentry = au_sio_lkup_one(&qs, h_parent);
+		if (IS_ERR(dentry) || d_is_negative(dentry))
+			goto out_name;
+		dput(dentry);
+	}
+	/* pr_warn("could not get random name\n"); */
+	dentry = ERR_PTR(-EEXIST);
+	AuDbg("%.*s\n", AuLNPair(&qs));
+	BUG();
+
+out_name:
+	if (name != defname)
+		au_delayed_kfree(name);
+out:
+	AuTraceErrPtr(dentry);
+	return dentry;
+}
+
+/*
+ * rename the @h_dentry on @br to the whiteouted temporary name.
+ */
+int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br)
+{
+	int err;
+	struct path h_path = {
+		.mnt = au_br_mnt(br)
+	};
+	struct inode *h_dir, *delegated;
+	struct dentry *h_parent;
+
+	h_parent = h_dentry->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+
+	h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name);
+	err = PTR_ERR(h_path.dentry);
+	if (IS_ERR(h_path.dentry))
+		goto out;
+
+	/* under the same dir, no need to lock_rename() */
+	delegated = NULL;
+	err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path, &delegated);
+	AuTraceErr(err);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal rename\n");
+		iput(delegated);
+	}
+	dput(h_path.dentry);
+
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * functions for removing a whiteout
+ */
+
+static int do_unlink_wh(struct inode *h_dir, struct path *h_path)
+{
+	int err, force;
+	struct inode *delegated;
+
+	/*
+	 * forces superio when the dir has a sticky bit.
+	 * this may be a violation of unix fs semantics.
+	 */
+	force = (h_dir->i_mode & S_ISVTX)
+		&& !uid_eq(current_fsuid(), d_inode(h_path->dentry)->i_uid);
+	delegated = NULL;
+	err = vfsub_unlink(h_dir, h_path, &delegated, force);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	return err;
+}
+
+int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
+			struct dentry *dentry)
+{
+	int err;
+
+	err = do_unlink_wh(h_dir, h_path);
+	if (!err && dentry)
+		au_set_dbwh(dentry, -1);
+
+	return err;
+}
+
+static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh,
+			  struct au_branch *br)
+{
+	int err;
+	struct path h_path = {
+		.mnt = au_br_mnt(br)
+	};
+
+	err = 0;
+	h_path.dentry = vfsub_lkup_one(wh, h_parent);
+	if (IS_ERR(h_path.dentry))
+		err = PTR_ERR(h_path.dentry);
+	else {
+		if (d_is_reg(h_path.dentry))
+			err = do_unlink_wh(d_inode(h_parent), &h_path);
+		dput(h_path.dentry);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * initialize/clean whiteout for a branch
+ */
+
+static void au_wh_clean(struct inode *h_dir, struct path *whpath,
+			const int isdir)
+{
+	int err;
+	struct inode *delegated;
+
+	if (d_is_negative(whpath->dentry))
+		return;
+
+	if (isdir)
+		err = vfsub_rmdir(h_dir, whpath);
+	else {
+		delegated = NULL;
+		err = vfsub_unlink(h_dir, whpath, &delegated, /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+	}
+	if (unlikely(err))
+		pr_warn("failed removing %pd (%d), ignored.\n",
+			whpath->dentry, err);
+}
+
+static int test_linkable(struct dentry *h_root)
+{
+	struct inode *h_dir = d_inode(h_root);
+
+	if (h_dir->i_op->link)
+		return 0;
+
+	pr_err("%pd (%s) doesn't support link(2), use noplink and rw+nolwh\n",
+	       h_root, au_sbtype(h_root->d_sb));
+	return -ENOSYS;
+}
+
+/* todo: should this mkdir be done in /sbin/mount.aufs helper? */
+static int au_whdir(struct inode *h_dir, struct path *path)
+{
+	int err;
+
+	err = -EEXIST;
+	if (d_is_negative(path->dentry)) {
+		int mode = S_IRWXU;
+
+		if (au_test_nfs(path->dentry->d_sb))
+			mode |= S_IXUGO;
+		err = vfsub_mkdir(h_dir, path, mode);
+	} else if (d_is_dir(path->dentry))
+		err = 0;
+	else
+		pr_err("unknown %pd exists\n", path->dentry);
+
+	return err;
+}
+
+struct au_wh_base {
+	const struct qstr *name;
+	struct dentry *dentry;
+};
+
+static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[],
+			  struct path *h_path)
+{
+	h_path->dentry = base[AuBrWh_BASE].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/0);
+	h_path->dentry = base[AuBrWh_PLINK].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/1);
+	h_path->dentry = base[AuBrWh_ORPH].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/1);
+}
+
+/*
+ * returns tri-state,
+ * minus: error, caller should print the message
+ * zero: succuess
+ * plus: error, caller should NOT print the message
+ */
+static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr,
+				int do_plink, struct au_wh_base base[],
+				struct path *h_path)
+{
+	int err;
+	struct inode *h_dir;
+
+	h_dir = d_inode(h_root);
+	h_path->dentry = base[AuBrWh_BASE].dentry;
+	au_wh_clean(h_dir, h_path, /*isdir*/0);
+	h_path->dentry = base[AuBrWh_PLINK].dentry;
+	if (do_plink) {
+		err = test_linkable(h_root);
+		if (unlikely(err)) {
+			err = 1;
+			goto out;
+		}
+
+		err = au_whdir(h_dir, h_path);
+		if (unlikely(err))
+			goto out;
+		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
+	} else
+		au_wh_clean(h_dir, h_path, /*isdir*/1);
+	h_path->dentry = base[AuBrWh_ORPH].dentry;
+	err = au_whdir(h_dir, h_path);
+	if (unlikely(err))
+		goto out;
+	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
+
+out:
+	return err;
+}
+
+/*
+ * for the moment, aufs supports the branch filesystem which does not support
+ * link(2). testing on FAT which does not support i_op->setattr() fully either,
+ * copyup failed. finally, such filesystem will not be used as the writable
+ * branch.
+ *
+ * returns tri-state, see above.
+ */
+static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr,
+			 int do_plink, struct au_wh_base base[],
+			 struct path *h_path)
+{
+	int err;
+	struct inode *h_dir;
+
+	WbrWhMustWriteLock(wbr);
+
+	err = test_linkable(h_root);
+	if (unlikely(err)) {
+		err = 1;
+		goto out;
+	}
+
+	/*
+	 * todo: should this create be done in /sbin/mount.aufs helper?
+	 */
+	err = -EEXIST;
+	h_dir = d_inode(h_root);
+	if (d_is_negative(base[AuBrWh_BASE].dentry)) {
+		h_path->dentry = base[AuBrWh_BASE].dentry;
+		err = vfsub_create(h_dir, h_path, WH_MASK, /*want_excl*/true);
+	} else if (d_is_reg(base[AuBrWh_BASE].dentry))
+		err = 0;
+	else
+		pr_err("unknown %pd2 exists\n", base[AuBrWh_BASE].dentry);
+	if (unlikely(err))
+		goto out;
+
+	h_path->dentry = base[AuBrWh_PLINK].dentry;
+	if (do_plink) {
+		err = au_whdir(h_dir, h_path);
+		if (unlikely(err))
+			goto out;
+		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
+	} else
+		au_wh_clean(h_dir, h_path, /*isdir*/1);
+	wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry);
+
+	h_path->dentry = base[AuBrWh_ORPH].dentry;
+	err = au_whdir(h_dir, h_path);
+	if (unlikely(err))
+		goto out;
+	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
+
+out:
+	return err;
+}
+
+/*
+ * initialize the whiteout base file/dir for @br.
+ */
+int au_wh_init(struct au_branch *br, struct super_block *sb)
+{
+	int err, i;
+	const unsigned char do_plink
+		= !!au_opt_test(au_mntflags(sb), PLINK);
+	struct inode *h_dir;
+	struct path path = br->br_path;
+	struct dentry *h_root = path.dentry;
+	struct au_wbr *wbr = br->br_wbr;
+	static const struct qstr base_name[] = {
+		[AuBrWh_BASE] = QSTR_INIT(AUFS_BASE_NAME,
+					  sizeof(AUFS_BASE_NAME) - 1),
+		[AuBrWh_PLINK] = QSTR_INIT(AUFS_PLINKDIR_NAME,
+					   sizeof(AUFS_PLINKDIR_NAME) - 1),
+		[AuBrWh_ORPH] = QSTR_INIT(AUFS_ORPHDIR_NAME,
+					  sizeof(AUFS_ORPHDIR_NAME) - 1)
+	};
+	struct au_wh_base base[] = {
+		[AuBrWh_BASE] = {
+			.name	= base_name + AuBrWh_BASE,
+			.dentry	= NULL
+		},
+		[AuBrWh_PLINK] = {
+			.name	= base_name + AuBrWh_PLINK,
+			.dentry	= NULL
+		},
+		[AuBrWh_ORPH] = {
+			.name	= base_name + AuBrWh_ORPH,
+			.dentry	= NULL
+		}
+	};
+
+	if (wbr)
+		WbrWhMustWriteLock(wbr);
+
+	for (i = 0; i < AuBrWh_Last; i++) {
+		/* doubly whiteouted */
+		struct dentry *d;
+
+		d = au_wh_lkup(h_root, (void *)base[i].name, br);
+		err = PTR_ERR(d);
+		if (IS_ERR(d))
+			goto out;
+
+		base[i].dentry = d;
+		AuDebugOn(wbr
+			  && wbr->wbr_wh[i]
+			  && wbr->wbr_wh[i] != base[i].dentry);
+	}
+
+	if (wbr)
+		for (i = 0; i < AuBrWh_Last; i++) {
+			dput(wbr->wbr_wh[i]);
+			wbr->wbr_wh[i] = NULL;
+		}
+
+	err = 0;
+	if (!au_br_writable(br->br_perm)) {
+		h_dir = d_inode(h_root);
+		au_wh_init_ro(h_dir, base, &path);
+	} else if (!au_br_wh_linkable(br->br_perm)) {
+		err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path);
+		if (err > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+	} else {
+		err = au_wh_init_rw(h_root, wbr, do_plink, base, &path);
+		if (err > 0)
+			goto out;
+		else if (err)
+			goto out_err;
+	}
+	goto out; /* success */
+
+out_err:
+	pr_err("an error(%d) on the writable branch %pd(%s)\n",
+	       err, h_root, au_sbtype(h_root->d_sb));
+out:
+	for (i = 0; i < AuBrWh_Last; i++)
+		dput(base[i].dentry);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+/*
+ * whiteouts are all hard-linked usually.
+ * when its link count reaches a ceiling, we create a new whiteout base
+ * asynchronously.
+ */
+
+struct reinit_br_wh {
+	struct super_block *sb;
+	struct au_branch *br;
+};
+
+static void reinit_br_wh(void *arg)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct path h_path;
+	struct reinit_br_wh *a = arg;
+	struct au_wbr *wbr;
+	struct inode *dir, *delegated;
+	struct dentry *h_root;
+	struct au_hinode *hdir;
+
+	err = 0;
+	wbr = a->br->br_wbr;
+	/* big aufs lock */
+	si_noflush_write_lock(a->sb);
+	if (!au_br_writable(a->br->br_perm))
+		goto out;
+	bindex = au_br_index(a->sb, a->br->br_id);
+	if (unlikely(bindex < 0))
+		goto out;
+
+	di_read_lock_parent(a->sb->s_root, AuLock_IR);
+	dir = d_inode(a->sb->s_root);
+	hdir = au_hi(dir, bindex);
+	h_root = au_h_dptr(a->sb->s_root, bindex);
+	AuDebugOn(h_root != au_br_dentry(a->br));
+
+	au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT);
+	wbr_wh_write_lock(wbr);
+	err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode,
+			  h_root, a->br);
+	if (!err) {
+		h_path.dentry = wbr->wbr_whbase;
+		h_path.mnt = au_br_mnt(a->br);
+		delegated = NULL;
+		err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated,
+				   /*force*/0);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal unlink\n");
+			iput(delegated);
+		}
+	} else {
+		pr_warn("%pd is moved, ignored\n", wbr->wbr_whbase);
+		err = 0;
+	}
+	dput(wbr->wbr_whbase);
+	wbr->wbr_whbase = NULL;
+	if (!err)
+		err = au_wh_init(a->br, a->sb);
+	wbr_wh_write_unlock(wbr);
+	au_hn_inode_unlock(hdir);
+	di_read_unlock(a->sb->s_root, AuLock_IR);
+	if (!err)
+		au_fhsm_wrote(a->sb, bindex, /*force*/0);
+
+out:
+	if (wbr)
+		atomic_dec(&wbr->wbr_wh_running);
+	au_br_put(a->br);
+	si_write_unlock(a->sb);
+	au_nwt_done(&au_sbi(a->sb)->si_nowait);
+	au_delayed_kfree(arg);
+	if (unlikely(err))
+		AuIOErr("err %d\n", err);
+}
+
+static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br)
+{
+	int do_dec, wkq_err;
+	struct reinit_br_wh *arg;
+
+	do_dec = 1;
+	if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1)
+		goto out;
+
+	/* ignore ENOMEM */
+	arg = kmalloc(sizeof(*arg), GFP_NOFS);
+	if (arg) {
+		/*
+		 * dec(wh_running), kfree(arg) and dec(br_count)
+		 * in reinit function
+		 */
+		arg->sb = sb;
+		arg->br = br;
+		au_br_get(br);
+		wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0);
+		if (unlikely(wkq_err)) {
+			atomic_dec(&br->br_wbr->wbr_wh_running);
+			au_br_put(br);
+			au_delayed_kfree(arg);
+		}
+		do_dec = 0;
+	}
+
+out:
+	if (do_dec)
+		atomic_dec(&br->br_wbr->wbr_wh_running);
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create the whiteout @wh.
+ */
+static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex,
+			     struct dentry *wh)
+{
+	int err;
+	struct path h_path = {
+		.dentry = wh
+	};
+	struct au_branch *br;
+	struct au_wbr *wbr;
+	struct dentry *h_parent;
+	struct inode *h_dir, *delegated;
+
+	h_parent = wh->d_parent; /* dir inode is locked */
+	h_dir = d_inode(h_parent);
+	IMustLock(h_dir);
+
+	br = au_sbr(sb, bindex);
+	h_path.mnt = au_br_mnt(br);
+	wbr = br->br_wbr;
+	wbr_wh_read_lock(wbr);
+	if (wbr->wbr_whbase) {
+		delegated = NULL;
+		err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path, &delegated);
+		if (unlikely(err == -EWOULDBLOCK)) {
+			pr_warn("cannot retry for NFSv4 delegation"
+				" for an internal link\n");
+			iput(delegated);
+		}
+		if (!err || err != -EMLINK)
+			goto out;
+
+		/* link count full. re-initialize br_whbase. */
+		kick_reinit_br_wh(sb, br);
+	}
+
+	/* return this error in this context */
+	err = vfsub_create(h_dir, &h_path, WH_MASK, /*want_excl*/true);
+	if (!err)
+		au_fhsm_wrote(sb, bindex, /*force*/0);
+
+out:
+	wbr_wh_read_unlock(wbr);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create or remove the diropq.
+ */
+static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex,
+				unsigned int flags)
+{
+	struct dentry *opq_dentry, *h_dentry;
+	struct super_block *sb;
+	struct au_branch *br;
+	int err;
+
+	sb = dentry->d_sb;
+	br = au_sbr(sb, bindex);
+	h_dentry = au_h_dptr(dentry, bindex);
+	opq_dentry = vfsub_lkup_one(&diropq_name, h_dentry);
+	if (IS_ERR(opq_dentry))
+		goto out;
+
+	if (au_ftest_diropq(flags, CREATE)) {
+		err = link_or_create_wh(sb, bindex, opq_dentry);
+		if (!err) {
+			au_set_dbdiropq(dentry, bindex);
+			goto out; /* success */
+		}
+	} else {
+		struct path tmp = {
+			.dentry = opq_dentry,
+			.mnt	= au_br_mnt(br)
+		};
+		err = do_unlink_wh(au_h_iptr(d_inode(dentry), bindex), &tmp);
+		if (!err)
+			au_set_dbdiropq(dentry, -1);
+	}
+	dput(opq_dentry);
+	opq_dentry = ERR_PTR(err);
+
+out:
+	return opq_dentry;
+}
+
+struct do_diropq_args {
+	struct dentry **errp;
+	struct dentry *dentry;
+	aufs_bindex_t bindex;
+	unsigned int flags;
+};
+
+static void call_do_diropq(void *args)
+{
+	struct do_diropq_args *a = args;
+	*a->errp = do_diropq(a->dentry, a->bindex, a->flags);
+}
+
+struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
+			     unsigned int flags)
+{
+	struct dentry *diropq, *h_dentry;
+
+	h_dentry = au_h_dptr(dentry, bindex);
+	if (!au_test_h_perm_sio(d_inode(h_dentry), MAY_EXEC | MAY_WRITE))
+		diropq = do_diropq(dentry, bindex, flags);
+	else {
+		int wkq_err;
+		struct do_diropq_args args = {
+			.errp		= &diropq,
+			.dentry		= dentry,
+			.bindex		= bindex,
+			.flags		= flags
+		};
+
+		wkq_err = au_wkq_wait(call_do_diropq, &args);
+		if (unlikely(wkq_err))
+			diropq = ERR_PTR(wkq_err);
+	}
+
+	return diropq;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * lookup whiteout dentry.
+ * @h_parent: lower parent dentry which must exist and be locked
+ * @base_name: name of dentry which will be whiteouted
+ * returns dentry for whiteout.
+ */
+struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
+			  struct au_branch *br)
+{
+	int err;
+	struct qstr wh_name;
+	struct dentry *wh_dentry;
+
+	err = au_wh_name_alloc(&wh_name, base_name);
+	wh_dentry = ERR_PTR(err);
+	if (!err) {
+		wh_dentry = vfsub_lkup_one(&wh_name, h_parent);
+		au_delayed_kfree(wh_name.name);
+	}
+	return wh_dentry;
+}
+
+/*
+ * link/create a whiteout for @dentry on @bindex.
+ */
+struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
+			    struct dentry *h_parent)
+{
+	struct dentry *wh_dentry;
+	struct super_block *sb;
+	int err;
+
+	sb = dentry->d_sb;
+	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex));
+	if (!IS_ERR(wh_dentry) && d_is_negative(wh_dentry)) {
+		err = link_or_create_wh(sb, bindex, wh_dentry);
+		if (!err) {
+			au_set_dbwh(dentry, bindex);
+			au_fhsm_wrote(sb, bindex, /*force*/0);
+		} else {
+			dput(wh_dentry);
+			wh_dentry = ERR_PTR(err);
+		}
+	}
+
+	return wh_dentry;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* Delete all whiteouts in this directory on branch bindex. */
+static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist,
+			   aufs_bindex_t bindex, struct au_branch *br)
+{
+	int err;
+	unsigned long ul, n;
+	struct qstr wh_name;
+	char *p;
+	struct hlist_head *head;
+	struct au_vdir_wh *pos;
+	struct au_vdir_destr *str;
+
+	err = -ENOMEM;
+	p = (void *)__get_free_page(GFP_NOFS);
+	wh_name.name = p;
+	if (unlikely(!wh_name.name))
+		goto out;
+
+	err = 0;
+	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
+	p += AUFS_WH_PFX_LEN;
+	n = whlist->nh_num;
+	head = whlist->nh_head;
+	for (ul = 0; !err && ul < n; ul++, head++) {
+		hlist_for_each_entry(pos, head, wh_hash) {
+			if (pos->wh_bindex != bindex)
+				continue;
+
+			str = &pos->wh_str;
+			if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) {
+				memcpy(p, str->name, str->len);
+				wh_name.len = AUFS_WH_PFX_LEN + str->len;
+				err = unlink_wh_name(h_dentry, &wh_name, br);
+				if (!err)
+					continue;
+				break;
+			}
+			AuIOErr("whiteout name too long %.*s\n",
+				str->len, str->name);
+			err = -EIO;
+			break;
+		}
+	}
+	au_delayed_free_page((unsigned long)wh_name.name);
+
+out:
+	return err;
+}
+
+struct del_wh_children_args {
+	int *errp;
+	struct dentry *h_dentry;
+	struct au_nhash *whlist;
+	aufs_bindex_t bindex;
+	struct au_branch *br;
+};
+
+static void call_del_wh_children(void *args)
+{
+	struct del_wh_children_args *a = args;
+	*a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br);
+}
+
+/* ---------------------------------------------------------------------- */
+
+struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp)
+{
+	struct au_whtmp_rmdir *whtmp;
+	int err;
+	unsigned int rdhash;
+
+	SiMustAnyLock(sb);
+
+	whtmp = kzalloc(sizeof(*whtmp), gfp);
+	if (unlikely(!whtmp)) {
+		whtmp = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	/* no estimation for dir size */
+	rdhash = au_sbi(sb)->si_rdhash;
+	if (!rdhash)
+		rdhash = AUFS_RDHASH_DEF;
+	err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp);
+	if (unlikely(err)) {
+		au_delayed_kfree(whtmp);
+		whtmp = ERR_PTR(err);
+	}
+
+out:
+	return whtmp;
+}
+
+void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp)
+{
+	if (whtmp->br)
+		au_br_put(whtmp->br);
+	dput(whtmp->wh_dentry);
+	iput(whtmp->dir);
+	au_nhash_wh_free(&whtmp->whlist);
+	au_delayed_kfree(whtmp);
+}
+
+/*
+ * rmdir the whiteouted temporary named dir @h_dentry.
+ * @whlist: whiteouted children.
+ */
+int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
+		   struct dentry *wh_dentry, struct au_nhash *whlist)
+{
+	int err;
+	unsigned int h_nlink;
+	struct path h_tmp;
+	struct inode *wh_inode, *h_dir;
+	struct au_branch *br;
+
+	h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
+	IMustLock(h_dir);
+
+	br = au_sbr(dir->i_sb, bindex);
+	wh_inode = d_inode(wh_dentry);
+	inode_lock_nested(wh_inode, AuLsc_I_CHILD);
+
+	/*
+	 * someone else might change some whiteouts while we were sleeping.
+	 * it means this whlist may have an obsoleted entry.
+	 */
+	if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE))
+		err = del_wh_children(wh_dentry, whlist, bindex, br);
+	else {
+		int wkq_err;
+		struct del_wh_children_args args = {
+			.errp		= &err,
+			.h_dentry	= wh_dentry,
+			.whlist		= whlist,
+			.bindex		= bindex,
+			.br		= br
+		};
+
+		wkq_err = au_wkq_wait(call_del_wh_children, &args);
+		if (unlikely(wkq_err))
+			err = wkq_err;
+	}
+	inode_unlock(wh_inode);
+
+	if (!err) {
+		h_tmp.dentry = wh_dentry;
+		h_tmp.mnt = au_br_mnt(br);
+		h_nlink = h_dir->i_nlink;
+		err = vfsub_rmdir(h_dir, &h_tmp);
+		/* some fs doesn't change the parent nlink in some cases */
+		h_nlink -= h_dir->i_nlink;
+	}
+
+	if (!err) {
+		if (au_ibtop(dir) == bindex) {
+			/* todo: dir->i_mutex is necessary */
+			au_cpup_attr_timesizes(dir);
+			if (h_nlink)
+				vfsub_drop_nlink(dir);
+		}
+		return 0; /* success */
+	}
+
+	pr_warn("failed removing %pd(%d), ignored\n", wh_dentry, err);
+	return err;
+}
+
+static void call_rmdir_whtmp(void *args)
+{
+	int err;
+	aufs_bindex_t bindex;
+	struct au_whtmp_rmdir *a = args;
+	struct super_block *sb;
+	struct dentry *h_parent;
+	struct inode *h_dir;
+	struct au_hinode *hdir;
+
+	/* rmdir by nfsd may cause deadlock with this i_mutex */
+	/* inode_lock(a->dir); */
+	err = -EROFS;
+	sb = a->dir->i_sb;
+	si_read_lock(sb, !AuLock_FLUSH);
+	if (!au_br_writable(a->br->br_perm))
+		goto out;
+	bindex = au_br_index(sb, a->br->br_id);
+	if (unlikely(bindex < 0))
+		goto out;
+
+	err = -EIO;
+	ii_write_lock_parent(a->dir);
+	h_parent = dget_parent(a->wh_dentry);
+	h_dir = d_inode(h_parent);
+	hdir = au_hi(a->dir, bindex);
+	err = vfsub_mnt_want_write(au_br_mnt(a->br));
+	if (unlikely(err))
+		goto out_mnt;
+	au_hn_inode_lock_nested(hdir, AuLsc_I_PARENT);
+	err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent,
+			  a->br);
+	if (!err)
+		err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist);
+	au_hn_inode_unlock(hdir);
+	vfsub_mnt_drop_write(au_br_mnt(a->br));
+
+out_mnt:
+	dput(h_parent);
+	ii_write_unlock(a->dir);
+out:
+	/* inode_unlock(a->dir); */
+	au_whtmp_rmdir_free(a);
+	si_read_unlock(sb);
+	au_nwt_done(&au_sbi(sb)->si_nowait);
+	if (unlikely(err))
+		AuIOErr("err %d\n", err);
+}
+
+void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
+			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args)
+{
+	int wkq_err;
+	struct super_block *sb;
+
+	IMustLock(dir);
+
+	/* all post-process will be done in do_rmdir_whtmp(). */
+	sb = dir->i_sb;
+	args->dir = au_igrab(dir);
+	args->br = au_sbr(sb, bindex);
+	au_br_get(args->br);
+	args->wh_dentry = dget(wh_dentry);
+	wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0);
+	if (unlikely(wkq_err)) {
+		pr_warn("rmdir error %pd (%d), ignored\n", wh_dentry, wkq_err);
+		au_whtmp_rmdir_free(args);
+	}
+}
diff --git b/fs/aufs/whout.h b/fs/aufs/whout.h
new file mode 100644
index 0000000..4077dd1
--- /dev/null
+++ b/fs/aufs/whout.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * whiteout for logical deletion and opaque directory
+ */
+
+#ifndef __AUFS_WHOUT_H__
+#define __AUFS_WHOUT_H__
+
+#ifdef __KERNEL__
+
+#include "dir.h"
+
+/* whout.c */
+int au_wh_name_alloc(struct qstr *wh, const struct qstr *name);
+int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio);
+int au_diropq_test(struct dentry *h_dentry);
+struct au_branch;
+struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
+			     struct qstr *prefix);
+int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br);
+int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
+			struct dentry *dentry);
+int au_wh_init(struct au_branch *br, struct super_block *sb);
+
+/* diropq flags */
+#define AuDiropq_CREATE	1
+#define au_ftest_diropq(flags, name)	((flags) & AuDiropq_##name)
+#define au_fset_diropq(flags, name) \
+	do { (flags) |= AuDiropq_##name; } while (0)
+#define au_fclr_diropq(flags, name) \
+	do { (flags) &= ~AuDiropq_##name; } while (0)
+
+struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
+			     unsigned int flags);
+struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
+			  struct au_branch *br);
+struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
+			    struct dentry *h_parent);
+
+/* real rmdir for the whiteout-ed dir */
+struct au_whtmp_rmdir {
+	struct inode *dir;
+	struct au_branch *br;
+	struct dentry *wh_dentry;
+	struct au_nhash whlist;
+};
+
+struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp);
+void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp);
+int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
+		   struct dentry *wh_dentry, struct au_nhash *whlist);
+void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
+			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args);
+
+/* ---------------------------------------------------------------------- */
+
+static inline struct dentry *au_diropq_create(struct dentry *dentry,
+					      aufs_bindex_t bindex)
+{
+	return au_diropq_sio(dentry, bindex, AuDiropq_CREATE);
+}
+
+static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex)
+{
+	return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE));
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_WHOUT_H__ */
diff --git b/fs/aufs/wkq.c b/fs/aufs/wkq.c
new file mode 100644
index 0000000..216d08f
--- /dev/null
+++ b/fs/aufs/wkq.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * workqueue for asynchronous/super-io operations
+ * todo: try new dredential scheme
+ */
+
+#include <linux/module.h>
+#include "aufs.h"
+
+/* internal workqueue named AUFS_WKQ_NAME */
+
+static struct workqueue_struct *au_wkq;
+
+struct au_wkinfo {
+	struct work_struct wk;
+	struct kobject *kobj;
+
+	unsigned int flags; /* see wkq.h */
+
+	au_wkq_func_t func;
+	void *args;
+
+	struct completion *comp;
+};
+
+/* ---------------------------------------------------------------------- */
+
+static void wkq_func(struct work_struct *wk)
+{
+	struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk);
+
+	AuDebugOn(!uid_eq(current_fsuid(), GLOBAL_ROOT_UID));
+	AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY);
+
+	wkinfo->func(wkinfo->args);
+	if (au_ftest_wkq(wkinfo->flags, WAIT))
+		complete(wkinfo->comp);
+	else {
+		kobject_put(wkinfo->kobj);
+		module_put(THIS_MODULE); /* todo: ?? */
+		au_delayed_kfree(wkinfo);
+	}
+}
+
+/*
+ * Since struct completion is large, try allocating it dynamically.
+ */
+#if 1 /* defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) */
+#define AuWkqCompDeclare(name)	struct completion *comp = NULL
+
+static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
+{
+	*comp = kmalloc(sizeof(**comp), GFP_NOFS);
+	if (*comp) {
+		init_completion(*comp);
+		wkinfo->comp = *comp;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+static void au_wkq_comp_free(struct completion *comp)
+{
+	au_delayed_kfree(comp);
+}
+
+#else
+
+/* no braces */
+#define AuWkqCompDeclare(name) \
+	DECLARE_COMPLETION_ONSTACK(_ ## name); \
+	struct completion *comp = &_ ## name
+
+static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
+{
+	wkinfo->comp = *comp;
+	return 0;
+}
+
+static void au_wkq_comp_free(struct completion *comp __maybe_unused)
+{
+	/* empty */
+}
+#endif /* 4KSTACKS */
+
+static void au_wkq_run(struct au_wkinfo *wkinfo)
+{
+	if (au_ftest_wkq(wkinfo->flags, NEST)) {
+		if (au_wkq_test()) {
+			AuWarn1("wkq from wkq, unless silly-rename on NFS,"
+				" due to a dead dir by UDBA?\n");
+			AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT));
+		}
+	} else
+		au_dbg_verify_kthread();
+
+	if (au_ftest_wkq(wkinfo->flags, WAIT)) {
+		INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func);
+		queue_work(au_wkq, &wkinfo->wk);
+	} else {
+		INIT_WORK(&wkinfo->wk, wkq_func);
+		schedule_work(&wkinfo->wk);
+	}
+}
+
+/*
+ * Be careful. It is easy to make deadlock happen.
+ * processA: lock, wkq and wait
+ * processB: wkq and wait, lock in wkq
+ * --> deadlock
+ */
+int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args)
+{
+	int err;
+	AuWkqCompDeclare(comp);
+	struct au_wkinfo wkinfo = {
+		.flags	= flags,
+		.func	= func,
+		.args	= args
+	};
+
+	err = au_wkq_comp_alloc(&wkinfo, &comp);
+	if (!err) {
+		au_wkq_run(&wkinfo);
+		/* no timeout, no interrupt */
+		wait_for_completion(wkinfo.comp);
+		au_wkq_comp_free(comp);
+		destroy_work_on_stack(&wkinfo.wk);
+	}
+
+	return err;
+
+}
+
+/*
+ * Note: dget/dput() in func for aufs dentries are not supported. It will be a
+ * problem in a concurrent umounting.
+ */
+int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
+		  unsigned int flags)
+{
+	int err;
+	struct au_wkinfo *wkinfo;
+
+	atomic_inc(&au_sbi(sb)->si_nowait.nw_len);
+
+	/*
+	 * wkq_func() must free this wkinfo.
+	 * it highly depends upon the implementation of workqueue.
+	 */
+	err = 0;
+	wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS);
+	if (wkinfo) {
+		wkinfo->kobj = &au_sbi(sb)->si_kobj;
+		wkinfo->flags = flags & ~AuWkq_WAIT;
+		wkinfo->func = func;
+		wkinfo->args = args;
+		wkinfo->comp = NULL;
+		kobject_get(wkinfo->kobj);
+		__module_get(THIS_MODULE); /* todo: ?? */
+
+		au_wkq_run(wkinfo);
+	} else {
+		err = -ENOMEM;
+		au_nwt_done(&au_sbi(sb)->si_nowait);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+void au_nwt_init(struct au_nowait_tasks *nwt)
+{
+	atomic_set(&nwt->nw_len, 0);
+	/* smp_mb(); */ /* atomic_set */
+	init_waitqueue_head(&nwt->nw_wq);
+}
+
+void au_wkq_fin(void)
+{
+	destroy_workqueue(au_wkq);
+}
+
+int __init au_wkq_init(void)
+{
+	int err;
+
+	err = 0;
+	au_wkq = alloc_workqueue(AUFS_WKQ_NAME, 0, WQ_DFL_ACTIVE);
+	if (IS_ERR(au_wkq))
+		err = PTR_ERR(au_wkq);
+	else if (!au_wkq)
+		err = -ENOMEM;
+
+	return err;
+}
diff --git b/fs/aufs/wkq.h b/fs/aufs/wkq.h
new file mode 100644
index 0000000..6d63f74
--- /dev/null
+++ b/fs/aufs/wkq.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * workqueue for asynchronous/super-io operations
+ * todo: try new credentials management scheme
+ */
+
+#ifndef __AUFS_WKQ_H__
+#define __AUFS_WKQ_H__
+
+#ifdef __KERNEL__
+
+#include <linux/percpu_counter.h>
+
+struct super_block;
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * in the next operation, wait for the 'nowait' tasks in system-wide workqueue
+ */
+struct au_nowait_tasks {
+	atomic_t		nw_len;
+	wait_queue_head_t	nw_wq;
+};
+
+/* ---------------------------------------------------------------------- */
+
+typedef void (*au_wkq_func_t)(void *args);
+
+/* wkq flags */
+#define AuWkq_WAIT	1
+#define AuWkq_NEST	(1 << 1)
+#define au_ftest_wkq(flags, name)	((flags) & AuWkq_##name)
+#define au_fset_wkq(flags, name) \
+	do { (flags) |= AuWkq_##name; } while (0)
+#define au_fclr_wkq(flags, name) \
+	do { (flags) &= ~AuWkq_##name; } while (0)
+
+#ifndef CONFIG_AUFS_HNOTIFY
+#undef AuWkq_NEST
+#define AuWkq_NEST	0
+#endif
+
+/* wkq.c */
+int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args);
+int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
+		  unsigned int flags);
+void au_nwt_init(struct au_nowait_tasks *nwt);
+int __init au_wkq_init(void);
+void au_wkq_fin(void);
+
+/* ---------------------------------------------------------------------- */
+
+static inline int au_wkq_test(void)
+{
+	return current->flags & PF_WQ_WORKER;
+}
+
+static inline int au_wkq_wait(au_wkq_func_t func, void *args)
+{
+	return au_wkq_do_wait(AuWkq_WAIT, func, args);
+}
+
+static inline void au_nwt_done(struct au_nowait_tasks *nwt)
+{
+	if (atomic_dec_and_test(&nwt->nw_len))
+		wake_up_all(&nwt->nw_wq);
+}
+
+static inline int au_nwt_flush(struct au_nowait_tasks *nwt)
+{
+	wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len));
+	return 0;
+}
+
+#endif /* __KERNEL__ */
+#endif /* __AUFS_WKQ_H__ */
diff --git b/fs/aufs/xattr.c b/fs/aufs/xattr.c
new file mode 100644
index 0000000..2a148c9
--- /dev/null
+++ b/fs/aufs/xattr.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright (C) 2014-2016 Junjiro R. Okajima
+ */
+
+/*
+ * handling xattr functions
+ */
+
+#include <linux/xattr.h>
+#include "aufs.h"
+
+static int au_xattr_ignore(int err, char *name, unsigned int ignore_flags)
+{
+	if (!ignore_flags)
+		goto out;
+	switch (err) {
+	case -ENOMEM:
+	case -EDQUOT:
+		goto out;
+	}
+
+	if ((ignore_flags & AuBrAttr_ICEX) == AuBrAttr_ICEX) {
+		err = 0;
+		goto out;
+	}
+
+#define cmp(brattr, prefix) do {					\
+		if (!strncmp(name, XATTR_##prefix##_PREFIX,		\
+			     XATTR_##prefix##_PREFIX_LEN)) {		\
+			if (ignore_flags & AuBrAttr_ICEX_##brattr)	\
+				err = 0;				\
+			goto out;					\
+		}							\
+	} while (0)
+
+	cmp(SEC, SECURITY);
+	cmp(SYS, SYSTEM);
+	cmp(TR, TRUSTED);
+	cmp(USR, USER);
+#undef cmp
+
+	if (ignore_flags & AuBrAttr_ICEX_OTH)
+		err = 0;
+
+out:
+	return err;
+}
+
+static const int au_xattr_out_of_list = AuBrAttr_ICEX_OTH << 1;
+
+static int au_do_cpup_xattr(struct dentry *h_dst, struct dentry *h_src,
+			    char *name, char **buf, unsigned int ignore_flags,
+			    unsigned int verbose)
+{
+	int err;
+	ssize_t ssz;
+	struct inode *h_idst;
+
+	ssz = vfs_getxattr_alloc(h_src, name, buf, 0, GFP_NOFS);
+	err = ssz;
+	if (unlikely(err <= 0)) {
+		if (err == -ENODATA
+		    || (err == -EOPNOTSUPP
+			&& ((ignore_flags & au_xattr_out_of_list)
+			    || (au_test_nfs_noacl(d_inode(h_src))
+				&& (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS)
+				    || !strcmp(name,
+					       XATTR_NAME_POSIX_ACL_DEFAULT))))
+			    ))
+			err = 0;
+		if (err && (verbose || au_debug_test()))
+			pr_err("%s, err %d\n", name, err);
+		goto out;
+	}
+
+	/* unlock it temporary */
+	h_idst = d_inode(h_dst);
+	inode_unlock(h_idst);
+	err = vfsub_setxattr(h_dst, name, *buf, ssz, /*flags*/0);
+	inode_lock_nested(h_idst, AuLsc_I_CHILD2);
+	if (unlikely(err)) {
+		if (verbose || au_debug_test())
+			pr_err("%s, err %d\n", name, err);
+		err = au_xattr_ignore(err, name, ignore_flags);
+	}
+
+out:
+	return err;
+}
+
+int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
+		  unsigned int verbose)
+{
+	int err, unlocked, acl_access, acl_default;
+	ssize_t ssz;
+	struct inode *h_isrc, *h_idst;
+	char *value, *p, *o, *e;
+
+	/* try stopping to update the source inode while we are referencing */
+	/* there should not be the parent-child relationship between them */
+	h_isrc = d_inode(h_src);
+	h_idst = d_inode(h_dst);
+	inode_unlock(h_idst);
+	inode_lock_nested(h_isrc, AuLsc_I_CHILD);
+	inode_lock_nested(h_idst, AuLsc_I_CHILD2);
+	unlocked = 0;
+
+	/* some filesystems don't list POSIX ACL, for example tmpfs */
+	ssz = vfs_listxattr(h_src, NULL, 0);
+	err = ssz;
+	if (unlikely(err < 0)) {
+		AuTraceErr(err);
+		if (err == -ENODATA
+		    || err == -EOPNOTSUPP)
+			err = 0;	/* ignore */
+		goto out;
+	}
+
+	err = 0;
+	p = NULL;
+	o = NULL;
+	if (ssz) {
+		err = -ENOMEM;
+		p = kmalloc(ssz, GFP_NOFS);
+		o = p;
+		if (unlikely(!p))
+			goto out;
+		err = vfs_listxattr(h_src, p, ssz);
+	}
+	inode_unlock(h_isrc);
+	unlocked = 1;
+	AuDbg("err %d, ssz %zd\n", err, ssz);
+	if (unlikely(err < 0))
+		goto out_free;
+
+	err = 0;
+	e = p + ssz;
+	value = NULL;
+	acl_access = 0;
+	acl_default = 0;
+	while (!err && p < e) {
+		acl_access |= !strncmp(p, XATTR_NAME_POSIX_ACL_ACCESS,
+				       sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1);
+		acl_default |= !strncmp(p, XATTR_NAME_POSIX_ACL_DEFAULT,
+					sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)
+					- 1);
+		err = au_do_cpup_xattr(h_dst, h_src, p, &value, ignore_flags,
+				       verbose);
+		p += strlen(p) + 1;
+	}
+	AuTraceErr(err);
+	ignore_flags |= au_xattr_out_of_list;
+	if (!err && !acl_access) {
+		err = au_do_cpup_xattr(h_dst, h_src,
+				       XATTR_NAME_POSIX_ACL_ACCESS, &value,
+				       ignore_flags, verbose);
+		AuTraceErr(err);
+	}
+	if (!err && !acl_default) {
+		err = au_do_cpup_xattr(h_dst, h_src,
+				       XATTR_NAME_POSIX_ACL_DEFAULT, &value,
+				       ignore_flags, verbose);
+		AuTraceErr(err);
+	}
+
+	if (value)
+		au_delayed_kfree(value);
+
+out_free:
+	if (o)
+		au_delayed_kfree(o);
+out:
+	if (!unlocked)
+		inode_unlock(h_isrc);
+	AuTraceErr(err);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+enum {
+	AU_XATTR_LIST,
+	AU_XATTR_GET
+};
+
+struct au_lgxattr {
+	int type;
+	union {
+		struct {
+			char	*list;
+			size_t	size;
+		} list;
+		struct {
+			const char	*name;
+			void		*value;
+			size_t		size;
+		} get;
+	} u;
+};
+
+static ssize_t au_lgxattr(struct dentry *dentry, struct au_lgxattr *arg)
+{
+	ssize_t err;
+	struct path h_path;
+	struct super_block *sb;
+
+	sb = dentry->d_sb;
+	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
+	if (unlikely(err))
+		goto out;
+	err = au_h_path_getattr(dentry, /*force*/1, &h_path);
+	if (unlikely(err))
+		goto out_si;
+	if (unlikely(!h_path.dentry))
+		/* illegally overlapped or something */
+		goto out_di; /* pretending success */
+
+	/* always topmost entry only */
+	switch (arg->type) {
+	case AU_XATTR_LIST:
+		err = vfs_listxattr(h_path.dentry,
+				    arg->u.list.list, arg->u.list.size);
+		break;
+	case AU_XATTR_GET:
+		AuDebugOn(d_is_negative(h_path.dentry));
+		err = vfs_getxattr(h_path.dentry,
+				   arg->u.get.name, arg->u.get.value,
+				   arg->u.get.size);
+		break;
+	}
+
+out_di:
+	di_read_unlock(dentry, AuLock_IR);
+out_si:
+	si_read_unlock(sb);
+out:
+	AuTraceErr(err);
+	return err;
+}
+
+ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	struct au_lgxattr arg = {
+		.type = AU_XATTR_LIST,
+		.u.list = {
+			.list	= list,
+			.size	= size
+		},
+	};
+
+	return au_lgxattr(dentry, &arg);
+}
+
+ssize_t aufs_getxattr(struct dentry *dentry, struct inode *inode __maybe_unused,
+		      const char *name, void *value, size_t size)
+{
+	struct au_lgxattr arg = {
+		.type = AU_XATTR_GET,
+		.u.get = {
+			.name	= name,
+			.value	= value,
+			.size	= size
+		},
+	};
+
+	return au_lgxattr(dentry, &arg);
+}
+
+int aufs_setxattr(struct dentry *dentry, struct inode *inode, const char *name,
+		  const void *value, size_t size, int flags)
+{
+	struct au_srxattr arg = {
+		.type = AU_XATTR_SET,
+		.u.set = {
+			.name	= name,
+			.value	= value,
+			.size	= size,
+			.flags	= flags
+		},
+	};
+
+	return au_srxattr(dentry, inode, &arg);
+}
+
+int aufs_removexattr(struct dentry *dentry, const char *name)
+{
+	struct au_srxattr arg = {
+		.type = AU_XATTR_REMOVE,
+		.u.remove = {
+			.name	= name
+		},
+	};
+
+	return au_srxattr(dentry, d_inode(dentry), &arg);
+}
+
+/* ---------------------------------------------------------------------- */
+
+#if 0
+static size_t au_xattr_list(struct dentry *dentry, char *list, size_t list_size,
+			    const char *name, size_t name_len, int type)
+{
+	return aufs_listxattr(dentry, list, list_size);
+}
+
+static int au_xattr_get(struct dentry *dentry, const char *name, void *buffer,
+			size_t size, int type)
+{
+	return aufs_getxattr(dentry, name, buffer, size);
+}
+
+static int au_xattr_set(struct dentry *dentry, const char *name,
+			const void *value, size_t size, int flags, int type)
+{
+	return aufs_setxattr(dentry, name, value, size, flags);
+}
+
+static const struct xattr_handler au_xattr_handler = {
+	/* no prefix, no flags */
+	.list	= au_xattr_list,
+	.get	= au_xattr_get,
+	.set	= au_xattr_set
+	/* why no remove? */
+};
+
+static const struct xattr_handler *au_xattr_handlers[] = {
+	&au_xattr_handler
+};
+
+void au_xattr_init(struct super_block *sb)
+{
+	/* sb->s_xattr = au_xattr_handlers; */
+}
+#endif
diff --git b/fs/aufs/xino.c b/fs/aufs/xino.c
new file mode 100644
index 0000000..2a411f9
--- /dev/null
+++ b/fs/aufs/xino.c
@@ -0,0 +1,1305 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+/*
+ * external inode number translation table and bitmap
+ */
+
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include "aufs.h"
+
+/* todo: unnecessary to support mmap_sem since kernel-space? */
+ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size,
+		   loff_t *pos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		char __user *u;
+	} buf;
+
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	do {
+		/* todo: signal_pending? */
+		err = func(file, buf.u, size, pos);
+	} while (err == -EAGAIN || err == -EINTR);
+	set_fs(oldfs);
+
+#if 0 /* reserved for future use */
+	if (err > 0)
+		fsnotify_access(file->f_path.dentry);
+#endif
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
+			       size_t size, loff_t *pos);
+
+static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf,
+			      size_t size, loff_t *pos)
+{
+	ssize_t err;
+	mm_segment_t oldfs;
+	union {
+		void *k;
+		const char __user *u;
+	} buf;
+	int i;
+	const int prevent_endless = 10;
+
+	i = 0;
+	buf.k = kbuf;
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+	do {
+		err = func(file, buf.u, size, pos);
+		if (err == -EINTR
+		    && !au_wkq_test()
+		    && fatal_signal_pending(current)) {
+			set_fs(oldfs);
+			err = xino_fwrite_wkq(func, file, kbuf, size, pos);
+			BUG_ON(err == -EINTR);
+			oldfs = get_fs();
+			set_fs(KERNEL_DS);
+		}
+	} while (i++ < prevent_endless
+		 && (err == -EAGAIN || err == -EINTR));
+	set_fs(oldfs);
+
+#if 0 /* reserved for future use */
+	if (err > 0)
+		fsnotify_modify(file->f_path.dentry);
+#endif
+
+	return err;
+}
+
+struct do_xino_fwrite_args {
+	ssize_t *errp;
+	vfs_writef_t func;
+	struct file *file;
+	void *buf;
+	size_t size;
+	loff_t *pos;
+};
+
+static void call_do_xino_fwrite(void *args)
+{
+	struct do_xino_fwrite_args *a = args;
+	*a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos);
+}
+
+static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
+			       size_t size, loff_t *pos)
+{
+	ssize_t err;
+	int wkq_err;
+	struct do_xino_fwrite_args args = {
+		.errp	= &err,
+		.func	= func,
+		.file	= file,
+		.buf	= buf,
+		.size	= size,
+		.pos	= pos
+	};
+
+	/*
+	 * it breaks RLIMIT_FSIZE and normal user's limit,
+	 * users should care about quota and real 'filesystem full.'
+	 */
+	wkq_err = au_wkq_wait(call_do_xino_fwrite, &args);
+	if (unlikely(wkq_err))
+		err = wkq_err;
+
+	return err;
+}
+
+ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
+		    size_t size, loff_t *pos)
+{
+	ssize_t err;
+
+	if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) {
+		lockdep_off();
+		err = do_xino_fwrite(func, file, buf, size, pos);
+		lockdep_on();
+	} else
+		err = xino_fwrite_wkq(func, file, buf, size, pos);
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create a new xinofile at the same place/path as @base_file.
+ */
+struct file *au_xino_create2(struct file *base_file, struct file *copy_src)
+{
+	struct file *file;
+	struct dentry *base, *parent;
+	struct inode *dir, *delegated;
+	struct qstr *name;
+	struct path path;
+	int err;
+
+	base = base_file->f_path.dentry;
+	parent = base->d_parent; /* dir inode is locked */
+	dir = d_inode(parent);
+	IMustLock(dir);
+
+	file = ERR_PTR(-EINVAL);
+	name = &base->d_name;
+	path.dentry = vfsub_lookup_one_len(name->name, parent, name->len);
+	if (IS_ERR(path.dentry)) {
+		file = (void *)path.dentry;
+		pr_err("%pd lookup err %ld\n",
+		       base, PTR_ERR(path.dentry));
+		goto out;
+	}
+
+	/* no need to mnt_want_write() since we call dentry_open() later */
+	err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL);
+	if (unlikely(err)) {
+		file = ERR_PTR(err);
+		pr_err("%pd create err %d\n", base, err);
+		goto out_dput;
+	}
+
+	path.mnt = base_file->f_path.mnt;
+	file = vfsub_dentry_open(&path,
+				 O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
+				 /* | __FMODE_NONOTIFY */);
+	if (IS_ERR(file)) {
+		pr_err("%pd open err %ld\n", base, PTR_ERR(file));
+		goto out_dput;
+	}
+
+	delegated = NULL;
+	err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0);
+	if (unlikely(err == -EWOULDBLOCK)) {
+		pr_warn("cannot retry for NFSv4 delegation"
+			" for an internal unlink\n");
+		iput(delegated);
+	}
+	if (unlikely(err)) {
+		pr_err("%pd unlink err %d\n", base, err);
+		goto out_fput;
+	}
+
+	if (copy_src) {
+		/* no one can touch copy_src xino */
+		err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src));
+		if (unlikely(err)) {
+			pr_err("%pd copy err %d\n", base, err);
+			goto out_fput;
+		}
+	}
+	goto out_dput; /* success */
+
+out_fput:
+	fput(file);
+	file = ERR_PTR(err);
+out_dput:
+	dput(path.dentry);
+out:
+	return file;
+}
+
+struct au_xino_lock_dir {
+	struct au_hinode *hdir;
+	struct dentry *parent;
+	struct inode *dir;
+};
+
+static void au_xino_lock_dir(struct super_block *sb, struct file *xino,
+			     struct au_xino_lock_dir *ldir)
+{
+	aufs_bindex_t brid, bindex;
+
+	ldir->hdir = NULL;
+	bindex = -1;
+	brid = au_xino_brid(sb);
+	if (brid >= 0)
+		bindex = au_br_index(sb, brid);
+	if (bindex >= 0) {
+		ldir->hdir = au_hi(d_inode(sb->s_root), bindex);
+		au_hn_inode_lock_nested(ldir->hdir, AuLsc_I_PARENT);
+	} else {
+		ldir->parent = dget_parent(xino->f_path.dentry);
+		ldir->dir = d_inode(ldir->parent);
+		inode_lock_nested(ldir->dir, AuLsc_I_PARENT);
+	}
+}
+
+static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir)
+{
+	if (ldir->hdir)
+		au_hn_inode_unlock(ldir->hdir);
+	else {
+		inode_unlock(ldir->dir);
+		dput(ldir->parent);
+	}
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* trucate xino files asynchronously */
+
+int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex)
+{
+	int err;
+	unsigned long jiffy;
+	blkcnt_t blocks;
+	aufs_bindex_t bi, bbot;
+	struct kstatfs *st;
+	struct au_branch *br;
+	struct file *new_xino, *file;
+	struct super_block *h_sb;
+	struct au_xino_lock_dir ldir;
+
+	err = -ENOMEM;
+	st = kmalloc(sizeof(*st), GFP_NOFS);
+	if (unlikely(!st))
+		goto out;
+
+	err = -EINVAL;
+	bbot = au_sbbot(sb);
+	if (unlikely(bindex < 0 || bbot < bindex))
+		goto out_st;
+	br = au_sbr(sb, bindex);
+	file = br->br_xino.xi_file;
+	if (!file)
+		goto out_st;
+
+	err = vfs_statfs(&file->f_path, st);
+	if (unlikely(err))
+		AuErr1("statfs err %d, ignored\n", err);
+	jiffy = jiffies;
+	blocks = file_inode(file)->i_blocks;
+	pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
+		bindex, (u64)blocks, st->f_bfree, st->f_blocks);
+
+	au_xino_lock_dir(sb, file, &ldir);
+	/* mnt_want_write() is unnecessary here */
+	new_xino = au_xino_create2(file, file);
+	au_xino_unlock_dir(&ldir);
+	err = PTR_ERR(new_xino);
+	if (IS_ERR(new_xino)) {
+		pr_err("err %d, ignored\n", err);
+		goto out_st;
+	}
+	err = 0;
+	fput(file);
+	br->br_xino.xi_file = new_xino;
+
+	h_sb = au_br_sb(br);
+	for (bi = 0; bi <= bbot; bi++) {
+		if (unlikely(bi == bindex))
+			continue;
+		br = au_sbr(sb, bi);
+		if (au_br_sb(br) != h_sb)
+			continue;
+
+		fput(br->br_xino.xi_file);
+		br->br_xino.xi_file = new_xino;
+		get_file(new_xino);
+	}
+
+	err = vfs_statfs(&new_xino->f_path, st);
+	if (!err) {
+		pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
+			bindex, (u64)file_inode(new_xino)->i_blocks,
+			st->f_bfree, st->f_blocks);
+		if (file_inode(new_xino)->i_blocks < blocks)
+			au_sbi(sb)->si_xino_jiffy = jiffy;
+	} else
+		AuErr1("statfs err %d, ignored\n", err);
+
+out_st:
+	au_delayed_kfree(st);
+out:
+	return err;
+}
+
+struct xino_do_trunc_args {
+	struct super_block *sb;
+	struct au_branch *br;
+};
+
+static void xino_do_trunc(void *_args)
+{
+	struct xino_do_trunc_args *args = _args;
+	struct super_block *sb;
+	struct au_branch *br;
+	struct inode *dir;
+	int err;
+	aufs_bindex_t bindex;
+
+	err = 0;
+	sb = args->sb;
+	dir = d_inode(sb->s_root);
+	br = args->br;
+
+	si_noflush_write_lock(sb);
+	ii_read_lock_parent(dir);
+	bindex = au_br_index(sb, br->br_id);
+	err = au_xino_trunc(sb, bindex);
+	ii_read_unlock(dir);
+	if (unlikely(err))
+		pr_warn("err b%d, (%d)\n", bindex, err);
+	atomic_dec(&br->br_xino_running);
+	au_br_put(br);
+	si_write_unlock(sb);
+	au_nwt_done(&au_sbi(sb)->si_nowait);
+	au_delayed_kfree(args);
+}
+
+static int xino_trunc_test(struct super_block *sb, struct au_branch *br)
+{
+	int err;
+	struct kstatfs st;
+	struct au_sbinfo *sbinfo;
+
+	/* todo: si_xino_expire and the ratio should be customizable */
+	sbinfo = au_sbi(sb);
+	if (time_before(jiffies,
+			sbinfo->si_xino_jiffy + sbinfo->si_xino_expire))
+		return 0;
+
+	/* truncation border */
+	err = vfs_statfs(&br->br_xino.xi_file->f_path, &st);
+	if (unlikely(err)) {
+		AuErr1("statfs err %d, ignored\n", err);
+		return 0;
+	}
+	if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC)
+		return 0;
+
+	return 1;
+}
+
+static void xino_try_trunc(struct super_block *sb, struct au_branch *br)
+{
+	struct xino_do_trunc_args *args;
+	int wkq_err;
+
+	if (!xino_trunc_test(sb, br))
+		return;
+
+	if (atomic_inc_return(&br->br_xino_running) > 1)
+		goto out;
+
+	/* lock and kfree() will be called in trunc_xino() */
+	args = kmalloc(sizeof(*args), GFP_NOFS);
+	if (unlikely(!args)) {
+		AuErr1("no memory\n");
+		goto out;
+	}
+
+	au_br_get(br);
+	args->sb = sb;
+	args->br = br;
+	wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0);
+	if (!wkq_err)
+		return; /* success */
+
+	pr_err("wkq %d\n", wkq_err);
+	au_br_put(br);
+	au_delayed_kfree(args);
+
+out:
+	atomic_dec(&br->br_xino_running);
+}
+
+/* ---------------------------------------------------------------------- */
+
+static int au_xino_do_write(vfs_writef_t write, struct file *file,
+			    ino_t h_ino, ino_t ino)
+{
+	loff_t pos;
+	ssize_t sz;
+
+	pos = h_ino;
+	if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) {
+		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
+		return -EFBIG;
+	}
+	pos *= sizeof(ino);
+	sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos);
+	if (sz == sizeof(ino))
+		return 0; /* success */
+
+	AuIOErr("write failed (%zd)\n", sz);
+	return -EIO;
+}
+
+/*
+ * write @ino to the xinofile for the specified branch{@sb, @bindex}
+ * at the position of @h_ino.
+ * even if @ino is zero, it is written to the xinofile and means no entry.
+ * if the size of the xino file on a specific filesystem exceeds the watermark,
+ * try truncating it.
+ */
+int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		  ino_t ino)
+{
+	int err;
+	unsigned int mnt_flags;
+	struct au_branch *br;
+
+	BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max)
+		     || ((loff_t)-1) > 0);
+	SiMustAnyLock(sb);
+
+	mnt_flags = au_mntflags(sb);
+	if (!au_opt_test(mnt_flags, XINO))
+		return 0;
+
+	br = au_sbr(sb, bindex);
+	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
+			       h_ino, ino);
+	if (!err) {
+		if (au_opt_test(mnt_flags, TRUNC_XINO)
+		    && au_test_fs_trunc_xino(au_br_sb(br)))
+			xino_try_trunc(sb, br);
+		return 0; /* success */
+	}
+
+	AuIOErr("write failed (%d)\n", err);
+	return -EIO;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* aufs inode number bitmap */
+
+static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE;
+static ino_t xib_calc_ino(unsigned long pindex, int bit)
+{
+	ino_t ino;
+
+	AuDebugOn(bit < 0 || page_bits <= bit);
+	ino = AUFS_FIRST_INO + pindex * page_bits + bit;
+	return ino;
+}
+
+static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit)
+{
+	AuDebugOn(ino < AUFS_FIRST_INO);
+	ino -= AUFS_FIRST_INO;
+	*pindex = ino / page_bits;
+	*bit = ino % page_bits;
+}
+
+static int xib_pindex(struct super_block *sb, unsigned long pindex)
+{
+	int err;
+	loff_t pos;
+	ssize_t sz;
+	struct au_sbinfo *sbinfo;
+	struct file *xib;
+	unsigned long *p;
+
+	sbinfo = au_sbi(sb);
+	MtxMustLock(&sbinfo->si_xib_mtx);
+	AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE
+		  || !au_opt_test(sbinfo->si_mntflags, XINO));
+
+	if (pindex == sbinfo->si_xib_last_pindex)
+		return 0;
+
+	xib = sbinfo->si_xib;
+	p = sbinfo->si_xib_buf;
+	pos = sbinfo->si_xib_last_pindex;
+	pos *= PAGE_SIZE;
+	sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
+	if (unlikely(sz != PAGE_SIZE))
+		goto out;
+
+	pos = pindex;
+	pos *= PAGE_SIZE;
+	if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE)
+		sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos);
+	else {
+		memset(p, 0, PAGE_SIZE);
+		sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
+	}
+	if (sz == PAGE_SIZE) {
+		sbinfo->si_xib_last_pindex = pindex;
+		return 0; /* success */
+	}
+
+out:
+	AuIOErr1("write failed (%zd)\n", sz);
+	err = sz;
+	if (sz >= 0)
+		err = -EIO;
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+static void au_xib_clear_bit(struct inode *inode)
+{
+	int err, bit;
+	unsigned long pindex;
+	struct super_block *sb;
+	struct au_sbinfo *sbinfo;
+
+	AuDebugOn(inode->i_nlink);
+
+	sb = inode->i_sb;
+	xib_calc_bit(inode->i_ino, &pindex, &bit);
+	AuDebugOn(page_bits <= bit);
+	sbinfo = au_sbi(sb);
+	mutex_lock(&sbinfo->si_xib_mtx);
+	err = xib_pindex(sb, pindex);
+	if (!err) {
+		clear_bit(bit, sbinfo->si_xib_buf);
+		sbinfo->si_xib_next_bit = bit;
+	}
+	mutex_unlock(&sbinfo->si_xib_mtx);
+}
+
+/* for s_op->delete_inode() */
+void au_xino_delete_inode(struct inode *inode, const int unlinked)
+{
+	int err;
+	unsigned int mnt_flags;
+	aufs_bindex_t bindex, bbot, bi;
+	unsigned char try_trunc;
+	struct au_iinfo *iinfo;
+	struct super_block *sb;
+	struct au_hinode *hi;
+	struct inode *h_inode;
+	struct au_branch *br;
+	vfs_writef_t xwrite;
+
+	AuDebugOn(au_is_bad_inode(inode));
+
+	sb = inode->i_sb;
+	mnt_flags = au_mntflags(sb);
+	if (!au_opt_test(mnt_flags, XINO)
+	    || inode->i_ino == AUFS_ROOT_INO)
+		return;
+
+	if (unlinked) {
+		au_xigen_inc(inode);
+		au_xib_clear_bit(inode);
+	}
+
+	iinfo = au_ii(inode);
+	bindex = iinfo->ii_btop;
+	if (bindex < 0)
+		return;
+
+	xwrite = au_sbi(sb)->si_xwrite;
+	try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO);
+	hi = au_hinode(iinfo, bindex);
+	bbot = iinfo->ii_bbot;
+	for (; bindex <= bbot; bindex++, hi++) {
+		h_inode = hi->hi_inode;
+		if (!h_inode
+		    || (!unlinked && h_inode->i_nlink))
+			continue;
+
+		/* inode may not be revalidated */
+		bi = au_br_index(sb, hi->hi_id);
+		if (bi < 0)
+			continue;
+
+		br = au_sbr(sb, bi);
+		err = au_xino_do_write(xwrite, br->br_xino.xi_file,
+				       h_inode->i_ino, /*ino*/0);
+		if (!err && try_trunc
+		    && au_test_fs_trunc_xino(au_br_sb(br)))
+			xino_try_trunc(sb, br);
+	}
+}
+
+/* get an unused inode number from bitmap */
+ino_t au_xino_new_ino(struct super_block *sb)
+{
+	ino_t ino;
+	unsigned long *p, pindex, ul, pend;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+	int free_bit, err;
+
+	if (!au_opt_test(au_mntflags(sb), XINO))
+		return iunique(sb, AUFS_FIRST_INO);
+
+	sbinfo = au_sbi(sb);
+	mutex_lock(&sbinfo->si_xib_mtx);
+	p = sbinfo->si_xib_buf;
+	free_bit = sbinfo->si_xib_next_bit;
+	if (free_bit < page_bits && !test_bit(free_bit, p))
+		goto out; /* success */
+	free_bit = find_first_zero_bit(p, page_bits);
+	if (free_bit < page_bits)
+		goto out; /* success */
+
+	pindex = sbinfo->si_xib_last_pindex;
+	for (ul = pindex - 1; ul < ULONG_MAX; ul--) {
+		err = xib_pindex(sb, ul);
+		if (unlikely(err))
+			goto out_err;
+		free_bit = find_first_zero_bit(p, page_bits);
+		if (free_bit < page_bits)
+			goto out; /* success */
+	}
+
+	file = sbinfo->si_xib;
+	pend = vfsub_f_size_read(file) / PAGE_SIZE;
+	for (ul = pindex + 1; ul <= pend; ul++) {
+		err = xib_pindex(sb, ul);
+		if (unlikely(err))
+			goto out_err;
+		free_bit = find_first_zero_bit(p, page_bits);
+		if (free_bit < page_bits)
+			goto out; /* success */
+	}
+	BUG();
+
+out:
+	set_bit(free_bit, p);
+	sbinfo->si_xib_next_bit = free_bit + 1;
+	pindex = sbinfo->si_xib_last_pindex;
+	mutex_unlock(&sbinfo->si_xib_mtx);
+	ino = xib_calc_ino(pindex, free_bit);
+	AuDbg("i%lu\n", (unsigned long)ino);
+	return ino;
+out_err:
+	mutex_unlock(&sbinfo->si_xib_mtx);
+	AuDbg("i0\n");
+	return 0;
+}
+
+/*
+ * read @ino from xinofile for the specified branch{@sb, @bindex}
+ * at the position of @h_ino.
+ * if @ino does not exist and @do_new is true, get new one.
+ */
+int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
+		 ino_t *ino)
+{
+	int err;
+	ssize_t sz;
+	loff_t pos;
+	struct file *file;
+	struct au_sbinfo *sbinfo;
+
+	*ino = 0;
+	if (!au_opt_test(au_mntflags(sb), XINO))
+		return 0; /* no xino */
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	pos = h_ino;
+	if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) {
+		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
+		return -EFBIG;
+	}
+	pos *= sizeof(*ino);
+
+	file = au_sbr(sb, bindex)->br_xino.xi_file;
+	if (vfsub_f_size_read(file) < pos + sizeof(*ino))
+		return 0; /* no ino */
+
+	sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos);
+	if (sz == sizeof(*ino))
+		return 0; /* success */
+
+	err = sz;
+	if (unlikely(sz >= 0)) {
+		err = -EIO;
+		AuIOErr("xino read error (%zd)\n", sz);
+	}
+
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* create and set a new xino file */
+
+struct file *au_xino_create(struct super_block *sb, char *fname, int silent)
+{
+	struct file *file;
+	struct dentry *h_parent, *d;
+	struct inode *h_dir, *inode;
+	int err;
+
+	/*
+	 * at mount-time, and the xino file is the default path,
+	 * hnotify is disabled so we have no notify events to ignore.
+	 * when a user specified the xino, we cannot get au_hdir to be ignored.
+	 */
+	file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
+			       /* | __FMODE_NONOTIFY */,
+			       S_IRUGO | S_IWUGO);
+	if (IS_ERR(file)) {
+		if (!silent)
+			pr_err("open %s(%ld)\n", fname, PTR_ERR(file));
+		return file;
+	}
+
+	/* keep file count */
+	err = 0;
+	inode = file_inode(file);
+	h_parent = dget_parent(file->f_path.dentry);
+	h_dir = d_inode(h_parent);
+	inode_lock_nested(h_dir, AuLsc_I_PARENT);
+	/* mnt_want_write() is unnecessary here */
+	/* no delegation since it is just created */
+	if (inode->i_nlink)
+		err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL,
+				   /*force*/0);
+	inode_unlock(h_dir);
+	dput(h_parent);
+	if (unlikely(err)) {
+		if (!silent)
+			pr_err("unlink %s(%d)\n", fname, err);
+		goto out;
+	}
+
+	err = -EINVAL;
+	d = file->f_path.dentry;
+	if (unlikely(sb == d->d_sb)) {
+		if (!silent)
+			pr_err("%s must be outside\n", fname);
+		goto out;
+	}
+	if (unlikely(au_test_fs_bad_xino(d->d_sb))) {
+		if (!silent)
+			pr_err("xino doesn't support %s(%s)\n",
+			       fname, au_sbtype(d->d_sb));
+		goto out;
+	}
+	return file; /* success */
+
+out:
+	fput(file);
+	file = ERR_PTR(err);
+	return file;
+}
+
+/*
+ * find another branch who is on the same filesystem of the specified
+ * branch{@btgt}. search until @bbot.
+ */
+static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt,
+			aufs_bindex_t bbot)
+{
+	aufs_bindex_t bindex;
+	struct super_block *tgt_sb = au_sbr_sb(sb, btgt);
+
+	for (bindex = 0; bindex < btgt; bindex++)
+		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
+			return bindex;
+	for (bindex++; bindex <= bbot; bindex++)
+		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
+			return bindex;
+	return -1;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * initialize the xinofile for the specified branch @br
+ * at the place/path where @base_file indicates.
+ * test whether another branch is on the same filesystem or not,
+ * if @do_test is true.
+ */
+int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino,
+	       struct file *base_file, int do_test)
+{
+	int err;
+	ino_t ino;
+	aufs_bindex_t bbot, bindex;
+	struct au_branch *shared_br, *b;
+	struct file *file;
+	struct super_block *tgt_sb;
+
+	shared_br = NULL;
+	bbot = au_sbbot(sb);
+	if (do_test) {
+		tgt_sb = au_br_sb(br);
+		for (bindex = 0; bindex <= bbot; bindex++) {
+			b = au_sbr(sb, bindex);
+			if (tgt_sb == au_br_sb(b)) {
+				shared_br = b;
+				break;
+			}
+		}
+	}
+
+	if (!shared_br || !shared_br->br_xino.xi_file) {
+		struct au_xino_lock_dir ldir;
+
+		au_xino_lock_dir(sb, base_file, &ldir);
+		/* mnt_want_write() is unnecessary here */
+		file = au_xino_create2(base_file, NULL);
+		au_xino_unlock_dir(&ldir);
+		err = PTR_ERR(file);
+		if (IS_ERR(file))
+			goto out;
+		br->br_xino.xi_file = file;
+	} else {
+		br->br_xino.xi_file = shared_br->br_xino.xi_file;
+		get_file(br->br_xino.xi_file);
+	}
+
+	ino = AUFS_ROOT_INO;
+	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
+			       h_ino, ino);
+	if (unlikely(err)) {
+		fput(br->br_xino.xi_file);
+		br->br_xino.xi_file = NULL;
+	}
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* trucate a xino bitmap file */
+
+/* todo: slow */
+static int do_xib_restore(struct super_block *sb, struct file *file, void *page)
+{
+	int err, bit;
+	ssize_t sz;
+	unsigned long pindex;
+	loff_t pos, pend;
+	struct au_sbinfo *sbinfo;
+	vfs_readf_t func;
+	ino_t *ino;
+	unsigned long *p;
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	MtxMustLock(&sbinfo->si_xib_mtx);
+	p = sbinfo->si_xib_buf;
+	func = sbinfo->si_xread;
+	pend = vfsub_f_size_read(file);
+	pos = 0;
+	while (pos < pend) {
+		sz = xino_fread(func, file, page, PAGE_SIZE, &pos);
+		err = sz;
+		if (unlikely(sz <= 0))
+			goto out;
+
+		err = 0;
+		for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) {
+			if (unlikely(*ino < AUFS_FIRST_INO))
+				continue;
+
+			xib_calc_bit(*ino, &pindex, &bit);
+			AuDebugOn(page_bits <= bit);
+			err = xib_pindex(sb, pindex);
+			if (!err)
+				set_bit(bit, p);
+			else
+				goto out;
+		}
+	}
+
+out:
+	return err;
+}
+
+static int xib_restore(struct super_block *sb)
+{
+	int err;
+	aufs_bindex_t bindex, bbot;
+	void *page;
+
+	err = -ENOMEM;
+	page = (void *)__get_free_page(GFP_NOFS);
+	if (unlikely(!page))
+		goto out;
+
+	err = 0;
+	bbot = au_sbbot(sb);
+	for (bindex = 0; !err && bindex <= bbot; bindex++)
+		if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0)
+			err = do_xib_restore
+				(sb, au_sbr(sb, bindex)->br_xino.xi_file, page);
+		else
+			AuDbg("b%d\n", bindex);
+	au_delayed_free_page((unsigned long)page);
+
+out:
+	return err;
+}
+
+int au_xib_trunc(struct super_block *sb)
+{
+	int err;
+	ssize_t sz;
+	loff_t pos;
+	struct au_xino_lock_dir ldir;
+	struct au_sbinfo *sbinfo;
+	unsigned long *p;
+	struct file *file;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	if (!au_opt_test(sbinfo->si_mntflags, XINO))
+		goto out;
+
+	file = sbinfo->si_xib;
+	if (vfsub_f_size_read(file) <= PAGE_SIZE)
+		goto out;
+
+	au_xino_lock_dir(sb, file, &ldir);
+	/* mnt_want_write() is unnecessary here */
+	file = au_xino_create2(sbinfo->si_xib, NULL);
+	au_xino_unlock_dir(&ldir);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+	fput(sbinfo->si_xib);
+	sbinfo->si_xib = file;
+
+	p = sbinfo->si_xib_buf;
+	memset(p, 0, PAGE_SIZE);
+	pos = 0;
+	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos);
+	if (unlikely(sz != PAGE_SIZE)) {
+		err = sz;
+		AuIOErr("err %d\n", err);
+		if (sz >= 0)
+			err = -EIO;
+		goto out;
+	}
+
+	mutex_lock(&sbinfo->si_xib_mtx);
+	/* mnt_want_write() is unnecessary here */
+	err = xib_restore(sb);
+	mutex_unlock(&sbinfo->si_xib_mtx);
+
+out:
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * xino mount option handlers
+ */
+
+/* xino bitmap */
+static void xino_clear_xib(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	sbinfo->si_xread = NULL;
+	sbinfo->si_xwrite = NULL;
+	if (sbinfo->si_xib)
+		fput(sbinfo->si_xib);
+	sbinfo->si_xib = NULL;
+	if (sbinfo->si_xib_buf)
+		au_delayed_free_page((unsigned long)sbinfo->si_xib_buf);
+	sbinfo->si_xib_buf = NULL;
+}
+
+static int au_xino_set_xib(struct super_block *sb, struct file *base)
+{
+	int err;
+	loff_t pos;
+	struct au_sbinfo *sbinfo;
+	struct file *file;
+
+	SiMustWriteLock(sb);
+
+	sbinfo = au_sbi(sb);
+	file = au_xino_create2(base, sbinfo->si_xib);
+	err = PTR_ERR(file);
+	if (IS_ERR(file))
+		goto out;
+	if (sbinfo->si_xib)
+		fput(sbinfo->si_xib);
+	sbinfo->si_xib = file;
+	sbinfo->si_xread = vfs_readf(file);
+	sbinfo->si_xwrite = vfs_writef(file);
+
+	err = -ENOMEM;
+	if (!sbinfo->si_xib_buf)
+		sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS);
+	if (unlikely(!sbinfo->si_xib_buf))
+		goto out_unset;
+
+	sbinfo->si_xib_last_pindex = 0;
+	sbinfo->si_xib_next_bit = 0;
+	if (vfsub_f_size_read(file) < PAGE_SIZE) {
+		pos = 0;
+		err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf,
+				  PAGE_SIZE, &pos);
+		if (unlikely(err != PAGE_SIZE))
+			goto out_free;
+	}
+	err = 0;
+	goto out; /* success */
+
+out_free:
+	if (sbinfo->si_xib_buf)
+		au_delayed_free_page((unsigned long)sbinfo->si_xib_buf);
+	sbinfo->si_xib_buf = NULL;
+	if (err >= 0)
+		err = -EIO;
+out_unset:
+	fput(sbinfo->si_xib);
+	sbinfo->si_xib = NULL;
+	sbinfo->si_xread = NULL;
+	sbinfo->si_xwrite = NULL;
+out:
+	return err;
+}
+
+/* xino for each branch */
+static void xino_clear_br(struct super_block *sb)
+{
+	aufs_bindex_t bindex, bbot;
+	struct au_branch *br;
+
+	bbot = au_sbbot(sb);
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (!br || !br->br_xino.xi_file)
+			continue;
+
+		fput(br->br_xino.xi_file);
+		br->br_xino.xi_file = NULL;
+	}
+}
+
+static int au_xino_set_br(struct super_block *sb, struct file *base)
+{
+	int err;
+	ino_t ino;
+	aufs_bindex_t bindex, bbot, bshared;
+	struct {
+		struct file *old, *new;
+	} *fpair, *p;
+	struct au_branch *br;
+	struct inode *inode;
+	vfs_writef_t writef;
+
+	SiMustWriteLock(sb);
+
+	err = -ENOMEM;
+	bbot = au_sbbot(sb);
+	fpair = kcalloc(bbot + 1, sizeof(*fpair), GFP_NOFS);
+	if (unlikely(!fpair))
+		goto out;
+
+	inode = d_inode(sb->s_root);
+	ino = AUFS_ROOT_INO;
+	writef = au_sbi(sb)->si_xwrite;
+	for (bindex = 0, p = fpair; bindex <= bbot; bindex++, p++) {
+		bshared = is_sb_shared(sb, bindex, bindex - 1);
+		if (bshared >= 0) {
+			/* shared xino */
+			*p = fpair[bshared];
+			get_file(p->new);
+		}
+
+		if (!p->new) {
+			/* new xino */
+			br = au_sbr(sb, bindex);
+			p->old = br->br_xino.xi_file;
+			p->new = au_xino_create2(base, br->br_xino.xi_file);
+			err = PTR_ERR(p->new);
+			if (IS_ERR(p->new)) {
+				p->new = NULL;
+				goto out_pair;
+			}
+		}
+
+		err = au_xino_do_write(writef, p->new,
+				       au_h_iptr(inode, bindex)->i_ino, ino);
+		if (unlikely(err))
+			goto out_pair;
+	}
+
+	for (bindex = 0, p = fpair; bindex <= bbot; bindex++, p++) {
+		br = au_sbr(sb, bindex);
+		if (br->br_xino.xi_file)
+			fput(br->br_xino.xi_file);
+		get_file(p->new);
+		br->br_xino.xi_file = p->new;
+	}
+
+out_pair:
+	for (bindex = 0, p = fpair; bindex <= bbot; bindex++, p++)
+		if (p->new)
+			fput(p->new);
+		else
+			break;
+	au_delayed_kfree(fpair);
+out:
+	return err;
+}
+
+void au_xino_clr(struct super_block *sb)
+{
+	struct au_sbinfo *sbinfo;
+
+	au_xigen_clr(sb);
+	xino_clear_xib(sb);
+	xino_clear_br(sb);
+	sbinfo = au_sbi(sb);
+	/* lvalue, do not call au_mntflags() */
+	au_opt_clr(sbinfo->si_mntflags, XINO);
+}
+
+int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount)
+{
+	int err, skip;
+	struct dentry *parent, *cur_parent;
+	struct qstr *dname, *cur_name;
+	struct file *cur_xino;
+	struct inode *dir;
+	struct au_sbinfo *sbinfo;
+
+	SiMustWriteLock(sb);
+
+	err = 0;
+	sbinfo = au_sbi(sb);
+	parent = dget_parent(xino->file->f_path.dentry);
+	if (remount) {
+		skip = 0;
+		dname = &xino->file->f_path.dentry->d_name;
+		cur_xino = sbinfo->si_xib;
+		if (cur_xino) {
+			cur_parent = dget_parent(cur_xino->f_path.dentry);
+			cur_name = &cur_xino->f_path.dentry->d_name;
+			skip = (cur_parent == parent
+				&& au_qstreq(dname, cur_name));
+			dput(cur_parent);
+		}
+		if (skip)
+			goto out;
+	}
+
+	au_opt_set(sbinfo->si_mntflags, XINO);
+	dir = d_inode(parent);
+	inode_lock_nested(dir, AuLsc_I_PARENT);
+	/* mnt_want_write() is unnecessary here */
+	err = au_xino_set_xib(sb, xino->file);
+	if (!err)
+		err = au_xigen_set(sb, xino->file);
+	if (!err)
+		err = au_xino_set_br(sb, xino->file);
+	inode_unlock(dir);
+	if (!err)
+		goto out; /* success */
+
+	/* reset all */
+	AuIOErr("failed creating xino(%d).\n", err);
+	au_xigen_clr(sb);
+	xino_clear_xib(sb);
+
+out:
+	dput(parent);
+	return err;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/*
+ * create a xinofile at the default place/path.
+ */
+struct file *au_xino_def(struct super_block *sb)
+{
+	struct file *file;
+	char *page, *p;
+	struct au_branch *br;
+	struct super_block *h_sb;
+	struct path path;
+	aufs_bindex_t bbot, bindex, bwr;
+
+	br = NULL;
+	bbot = au_sbbot(sb);
+	bwr = -1;
+	for (bindex = 0; bindex <= bbot; bindex++) {
+		br = au_sbr(sb, bindex);
+		if (au_br_writable(br->br_perm)
+		    && !au_test_fs_bad_xino(au_br_sb(br))) {
+			bwr = bindex;
+			break;
+		}
+	}
+
+	if (bwr >= 0) {
+		file = ERR_PTR(-ENOMEM);
+		page = (void *)__get_free_page(GFP_NOFS);
+		if (unlikely(!page))
+			goto out;
+		path.mnt = au_br_mnt(br);
+		path.dentry = au_h_dptr(sb->s_root, bwr);
+		p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME));
+		file = (void *)p;
+		if (!IS_ERR(p)) {
+			strcat(p, "/" AUFS_XINO_FNAME);
+			AuDbg("%s\n", p);
+			file = au_xino_create(sb, p, /*silent*/0);
+			if (!IS_ERR(file))
+				au_xino_brid_set(sb, br->br_id);
+		}
+		au_delayed_free_page((unsigned long)page);
+	} else {
+		file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0);
+		if (IS_ERR(file))
+			goto out;
+		h_sb = file->f_path.dentry->d_sb;
+		if (unlikely(au_test_fs_bad_xino(h_sb))) {
+			pr_err("xino doesn't support %s(%s)\n",
+			       AUFS_XINO_DEFPATH, au_sbtype(h_sb));
+			fput(file);
+			file = ERR_PTR(-EINVAL);
+		}
+		if (!IS_ERR(file))
+			au_xino_brid_set(sb, -1);
+	}
+
+out:
+	return file;
+}
+
+/* ---------------------------------------------------------------------- */
+
+int au_xino_path(struct seq_file *seq, struct file *file)
+{
+	int err;
+
+	err = au_seq_path(seq, &file->f_path);
+	if (unlikely(err))
+		goto out;
+
+#define Deleted "\\040(deleted)"
+	seq->count -= sizeof(Deleted) - 1;
+	AuDebugOn(memcmp(seq->buf + seq->count, Deleted,
+			 sizeof(Deleted) - 1));
+#undef Deleted
+
+out:
+	return err;
+}
diff --git a/fs/buffer.c b/fs/buffer.c
index 9c8eb9b..6a5f1a0 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1698,7 +1698,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+	int write_flags = wbc_to_write_flags(wbc);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7cc95..76280ee 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1164,7 +1164,7 @@ enum d_walk_ret {
  *
  * The @enter() and @finish() callbacks are called with d_lock held.
  */
-static void d_walk(struct dentry *parent, void *data,
+void d_walk(struct dentry *parent, void *data,
 		   enum d_walk_ret (*enter)(void *, struct dentry *),
 		   void (*finish)(void *))
 {
@@ -1272,6 +1272,7 @@ rename_retry:
 	seq = 1;
 	goto again;
 }
+EXPORT_SYMBOL_GPL(d_walk);
 
 /*
  * Search for at least 1 mount point in the dentry's subdirs.
diff --git a/fs/exec.c b/fs/exec.c
index 6fcfb3f..2b42e5f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -19,7 +19,7 @@
  * current->executable is only used by the procfs.  This allows a dispatch
  * table to check for several different types  of binary formats.  We keep
  * trying until we recognize the file or we run out of supported binary
- * formats. 
+ * formats.
  */
 
 #include <linux/slab.h>
@@ -57,6 +57,9 @@
 #include <linux/oom.h>
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
+#include <linux/ksm.h>
+
+#include <trace/events/fs.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -104,6 +107,7 @@ bool path_noexec(const struct path *path)
 	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
 	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
 }
+EXPORT_SYMBOL_GPL(path_noexec);
 
 #ifdef CONFIG_USELIB
 /*
@@ -830,8 +834,10 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 	if (err)
 		goto exit;
 
-	if (name->name[0] != '\0')
+	if (name->name[0] != '\0') {
 		fsnotify_open(file);
+		trace_open_exec(name->name);
+	}
 
 out:
 	return file;
@@ -1309,6 +1315,7 @@ void setup_new_exec(struct linux_binprm * bprm)
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 	current->self_exec_id++;
+
 	flush_signal_handlers(current, 0);
 	do_close_on_exec(current->files);
 }
diff --git b/fs/exfat/Kconfig b/fs/exfat/Kconfig
new file mode 100644
index 0000000..78b32aa
--- /dev/null
+++ b/fs/exfat/Kconfig
@@ -0,0 +1,39 @@
+config EXFAT_FS
+	tristate "exFAT fs support"
+	select NLS
+	help
+	  This adds support for the exFAT file system.
+
+config EXFAT_DISCARD
+	bool "enable discard support"
+	depends on EXFAT_FS
+	default y
+
+config EXFAT_DELAYED_SYNC
+	bool "enable delayed sync"
+	depends on EXFAT_FS
+	default n
+
+config EXFAT_KERNEL_DEBUG
+	bool "enable kernel debug features via ioctl"
+	depends on EXFAT_FS
+	default n
+
+config EXFAT_DEBUG_MSG
+	bool "print debug messages"
+	depends on EXFAT_FS
+	default n
+
+config EXFAT_DEFAULT_CODEPAGE
+	int "Default codepage for exFAT"
+	default 437
+	depends on EXFAT_FS
+	help
+	  This option should be set to the codepage of your exFAT filesystems.
+
+config EXFAT_DEFAULT_IOCHARSET
+	string "Default iocharset for exFAT"
+	default "utf8"
+	depends on EXFAT_FS
+	help
+	  Set this to the default input/output character set you'd like exFAT to use.
diff --git b/fs/exfat/LICENSE b/fs/exfat/LICENSE
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/fs/exfat/LICENSE
@@ -0,0 +1,339 @@
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                            Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                            NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git b/fs/exfat/Makefile b/fs/exfat/Makefile
new file mode 100644
index 0000000..fb2be9d
--- /dev/null
+++ b/fs/exfat/Makefile
@@ -0,0 +1,55 @@
+#
+# Makefile for Linux FAT12/FAT16/FAT32(VFAT)/FAT64(ExFAT) filesystem driver.
+#
+
+ifneq ($(KERNELRELEASE),)
+# call from kernel build system
+
+obj-$(CONFIG_EXFAT_FS) += exfat.o
+
+exfat-objs := exfat_core.o exfat_super.o exfat_api.o exfat_blkdev.o exfat_cache.o \
+			   exfat_data.o exfat_bitmap.o exfat_nls.o exfat_oal.o exfat_upcase.o
+
+else
+# external module build
+
+EXTRA_FLAGS += -I$(PWD)
+
+#
+# KDIR is a path to a directory containing kernel source.
+# It can be specified on the command line passed to make to enable the module to
+# be built and installed for a kernel other than the one currently running.
+# By default it is the path to the symbolic link created when
+# the current kernel's modules were installed, but
+# any valid path to the directory in which the target kernel's source is located
+# can be provided on the command line.
+#
+KDIR	:= /lib/modules/$(shell uname -r)/build
+MDIR	:= /lib/modules/$(shell uname -r)
+PWD	:= $(shell pwd)
+KREL	:= $(shell cd ${KDIR} && make -s kernelrelease)
+PWD	:= $(shell pwd)
+
+export CONFIG_EXFAT_FS := m
+
+all:
+	$(MAKE) -C $(KDIR) M=$(PWD) modules
+
+clean:
+	$(MAKE) -C $(KDIR) M=$(PWD) clean
+
+help:
+	$(MAKE) -C $(KDIR) M=$(PWD) help
+
+install: exfat.ko
+	rm -f ${MDIR}/kernel/fs/exfat/exfat.ko
+	install -m644 -b -D exfat.ko ${MDIR}/kernel/fs/exfat/exfat.ko
+	depmod -aq
+
+uninstall:
+	rm -rf ${MDIR}/kernel/fs/exfat
+	depmod -aq
+
+endif
+
+.PHONY : all clean install uninstall
diff --git b/fs/exfat/README.md b/fs/exfat/README.md
new file mode 100644
index 0000000..3e1887f
--- /dev/null
+++ b/fs/exfat/README.md
@@ -0,0 +1,74 @@
+exfat-nofuse
+============
+
+Linux non-fuse read/write kernel driver for the exFAT, FAT12, FAT16 and vfat (FAT32) file systems.<br />
+Originally ported from Android kernel v3.0.
+
+Kudos to ksv1986 for the mutex patch!<br />
+Thanks to JackNorris for being awesome and providing the clear_inode() patch.<br />
+<br />
+Big thanks to lqs for completing the driver!<br />
+Big thanks to benpicco for fixing 3.11.y compatibility!
+
+
+Special thanks to github user AndreiLux for spreading the word about the leak!<br />
+
+
+Installing as a stand-alone module:
+====================================
+
+    make
+    sudo make install
+
+To load the driver manually, run this as root:
+
+    modprobe exfat
+
+You may also specify custom toolchains by using CROSS_COMPILE flag, in my case:
+>CROSS_COMPILE=../dorimanx-SG2-I9100-Kernel/android-toolchain/bin/arm-eabi-
+
+Installing as a part of the kernel:
+======================================
+
+Let's take [linux] as the path to your kernel source dir...
+
+	cd [linux]
+	cp -rvf exfat-nofuse [linux]/fs/exfat
+
+edit [linux]/fs/Kconfig
+```
+ menu "DOS/FAT/NT Filesystems"
+
+  source "fs/fat/Kconfig"
+ +source "fs/exfat/Kconfig"
+  source "fs/ntfs/Kconfig"
+  endmenu
+```
+  
+
+edit [linux]/fs/Makefile
+```
+  obj-$(CONFIG_FAT_FS)    += fat/
+ +obj-$(CONFIG_EXFAT_FS)  += exfat/
+  obj-$(CONFIG_BFS_FS)    += bfs/
+```
+
+	cd [linux]
+	make menuconfig
+
+Go to:
+> File systems > DOS/FAT/NT
+>   check exfat as MODULE (M)
+>   (437) Default codepage for exFAT
+>   (utf8) Default iocharset for exFAT
+
+> ESC to main menu
+> Save an Alternate Configuration File
+> ESC ESC
+
+build your kernel
+
+Have fun.
+
+Free Software for the Free Minds!
+=================================
diff --git b/fs/exfat/exfat_api.c b/fs/exfat/exfat_api.c
new file mode 100644
index 0000000..32b29f0
--- /dev/null
+++ b/fs/exfat/exfat_api.c
@@ -0,0 +1,528 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_api.c                                               */
+/*  PURPOSE : exFAT API Glue Layer                                      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+
+#include "exfat_version.h"
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+extern struct semaphore z_sem;
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Local Function Declarations                                         */
+/*----------------------------------------------------------------------*/
+
+/*======================================================================*/
+/*  Global Function Definitions                                         */
+/*    - All functions for global use have same return value format,     */
+/*      that is, FFS_SUCCESS on success and several FS error code on    */
+/*      various error condition.                                        */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  exFAT Filesystem Init & Exit Functions                              */
+/*----------------------------------------------------------------------*/
+
+int FsInit(void)
+{
+	return ffsInit();
+}
+
+int FsShutdown(void)
+{
+	return ffsShutdown();
+}
+
+/*----------------------------------------------------------------------*/
+/*  Volume Management Functions                                         */
+/*----------------------------------------------------------------------*/
+
+/* FsMountVol : mount the file system volume */
+int FsMountVol(struct super_block *sb)
+{
+	int err;
+
+	sm_P(&z_sem);
+
+	err = buf_init(sb);
+	if (!err)
+		err = ffsMountVol(sb);
+	else
+		buf_shutdown(sb);
+
+	sm_V(&z_sem);
+
+	return err;
+} /* end of FsMountVol */
+
+/* FsUmountVol : unmount the file system volume */
+int FsUmountVol(struct super_block *sb)
+{
+	int err;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&z_sem);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsUmountVol(sb);
+	buf_shutdown(sb);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	sm_V(&z_sem);
+
+	return err;
+} /* end of FsUmountVol */
+
+/* FsGetVolInfo : get the information of a file system volume */
+int FsGetVolInfo(struct super_block *sb, VOL_INFO_T *info)
+{
+	int err;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if (info == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsGetVolInfo(sb, info);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsGetVolInfo */
+
+/* FsSyncVol : synchronize a file system volume */
+int FsSyncVol(struct super_block *sb, int do_sync)
+{
+	int err;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsSyncVol(sb, do_sync);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsSyncVol */
+
+
+/*----------------------------------------------------------------------*/
+/*  File Operation Functions                                            */
+/*----------------------------------------------------------------------*/
+
+/* FsCreateFile : create a file */
+int FsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if ((fid == NULL) || (path == NULL) || (*path == '\0'))
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsLookupFile(inode, path, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsLookupFile */
+
+/* FsCreateFile : create a file */
+int FsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if ((fid == NULL) || (path == NULL) || (*path == '\0'))
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsCreateFile(inode, path, mode, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsCreateFile */
+
+int FsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* check the validity of pointer parameters */
+	if (buffer == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsReadFile(inode, fid, buffer, count, rcount);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsReadFile */
+
+int FsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* check the validity of pointer parameters */
+	if (buffer == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsWriteFile(inode, fid, buffer, count, wcount);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsWriteFile */
+
+/* FsTruncateFile : resize the file length */
+int FsTruncateFile(struct inode *inode, u64 old_size, u64 new_size)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	DPRINTK("FsTruncateFile entered (inode %p size %llu)\n", inode, new_size);
+
+	err = ffsTruncateFile(inode, old_size, new_size);
+
+	DPRINTK("FsTruncateFile exitted (%d)\n", err);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsTruncateFile */
+
+/* FsMoveFile : move(rename) a old file into a new file */
+int FsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry)
+{
+	int err;
+	struct super_block *sb = old_parent_inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsMoveFile(old_parent_inode, fid, new_parent_inode, new_dentry);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsMoveFile */
+
+/* FsRemoveFile : remove a file */
+int FsRemoveFile(struct inode *inode, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsRemoveFile(inode, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsRemoveFile */
+
+/* FsSetAttr : set the attribute of a given file */
+int FsSetAttr(struct inode *inode, u32 attr)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsSetAttr(inode, attr);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsSetAttr */
+
+/* FsReadStat : get the information of a given file */
+int FsReadStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsGetStat(inode, info);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsReadStat */
+
+/* FsWriteStat : set the information of a given file */
+int FsWriteStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	DPRINTK("FsWriteStat entered (inode %p info %p\n", inode, info);
+
+	err = ffsSetStat(inode, info);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	DPRINTK("FsWriteStat exited (%d)\n", err);
+
+	return err;
+} /* end of FsWriteStat */
+
+/* FsMapCluster : return the cluster number in the given cluster offset */
+int FsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if (clu == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsMapCluster(inode, clu_offset, clu);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsMapCluster */
+
+/*----------------------------------------------------------------------*/
+/*  Directory Operation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+/* FsCreateDir : create(make) a directory */
+int FsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if ((fid == NULL) || (path == NULL) || (*path == '\0'))
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsCreateDir(inode, path, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsCreateDir */
+
+/* FsReadDir : read a directory entry from the opened directory */
+int FsReadDir(struct inode *inode, DIR_ENTRY_T *dir_entry)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of pointer parameters */
+	if (dir_entry == NULL)
+		return FFS_ERROR;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsReadDir(inode, dir_entry);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsReadDir */
+
+/* FsRemoveDir : remove a directory */
+int FsRemoveDir(struct inode *inode, FILE_ID_T *fid)
+{
+	int err;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of the given file id */
+	if (fid == NULL)
+		return FFS_INVALIDFID;
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	err = ffsRemoveDir(inode, fid);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return err;
+} /* end of FsRemoveDir */
+
+EXPORT_SYMBOL(FsMountVol);
+EXPORT_SYMBOL(FsUmountVol);
+EXPORT_SYMBOL(FsGetVolInfo);
+EXPORT_SYMBOL(FsSyncVol);
+EXPORT_SYMBOL(FsLookupFile);
+EXPORT_SYMBOL(FsCreateFile);
+EXPORT_SYMBOL(FsReadFile);
+EXPORT_SYMBOL(FsWriteFile);
+EXPORT_SYMBOL(FsTruncateFile);
+EXPORT_SYMBOL(FsMoveFile);
+EXPORT_SYMBOL(FsRemoveFile);
+EXPORT_SYMBOL(FsSetAttr);
+EXPORT_SYMBOL(FsReadStat);
+EXPORT_SYMBOL(FsWriteStat);
+EXPORT_SYMBOL(FsMapCluster);
+EXPORT_SYMBOL(FsCreateDir);
+EXPORT_SYMBOL(FsReadDir);
+EXPORT_SYMBOL(FsRemoveDir);
+
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+/* FsReleaseCache: Release FAT & buf cache */
+int FsReleaseCache(struct super_block *sb)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* acquire the lock for file system critical section */
+	sm_P(&p_fs->v_sem);
+
+	FAT_release_all(sb);
+	buf_release_all(sb);
+
+	/* release the lock for file system critical section */
+	sm_V(&p_fs->v_sem);
+
+	return 0;
+}
+/* FsReleaseCache */
+
+EXPORT_SYMBOL(FsReleaseCache);
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
diff --git b/fs/exfat/exfat_api.h b/fs/exfat/exfat_api.h
new file mode 100644
index 0000000..84bdf61
--- /dev/null
+++ b/fs/exfat/exfat_api.h
@@ -0,0 +1,206 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_api.h                                               */
+/*  PURPOSE : Header File for exFAT API Glue Layer                      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_API_H
+#define _EXFAT_API_H
+
+#include <linux/fs.h>
+#include "exfat_config.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+#define EXFAT_SUPER_MAGIC       (0x2011BAB0L)
+#define EXFAT_ROOT_INO          1
+
+/* FAT types */
+#define FAT12                   0x01    /* FAT12 */
+#define FAT16                   0x0E    /* Win95 FAT16 (LBA) */
+#define FAT32                   0x0C    /* Win95 FAT32 (LBA) */
+#define EXFAT                   0x07    /* exFAT */
+
+/* file name lengths */
+#define MAX_CHARSET_SIZE        3       /* max size of multi-byte character	*/
+#define MAX_PATH_DEPTH          15      /* max depth of path name */
+#define MAX_NAME_LENGTH         256     /* max len of file name including NULL */
+#define MAX_PATH_LENGTH         260     /* max len of path name including NULL */
+#define DOS_NAME_LENGTH         11      /* DOS file name length excluding NULL */
+#define DOS_PATH_LENGTH         80      /* DOS path name length excluding NULL */
+
+/* file attributes */
+#define ATTR_NORMAL             0x0000
+#define ATTR_READONLY           0x0001
+#define ATTR_HIDDEN             0x0002
+#define ATTR_SYSTEM             0x0004
+#define ATTR_VOLUME             0x0008
+#define ATTR_SUBDIR             0x0010
+#define ATTR_ARCHIVE            0x0020
+#define ATTR_SYMLINK            0x0040
+#define ATTR_EXTEND             0x000F
+#define ATTR_RWMASK             0x007E
+
+/* file creation modes */
+#define FM_REGULAR              0x00
+#define FM_SYMLINK              0x40
+
+/* return values */
+#define FFS_SUCCESS             0
+#define FFS_MEDIAERR            1
+#define FFS_FORMATERR           2
+#define FFS_MOUNTED             3
+#define FFS_NOTMOUNTED          4
+#define FFS_ALIGNMENTERR        5
+#define FFS_SEMAPHOREERR        6
+#define FFS_INVALIDPATH         7
+#define FFS_INVALIDFID          8
+#define FFS_NOTFOUND            9
+#define FFS_FILEEXIST           10
+#define FFS_PERMISSIONERR       11
+#define FFS_NOTOPENED           12
+#define FFS_MAXOPENED           13
+#define FFS_FULL                14
+#define FFS_EOF                 15
+#define FFS_DIRBUSY             16
+#define FFS_MEMORYERR           17
+#define FFS_NAMETOOLONG		18
+#define FFS_ERROR               19
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct {
+	u16      Year;
+	u16      Month;
+	u16      Day;
+	u16      Hour;
+	u16      Minute;
+	u16      Second;
+	u16      MilliSecond;
+} DATE_TIME_T;
+
+typedef struct {
+	u32      Offset;    /* start sector number of the partition */
+	u32      Size;      /* in sectors */
+} PART_INFO_T;
+
+typedef struct {
+	u32      SecSize;    /* sector size in bytes */
+	u32      DevSize;    /* block device size in sectors */
+} DEV_INFO_T;
+
+typedef struct {
+	u32      FatType;
+	u32      ClusterSize;
+	u32      NumClusters;
+	u32      FreeClusters;
+	u32      UsedClusters;
+} VOL_INFO_T;
+
+/* directory structure */
+typedef struct {
+	u32      dir;
+	s32       size;
+	u8       flags;
+} CHAIN_T;
+
+/* file id structure */
+typedef struct {
+	CHAIN_T     dir;
+	s32       entry;
+	u32      type;
+	u32      attr;
+	u32      start_clu;
+	u64      size;
+	u8       flags;
+	s64       rwoffset;
+	s32       hint_last_off;
+	u32      hint_last_clu;
+} FILE_ID_T;
+
+typedef struct {
+	char        Name[MAX_NAME_LENGTH * MAX_CHARSET_SIZE];
+	char        ShortName[DOS_NAME_LENGTH + 2];     /* used only for FAT12/16/32, not used for exFAT */
+	u32      Attr;
+	u64      Size;
+	u32      NumSubdirs;
+	DATE_TIME_T CreateTimestamp;
+	DATE_TIME_T ModifyTimestamp;
+	DATE_TIME_T AccessTimestamp;
+} DIR_ENTRY_T;
+
+/*======================================================================*/
+/*                                                                      */
+/*                     API FUNCTION DECLARATIONS                        */
+/*                  (CHANGE THIS PART IF REQUIRED)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/* file system initialization & shutdown functions */
+	int FsInit(void);
+	int FsShutdown(void);
+
+/* volume management functions */
+	int FsMountVol(struct super_block *sb);
+	int FsUmountVol(struct super_block *sb);
+	int FsGetVolInfo(struct super_block *sb, VOL_INFO_T *info);
+	int FsSyncVol(struct super_block *sb, int do_sync);
+
+/* file management functions */
+	int FsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid);
+	int FsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid);
+	int FsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount);
+	int FsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount);
+	int FsTruncateFile(struct inode *inode, u64 old_size, u64 new_size);
+	int FsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry);
+	int FsRemoveFile(struct inode *inode, FILE_ID_T *fid);
+	int FsSetAttr(struct inode *inode, u32 attr);
+	int FsReadStat(struct inode *inode, DIR_ENTRY_T *info);
+	int FsWriteStat(struct inode *inode, DIR_ENTRY_T *info);
+	int FsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu);
+
+/* directory management functions */
+	int FsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid);
+	int FsReadDir(struct inode *inode, DIR_ENTRY_T *dir_entry);
+	int FsRemoveDir(struct inode *inode, FILE_ID_T *fid);
+
+/* debug functions */
+s32 FsReleaseCache(struct super_block *sb);
+
+#endif /* _EXFAT_API_H */
diff --git b/fs/exfat/exfat_bitmap.c b/fs/exfat/exfat_bitmap.c
new file mode 100644
index 0000000..b0672dd
--- /dev/null
+++ b/fs/exfat/exfat_bitmap.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_global.c                                            */
+/*  PURPOSE : exFAT Miscellaneous Functions                             */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_bitmap.h"
+
+/*----------------------------------------------------------------------*/
+/*  Bitmap Manipulation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+#define BITMAP_LOC(v)           ((v) >> 3)
+#define BITMAP_SHIFT(v)         ((v) & 0x07)
+
+s32 exfat_bitmap_test(u8 *bitmap, int i)
+{
+	u8 data;
+
+	data = bitmap[BITMAP_LOC(i)];
+	if ((data >> BITMAP_SHIFT(i)) & 0x01)
+		return 1;
+	return 0;
+} /* end of Bitmap_test */
+
+void exfat_bitmap_set(u8 *bitmap, int i)
+{
+	bitmap[BITMAP_LOC(i)] |= (0x01 << BITMAP_SHIFT(i));
+} /* end of Bitmap_set */
+
+void exfat_bitmap_clear(u8 *bitmap, int i)
+{
+	bitmap[BITMAP_LOC(i)] &= ~(0x01 << BITMAP_SHIFT(i));
+} /* end of Bitmap_clear */
diff --git b/fs/exfat/exfat_bitmap.h b/fs/exfat/exfat_bitmap.h
new file mode 100644
index 0000000..4f482c7
--- /dev/null
+++ b/fs/exfat/exfat_bitmap.h
@@ -0,0 +1,55 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_global.h                                            */
+/*  PURPOSE : Header File for exFAT Global Definitions & Misc Functions */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_BITMAP_H
+#define _EXFAT_BITMAP_H
+
+#include <linux/types.h>
+
+/*======================================================================*/
+/*                                                                      */
+/*       LIBRARY FUNCTION DECLARATIONS -- OTHER UTILITY FUNCTIONS       */
+/*                    (DO NOT CHANGE THIS PART !!)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  Bitmap Manipulation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+s32	exfat_bitmap_test(u8 *bitmap, int i);
+void	exfat_bitmap_set(u8 *bitmap, int i);
+void	exfat_bitmap_clear(u8 *bitmpa, int i);
+
+#endif /* _EXFAT_BITMAP_H */
diff --git b/fs/exfat/exfat_blkdev.c b/fs/exfat/exfat_blkdev.c
new file mode 100644
index 0000000..02fa4fe
--- /dev/null
+++ b/fs/exfat/exfat_blkdev.c
@@ -0,0 +1,197 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_blkdev.c                                            */
+/*  PURPOSE : exFAT Block Device Driver Glue Layer                      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/blkdev.h>
+#include <linux/log2.h>
+#include "exfat_config.h"
+#include "exfat_blkdev.h"
+#include "exfat_data.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+/*======================================================================*/
+/*  Function Definitions                                                */
+/*======================================================================*/
+
+s32 bdev_init(void)
+{
+	return FFS_SUCCESS;
+}
+
+s32 bdev_shutdown(void)
+{
+	return FFS_SUCCESS;
+}
+
+s32 bdev_open(struct super_block *sb)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bd->opened)
+		return FFS_SUCCESS;
+
+	p_bd->sector_size      = bdev_logical_block_size(sb->s_bdev);
+	p_bd->sector_size_bits = ilog2(p_bd->sector_size);
+	p_bd->sector_size_mask = p_bd->sector_size - 1;
+	p_bd->num_sectors      = i_size_read(sb->s_bdev->bd_inode) >> p_bd->sector_size_bits;
+
+	p_bd->opened = TRUE;
+
+	return FFS_SUCCESS;
+}
+
+s32 bdev_close(struct super_block *sb)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (!p_bd->opened)
+		return FFS_SUCCESS;
+
+	p_bd->opened = FALSE;
+	return FFS_SUCCESS;
+}
+
+s32 bdev_read(struct super_block *sb, u32 secno, struct buffer_head **bh, u32 num_secs, s32 read)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	long flags = sbi->debug_flags;
+
+	if (flags & EXFAT_DEBUGFLAGS_ERROR_RW)
+		return FFS_MEDIAERR;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	if (!p_bd->opened)
+		return FFS_MEDIAERR;
+
+	if (*bh)
+		__brelse(*bh);
+
+	if (read)
+		*bh = __bread(sb->s_bdev, secno, num_secs << p_bd->sector_size_bits);
+	else
+		*bh = __getblk(sb->s_bdev, secno, num_secs << p_bd->sector_size_bits);
+
+	if (*bh)
+		return FFS_SUCCESS;
+
+	WARN(!p_fs->dev_ejected,
+		"[EXFAT] No bh, device seems wrong or to be ejected.\n");
+
+	return FFS_MEDIAERR;
+}
+
+s32 bdev_write(struct super_block *sb, u32 secno, struct buffer_head *bh, u32 num_secs, s32 sync)
+{
+	s32 count;
+	struct buffer_head *bh2;
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	long flags = sbi->debug_flags;
+
+	if (flags & EXFAT_DEBUGFLAGS_ERROR_RW)
+		return FFS_MEDIAERR;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	if (!p_bd->opened)
+		return FFS_MEDIAERR;
+
+	if (secno == bh->b_blocknr) {
+		lock_buffer(bh);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+		unlock_buffer(bh);
+		if (sync && (sync_dirty_buffer(bh) != 0))
+			return FFS_MEDIAERR;
+	} else {
+		count = num_secs << p_bd->sector_size_bits;
+
+		bh2 = __getblk(sb->s_bdev, secno, count);
+
+		if (bh2 == NULL)
+			goto no_bh;
+
+		lock_buffer(bh2);
+		memcpy(bh2->b_data, bh->b_data, count);
+		set_buffer_uptodate(bh2);
+		mark_buffer_dirty(bh2);
+		unlock_buffer(bh2);
+		if (sync && (sync_dirty_buffer(bh2) != 0)) {
+			__brelse(bh2);
+			goto no_bh;
+		}
+		__brelse(bh2);
+	}
+
+	return FFS_SUCCESS;
+
+no_bh:
+	WARN(!p_fs->dev_ejected,
+		"[EXFAT] No bh, device seems wrong or to be ejected.\n");
+
+	return FFS_MEDIAERR;
+}
+
+s32 bdev_sync(struct super_block *sb)
+{
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	long flags = sbi->debug_flags;
+
+	if (flags & EXFAT_DEBUGFLAGS_ERROR_RW)
+		return FFS_MEDIAERR;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	if (!p_bd->opened)
+		return FFS_MEDIAERR;
+
+	return sync_blockdev(sb->s_bdev);
+}
diff --git b/fs/exfat/exfat_blkdev.h b/fs/exfat/exfat_blkdev.h
new file mode 100644
index 0000000..169d25d
--- /dev/null
+++ b/fs/exfat/exfat_blkdev.h
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_blkdev.h                                            */
+/*  PURPOSE : Header File for exFAT Block Device Driver Glue Layer      */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_BLKDEV_H
+#define _EXFAT_BLKDEV_H
+
+#include <linux/fs.h>
+#include "exfat_config.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions (Non-Configurable)                     */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct __BD_INFO_T {
+	s32 sector_size;      /* in bytes */
+	s32 sector_size_bits;
+	s32 sector_size_mask;
+	s32 num_sectors;      /* total number of sectors in this block device */
+	bool  opened;           /* opened or not */
+} BD_INFO_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Variable Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+s32 bdev_init(void);
+s32 bdev_shutdown(void);
+s32 bdev_open(struct super_block *sb);
+s32 bdev_close(struct super_block *sb);
+s32 bdev_read(struct super_block *sb, u32 secno, struct buffer_head **bh, u32 num_secs, s32 read);
+s32 bdev_write(struct super_block *sb, u32 secno, struct buffer_head *bh, u32 num_secs, s32 sync);
+s32 bdev_sync(struct super_block *sb);
+
+#endif /* _EXFAT_BLKDEV_H */
diff --git b/fs/exfat/exfat_cache.c b/fs/exfat/exfat_cache.c
new file mode 100644
index 0000000..e6ca88b
--- /dev/null
+++ b/fs/exfat/exfat_cache.c
@@ -0,0 +1,780 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_cache.c                                             */
+/*  PURPOSE : exFAT Cache Manager                                       */
+/*            (FAT Cache & Buffer Cache)                                */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Sung-Kwan Kim] : first writing                        */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+
+#include "exfat_cache.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+#define sm_P(s)
+#define sm_V(s)
+
+static s32 __FAT_read(struct super_block *sb, u32 loc, u32 *content);
+static s32 __FAT_write(struct super_block *sb, u32 loc, u32 content);
+
+static BUF_CACHE_T *FAT_cache_find(struct super_block *sb, u32 sec);
+static BUF_CACHE_T *FAT_cache_get(struct super_block *sb, u32 sec);
+static void FAT_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp);
+static void FAT_cache_remove_hash(BUF_CACHE_T *bp);
+
+static u8 *__buf_getblk(struct super_block *sb, u32 sec);
+
+static BUF_CACHE_T *buf_cache_find(struct super_block *sb, u32 sec);
+static BUF_CACHE_T *buf_cache_get(struct super_block *sb, u32 sec);
+static void buf_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp);
+static void buf_cache_remove_hash(BUF_CACHE_T *bp);
+
+static void push_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+static void push_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+static void move_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+static void move_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list);
+
+/*======================================================================*/
+/*  Cache Initialization Functions                                      */
+/*======================================================================*/
+
+s32 buf_init(struct super_block *sb)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	int i;
+
+	/* LRU list */
+	p_fs->FAT_cache_lru_list.next = p_fs->FAT_cache_lru_list.prev = &p_fs->FAT_cache_lru_list;
+
+	for (i = 0; i < FAT_CACHE_SIZE; i++) {
+		p_fs->FAT_cache_array[i].drv = -1;
+		p_fs->FAT_cache_array[i].sec = ~0;
+		p_fs->FAT_cache_array[i].flag = 0;
+		p_fs->FAT_cache_array[i].buf_bh = NULL;
+		p_fs->FAT_cache_array[i].prev = p_fs->FAT_cache_array[i].next = NULL;
+		push_to_mru(&(p_fs->FAT_cache_array[i]), &p_fs->FAT_cache_lru_list);
+	}
+
+	p_fs->buf_cache_lru_list.next = p_fs->buf_cache_lru_list.prev = &p_fs->buf_cache_lru_list;
+
+	for (i = 0; i < BUF_CACHE_SIZE; i++) {
+		p_fs->buf_cache_array[i].drv = -1;
+		p_fs->buf_cache_array[i].sec = ~0;
+		p_fs->buf_cache_array[i].flag = 0;
+		p_fs->buf_cache_array[i].buf_bh = NULL;
+		p_fs->buf_cache_array[i].prev = p_fs->buf_cache_array[i].next = NULL;
+		push_to_mru(&(p_fs->buf_cache_array[i]), &p_fs->buf_cache_lru_list);
+	}
+
+	/* HASH list */
+	for (i = 0; i < FAT_CACHE_HASH_SIZE; i++) {
+		p_fs->FAT_cache_hash_list[i].drv = -1;
+		p_fs->FAT_cache_hash_list[i].sec = ~0;
+		p_fs->FAT_cache_hash_list[i].hash_next = p_fs->FAT_cache_hash_list[i].hash_prev = &(p_fs->FAT_cache_hash_list[i]);
+	}
+
+	for (i = 0; i < FAT_CACHE_SIZE; i++)
+		FAT_cache_insert_hash(sb, &(p_fs->FAT_cache_array[i]));
+
+	for (i = 0; i < BUF_CACHE_HASH_SIZE; i++) {
+		p_fs->buf_cache_hash_list[i].drv = -1;
+		p_fs->buf_cache_hash_list[i].sec = ~0;
+		p_fs->buf_cache_hash_list[i].hash_next = p_fs->buf_cache_hash_list[i].hash_prev = &(p_fs->buf_cache_hash_list[i]);
+	}
+
+	for (i = 0; i < BUF_CACHE_SIZE; i++)
+		buf_cache_insert_hash(sb, &(p_fs->buf_cache_array[i]));
+
+	return FFS_SUCCESS;
+} /* end of buf_init */
+
+s32 buf_shutdown(struct super_block *sb)
+{
+	return FFS_SUCCESS;
+} /* end of buf_shutdown */
+
+/*======================================================================*/
+/*  FAT Read/Write Functions                                            */
+/*======================================================================*/
+
+/* in : sb, loc
+  * out: content
+  * returns 0 on success
+  *            -1 on error
+  */
+s32 FAT_read(struct super_block *sb, u32 loc, u32 *content)
+{
+	s32 ret;
+
+	sm_P(&f_sem);
+
+	ret = __FAT_read(sb, loc, content);
+
+	sm_V(&f_sem);
+
+	return ret;
+} /* end of FAT_read */
+
+s32 FAT_write(struct super_block *sb, u32 loc, u32 content)
+{
+	s32 ret;
+
+	sm_P(&f_sem);
+
+	ret = __FAT_write(sb, loc, content);
+
+	sm_V(&f_sem);
+
+	return ret;
+} /* end of FAT_write */
+
+static s32 __FAT_read(struct super_block *sb, u32 loc, u32 *content)
+{
+	s32 off;
+	u32 sec, _content;
+	u8 *fat_sector, *fat_entry;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_fs->vol_type == FAT12) {
+		sec = p_fs->FAT1_start_sector + ((loc + (loc >> 1)) >> p_bd->sector_size_bits);
+		off = (loc + (loc >> 1)) & p_bd->sector_size_mask;
+
+		if (off == (p_bd->sector_size-1)) {
+			fat_sector = FAT_getblk(sb, sec);
+			if (!fat_sector)
+				return -1;
+
+			_content  = (u32) fat_sector[off];
+
+			fat_sector = FAT_getblk(sb, ++sec);
+			if (!fat_sector)
+				return -1;
+
+			_content |= (u32) fat_sector[0] << 8;
+		} else {
+			fat_sector = FAT_getblk(sb, sec);
+			if (!fat_sector)
+				return -1;
+
+			fat_entry = &(fat_sector[off]);
+			_content = GET16(fat_entry);
+		}
+
+		if (loc & 1)
+			_content >>= 4;
+
+		_content &= 0x00000FFF;
+
+		if (_content >= CLUSTER_16(0x0FF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	} else if (p_fs->vol_type == FAT16) {
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-1));
+		off = (loc << 1) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		_content = GET16_A(fat_entry);
+
+		_content &= 0x0000FFFF;
+
+		if (_content >= CLUSTER_16(0xFFF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	} else if (p_fs->vol_type == FAT32) {
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		_content = GET32_A(fat_entry);
+
+		_content &= 0x0FFFFFFF;
+
+		if (_content >= CLUSTER_32(0x0FFFFFF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	} else {
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+		_content = GET32_A(fat_entry);
+
+		if (_content >= CLUSTER_32(0xFFFFFFF8)) {
+			*content = CLUSTER_32(~0);
+			return 0;
+		} else {
+			*content = CLUSTER_32(_content);
+			return 0;
+		}
+	}
+
+	*content = CLUSTER_32(~0);
+	return 0;
+} /* end of __FAT_read */
+
+static s32 __FAT_write(struct super_block *sb, u32 loc, u32 content)
+{
+	s32 off;
+	u32 sec;
+	u8 *fat_sector, *fat_entry;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_fs->vol_type == FAT12) {
+
+		content &= 0x00000FFF;
+
+		sec = p_fs->FAT1_start_sector + ((loc + (loc >> 1)) >> p_bd->sector_size_bits);
+		off = (loc + (loc >> 1)) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		if (loc & 1) { /* odd */
+
+			content <<= 4;
+
+			if (off == (p_bd->sector_size-1)) {
+				fat_sector[off] = (u8)(content | (fat_sector[off] & 0x0F));
+				FAT_modify(sb, sec);
+
+				fat_sector = FAT_getblk(sb, ++sec);
+				if (!fat_sector)
+					return -1;
+
+				fat_sector[0] = (u8)(content >> 8);
+			} else {
+				fat_entry = &(fat_sector[off]);
+				content |= GET16(fat_entry) & 0x000F;
+
+				SET16(fat_entry, content);
+			}
+		} else { /* even */
+			fat_sector[off] = (u8)(content);
+
+			if (off == (p_bd->sector_size-1)) {
+				fat_sector[off] = (u8)(content);
+				FAT_modify(sb, sec);
+
+				fat_sector = FAT_getblk(sb, ++sec);
+				fat_sector[0] = (u8)((fat_sector[0] & 0xF0) | (content >> 8));
+			} else {
+				fat_entry = &(fat_sector[off]);
+				content |= GET16(fat_entry) & 0xF000;
+
+				SET16(fat_entry, content);
+			}
+		}
+	}
+
+	else if (p_fs->vol_type == FAT16) {
+
+		content &= 0x0000FFFF;
+
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-1));
+		off = (loc << 1) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		SET16_A(fat_entry, content);
+	}
+
+	else if (p_fs->vol_type == FAT32) {
+
+		content &= 0x0FFFFFFF;
+
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		content |= GET32_A(fat_entry) & 0xF0000000;
+
+		SET32_A(fat_entry, content);
+	}
+
+	else { /* p_fs->vol_type == EXFAT */
+
+		sec = p_fs->FAT1_start_sector + (loc >> (p_bd->sector_size_bits-2));
+		off = (loc << 2) & p_bd->sector_size_mask;
+
+		fat_sector = FAT_getblk(sb, sec);
+		if (!fat_sector)
+			return -1;
+
+		fat_entry = &(fat_sector[off]);
+
+		SET32_A(fat_entry, content);
+	}
+
+	FAT_modify(sb, sec);
+	return 0;
+} /* end of __FAT_write */
+
+u8 *FAT_getblk(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = FAT_cache_find(sb, sec);
+	if (bp != NULL) {
+		move_to_mru(bp, &p_fs->FAT_cache_lru_list);
+		return bp->buf_bh->b_data;
+	}
+
+	bp = FAT_cache_get(sb, sec);
+
+	FAT_cache_remove_hash(bp);
+
+	bp->drv = p_fs->drv;
+	bp->sec = sec;
+	bp->flag = 0;
+
+	FAT_cache_insert_hash(sb, bp);
+
+	if (sector_read(sb, sec, &(bp->buf_bh), 1) != FFS_SUCCESS) {
+		FAT_cache_remove_hash(bp);
+		bp->drv = -1;
+		bp->sec = ~0;
+		bp->flag = 0;
+		bp->buf_bh = NULL;
+
+		move_to_lru(bp, &p_fs->FAT_cache_lru_list);
+		return NULL;
+	}
+
+	return bp->buf_bh->b_data;
+} /* end of FAT_getblk */
+
+void FAT_modify(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	bp = FAT_cache_find(sb, sec);
+	if (bp != NULL)
+		sector_write(sb, sec, bp->buf_bh, 0);
+} /* end of FAT_modify */
+
+void FAT_release_all(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&f_sem);
+
+	bp = p_fs->FAT_cache_lru_list.next;
+	while (bp != &p_fs->FAT_cache_lru_list) {
+		if (bp->drv == p_fs->drv) {
+			bp->drv = -1;
+			bp->sec = ~0;
+			bp->flag = 0;
+
+			if (bp->buf_bh) {
+				__brelse(bp->buf_bh);
+				bp->buf_bh = NULL;
+			}
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&f_sem);
+} /* end of FAT_release_all */
+
+void FAT_sync(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&f_sem);
+
+	bp = p_fs->FAT_cache_lru_list.next;
+	while (bp != &p_fs->FAT_cache_lru_list) {
+		if ((bp->drv == p_fs->drv) && (bp->flag & DIRTYBIT)) {
+			sync_dirty_buffer(bp->buf_bh);
+			bp->flag &= ~(DIRTYBIT);
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&f_sem);
+} /* end of FAT_sync */
+
+static BUF_CACHE_T *FAT_cache_find(struct super_block *sb, u32 sec)
+{
+	s32 off;
+	BUF_CACHE_T *bp, *hp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	off = (sec + (sec >> p_fs->sectors_per_clu_bits)) & (FAT_CACHE_HASH_SIZE - 1);
+
+	hp = &(p_fs->FAT_cache_hash_list[off]);
+	for (bp = hp->hash_next; bp != hp; bp = bp->hash_next) {
+		if ((bp->drv == p_fs->drv) && (bp->sec == sec)) {
+
+			WARN(!bp->buf_bh, "[EXFAT] FAT_cache has no bh. "
+					  "It will make system panic.\n");
+
+			touch_buffer(bp->buf_bh);
+			return bp;
+		}
+	}
+	return NULL;
+} /* end of FAT_cache_find */
+
+static BUF_CACHE_T *FAT_cache_get(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = p_fs->FAT_cache_lru_list.prev;
+
+
+	move_to_mru(bp, &p_fs->FAT_cache_lru_list);
+	return bp;
+} /* end of FAT_cache_get */
+
+static void FAT_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp)
+{
+	s32 off;
+	BUF_CACHE_T *hp;
+	FS_INFO_T *p_fs;
+
+	p_fs = &(EXFAT_SB(sb)->fs_info);
+	off = (bp->sec + (bp->sec >> p_fs->sectors_per_clu_bits)) & (FAT_CACHE_HASH_SIZE-1);
+
+	hp = &(p_fs->FAT_cache_hash_list[off]);
+	bp->hash_next = hp->hash_next;
+	bp->hash_prev = hp;
+	hp->hash_next->hash_prev = bp;
+	hp->hash_next = bp;
+} /* end of FAT_cache_insert_hash */
+
+static void FAT_cache_remove_hash(BUF_CACHE_T *bp)
+{
+	(bp->hash_prev)->hash_next = bp->hash_next;
+	(bp->hash_next)->hash_prev = bp->hash_prev;
+} /* end of FAT_cache_remove_hash */
+
+/*======================================================================*/
+/*  Buffer Read/Write Functions                                         */
+/*======================================================================*/
+
+u8 *buf_getblk(struct super_block *sb, u32 sec)
+{
+	u8 *buf;
+
+	sm_P(&b_sem);
+
+	buf = __buf_getblk(sb, sec);
+
+	sm_V(&b_sem);
+
+	return buf;
+} /* end of buf_getblk */
+
+static u8 *__buf_getblk(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = buf_cache_find(sb, sec);
+	if (bp != NULL) {
+		move_to_mru(bp, &p_fs->buf_cache_lru_list);
+		return bp->buf_bh->b_data;
+	}
+
+	bp = buf_cache_get(sb, sec);
+
+	buf_cache_remove_hash(bp);
+
+	bp->drv = p_fs->drv;
+	bp->sec = sec;
+	bp->flag = 0;
+
+	buf_cache_insert_hash(sb, bp);
+
+	if (sector_read(sb, sec, &(bp->buf_bh), 1) != FFS_SUCCESS) {
+		buf_cache_remove_hash(bp);
+		bp->drv = -1;
+		bp->sec = ~0;
+		bp->flag = 0;
+		bp->buf_bh = NULL;
+
+		move_to_lru(bp, &p_fs->buf_cache_lru_list);
+		return NULL;
+	}
+
+	return bp->buf_bh->b_data;
+
+} /* end of __buf_getblk */
+
+void buf_modify(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL))
+		sector_write(sb, sec, bp->buf_bh, 0);
+
+	WARN(!bp, "[EXFAT] failed to find buffer_cache(sector:%u).\n", sec);
+
+	sm_V(&b_sem);
+} /* end of buf_modify */
+
+void buf_lock(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL))
+		bp->flag |= LOCKBIT;
+
+	WARN(!bp, "[EXFAT] failed to find buffer_cache(sector:%u).\n", sec);
+
+	sm_V(&b_sem);
+} /* end of buf_lock */
+
+void buf_unlock(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL))
+		bp->flag &= ~(LOCKBIT);
+
+	WARN(!bp, "[EXFAT] failed to find buffer_cache(sector:%u).\n", sec);
+
+	sm_V(&b_sem);
+} /* end of buf_unlock */
+
+void buf_release(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&b_sem);
+
+	bp = buf_cache_find(sb, sec);
+	if (likely(bp != NULL)) {
+		bp->drv = -1;
+		bp->sec = ~0;
+		bp->flag = 0;
+
+		if (bp->buf_bh) {
+			__brelse(bp->buf_bh);
+			bp->buf_bh = NULL;
+		}
+
+		move_to_lru(bp, &p_fs->buf_cache_lru_list);
+	}
+
+	sm_V(&b_sem);
+} /* end of buf_release */
+
+void buf_release_all(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&b_sem);
+
+	bp = p_fs->buf_cache_lru_list.next;
+	while (bp != &p_fs->buf_cache_lru_list) {
+		if (bp->drv == p_fs->drv) {
+			bp->drv = -1;
+			bp->sec = ~0;
+			bp->flag = 0;
+
+			if (bp->buf_bh) {
+				__brelse(bp->buf_bh);
+				bp->buf_bh = NULL;
+			}
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&b_sem);
+} /* end of buf_release_all */
+
+void buf_sync(struct super_block *sb)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	sm_P(&b_sem);
+
+	bp = p_fs->buf_cache_lru_list.next;
+	while (bp != &p_fs->buf_cache_lru_list) {
+		if ((bp->drv == p_fs->drv) && (bp->flag & DIRTYBIT)) {
+			sync_dirty_buffer(bp->buf_bh);
+			bp->flag &= ~(DIRTYBIT);
+		}
+		bp = bp->next;
+	}
+
+	sm_V(&b_sem);
+} /* end of buf_sync */
+
+static BUF_CACHE_T *buf_cache_find(struct super_block *sb, u32 sec)
+{
+	s32 off;
+	BUF_CACHE_T *bp, *hp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	off = (sec + (sec >> p_fs->sectors_per_clu_bits)) & (BUF_CACHE_HASH_SIZE - 1);
+
+	hp = &(p_fs->buf_cache_hash_list[off]);
+	for (bp = hp->hash_next; bp != hp; bp = bp->hash_next) {
+		if ((bp->drv == p_fs->drv) && (bp->sec == sec)) {
+			touch_buffer(bp->buf_bh);
+			return bp;
+		}
+	}
+	return NULL;
+} /* end of buf_cache_find */
+
+static BUF_CACHE_T *buf_cache_get(struct super_block *sb, u32 sec)
+{
+	BUF_CACHE_T *bp;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	bp = p_fs->buf_cache_lru_list.prev;
+	while (bp->flag & LOCKBIT)
+		bp = bp->prev;
+
+
+	move_to_mru(bp, &p_fs->buf_cache_lru_list);
+	return bp;
+} /* end of buf_cache_get */
+
+static void buf_cache_insert_hash(struct super_block *sb, BUF_CACHE_T *bp)
+{
+	s32 off;
+	BUF_CACHE_T *hp;
+	FS_INFO_T *p_fs;
+
+	p_fs = &(EXFAT_SB(sb)->fs_info);
+	off = (bp->sec + (bp->sec >> p_fs->sectors_per_clu_bits)) & (BUF_CACHE_HASH_SIZE-1);
+
+	hp = &(p_fs->buf_cache_hash_list[off]);
+	bp->hash_next = hp->hash_next;
+	bp->hash_prev = hp;
+	hp->hash_next->hash_prev = bp;
+	hp->hash_next = bp;
+} /* end of buf_cache_insert_hash */
+
+static void buf_cache_remove_hash(BUF_CACHE_T *bp)
+{
+	(bp->hash_prev)->hash_next = bp->hash_next;
+	(bp->hash_next)->hash_prev = bp->hash_prev;
+} /* end of buf_cache_remove_hash */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
+
+static void push_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->next = list->next;
+	bp->prev = list;
+	list->next->prev = bp;
+	list->next = bp;
+} /* end of buf_cache_push_to_mru */
+
+static void push_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->prev = list->prev;
+	bp->next = list;
+	list->prev->next = bp;
+	list->prev = bp;
+} /* end of buf_cache_push_to_lru */
+
+static void move_to_mru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->prev->next = bp->next;
+	bp->next->prev = bp->prev;
+	push_to_mru(bp, list);
+} /* end of buf_cache_move_to_mru */
+
+static void move_to_lru(BUF_CACHE_T *bp, BUF_CACHE_T *list)
+{
+	bp->prev->next = bp->next;
+	bp->next->prev = bp->prev;
+	push_to_lru(bp, list);
+} /* end of buf_cache_move_to_lru */
diff --git b/fs/exfat/exfat_cache.h b/fs/exfat/exfat_cache.h
new file mode 100644
index 0000000..d82aad5
--- /dev/null
+++ b/fs/exfat/exfat_cache.h
@@ -0,0 +1,85 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_cache.h                                             */
+/*  PURPOSE : Header File for exFAT Cache Manager                       */
+/*            (FAT Cache & Buffer Cache)                                */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Sung-Kwan Kim] : first writing                        */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_CACHE_H
+#define _EXFAT_CACHE_H
+
+#include <linux/fs.h>
+#include <linux/types.h>
+#include "exfat_config.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+#define LOCKBIT                 0x01
+#define DIRTYBIT                0x02
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct __BUF_CACHE_T {
+	struct __BUF_CACHE_T *next;
+	struct __BUF_CACHE_T *prev;
+	struct __BUF_CACHE_T *hash_next;
+	struct __BUF_CACHE_T *hash_prev;
+	s32                drv;
+	u32               sec;
+	u32               flag;
+	struct buffer_head   *buf_bh;
+} BUF_CACHE_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+s32  buf_init(struct super_block *sb);
+s32  buf_shutdown(struct super_block *sb);
+s32  FAT_read(struct super_block *sb, u32 loc, u32 *content);
+s32  FAT_write(struct super_block *sb, u32 loc, u32 content);
+u8 *FAT_getblk(struct super_block *sb, u32 sec);
+void   FAT_modify(struct super_block *sb, u32 sec);
+void   FAT_release_all(struct super_block *sb);
+void   FAT_sync(struct super_block *sb);
+u8 *buf_getblk(struct super_block *sb, u32 sec);
+void   buf_modify(struct super_block *sb, u32 sec);
+void   buf_lock(struct super_block *sb, u32 sec);
+void   buf_unlock(struct super_block *sb, u32 sec);
+void   buf_release(struct super_block *sb, u32 sec);
+void   buf_release_all(struct super_block *sb);
+void   buf_sync(struct super_block *sb);
+
+#endif /* _EXFAT_CACHE_H */
diff --git b/fs/exfat/exfat_config.h b/fs/exfat/exfat_config.h
new file mode 100644
index 0000000..33c6525
--- /dev/null
+++ b/fs/exfat/exfat_config.h
@@ -0,0 +1,69 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_config.h                                            */
+/*  PURPOSE : Header File for exFAT Configuable Policies                */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_CONFIG_H
+#define _EXFAT_CONFIG_H
+
+/*======================================================================*/
+/*                                                                      */
+/*                        FFS CONFIGURATIONS                            */
+/*                  (CHANGE THIS PART IF REQUIRED)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/* Feature Config                                                       */
+/*----------------------------------------------------------------------*/
+#ifndef CONFIG_EXFAT_DISCARD
+#define CONFIG_EXFAT_DISCARD		1	/* mount option -o discard support */
+#endif
+
+#ifndef CONFIG_EXFAT_DELAYED_SYNC
+#define CONFIG_EXFAT_DELAYED_SYNC 0
+#endif
+
+#ifndef CONFIG_EXFAT_KERNEL_DEBUG
+#define CONFIG_EXFAT_KERNEL_DEBUG	1	/* kernel debug features via ioctl */
+#endif
+
+#ifndef CONFIG_EXFAT_DEBUG_MSG
+#define CONFIG_EXFAT_DEBUG_MSG		0	/* debugging message on/off */
+#endif
+
+#ifndef CONFIG_EXFAT_DEFAULT_CODEPAGE
+#define CONFIG_EXFAT_DEFAULT_CODEPAGE	437
+#define CONFIG_EXFAT_DEFAULT_IOCHARSET	"utf8"
+#endif
+
+#endif /* _EXFAT_CONFIG_H */
diff --git b/fs/exfat/exfat_core.c b/fs/exfat/exfat_core.c
new file mode 100644
index 0000000..73c089f
--- /dev/null
+++ b/fs/exfat/exfat_core.c
@@ -0,0 +1,5124 @@
+/* Some of the source code in this file came from "linux/fs/fat/misc.c".  */
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *         and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_core.c                                              */
+/*  PURPOSE : exFAT File Manager                                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/version.h>
+#include <linux/param.h>
+#include <linux/log2.h>
+
+#include "exfat_bitmap.h"
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+#include <linux/blkdev.h>
+
+static void __set_sb_dirty(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	sb->s_dirt = 1;
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	sbi->s_dirt = 1;
+#endif
+}
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+extern u8 uni_upcase[];
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+static u8 name_buf[MAX_PATH_LENGTH * MAX_CHARSET_SIZE];
+
+static char *reserved_names[] = {
+	"AUX     ", "CON     ", "NUL     ", "PRN     ",
+	"COM1    ", "COM2    ", "COM3    ", "COM4    ",
+	"COM5    ", "COM6    ", "COM7    ", "COM8    ", "COM9    ",
+	"LPT1    ", "LPT2    ", "LPT3    ", "LPT4    ",
+	"LPT5    ", "LPT6    ", "LPT7    ", "LPT8    ", "LPT9    ",
+	NULL
+};
+
+static u8 free_bit[] = {
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, /*   0 ~  19 */
+	0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, /*  20 ~  39 */
+	0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, /*  40 ~  59 */
+	0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, /*  60 ~  79 */
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, /*  80 ~  99 */
+	0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, /* 100 ~ 119 */
+	0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, /* 120 ~ 139 */
+	0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, /* 140 ~ 159 */
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, /* 160 ~ 179 */
+	0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, /* 180 ~ 199 */
+	0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, /* 200 ~ 219 */
+	0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, /* 220 ~ 239 */
+	0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0                 /* 240 ~ 254 */
+};
+
+static u8 used_bit[] = {
+	0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, /*   0 ~  19 */
+	2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, /*  20 ~  39 */
+	2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, /*  40 ~  59 */
+	4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, /*  60 ~  79 */
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, /*  80 ~  99 */
+	3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, /* 100 ~ 119 */
+	4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, /* 120 ~ 139 */
+	3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, /* 140 ~ 159 */
+	2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, /* 160 ~ 179 */
+	4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, /* 180 ~ 199 */
+	3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, /* 200 ~ 219 */
+	5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, /* 220 ~ 239 */
+	4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8              /* 240 ~ 255 */
+};
+
+/*======================================================================*/
+/*  Global Function Definitions                                         */
+/*======================================================================*/
+
+/* ffsInit : roll back to the initial state of the file system */
+s32 ffsInit(void)
+{
+	s32 ret;
+
+	ret = bdev_init();
+	if (ret)
+		return ret;
+
+	ret = fs_init();
+	if (ret)
+		return ret;
+
+	return FFS_SUCCESS;
+} /* end of ffsInit */
+
+/* ffsShutdown : make free all memory-alloced global buffers */
+s32 ffsShutdown(void)
+{
+	s32 ret;
+	ret = fs_shutdown();
+	if (ret)
+		return ret;
+
+	ret = bdev_shutdown();
+	if (ret)
+		return ret;
+
+	return FFS_SUCCESS;
+} /* end of ffsShutdown */
+
+/* ffsMountVol : mount the file system volume */
+s32 ffsMountVol(struct super_block *sb)
+{
+	int i, ret;
+	PBR_SECTOR_T *p_pbr;
+	struct buffer_head *tmp_bh = NULL;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	printk("[EXFAT] trying to mount...\n");
+
+	sm_init(&p_fs->v_sem);
+	p_fs->dev_ejected = FALSE;
+
+	/* open the block device */
+	if (bdev_open(sb))
+		return FFS_MEDIAERR;
+
+	if (p_bd->sector_size < sb->s_blocksize)
+		return FFS_MEDIAERR;
+	if (p_bd->sector_size > sb->s_blocksize)
+		sb_set_blocksize(sb, p_bd->sector_size);
+
+	/* read Sector 0 */
+	if (sector_read(sb, 0, &tmp_bh, 1) != FFS_SUCCESS)
+		return FFS_MEDIAERR;
+
+	p_fs->PBR_sector = 0;
+
+	p_pbr = (PBR_SECTOR_T *) tmp_bh->b_data;
+
+	/* check the validity of PBR */
+	if (GET16_A(p_pbr->signature) != PBR_SIGNATURE) {
+		brelse(tmp_bh);
+		bdev_close(sb);
+		return FFS_FORMATERR;
+	}
+
+	/* fill fs_stuct */
+	for (i = 0; i < 53; i++)
+		if (p_pbr->bpb[i])
+			break;
+
+	if (i < 53) {
+		if (GET16(p_pbr->bpb+11)) /* num_fat_sectors */
+			ret = fat16_mount(sb, p_pbr);
+		else
+			ret = fat32_mount(sb, p_pbr);
+	} else {
+		ret = exfat_mount(sb, p_pbr);
+	}
+
+	brelse(tmp_bh);
+
+	if (ret) {
+		bdev_close(sb);
+		return ret;
+	}
+
+	if (p_fs->vol_type == EXFAT) {
+		ret = load_alloc_bitmap(sb);
+		if (ret) {
+			bdev_close(sb);
+			return ret;
+		}
+		ret = load_upcase_table(sb);
+		if (ret) {
+			free_alloc_bitmap(sb);
+			bdev_close(sb);
+			return ret;
+		}
+	}
+
+	if (p_fs->dev_ejected) {
+		if (p_fs->vol_type == EXFAT) {
+			free_upcase_table(sb);
+			free_alloc_bitmap(sb);
+		}
+		bdev_close(sb);
+		return FFS_MEDIAERR;
+	}
+
+	printk("[EXFAT] mounted successfully\n");
+
+	return FFS_SUCCESS;
+} /* end of ffsMountVol */
+
+/* ffsUmountVol : umount the file system volume */
+s32 ffsUmountVol(struct super_block *sb)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	printk("[EXFAT] trying to unmount...\n");
+
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+
+	if (p_fs->vol_type == EXFAT) {
+		free_upcase_table(sb);
+		free_alloc_bitmap(sb);
+	}
+
+	FAT_release_all(sb);
+	buf_release_all(sb);
+
+	/* close the block device */
+	bdev_close(sb);
+
+	if (p_fs->dev_ejected) {
+		printk("[EXFAT] unmounted with media errors. "
+			"device's already ejected.\n");
+		return FFS_MEDIAERR;
+	}
+
+	printk("[EXFAT] unmounted successfully\n");
+
+	return FFS_SUCCESS;
+} /* end of ffsUmountVol */
+
+/* ffsGetVolInfo : get the information of a file system volume */
+s32 ffsGetVolInfo(struct super_block *sb, VOL_INFO_T *info)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_fs->used_clusters == (u32) ~0)
+		p_fs->used_clusters = p_fs->fs_func->count_used_clusters(sb);
+
+	info->FatType = p_fs->vol_type;
+	info->ClusterSize = p_fs->cluster_size;
+	info->NumClusters = p_fs->num_clusters - 2; /* clu 0 & 1 */
+	info->UsedClusters = p_fs->used_clusters;
+	info->FreeClusters = info->NumClusters - info->UsedClusters;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsGetVolInfo */
+
+/* ffsSyncVol : synchronize all file system volumes */
+s32 ffsSyncVol(struct super_block *sb, s32 do_sync)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* synchronize the file system */
+	fs_sync(sb, do_sync);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsSyncVol */
+
+/*----------------------------------------------------------------------*/
+/*  File Operation Functions                                            */
+/*----------------------------------------------------------------------*/
+
+/* ffsLookupFile : lookup a file */
+s32 ffsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	s32 ret, dentry, num_entries;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	DOS_NAME_T dos_name;
+	DENTRY_T *ep, *ep2;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	DPRINTK("ffsLookupFile entered\n");
+
+	/* check the validity of directory name in the given pathname */
+	ret = resolve_path(inode, path, &dir, &uni_name);
+	if (ret)
+		return ret;
+
+	ret = get_num_entries_and_dos_name(sb, &dir, &uni_name, &num_entries, &dos_name);
+	if (ret)
+		return ret;
+
+	/* search the file name for directories */
+	dentry = p_fs->fs_func->find_dir_entry(sb, &dir, &uni_name, num_entries, &dos_name, TYPE_ALL);
+	if (dentry < -1)
+		return FFS_NOTFOUND;
+
+	fid->dir.dir = dir.dir;
+	fid->dir.size = dir.size;
+	fid->dir.flags = dir.flags;
+	fid->entry = dentry;
+
+	if (dentry == -1) {
+		fid->type = TYPE_DIR;
+		fid->rwoffset = 0;
+		fid->hint_last_off = -1;
+
+		fid->attr = ATTR_SUBDIR;
+		fid->flags = 0x01;
+		fid->size = 0;
+		fid->start_clu = p_fs->root_dir;
+	} else {
+		if (p_fs->vol_type == EXFAT) {
+			es = get_entry_set_in_dir(sb, &dir, dentry, ES_2_ENTRIES, &ep);
+			if (!es)
+				return FFS_MEDIAERR;
+			ep2 = ep+1;
+		} else {
+			ep = get_entry_in_dir(sb, &dir, dentry, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+			ep2 = ep;
+		}
+
+		fid->type = p_fs->fs_func->get_entry_type(ep);
+		fid->rwoffset = 0;
+		fid->hint_last_off = -1;
+		fid->attr = p_fs->fs_func->get_entry_attr(ep);
+
+		fid->size = p_fs->fs_func->get_entry_size(ep2);
+		if ((fid->type == TYPE_FILE) && (fid->size == 0)) {
+			fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+			fid->start_clu = CLUSTER_32(~0);
+		} else {
+			fid->flags = p_fs->fs_func->get_entry_flag(ep2);
+			fid->start_clu = p_fs->fs_func->get_entry_clu0(ep2);
+		}
+
+		if (p_fs->vol_type == EXFAT)
+			release_entry_set(es);
+	}
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	DPRINTK("ffsLookupFile exited successfully\n");
+
+	return FFS_SUCCESS;
+} /* end of ffsLookupFile */
+
+/* ffsCreateFile : create a file */
+s32 ffsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid)
+{
+	s32 ret/*, dentry*/;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	/* check the validity of directory name in the given pathname */
+	ret = resolve_path(inode, path, &dir, &uni_name);
+	if (ret)
+		return ret;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* create a new file */
+	ret = create_file(inode, &dir, &uni_name, mode, fid);
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return ret;
+} /* end of ffsCreateFile */
+
+/* ffsReadFile : read data from a opened file */
+s32 ffsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount)
+{
+	s32 offset, sec_offset, clu_offset;
+	u32 clu, LogSector;
+	u64 oneblkread, read_bytes;
+	struct buffer_head *tmp_bh = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_FILE)
+		return FFS_PERMISSIONERR;
+
+	if (fid->rwoffset > fid->size)
+		fid->rwoffset = fid->size;
+
+	if (count > (fid->size - fid->rwoffset))
+		count = fid->size - fid->rwoffset;
+
+	if (count == 0) {
+		if (rcount != NULL)
+			*rcount = 0;
+		return FFS_EOF;
+	}
+
+	read_bytes = 0;
+
+	while (count > 0) {
+		clu_offset = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		clu = fid->start_clu;
+
+		if (fid->flags == 0x03) {
+			clu += clu_offset;
+		} else {
+			/* hint information */
+			if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+				(clu_offset >= fid->hint_last_off)) {
+				clu_offset -= fid->hint_last_off;
+				clu = fid->hint_last_clu;
+			}
+
+			while (clu_offset > 0) {
+				/* clu = FAT_read(sb, clu); */
+				if (FAT_read(sb, clu, &clu) == -1)
+					return FFS_MEDIAERR;
+
+				clu_offset--;
+			}
+		}
+
+		/* hint information */
+		fid->hint_last_off = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		fid->hint_last_clu = clu;
+
+		offset = (s32)(fid->rwoffset & (p_fs->cluster_size-1)); /* byte offset in cluster   */
+		sec_offset = offset >> p_bd->sector_size_bits;            /* sector offset in cluster */
+		offset &= p_bd->sector_size_mask;                         /* byte offset in sector    */
+
+		LogSector = START_SECTOR(clu) + sec_offset;
+
+		oneblkread = (u64)(p_bd->sector_size - offset);
+		if (oneblkread > count)
+			oneblkread = count;
+
+		if ((offset == 0) && (oneblkread == p_bd->sector_size)) {
+			if (sector_read(sb, LogSector, &tmp_bh, 1) != FFS_SUCCESS)
+				goto err_out;
+			memcpy(((char *) buffer)+read_bytes, ((char *) tmp_bh->b_data), (s32) oneblkread);
+		} else {
+			if (sector_read(sb, LogSector, &tmp_bh, 1) != FFS_SUCCESS)
+				goto err_out;
+			memcpy(((char *) buffer)+read_bytes, ((char *) tmp_bh->b_data)+offset, (s32) oneblkread);
+		}
+		count -= oneblkread;
+		read_bytes += oneblkread;
+		fid->rwoffset += oneblkread;
+	}
+	brelse(tmp_bh);
+
+err_out:
+	/* set the size of read bytes */
+	if (rcount != NULL)
+		*rcount = read_bytes;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsReadFile */
+
+/* ffsWriteFile : write data into a opened file */
+s32 ffsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount)
+{
+	s32 modified = FALSE, offset, sec_offset, clu_offset;
+	s32 num_clusters, num_alloc, num_alloced = (s32) ~0;
+	u32 clu, last_clu, LogSector, sector = 0;
+	u64 oneblkwrite, write_bytes;
+	CHAIN_T new_clu;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct buffer_head *tmp_bh = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_FILE)
+		return FFS_PERMISSIONERR;
+
+	if (fid->rwoffset > fid->size)
+		fid->rwoffset = fid->size;
+
+	if (count == 0) {
+		if (wcount != NULL)
+			*wcount = 0;
+		return FFS_SUCCESS;
+	}
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	if (fid->size == 0)
+		num_clusters = 0;
+	else
+		num_clusters = (s32)((fid->size-1) >> p_fs->cluster_size_bits) + 1;
+
+	write_bytes = 0;
+
+	while (count > 0) {
+		clu_offset = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		clu = last_clu = fid->start_clu;
+
+		if (fid->flags == 0x03) {
+			if ((clu_offset > 0) && (clu != CLUSTER_32(~0))) {
+				last_clu += clu_offset - 1;
+
+				if (clu_offset == num_clusters)
+					clu = CLUSTER_32(~0);
+				else
+					clu += clu_offset;
+			}
+		} else {
+			/* hint information */
+			if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+				(clu_offset >= fid->hint_last_off)) {
+				clu_offset -= fid->hint_last_off;
+				clu = fid->hint_last_clu;
+			}
+
+			while ((clu_offset > 0) && (clu != CLUSTER_32(~0))) {
+				last_clu = clu;
+				/* clu = FAT_read(sb, clu); */
+				if (FAT_read(sb, clu, &clu) == -1)
+					return FFS_MEDIAERR;
+
+				clu_offset--;
+			}
+		}
+
+		if (clu == CLUSTER_32(~0)) {
+			num_alloc = (s32)((count-1) >> p_fs->cluster_size_bits) + 1;
+			new_clu.dir = (last_clu == CLUSTER_32(~0)) ? CLUSTER_32(~0) : last_clu+1;
+			new_clu.size = 0;
+			new_clu.flags = fid->flags;
+
+			/* (1) allocate a chain of clusters */
+			num_alloced = p_fs->fs_func->alloc_cluster(sb, num_alloc, &new_clu);
+			if (num_alloced == 0)
+				break;
+			else if (num_alloced < 0)
+				return FFS_MEDIAERR;
+
+			/* (2) append to the FAT chain */
+			if (last_clu == CLUSTER_32(~0)) {
+				if (new_clu.flags == 0x01)
+					fid->flags = 0x01;
+				fid->start_clu = new_clu.dir;
+				modified = TRUE;
+			} else {
+				if (new_clu.flags != fid->flags) {
+					exfat_chain_cont_cluster(sb, fid->start_clu, num_clusters);
+					fid->flags = 0x01;
+					modified = TRUE;
+				}
+				if (new_clu.flags == 0x01)
+					FAT_write(sb, last_clu, new_clu.dir);
+			}
+
+			num_clusters += num_alloced;
+			clu = new_clu.dir;
+		}
+
+		/* hint information */
+		fid->hint_last_off = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+		fid->hint_last_clu = clu;
+
+		offset = (s32)(fid->rwoffset & (p_fs->cluster_size-1)); /* byte offset in cluster   */
+		sec_offset = offset >> p_bd->sector_size_bits;            /* sector offset in cluster */
+		offset &= p_bd->sector_size_mask;                         /* byte offset in sector    */
+
+		LogSector = START_SECTOR(clu) + sec_offset;
+
+		oneblkwrite = (u64)(p_bd->sector_size - offset);
+		if (oneblkwrite > count)
+			oneblkwrite = count;
+
+		if ((offset == 0) && (oneblkwrite == p_bd->sector_size)) {
+			if (sector_read(sb, LogSector, &tmp_bh, 0) != FFS_SUCCESS)
+				goto err_out;
+			memcpy(((char *) tmp_bh->b_data), ((char *) buffer)+write_bytes, (s32) oneblkwrite);
+			if (sector_write(sb, LogSector, tmp_bh, 0) != FFS_SUCCESS) {
+				brelse(tmp_bh);
+				goto err_out;
+			}
+		} else {
+			if ((offset > 0) || ((fid->rwoffset+oneblkwrite) < fid->size)) {
+				if (sector_read(sb, LogSector, &tmp_bh, 1) != FFS_SUCCESS)
+					goto err_out;
+			} else {
+				if (sector_read(sb, LogSector, &tmp_bh, 0) != FFS_SUCCESS)
+					goto err_out;
+			}
+
+			memcpy(((char *) tmp_bh->b_data)+offset, ((char *) buffer)+write_bytes, (s32) oneblkwrite);
+			if (sector_write(sb, LogSector, tmp_bh, 0) != FFS_SUCCESS) {
+				brelse(tmp_bh);
+				goto err_out;
+			}
+		}
+
+		count -= oneblkwrite;
+		write_bytes += oneblkwrite;
+		fid->rwoffset += oneblkwrite;
+
+		fid->attr |= ATTR_ARCHIVE;
+
+		if (fid->size < fid->rwoffset) {
+			fid->size = fid->rwoffset;
+			modified = TRUE;
+		}
+	}
+
+	brelse(tmp_bh);
+
+	/* (3) update the direcoty entry */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			goto err_out;
+		ep2 = ep+1;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			goto err_out;
+		ep2 = ep;
+	}
+
+	p_fs->fs_func->set_entry_time(ep, tm_current(&tm), TM_MODIFY);
+	p_fs->fs_func->set_entry_attr(ep, fid->attr);
+
+	if (p_fs->vol_type != EXFAT)
+		buf_modify(sb, sector);
+
+	if (modified) {
+		if (p_fs->fs_func->get_entry_flag(ep2) != fid->flags)
+			p_fs->fs_func->set_entry_flag(ep2, fid->flags);
+
+		if (p_fs->fs_func->get_entry_size(ep2) != fid->size)
+			p_fs->fs_func->set_entry_size(ep2, fid->size);
+
+		if (p_fs->fs_func->get_entry_clu0(ep2) != fid->start_clu)
+			p_fs->fs_func->set_entry_clu0(ep2, fid->start_clu);
+
+		if (p_fs->vol_type != EXFAT)
+			buf_modify(sb, sector);
+	}
+
+	if (p_fs->vol_type == EXFAT) {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+err_out:
+	/* set the size of written bytes */
+	if (wcount != NULL)
+		*wcount = write_bytes;
+
+	if (num_alloced == 0)
+		return FFS_FULL;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsWriteFile */
+
+/* ffsTruncateFile : resize the file length */
+s32 ffsTruncateFile(struct inode *inode, u64 old_size, u64 new_size)
+{
+	s32 num_clusters;
+	u32 last_clu = CLUSTER_32(0), sector = 0;
+	CHAIN_T clu;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	ENTRY_SET_CACHE_T *es = NULL;
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_FILE)
+		return FFS_PERMISSIONERR;
+
+	if (fid->size != old_size) {
+		printk(KERN_ERR "[EXFAT] truncate : can't skip it because of "
+				"size-mismatch(old:%lld->fid:%lld).\n"
+				,old_size, fid->size);
+	}
+
+	if (old_size <= new_size)
+		return FFS_SUCCESS;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	clu.dir = fid->start_clu;
+	clu.size = (s32)((old_size-1) >> p_fs->cluster_size_bits) + 1;
+	clu.flags = fid->flags;
+
+	if (new_size > 0) {
+		num_clusters = (s32)((new_size-1) >> p_fs->cluster_size_bits) + 1;
+
+		if (clu.flags == 0x03) {
+			clu.dir += num_clusters;
+		} else {
+			while (num_clusters > 0) {
+				last_clu = clu.dir;
+				if (FAT_read(sb, clu.dir, &(clu.dir)) == -1)
+					return FFS_MEDIAERR;
+				num_clusters--;
+			}
+		}
+
+		clu.size -= num_clusters;
+	}
+
+	fid->size = new_size;
+	fid->attr |= ATTR_ARCHIVE;
+	if (new_size == 0) {
+		fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+		fid->start_clu = CLUSTER_32(~0);
+	}
+
+	/* (1) update the directory entry */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+		ep2 = ep+1;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+		ep2 = ep;
+	}
+
+	p_fs->fs_func->set_entry_time(ep, tm_current(&tm), TM_MODIFY);
+	p_fs->fs_func->set_entry_attr(ep, fid->attr);
+
+	p_fs->fs_func->set_entry_size(ep2, new_size);
+	if (new_size == 0) {
+		p_fs->fs_func->set_entry_flag(ep2, 0x01);
+		p_fs->fs_func->set_entry_clu0(ep2, CLUSTER_32(0));
+	}
+
+	if (p_fs->vol_type != EXFAT)
+		buf_modify(sb, sector);
+	else {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+	/* (2) cut off from the FAT chain */
+	if (last_clu != CLUSTER_32(0)) {
+		if (fid->flags == 0x01)
+			FAT_write(sb, last_clu, CLUSTER_32(~0));
+	}
+
+	/* (3) free the clusters */
+	p_fs->fs_func->free_cluster(sb, &clu, 0);
+
+	/* hint information */
+	fid->hint_last_off = -1;
+	if (fid->rwoffset > fid->size)
+		fid->rwoffset = fid->size;
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsTruncateFile */
+
+static void update_parent_info(FILE_ID_T *fid, struct inode *parent_inode)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(parent_inode->i_sb)->fs_info);
+	FILE_ID_T *parent_fid = &(EXFAT_I(parent_inode)->fid);
+
+	if (unlikely((parent_fid->flags != fid->dir.flags)
+		|| (parent_fid->size != (fid->dir.size<<p_fs->cluster_size_bits))
+		|| (parent_fid->start_clu != fid->dir.dir))) {
+
+		fid->dir.dir = parent_fid->start_clu;
+		fid->dir.flags = parent_fid->flags;
+		fid->dir.size = ((parent_fid->size + (p_fs->cluster_size-1))
+						>> p_fs->cluster_size_bits);
+	}
+}
+
+/* ffsMoveFile : move(rename) a old file into a new file */
+s32 ffsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry)
+{
+	s32 ret;
+	s32 dentry;
+	CHAIN_T olddir, newdir;
+	CHAIN_T *p_dir = NULL;
+	UNI_NAME_T uni_name;
+	DENTRY_T *ep;
+	struct super_block *sb = old_parent_inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	u8 *new_path = (u8 *) new_dentry->d_name.name;
+	struct inode *new_inode = new_dentry->d_inode;
+	int num_entries;
+	FILE_ID_T *new_fid = NULL;
+	s32 new_entry = 0;
+
+	/* check the validity of pointer parameters */
+	if ((new_path == NULL) || (*new_path == '\0'))
+		return FFS_ERROR;
+
+	update_parent_info(fid, old_parent_inode);
+
+	olddir.dir = fid->dir.dir;
+	olddir.size = fid->dir.size;
+	olddir.flags = fid->dir.flags;
+
+	dentry = fid->entry;
+
+	/* check if the old file is "." or ".." */
+	if (p_fs->vol_type != EXFAT) {
+		if ((olddir.dir != p_fs->root_dir) && (dentry < 2))
+			return FFS_PERMISSIONERR;
+	}
+
+	ep = get_entry_in_dir(sb, &olddir, dentry, NULL);
+	if (!ep)
+		return FFS_MEDIAERR;
+
+	if (p_fs->fs_func->get_entry_attr(ep) & ATTR_READONLY)
+		return FFS_PERMISSIONERR;
+
+	/* check whether new dir is existing directory and empty */
+	if (new_inode) {
+		u32 entry_type;
+
+		ret = FFS_MEDIAERR;
+		new_fid = &EXFAT_I(new_inode)->fid;
+
+		update_parent_info(new_fid, new_parent_inode);
+
+		p_dir = &(new_fid->dir);
+		new_entry = new_fid->entry;
+		ep = get_entry_in_dir(sb, p_dir, new_entry, NULL);
+		if (!ep)
+			goto out;
+
+		entry_type = p_fs->fs_func->get_entry_type(ep);
+
+		if (entry_type == TYPE_DIR) {
+			CHAIN_T new_clu;
+			new_clu.dir = new_fid->start_clu;
+			new_clu.size = (s32)((new_fid->size-1) >> p_fs->cluster_size_bits) + 1;
+			new_clu.flags = new_fid->flags;
+
+			if (!is_dir_empty(sb, &new_clu))
+				return FFS_FILEEXIST;
+		}
+	}
+
+	/* check the validity of directory name in the given new pathname */
+	ret = resolve_path(new_parent_inode, new_path, &newdir, &uni_name);
+	if (ret)
+		return ret;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	if (olddir.dir == newdir.dir)
+		ret = rename_file(new_parent_inode, &olddir, dentry, &uni_name, fid);
+	else
+		ret = move_file(new_parent_inode, &olddir, dentry, &newdir, &uni_name, fid);
+
+	if ((ret == FFS_SUCCESS) && new_inode) {
+		/* delete entries of new_dir */
+		ep = get_entry_in_dir(sb, p_dir, new_entry, NULL);
+		if (!ep)
+			goto out;
+
+		num_entries = p_fs->fs_func->count_ext_entries(sb, p_dir, new_entry, ep);
+		if (num_entries < 0)
+			goto out;
+		p_fs->fs_func->delete_dir_entry(sb, p_dir, new_entry, 0, num_entries+1);
+	}
+out:
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return ret;
+} /* end of ffsMoveFile */
+
+/* ffsRemoveFile : remove a file */
+s32 ffsRemoveFile(struct inode *inode, FILE_ID_T *fid)
+{
+	s32 dentry;
+	CHAIN_T dir, clu_to_free;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	dir.dir = fid->dir.dir;
+	dir.size = fid->dir.size;
+	dir.flags = fid->dir.flags;
+
+	dentry = fid->entry;
+
+	ep = get_entry_in_dir(sb, &dir, dentry, NULL);
+	if (!ep)
+		return FFS_MEDIAERR;
+
+	if (p_fs->fs_func->get_entry_attr(ep) & ATTR_READONLY)
+		return FFS_PERMISSIONERR;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* (1) update the directory entry */
+	remove_file(inode, &dir, dentry);
+
+	clu_to_free.dir = fid->start_clu;
+	clu_to_free.size = (s32)((fid->size-1) >> p_fs->cluster_size_bits) + 1;
+	clu_to_free.flags = fid->flags;
+
+	/* (2) free the clusters */
+	p_fs->fs_func->free_cluster(sb, &clu_to_free, 0);
+
+	fid->size = 0;
+	fid->start_clu = CLUSTER_32(~0);
+	fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsRemoveFile */
+
+/* ffsSetAttr : set the attribute of a given file */
+s32 ffsSetAttr(struct inode *inode, u32 attr)
+{
+	u32 type, sector = 0;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	u8 is_dir = (fid->type == TYPE_DIR) ? 1 : 0;
+	ENTRY_SET_CACHE_T *es = NULL;
+
+	if (fid->attr == attr) {
+		if (p_fs->dev_ejected)
+			return FFS_MEDIAERR;
+		return FFS_SUCCESS;
+	}
+
+	if (is_dir) {
+		if ((fid->dir.dir == p_fs->root_dir) &&
+			(fid->entry == -1)) {
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+			return FFS_SUCCESS;
+		}
+	}
+
+	/* get the directory entry of given file */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+	}
+
+	type = p_fs->fs_func->get_entry_type(ep);
+
+	if (((type == TYPE_FILE) && (attr & ATTR_SUBDIR)) ||
+		((type == TYPE_DIR) && (!(attr & ATTR_SUBDIR)))) {
+		s32 err;
+		if (p_fs->dev_ejected)
+			err = FFS_MEDIAERR;
+		else
+			err = FFS_ERROR;
+
+		if (p_fs->vol_type == EXFAT)
+			release_entry_set(es);
+		return err;
+	}
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* set the file attribute */
+	fid->attr = attr;
+	p_fs->fs_func->set_entry_attr(ep, attr);
+
+	if (p_fs->vol_type != EXFAT)
+		buf_modify(sb, sector);
+	else {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsSetAttr */
+
+/* ffsGetStat : get the information of a given file */
+s32 ffsGetStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	u32 sector = 0;
+	s32 count;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	ENTRY_SET_CACHE_T *es = NULL;
+	u8 is_dir = (fid->type == TYPE_DIR) ? 1 : 0;
+
+	DPRINTK("ffsGetStat entered\n");
+
+	if (is_dir) {
+		if ((fid->dir.dir == p_fs->root_dir) &&
+			(fid->entry == -1)) {
+			info->Attr = ATTR_SUBDIR;
+			memset((char *) &info->CreateTimestamp, 0, sizeof(DATE_TIME_T));
+			memset((char *) &info->ModifyTimestamp, 0, sizeof(DATE_TIME_T));
+			memset((char *) &info->AccessTimestamp, 0, sizeof(DATE_TIME_T));
+			strcpy(info->ShortName, ".");
+			strcpy(info->Name, ".");
+
+			dir.dir = p_fs->root_dir;
+			dir.flags = 0x01;
+
+			if (p_fs->root_dir == CLUSTER_32(0)) /* FAT16 root_dir */
+				info->Size = p_fs->dentries_in_root << DENTRY_SIZE_BITS;
+			else
+				info->Size = count_num_clusters(sb, &dir) << p_fs->cluster_size_bits;
+
+			count = count_dos_name_entries(sb, &dir, TYPE_DIR);
+			if (count < 0)
+				return FFS_MEDIAERR;
+			info->NumSubdirs = count;
+
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+			return FFS_SUCCESS;
+		}
+	}
+
+	/* get the directory entry of given file or directory */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_2_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+		ep2 = ep+1;
+	} else {
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+		ep2 = ep;
+		buf_lock(sb, sector);
+	}
+
+	/* set FILE_INFO structure using the acquired DENTRY_T */
+	info->Attr = p_fs->fs_func->get_entry_attr(ep);
+
+	p_fs->fs_func->get_entry_time(ep, &tm, TM_CREATE);
+	info->CreateTimestamp.Year = tm.year;
+	info->CreateTimestamp.Month = tm.mon;
+	info->CreateTimestamp.Day = tm.day;
+	info->CreateTimestamp.Hour = tm.hour;
+	info->CreateTimestamp.Minute = tm.min;
+	info->CreateTimestamp.Second = tm.sec;
+	info->CreateTimestamp.MilliSecond = 0;
+
+	p_fs->fs_func->get_entry_time(ep, &tm, TM_MODIFY);
+	info->ModifyTimestamp.Year = tm.year;
+	info->ModifyTimestamp.Month = tm.mon;
+	info->ModifyTimestamp.Day = tm.day;
+	info->ModifyTimestamp.Hour = tm.hour;
+	info->ModifyTimestamp.Minute = tm.min;
+	info->ModifyTimestamp.Second = tm.sec;
+	info->ModifyTimestamp.MilliSecond = 0;
+
+	memset((char *) &info->AccessTimestamp, 0, sizeof(DATE_TIME_T));
+
+	*(uni_name.name) = 0x0;
+	/* XXX this is very bad for exfat cuz name is already included in es.
+	 API should be revised */
+	p_fs->fs_func->get_uni_name_from_ext_entry(sb, &(fid->dir), fid->entry, uni_name.name);
+	if (*(uni_name.name) == 0x0 && p_fs->vol_type != EXFAT)
+		get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x1);
+	nls_uniname_to_cstring(sb, info->Name, &uni_name);
+
+	if (p_fs->vol_type == EXFAT) {
+		info->NumSubdirs = 2;
+	} else {
+		buf_unlock(sb, sector);
+		get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x0);
+		nls_uniname_to_cstring(sb, info->ShortName, &uni_name);
+		info->NumSubdirs = 0;
+	}
+
+	info->Size = p_fs->fs_func->get_entry_size(ep2);
+
+	if (p_fs->vol_type == EXFAT)
+		release_entry_set(es);
+
+	if (is_dir) {
+		dir.dir = fid->start_clu;
+		dir.flags = 0x01;
+
+		if (info->Size == 0)
+			info->Size = (u64) count_num_clusters(sb, &dir) << p_fs->cluster_size_bits;
+
+		count = count_dos_name_entries(sb, &dir, TYPE_DIR);
+		if (count < 0)
+			return FFS_MEDIAERR;
+		info->NumSubdirs += count;
+	}
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	DPRINTK("ffsGetStat exited successfully\n");
+	return FFS_SUCCESS;
+} /* end of ffsGetStat */
+
+/* ffsSetStat : set the information of a given file */
+s32 ffsSetStat(struct inode *inode, DIR_ENTRY_T *info)
+{
+	u32 sector = 0;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep, *ep2;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	u8 is_dir = (fid->type == TYPE_DIR) ? 1 : 0;
+
+	if (is_dir) {
+		if ((fid->dir.dir == p_fs->root_dir) &&
+			(fid->entry == -1)) {
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+			return FFS_SUCCESS;
+		}
+	}
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* get the directory entry of given file or directory */
+	if (p_fs->vol_type == EXFAT) {
+		es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+		if (es == NULL)
+			return FFS_MEDIAERR;
+		ep2 = ep+1;
+	} else {
+		/* for other than exfat */
+		ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+		if (!ep)
+			return FFS_MEDIAERR;
+		ep2 = ep;
+	}
+
+
+	p_fs->fs_func->set_entry_attr(ep, info->Attr);
+
+	/* set FILE_INFO structure using the acquired DENTRY_T */
+	tm.sec  = info->CreateTimestamp.Second;
+	tm.min  = info->CreateTimestamp.Minute;
+	tm.hour = info->CreateTimestamp.Hour;
+	tm.day  = info->CreateTimestamp.Day;
+	tm.mon  = info->CreateTimestamp.Month;
+	tm.year = info->CreateTimestamp.Year;
+	p_fs->fs_func->set_entry_time(ep, &tm, TM_CREATE);
+
+	tm.sec  = info->ModifyTimestamp.Second;
+	tm.min  = info->ModifyTimestamp.Minute;
+	tm.hour = info->ModifyTimestamp.Hour;
+	tm.day  = info->ModifyTimestamp.Day;
+	tm.mon  = info->ModifyTimestamp.Month;
+	tm.year = info->ModifyTimestamp.Year;
+	p_fs->fs_func->set_entry_time(ep, &tm, TM_MODIFY);
+
+
+	p_fs->fs_func->set_entry_size(ep2, info->Size);
+
+	if (p_fs->vol_type != EXFAT) {
+		buf_modify(sb, sector);
+	} else {
+		update_dir_checksum_with_entry_set(sb, es);
+		release_entry_set(es);
+	}
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsSetStat */
+
+s32 ffsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu)
+{
+	s32 num_clusters, num_alloced, modified = FALSE;
+	u32 last_clu, sector = 0;
+	CHAIN_T new_clu;
+	DENTRY_T *ep;
+	ENTRY_SET_CACHE_T *es = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	fid->rwoffset = (s64)(clu_offset) << p_fs->cluster_size_bits;
+
+	if (EXFAT_I(inode)->mmu_private == 0)
+		num_clusters = 0;
+	else
+		num_clusters = (s32)((EXFAT_I(inode)->mmu_private-1) >> p_fs->cluster_size_bits) + 1;
+
+	*clu = last_clu = fid->start_clu;
+
+	if (fid->flags == 0x03) {
+		if ((clu_offset > 0) && (*clu != CLUSTER_32(~0))) {
+			last_clu += clu_offset - 1;
+
+			if (clu_offset == num_clusters)
+				*clu = CLUSTER_32(~0);
+			else
+				*clu += clu_offset;
+		}
+	} else {
+		/* hint information */
+		if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+			(clu_offset >= fid->hint_last_off)) {
+			clu_offset -= fid->hint_last_off;
+			*clu = fid->hint_last_clu;
+		}
+
+		while ((clu_offset > 0) && (*clu != CLUSTER_32(~0))) {
+			last_clu = *clu;
+			if (FAT_read(sb, *clu, clu) == -1)
+				return FFS_MEDIAERR;
+			clu_offset--;
+		}
+	}
+
+	if (*clu == CLUSTER_32(~0)) {
+		fs_set_vol_flags(sb, VOL_DIRTY);
+
+		new_clu.dir = (last_clu == CLUSTER_32(~0)) ? CLUSTER_32(~0) : last_clu+1;
+		new_clu.size = 0;
+		new_clu.flags = fid->flags;
+
+		/* (1) allocate a cluster */
+		num_alloced = p_fs->fs_func->alloc_cluster(sb, 1, &new_clu);
+		if (num_alloced < 0)
+			return FFS_MEDIAERR;
+		else if (num_alloced == 0)
+			return FFS_FULL;
+
+		/* (2) append to the FAT chain */
+		if (last_clu == CLUSTER_32(~0)) {
+			if (new_clu.flags == 0x01)
+				fid->flags = 0x01;
+			fid->start_clu = new_clu.dir;
+			modified = TRUE;
+		} else {
+			if (new_clu.flags != fid->flags) {
+				exfat_chain_cont_cluster(sb, fid->start_clu, num_clusters);
+				fid->flags = 0x01;
+				modified = TRUE;
+			}
+			if (new_clu.flags == 0x01)
+				FAT_write(sb, last_clu, new_clu.dir);
+		}
+
+		num_clusters += num_alloced;
+		*clu = new_clu.dir;
+
+		if (p_fs->vol_type == EXFAT) {
+			es = get_entry_set_in_dir(sb, &(fid->dir), fid->entry, ES_ALL_ENTRIES, &ep);
+			if (es == NULL)
+				return FFS_MEDIAERR;
+			/* get stream entry */
+			ep++;
+		}
+
+		/* (3) update directory entry */
+		if (modified) {
+			if (p_fs->vol_type != EXFAT) {
+				ep = get_entry_in_dir(sb, &(fid->dir), fid->entry, &sector);
+				if (!ep)
+					return FFS_MEDIAERR;
+			}
+
+			if (p_fs->fs_func->get_entry_flag(ep) != fid->flags)
+				p_fs->fs_func->set_entry_flag(ep, fid->flags);
+
+			if (p_fs->fs_func->get_entry_clu0(ep) != fid->start_clu)
+				p_fs->fs_func->set_entry_clu0(ep, fid->start_clu);
+
+			if (p_fs->vol_type != EXFAT)
+				buf_modify(sb, sector);
+		}
+
+		if (p_fs->vol_type == EXFAT) {
+			update_dir_checksum_with_entry_set(sb, es);
+			release_entry_set(es);
+		}
+
+		/* add number of new blocks to inode */
+		inode->i_blocks += num_alloced << (p_fs->cluster_size_bits - 9);
+	}
+
+	/* hint information */
+	fid->hint_last_off = (s32)(fid->rwoffset >> p_fs->cluster_size_bits);
+	fid->hint_last_clu = *clu;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsMapCluster */
+
+/*----------------------------------------------------------------------*/
+/*  Directory Operation Functions                                       */
+/*----------------------------------------------------------------------*/
+
+/* ffsCreateDir : create(make) a directory */
+s32 ffsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid)
+{
+	s32 ret/*, dentry*/;
+	CHAIN_T dir;
+	UNI_NAME_T uni_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	DPRINTK("ffsCreateDir entered\n");
+
+	/* check the validity of directory name in the given old pathname */
+	ret = resolve_path(inode, path, &dir, &uni_name);
+	if (ret)
+		return ret;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	ret = create_dir(inode, &dir, &uni_name, fid);
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return ret;
+} /* end of ffsCreateDir */
+
+/* ffsReadDir : read a directory entry from the opened directory */
+s32 ffsReadDir(struct inode *inode, DIR_ENTRY_T *dir_entry)
+{
+	int i, dentry, clu_offset;
+	s32 dentries_per_clu, dentries_per_clu_bits = 0;
+	u32 type, sector;
+	CHAIN_T dir, clu;
+	UNI_NAME_T uni_name;
+	TIMESTAMP_T tm;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	/* check if the given file ID is opened */
+	if (fid->type != TYPE_DIR)
+		return FFS_PERMISSIONERR;
+
+	if (fid->entry == -1) {
+		dir.dir = p_fs->root_dir;
+		dir.flags = 0x01;
+	} else {
+		dir.dir = fid->start_clu;
+		dir.size = (s32)(fid->size >> p_fs->cluster_size_bits);
+		dir.flags = fid->flags;
+	}
+
+	dentry = (s32) fid->rwoffset;
+
+	if (dir.dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+
+		if (dentry == dentries_per_clu) {
+			clu.dir = CLUSTER_32(~0);
+		} else {
+			clu.dir = dir.dir;
+			clu.size = dir.size;
+			clu.flags = dir.flags;
+		}
+	} else {
+		dentries_per_clu = p_fs->dentries_per_clu;
+		dentries_per_clu_bits = ilog2(dentries_per_clu);
+
+		clu_offset = dentry >> dentries_per_clu_bits;
+		clu.dir = dir.dir;
+		clu.size = dir.size;
+		clu.flags = dir.flags;
+
+		if (clu.flags == 0x03) {
+			clu.dir += clu_offset;
+			clu.size -= clu_offset;
+		} else {
+			/* hint_information */
+			if ((clu_offset > 0) && (fid->hint_last_off > 0) &&
+				(clu_offset >= fid->hint_last_off)) {
+				clu_offset -= fid->hint_last_off;
+				clu.dir = fid->hint_last_clu;
+			}
+
+			while (clu_offset > 0) {
+				/* clu.dir = FAT_read(sb, clu.dir); */
+				if (FAT_read(sb, clu.dir, &(clu.dir)) == -1)
+					return FFS_MEDIAERR;
+
+				clu_offset--;
+			}
+		}
+	}
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (dir.dir == CLUSTER_32(0)) /* FAT16 root_dir */
+			i = dentry % dentries_per_clu;
+		else
+			i = dentry & (dentries_per_clu-1);
+
+		for ( ; i < dentries_per_clu; i++, dentry++) {
+			ep = get_entry_in_dir(sb, &clu, i, &sector);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type(ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+
+			if ((type != TYPE_FILE) && (type != TYPE_DIR))
+				continue;
+
+			buf_lock(sb, sector);
+			dir_entry->Attr = p_fs->fs_func->get_entry_attr(ep);
+
+			p_fs->fs_func->get_entry_time(ep, &tm, TM_CREATE);
+			dir_entry->CreateTimestamp.Year = tm.year;
+			dir_entry->CreateTimestamp.Month = tm.mon;
+			dir_entry->CreateTimestamp.Day = tm.day;
+			dir_entry->CreateTimestamp.Hour = tm.hour;
+			dir_entry->CreateTimestamp.Minute = tm.min;
+			dir_entry->CreateTimestamp.Second = tm.sec;
+			dir_entry->CreateTimestamp.MilliSecond = 0;
+
+			p_fs->fs_func->get_entry_time(ep, &tm, TM_MODIFY);
+			dir_entry->ModifyTimestamp.Year = tm.year;
+			dir_entry->ModifyTimestamp.Month = tm.mon;
+			dir_entry->ModifyTimestamp.Day = tm.day;
+			dir_entry->ModifyTimestamp.Hour = tm.hour;
+			dir_entry->ModifyTimestamp.Minute = tm.min;
+			dir_entry->ModifyTimestamp.Second = tm.sec;
+			dir_entry->ModifyTimestamp.MilliSecond = 0;
+
+			memset((char *) &dir_entry->AccessTimestamp, 0, sizeof(DATE_TIME_T));
+
+			*(uni_name.name) = 0x0;
+			p_fs->fs_func->get_uni_name_from_ext_entry(sb, &dir, dentry, uni_name.name);
+			if (*(uni_name.name) == 0x0 && p_fs->vol_type != EXFAT)
+				get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x1);
+			nls_uniname_to_cstring(sb, dir_entry->Name, &uni_name);
+			buf_unlock(sb, sector);
+
+			if (p_fs->vol_type == EXFAT) {
+				ep = get_entry_in_dir(sb, &clu, i+1, NULL);
+				if (!ep)
+					return FFS_MEDIAERR;
+			} else {
+				get_uni_name_from_dos_entry(sb, (DOS_DENTRY_T *) ep, &uni_name, 0x0);
+				nls_uniname_to_cstring(sb, dir_entry->ShortName, &uni_name);
+			}
+
+			dir_entry->Size = p_fs->fs_func->get_entry_size(ep);
+
+			/* hint information */
+			if (dir.dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+			} else {
+				fid->hint_last_off = dentry >> dentries_per_clu_bits;
+				fid->hint_last_clu = clu.dir;
+			}
+
+			fid->rwoffset = (s64) ++dentry;
+
+			if (p_fs->dev_ejected)
+				return FFS_MEDIAERR;
+
+			return FFS_SUCCESS;
+		}
+
+		if (dir.dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			/* clu.dir = FAT_read(sb, clu.dir); */
+			if (FAT_read(sb, clu.dir, &(clu.dir)) == -1)
+				return FFS_MEDIAERR;
+		}
+	}
+
+	*(dir_entry->Name) = '\0';
+
+	fid->rwoffset = (s64) ++dentry;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsReadDir */
+
+/* ffsRemoveDir : remove a directory */
+s32 ffsRemoveDir(struct inode *inode, FILE_ID_T *fid)
+{
+	s32 dentry;
+	CHAIN_T dir, clu_to_free;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	dir.dir = fid->dir.dir;
+	dir.size = fid->dir.size;
+	dir.flags = fid->dir.flags;
+
+	dentry = fid->entry;
+
+	/* check if the file is "." or ".." */
+	if (p_fs->vol_type != EXFAT) {
+		if ((dir.dir != p_fs->root_dir) && (dentry < 2))
+			return FFS_PERMISSIONERR;
+	}
+
+	clu_to_free.dir = fid->start_clu;
+	clu_to_free.size = (s32)((fid->size-1) >> p_fs->cluster_size_bits) + 1;
+	clu_to_free.flags = fid->flags;
+
+	if (!is_dir_empty(sb, &clu_to_free))
+		return FFS_FILEEXIST;
+
+	fs_set_vol_flags(sb, VOL_DIRTY);
+
+	/* (1) update the directory entry */
+	remove_file(inode, &dir, dentry);
+
+	/* (2) free the clusters */
+	p_fs->fs_func->free_cluster(sb, &clu_to_free, 1);
+
+	fid->size = 0;
+	fid->start_clu = CLUSTER_32(~0);
+	fid->flags = (p_fs->vol_type == EXFAT)? 0x03: 0x01;
+
+#ifdef CONFIG_EXFAT_DELAYED_SYNC
+	fs_sync(sb, 0);
+	fs_set_vol_flags(sb, VOL_CLEAN);
+#endif
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	return FFS_SUCCESS;
+} /* end of ffsRemoveDir */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
+
+/*
+ *  File System Management Functions
+ */
+
+s32 fs_init(void)
+{
+	/* critical check for system requirement on size of DENTRY_T structure */
+	if (sizeof(DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(DOS_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(EXT_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(FILE_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(STRM_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(NAME_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(BMAP_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(CASE_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	if (sizeof(VOLM_DENTRY_T) != DENTRY_SIZE)
+		return FFS_ALIGNMENTERR;
+
+	return FFS_SUCCESS;
+} /* end of fs_init */
+
+s32 fs_shutdown(void)
+{
+	return FFS_SUCCESS;
+} /* end of fs_shutdown */
+
+void fs_set_vol_flags(struct super_block *sb, u32 new_flag)
+{
+	PBR_SECTOR_T *p_pbr;
+	BPBEX_T *p_bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_fs->vol_flag == new_flag)
+		return;
+
+	p_fs->vol_flag = new_flag;
+
+	if (p_fs->vol_type == EXFAT) {
+		if (p_fs->pbr_bh == NULL) {
+			if (sector_read(sb, p_fs->PBR_sector, &(p_fs->pbr_bh), 1) != FFS_SUCCESS)
+				return;
+		}
+
+		p_pbr = (PBR_SECTOR_T *) p_fs->pbr_bh->b_data;
+		p_bpb = (BPBEX_T *) p_pbr->bpb;
+		SET16(p_bpb->vol_flags, (u16) new_flag);
+
+		/* XXX duyoung
+		 what can we do here? (cuz fs_set_vol_flags() is void) */
+		if ((new_flag == VOL_DIRTY) && (!buffer_dirty(p_fs->pbr_bh)))
+			sector_write(sb, p_fs->PBR_sector, p_fs->pbr_bh, 1);
+		else
+			sector_write(sb, p_fs->PBR_sector, p_fs->pbr_bh, 0);
+	}
+} /* end of fs_set_vol_flags */
+
+void fs_sync(struct super_block *sb, s32 do_sync)
+{
+	if (do_sync)
+		bdev_sync(sb);
+} /* end of fs_sync */
+
+void fs_error(struct super_block *sb)
+{
+	struct exfat_mount_options *opts = &EXFAT_SB(sb)->options;
+
+	if (opts->errors == EXFAT_ERRORS_PANIC)
+		panic("[EXFAT] Filesystem panic from previous error\n");
+	else if ((opts->errors == EXFAT_ERRORS_RO) && !(sb->s_flags & MS_RDONLY)) {
+		sb->s_flags |= MS_RDONLY;
+		printk(KERN_ERR "[EXFAT] Filesystem has been set read-only\n");
+	}
+}
+
+/*
+ *  Cluster Management Functions
+ */
+
+s32 clear_cluster(struct super_block *sb, u32 clu)
+{
+	u32 s, n;
+	s32 ret = FFS_SUCCESS;
+	struct buffer_head *tmp_bh = NULL;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (clu == CLUSTER_32(0)) { /* FAT16 root_dir */
+		s = p_fs->root_start_sector;
+		n = p_fs->data_start_sector;
+	} else {
+		s = START_SECTOR(clu);
+		n = s + p_fs->sectors_per_clu;
+	}
+
+	for (; s < n; s++) {
+		ret = sector_read(sb, s, &tmp_bh, 0);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		memset((char *) tmp_bh->b_data, 0x0, p_bd->sector_size);
+		ret = sector_write(sb, s, tmp_bh, 0);
+		if (ret != FFS_SUCCESS)
+			break;
+	}
+
+	brelse(tmp_bh);
+	return ret;
+} /* end of clear_cluster */
+
+s32 fat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain)
+{
+	int i, num_clusters = 0;
+	u32 new_clu, last_clu = CLUSTER_32(~0), read_clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	new_clu = p_chain->dir;
+	if (new_clu == CLUSTER_32(~0))
+		new_clu = p_fs->clu_srch_ptr;
+	else if (new_clu >= p_fs->num_clusters)
+		new_clu = 2;
+
+	__set_sb_dirty(sb);
+
+	p_chain->dir = CLUSTER_32(~0);
+
+	for (i = 2; i < p_fs->num_clusters; i++) {
+		if (FAT_read(sb, new_clu, &read_clu) != 0)
+			return -1;
+
+		if (read_clu == CLUSTER_32(0)) {
+			if (FAT_write(sb, new_clu, CLUSTER_32(~0)) < 0)
+				return -1;
+			num_clusters++;
+
+			if (p_chain->dir == CLUSTER_32(~0))
+				p_chain->dir = new_clu;
+			else {
+				if (FAT_write(sb, last_clu, new_clu) < 0)
+					return -1;
+			}
+
+			last_clu = new_clu;
+
+			if ((--num_alloc) == 0) {
+				p_fs->clu_srch_ptr = new_clu;
+				if (p_fs->used_clusters != (u32) ~0)
+					p_fs->used_clusters += num_clusters;
+
+				return num_clusters;
+			}
+		}
+		if ((++new_clu) >= p_fs->num_clusters)
+			new_clu = 2;
+	}
+
+	p_fs->clu_srch_ptr = new_clu;
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters += num_clusters;
+
+	return num_clusters;
+} /* end of fat_alloc_cluster */
+
+s32 exfat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain)
+{
+	s32 num_clusters = 0;
+	u32 hint_clu, new_clu, last_clu = CLUSTER_32(~0);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	hint_clu = p_chain->dir;
+	if (hint_clu == CLUSTER_32(~0)) {
+		hint_clu = test_alloc_bitmap(sb, p_fs->clu_srch_ptr-2);
+		if (hint_clu == CLUSTER_32(~0))
+			return 0;
+	} else if (hint_clu >= p_fs->num_clusters) {
+		hint_clu = 2;
+		p_chain->flags = 0x01;
+	}
+
+	__set_sb_dirty(sb);
+
+	p_chain->dir = CLUSTER_32(~0);
+
+	while ((new_clu = test_alloc_bitmap(sb, hint_clu-2)) != CLUSTER_32(~0)) {
+		if (new_clu != hint_clu) {
+			if (p_chain->flags == 0x03) {
+				exfat_chain_cont_cluster(sb, p_chain->dir, num_clusters);
+				p_chain->flags = 0x01;
+			}
+		}
+
+		if (set_alloc_bitmap(sb, new_clu-2) != FFS_SUCCESS)
+			return -1;
+
+		num_clusters++;
+
+		if (p_chain->flags == 0x01) {
+			if (FAT_write(sb, new_clu, CLUSTER_32(~0)) < 0)
+				return -1;
+		}
+
+		if (p_chain->dir == CLUSTER_32(~0)) {
+			p_chain->dir = new_clu;
+		} else {
+			if (p_chain->flags == 0x01) {
+				if (FAT_write(sb, last_clu, new_clu) < 0)
+					return -1;
+			}
+		}
+		last_clu = new_clu;
+
+		if ((--num_alloc) == 0) {
+			p_fs->clu_srch_ptr = hint_clu;
+			if (p_fs->used_clusters != (u32) ~0)
+				p_fs->used_clusters += num_clusters;
+
+			p_chain->size += num_clusters;
+			return num_clusters;
+		}
+
+		hint_clu = new_clu + 1;
+		if (hint_clu >= p_fs->num_clusters) {
+			hint_clu = 2;
+
+			if (p_chain->flags == 0x03) {
+				exfat_chain_cont_cluster(sb, p_chain->dir, num_clusters);
+				p_chain->flags = 0x01;
+			}
+		}
+	}
+
+	p_fs->clu_srch_ptr = hint_clu;
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters += num_clusters;
+
+	p_chain->size += num_clusters;
+	return num_clusters;
+} /* end of exfat_alloc_cluster */
+
+void fat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse)
+{
+	s32 num_clusters = 0;
+	u32 clu, prev;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	int i;
+	u32 sector;
+
+	if ((p_chain->dir == CLUSTER_32(0)) || (p_chain->dir == CLUSTER_32(~0)))
+		return;
+	__set_sb_dirty(sb);
+	clu = p_chain->dir;
+
+	if (p_chain->size <= 0)
+		return;
+
+	do {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (do_relse) {
+			sector = START_SECTOR(clu);
+			for (i = 0; i < p_fs->sectors_per_clu; i++)
+				buf_release(sb, sector+i);
+		}
+
+		prev = clu;
+		if (FAT_read(sb, clu, &clu) == -1)
+			break;
+
+		if (FAT_write(sb, prev, CLUSTER_32(0)) < 0)
+			break;
+		num_clusters++;
+
+	} while (clu != CLUSTER_32(~0));
+
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters -= num_clusters;
+} /* end of fat_free_cluster */
+
+void exfat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse)
+{
+	s32 num_clusters = 0;
+	u32 clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	int i;
+	u32 sector;
+
+	if ((p_chain->dir == CLUSTER_32(0)) || (p_chain->dir == CLUSTER_32(~0)))
+		return;
+
+	if (p_chain->size <= 0) {
+		printk(KERN_ERR "[EXFAT] free_cluster : skip free-req clu:%u, "
+				"because of zero-size truncation\n"
+				,p_chain->dir);
+		return;
+	}
+
+	__set_sb_dirty(sb);
+	clu = p_chain->dir;
+
+	if (p_chain->flags == 0x03) {
+		do {
+			if (do_relse) {
+				sector = START_SECTOR(clu);
+				for (i = 0; i < p_fs->sectors_per_clu; i++)
+					buf_release(sb, sector+i);
+			}
+
+			if (clr_alloc_bitmap(sb, clu-2) != FFS_SUCCESS)
+				break;
+			clu++;
+
+			num_clusters++;
+		} while (num_clusters < p_chain->size);
+	} else {
+		do {
+			if (p_fs->dev_ejected)
+				break;
+
+			if (do_relse) {
+				sector = START_SECTOR(clu);
+				for (i = 0; i < p_fs->sectors_per_clu; i++)
+					buf_release(sb, sector+i);
+			}
+
+			if (clr_alloc_bitmap(sb, clu-2) != FFS_SUCCESS)
+				break;
+
+			if (FAT_read(sb, clu, &clu) == -1)
+				break;
+			num_clusters++;
+		} while ((clu != CLUSTER_32(0)) && (clu != CLUSTER_32(~0)));
+	}
+
+	if (p_fs->used_clusters != (u32) ~0)
+		p_fs->used_clusters -= num_clusters;
+} /* end of exfat_free_cluster */
+
+u32 find_last_cluster(struct super_block *sb, CHAIN_T *p_chain)
+{
+	u32 clu, next;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	clu = p_chain->dir;
+
+	if (p_chain->flags == 0x03) {
+		clu += p_chain->size - 1;
+	} else {
+		while ((FAT_read(sb, clu, &next) == 0) && (next != CLUSTER_32(~0))) {
+			if (p_fs->dev_ejected)
+				break;
+			clu = next;
+		}
+	}
+
+	return clu;
+} /* end of find_last_cluster */
+
+s32 count_num_clusters(struct super_block *sb, CHAIN_T *p_chain)
+{
+	int i, count = 0;
+	u32 clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if ((p_chain->dir == CLUSTER_32(0)) || (p_chain->dir == CLUSTER_32(~0)))
+		return 0;
+
+	clu = p_chain->dir;
+
+	if (p_chain->flags == 0x03) {
+		count = p_chain->size;
+	} else {
+		for (i = 2; i < p_fs->num_clusters; i++) {
+			count++;
+			if (FAT_read(sb, clu, &clu) != 0)
+				return 0;
+			if (clu == CLUSTER_32(~0))
+				break;
+		}
+	}
+
+	return count;
+} /* end of count_num_clusters */
+
+s32 fat_count_used_clusters(struct super_block *sb)
+{
+	int i, count = 0;
+	u32 clu;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = 2; i < p_fs->num_clusters; i++) {
+		if (FAT_read(sb, i, &clu) != 0)
+			break;
+		if (clu != CLUSTER_32(0))
+			count++;
+	}
+
+	return count;
+} /* end of fat_count_used_clusters */
+
+s32 exfat_count_used_clusters(struct super_block *sb)
+{
+	int i, map_i, map_b, count = 0;
+	u8 k;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	map_i = map_b = 0;
+
+	for (i = 2; i < p_fs->num_clusters; i += 8) {
+		k = *(((u8 *) p_fs->vol_amap[map_i]->b_data) + map_b);
+		count += used_bit[k];
+
+		if ((++map_b) >= p_bd->sector_size) {
+			map_i++;
+			map_b = 0;
+		}
+	}
+
+	return count;
+} /* end of exfat_count_used_clusters */
+
+void exfat_chain_cont_cluster(struct super_block *sb, u32 chain, s32 len)
+{
+	if (len == 0)
+		return;
+
+	while (len > 1) {
+		if (FAT_write(sb, chain, chain+1) < 0)
+			break;
+		chain++;
+		len--;
+	}
+	FAT_write(sb, chain, CLUSTER_32(~0));
+} /* end of exfat_chain_cont_cluster */
+
+/*
+ *  Allocation Bitmap Management Functions
+ */
+
+s32 load_alloc_bitmap(struct super_block *sb)
+{
+	int i, j, ret;
+	u32 map_size;
+	u32 type, sector;
+	CHAIN_T clu;
+	BMAP_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	clu.dir = p_fs->root_dir;
+	clu.flags = 0x01;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < p_fs->dentries_per_clu; i++) {
+			ep = (BMAP_DENTRY_T *) get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type((DENTRY_T *) ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+			if (type != TYPE_BITMAP)
+				continue;
+
+			if (ep->flags == 0x0) {
+				p_fs->map_clu  = GET32_A(ep->start_clu);
+				map_size = (u32) GET64_A(ep->size);
+
+				p_fs->map_sectors = ((map_size-1) >> p_bd->sector_size_bits) + 1;
+
+				p_fs->vol_amap = (struct buffer_head **) kmalloc(sizeof(struct buffer_head *) * p_fs->map_sectors, GFP_KERNEL);
+				if (p_fs->vol_amap == NULL)
+					return FFS_MEMORYERR;
+
+				sector = START_SECTOR(p_fs->map_clu);
+
+				for (j = 0; j < p_fs->map_sectors; j++) {
+					p_fs->vol_amap[j] = NULL;
+					ret = sector_read(sb, sector+j, &(p_fs->vol_amap[j]), 1);
+					if (ret != FFS_SUCCESS) {
+						/*  release all buffers and free vol_amap */
+						i = 0;
+						while (i < j)
+							brelse(p_fs->vol_amap[i++]);
+
+						if (p_fs->vol_amap)
+							kfree(p_fs->vol_amap);
+						p_fs->vol_amap = NULL;
+						return ret;
+					}
+				}
+
+				p_fs->pbr_bh = NULL;
+				return FFS_SUCCESS;
+			}
+		}
+
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return FFS_MEDIAERR;
+	}
+
+	return FFS_FORMATERR;
+} /* end of load_alloc_bitmap */
+
+void free_alloc_bitmap(struct super_block *sb)
+{
+	int i;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	brelse(p_fs->pbr_bh);
+
+	for (i = 0; i < p_fs->map_sectors; i++)
+		__brelse(p_fs->vol_amap[i]);
+
+	if (p_fs->vol_amap)
+		kfree(p_fs->vol_amap);
+	p_fs->vol_amap = NULL;
+} /* end of free_alloc_bitmap */
+
+s32 set_alloc_bitmap(struct super_block *sb, u32 clu)
+{
+	int i, b;
+	u32 sector;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	i = clu >> (p_bd->sector_size_bits + 3);
+	b = clu & ((p_bd->sector_size << 3) - 1);
+
+	sector = START_SECTOR(p_fs->map_clu) + i;
+
+	exfat_bitmap_set((u8 *) p_fs->vol_amap[i]->b_data, b);
+
+	return sector_write(sb, sector, p_fs->vol_amap[i], 0);
+} /* end of set_alloc_bitmap */
+
+s32 clr_alloc_bitmap(struct super_block *sb, u32 clu)
+{
+	int i, b;
+	u32 sector;
+#ifdef CONFIG_EXFAT_DISCARD
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct exfat_mount_options *opts = &sbi->options;
+	int ret;
+#endif /* CONFIG_EXFAT_DISCARD */
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	i = clu >> (p_bd->sector_size_bits + 3);
+	b = clu & ((p_bd->sector_size << 3) - 1);
+
+	sector = START_SECTOR(p_fs->map_clu) + i;
+
+	exfat_bitmap_clear((u8 *) p_fs->vol_amap[i]->b_data, b);
+
+	return sector_write(sb, sector, p_fs->vol_amap[i], 0);
+
+#ifdef CONFIG_EXFAT_DISCARD
+	if (opts->discard) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+		ret = sb_issue_discard(sb, START_SECTOR(clu), (1 << p_fs->sectors_per_clu_bits));
+#else
+		ret = sb_issue_discard(sb, START_SECTOR(clu), (1 << p_fs->sectors_per_clu_bits), GFP_NOFS, 0);
+#endif
+		if (ret == -EOPNOTSUPP) {
+			printk(KERN_WARNING "discard not supported by device, disabling");
+			opts->discard = 0;
+		}
+	}
+#endif /* CONFIG_EXFAT_DISCARD */
+} /* end of clr_alloc_bitmap */
+
+u32 test_alloc_bitmap(struct super_block *sb, u32 clu)
+{
+	int i, map_i, map_b;
+	u32 clu_base, clu_free;
+	u8 k, clu_mask;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	clu_base = (clu & ~(0x7)) + 2;
+	clu_mask = (1 << (clu - clu_base + 2)) - 1;
+
+	map_i = clu >> (p_bd->sector_size_bits + 3);
+	map_b = (clu >> 3) & p_bd->sector_size_mask;
+
+	for (i = 2; i < p_fs->num_clusters; i += 8) {
+		k = *(((u8 *) p_fs->vol_amap[map_i]->b_data) + map_b);
+		if (clu_mask > 0) {
+			k |= clu_mask;
+			clu_mask = 0;
+		}
+		if (k < 0xFF) {
+			clu_free = clu_base + free_bit[k];
+			if (clu_free < p_fs->num_clusters)
+				return clu_free;
+		}
+		clu_base += 8;
+
+		if (((++map_b) >= p_bd->sector_size) || (clu_base >= p_fs->num_clusters)) {
+			if ((++map_i) >= p_fs->map_sectors) {
+				clu_base = 2;
+				map_i = 0;
+			}
+			map_b = 0;
+		}
+	}
+
+	return CLUSTER_32(~0);
+} /* end of test_alloc_bitmap */
+
+void sync_alloc_bitmap(struct super_block *sb)
+{
+	int i;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_fs->vol_amap == NULL)
+		return;
+
+	for (i = 0; i < p_fs->map_sectors; i++)
+		sync_dirty_buffer(p_fs->vol_amap[i]);
+} /* end of sync_alloc_bitmap */
+
+/*
+ *  Upcase table Management Functions
+ */
+s32 __load_upcase_table(struct super_block *sb, u32 sector, u32 num_sectors, u32 utbl_checksum)
+{
+	int i, ret = FFS_ERROR;
+	u32 j;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	struct buffer_head *tmp_bh = NULL;
+
+	u8	skip = FALSE;
+	u32	index = 0;
+	u16	uni = 0;
+	u16 **upcase_table;
+
+	u32 checksum = 0;
+
+	upcase_table = p_fs->vol_utbl = (u16 **) kmalloc(UTBL_COL_COUNT * sizeof(u16 *), GFP_KERNEL);
+	if (upcase_table == NULL)
+		return FFS_MEMORYERR;
+	memset(upcase_table, 0, UTBL_COL_COUNT * sizeof(u16 *));
+
+	num_sectors += sector;
+
+	while (sector < num_sectors) {
+		ret = sector_read(sb, sector, &tmp_bh, 1);
+		if (ret != FFS_SUCCESS) {
+			DPRINTK("sector read (0x%X)fail\n", sector);
+			goto error;
+		}
+		sector++;
+
+		for (i = 0; i < p_bd->sector_size && index <= 0xFFFF; i += 2) {
+			uni = GET16(((u8 *) tmp_bh->b_data)+i);
+
+			checksum = ((checksum & 1) ? 0x80000000 : 0) + (checksum >> 1) + *(((u8 *) tmp_bh->b_data)+i);
+			checksum = ((checksum & 1) ? 0x80000000 : 0) + (checksum >> 1) + *(((u8 *) tmp_bh->b_data)+(i+1));
+
+			if (skip) {
+				DPRINTK("skip from 0x%X ", index);
+				index += uni;
+				DPRINTK("to 0x%X (amount of 0x%X)\n", index, uni);
+				skip = FALSE;
+			} else if (uni == index)
+				index++;
+			else if (uni == 0xFFFF)
+				skip = TRUE;
+			else { /* uni != index , uni != 0xFFFF */
+				u16 col_index = get_col_index(index);
+
+				if (upcase_table[col_index] == NULL) {
+					DPRINTK("alloc = 0x%X\n", col_index);
+					upcase_table[col_index] = (u16 *) kmalloc(UTBL_ROW_COUNT * sizeof(u16), GFP_KERNEL);
+					if (upcase_table[col_index] == NULL) {
+						ret = FFS_MEMORYERR;
+						goto error;
+					}
+
+					for (j = 0; j < UTBL_ROW_COUNT; j++)
+						upcase_table[col_index][j] = (col_index << LOW_INDEX_BIT) | j;
+				}
+
+				upcase_table[col_index][get_row_index(index)] = uni;
+				index++;
+			}
+		}
+	}
+	if (index >= 0xFFFF && utbl_checksum == checksum) {
+		if (tmp_bh)
+			brelse(tmp_bh);
+		return FFS_SUCCESS;
+	}
+	ret = FFS_ERROR;
+error:
+	if (tmp_bh)
+		brelse(tmp_bh);
+	free_upcase_table(sb);
+	return ret;
+}
+
+s32 __load_default_upcase_table(struct super_block *sb)
+{
+	int i, ret = FFS_ERROR;
+	u32 j;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	u8	skip = FALSE;
+	u32	index = 0;
+	u16	uni = 0;
+	u16 **upcase_table;
+
+	upcase_table = p_fs->vol_utbl = (u16 **) kmalloc(UTBL_COL_COUNT * sizeof(u16 *), GFP_KERNEL);
+	if (upcase_table == NULL)
+		return FFS_MEMORYERR;
+	memset(upcase_table, 0, UTBL_COL_COUNT * sizeof(u16 *));
+
+	for (i = 0; index <= 0xFFFF && i < NUM_UPCASE*2; i += 2) {
+		uni = GET16(uni_upcase + i);
+		if (skip) {
+			DPRINTK("skip from 0x%X ", index);
+			index += uni;
+			DPRINTK("to 0x%X (amount of 0x%X)\n", index, uni);
+			skip = FALSE;
+		} else if (uni == index)
+			index++;
+		else if (uni == 0xFFFF)
+			skip = TRUE;
+		else { /* uni != index , uni != 0xFFFF */
+			u16 col_index = get_col_index(index);
+
+			if (upcase_table[col_index] == NULL) {
+				DPRINTK("alloc = 0x%X\n", col_index);
+				upcase_table[col_index] = (u16 *) kmalloc(UTBL_ROW_COUNT * sizeof(u16), GFP_KERNEL);
+				if (upcase_table[col_index] == NULL) {
+					ret = FFS_MEMORYERR;
+					goto error;
+				}
+
+				for (j = 0; j < UTBL_ROW_COUNT; j++)
+					upcase_table[col_index][j] = (col_index << LOW_INDEX_BIT) | j;
+			}
+
+			upcase_table[col_index][get_row_index(index)] = uni;
+			index++;
+		}
+	}
+
+	if (index >= 0xFFFF)
+		return FFS_SUCCESS;
+
+error:
+	/* FATAL error: default upcase table has error */
+	free_upcase_table(sb);
+	return ret;
+}
+
+s32 load_upcase_table(struct super_block *sb)
+{
+	int i;
+	u32 tbl_clu, tbl_size;
+	u32 type, sector, num_sectors;
+	CHAIN_T clu;
+	CASE_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	clu.dir = p_fs->root_dir;
+	clu.flags = 0x01;
+
+	if (p_fs->dev_ejected)
+		return FFS_MEDIAERR;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		for (i = 0; i < p_fs->dentries_per_clu; i++) {
+			ep = (CASE_DENTRY_T *) get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type((DENTRY_T *) ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+			if (type != TYPE_UPCASE)
+				continue;
+
+			tbl_clu  = GET32_A(ep->start_clu);
+			tbl_size = (u32) GET64_A(ep->size);
+
+			sector = START_SECTOR(tbl_clu);
+			num_sectors = ((tbl_size-1) >> p_bd->sector_size_bits) + 1;
+			if (__load_upcase_table(sb, sector, num_sectors, GET32_A(ep->checksum)) != FFS_SUCCESS)
+				break;
+			else
+				return FFS_SUCCESS;
+		}
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return FFS_MEDIAERR;
+	}
+	/* load default upcase table */
+	return __load_default_upcase_table(sb);
+} /* end of load_upcase_table */
+
+void free_upcase_table(struct super_block *sb)
+{
+	u32 i;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	u16 **upcase_table;
+
+	upcase_table = p_fs->vol_utbl;
+	for (i = 0; i < UTBL_COL_COUNT; i++) {
+		if (upcase_table[i])
+			kfree(upcase_table[i]);
+	}
+
+	if (p_fs->vol_utbl)
+		kfree(p_fs->vol_utbl);
+	p_fs->vol_utbl = NULL;
+} /* end of free_upcase_table */
+
+/*
+ *  Directory Entry Management Functions
+ */
+
+u32 fat_get_entry_type(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	if (*(ep->name) == 0x0)
+		return TYPE_UNUSED;
+
+	else if (*(ep->name) == 0xE5)
+		return TYPE_DELETED;
+
+	else if (ep->attr == ATTR_EXTEND)
+		return TYPE_EXTEND;
+
+	else if ((ep->attr & (ATTR_SUBDIR|ATTR_VOLUME)) == ATTR_VOLUME)
+		return TYPE_VOLUME;
+
+	else if ((ep->attr & (ATTR_SUBDIR|ATTR_VOLUME)) == ATTR_SUBDIR)
+		return TYPE_DIR;
+
+	return TYPE_FILE;
+} /* end of fat_get_entry_type */
+
+u32 exfat_get_entry_type(DENTRY_T *p_entry)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	if (ep->type == 0x0) {
+		return TYPE_UNUSED;
+	} else if (ep->type < 0x80) {
+		return TYPE_DELETED;
+	} else if (ep->type == 0x80) {
+		return TYPE_INVALID;
+	} else if (ep->type < 0xA0) {
+		if (ep->type == 0x81) {
+			return TYPE_BITMAP;
+		} else if (ep->type == 0x82) {
+			return TYPE_UPCASE;
+		} else if (ep->type == 0x83) {
+			return TYPE_VOLUME;
+		} else if (ep->type == 0x85) {
+			if (GET16_A(ep->attr) & ATTR_SUBDIR)
+				return TYPE_DIR;
+			else
+				return TYPE_FILE;
+		}
+		return TYPE_CRITICAL_PRI;
+	} else if (ep->type < 0xC0) {
+		if (ep->type == 0xA0)
+			return TYPE_GUID;
+		else if (ep->type == 0xA1)
+			return TYPE_PADDING;
+		else if (ep->type == 0xA2)
+			return TYPE_ACLTAB;
+		return TYPE_BENIGN_PRI;
+	} else if (ep->type < 0xE0) {
+		if (ep->type == 0xC0)
+			return TYPE_STREAM;
+		else if (ep->type == 0xC1)
+			return TYPE_EXTEND;
+		else if (ep->type == 0xC2)
+			return TYPE_ACL;
+		return TYPE_CRITICAL_SEC;
+	}
+
+	return TYPE_BENIGN_SEC;
+} /* end of exfat_get_entry_type */
+
+void fat_set_entry_type(DENTRY_T *p_entry, u32 type)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	if (type == TYPE_UNUSED)
+		*(ep->name) = 0x0;
+
+	else if (type == TYPE_DELETED)
+		*(ep->name) = 0xE5;
+
+	else if (type == TYPE_EXTEND)
+		ep->attr = ATTR_EXTEND;
+
+	else if (type == TYPE_DIR)
+		ep->attr = ATTR_SUBDIR;
+
+	else if (type == TYPE_FILE)
+		ep->attr = ATTR_ARCHIVE;
+
+	else if (type == TYPE_SYMLINK)
+		ep->attr = ATTR_ARCHIVE | ATTR_SYMLINK;
+} /* end of fat_set_entry_type */
+
+void exfat_set_entry_type(DENTRY_T *p_entry, u32 type)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	if (type == TYPE_UNUSED) {
+		ep->type = 0x0;
+	} else if (type == TYPE_DELETED) {
+		ep->type &= ~0x80;
+	} else if (type == TYPE_STREAM) {
+		ep->type = 0xC0;
+	} else if (type == TYPE_EXTEND) {
+		ep->type = 0xC1;
+	} else if (type == TYPE_BITMAP) {
+		ep->type = 0x81;
+	} else if (type == TYPE_UPCASE) {
+		ep->type = 0x82;
+	} else if (type == TYPE_VOLUME) {
+		ep->type = 0x83;
+	} else if (type == TYPE_DIR) {
+		ep->type = 0x85;
+		SET16_A(ep->attr, ATTR_SUBDIR);
+	} else if (type == TYPE_FILE) {
+		ep->type = 0x85;
+		SET16_A(ep->attr, ATTR_ARCHIVE);
+	} else if (type == TYPE_SYMLINK) {
+		ep->type = 0x85;
+		SET16_A(ep->attr, ATTR_ARCHIVE | ATTR_SYMLINK);
+	}
+} /* end of exfat_set_entry_type */
+
+u32 fat_get_entry_attr(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	return (u32) ep->attr;
+} /* end of fat_get_entry_attr */
+
+u32 exfat_get_entry_attr(DENTRY_T *p_entry)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+	return (u32) GET16_A(ep->attr);
+} /* end of exfat_get_entry_attr */
+
+void fat_set_entry_attr(DENTRY_T *p_entry, u32 attr)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	ep->attr = (u8) attr;
+} /* end of fat_set_entry_attr */
+
+void exfat_set_entry_attr(DENTRY_T *p_entry, u32 attr)
+{
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+	SET16_A(ep->attr, (u16) attr);
+} /* end of exfat_set_entry_attr */
+
+u8 fat_get_entry_flag(DENTRY_T *p_entry)
+{
+	return 0x01;
+} /* end of fat_get_entry_flag */
+
+u8 exfat_get_entry_flag(DENTRY_T *p_entry)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	return ep->flags;
+} /* end of exfat_get_entry_flag */
+
+void fat_set_entry_flag(DENTRY_T *p_entry, u8 flags)
+{
+} /* end of fat_set_entry_flag */
+
+void exfat_set_entry_flag(DENTRY_T *p_entry, u8 flags)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	ep->flags = flags;
+} /* end of exfat_set_entry_flag */
+
+u32 fat_get_entry_clu0(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	return ((u32) GET16_A(ep->start_clu_hi) << 16) | GET16_A(ep->start_clu_lo);
+} /* end of fat_get_entry_clu0 */
+
+u32 exfat_get_entry_clu0(DENTRY_T *p_entry)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	return GET32_A(ep->start_clu);
+} /* end of exfat_get_entry_clu0 */
+
+void fat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	SET16_A(ep->start_clu_lo, CLUSTER_16(start_clu));
+	SET16_A(ep->start_clu_hi, CLUSTER_16(start_clu >> 16));
+} /* end of fat_set_entry_clu0 */
+
+void exfat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	SET32_A(ep->start_clu, start_clu);
+} /* end of exfat_set_entry_clu0 */
+
+u64 fat_get_entry_size(DENTRY_T *p_entry)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	return (u64) GET32_A(ep->size);
+} /* end of fat_get_entry_size */
+
+u64 exfat_get_entry_size(DENTRY_T *p_entry)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	return GET64_A(ep->valid_size);
+} /* end of exfat_get_entry_size */
+
+void fat_set_entry_size(DENTRY_T *p_entry, u64 size)
+{
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+	SET32_A(ep->size, (u32) size);
+} /* end of fat_set_entry_size */
+
+void exfat_set_entry_size(DENTRY_T *p_entry, u64 size)
+{
+	STRM_DENTRY_T *ep = (STRM_DENTRY_T *) p_entry;
+	SET64_A(ep->valid_size, size);
+	SET64_A(ep->size, size);
+} /* end of exfat_set_entry_size */
+
+void fat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t = 0x00, d = 0x21;
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	switch (mode) {
+	case TM_CREATE:
+		t = GET16_A(ep->create_time);
+		d = GET16_A(ep->create_date);
+		break;
+	case TM_MODIFY:
+		t = GET16_A(ep->modify_time);
+		d = GET16_A(ep->modify_date);
+		break;
+	}
+
+	tp->sec  = (t & 0x001F) << 1;
+	tp->min  = (t >> 5) & 0x003F;
+	tp->hour = (t >> 11);
+	tp->day  = (d & 0x001F);
+	tp->mon  = (d >> 5) & 0x000F;
+	tp->year = (d >> 9);
+} /* end of fat_get_entry_time */
+
+void exfat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t = 0x00, d = 0x21;
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	switch (mode) {
+	case TM_CREATE:
+		t = GET16_A(ep->create_time);
+		d = GET16_A(ep->create_date);
+		break;
+	case TM_MODIFY:
+		t = GET16_A(ep->modify_time);
+		d = GET16_A(ep->modify_date);
+		break;
+	case TM_ACCESS:
+		t = GET16_A(ep->access_time);
+		d = GET16_A(ep->access_date);
+		break;
+	}
+
+	tp->sec  = (t & 0x001F) << 1;
+	tp->min  = (t >> 5) & 0x003F;
+	tp->hour = (t >> 11);
+	tp->day  = (d & 0x001F);
+	tp->mon  = (d >> 5) & 0x000F;
+	tp->year = (d >> 9);
+} /* end of exfat_get_entry_time */
+
+void fat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t, d;
+	DOS_DENTRY_T *ep = (DOS_DENTRY_T *) p_entry;
+
+	t = (tp->hour << 11) | (tp->min << 5) | (tp->sec >> 1);
+	d = (tp->year <<  9) | (tp->mon << 5) |  tp->day;
+
+	switch (mode) {
+	case TM_CREATE:
+		SET16_A(ep->create_time, t);
+		SET16_A(ep->create_date, d);
+		break;
+	case TM_MODIFY:
+		SET16_A(ep->modify_time, t);
+		SET16_A(ep->modify_date, d);
+		break;
+	}
+} /* end of fat_set_entry_time */
+
+void exfat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode)
+{
+	u16 t, d;
+	FILE_DENTRY_T *ep = (FILE_DENTRY_T *) p_entry;
+
+	t = (tp->hour << 11) | (tp->min << 5) | (tp->sec >> 1);
+	d = (tp->year <<  9) | (tp->mon << 5) |  tp->day;
+
+	switch (mode) {
+	case TM_CREATE:
+		SET16_A(ep->create_time, t);
+		SET16_A(ep->create_date, d);
+		break;
+	case TM_MODIFY:
+		SET16_A(ep->modify_time, t);
+		SET16_A(ep->modify_date, d);
+		break;
+	case TM_ACCESS:
+		SET16_A(ep->access_time, t);
+		SET16_A(ep->access_date, d);
+		break;
+	}
+} /* end of exfat_set_entry_time */
+
+s32 fat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type,
+						 u32 start_clu, u64 size)
+{
+	u32 sector;
+	DOS_DENTRY_T *dos_ep;
+
+	dos_ep = (DOS_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!dos_ep)
+		return FFS_MEDIAERR;
+
+	init_dos_entry(dos_ep, type, start_clu);
+	buf_modify(sb, sector);
+
+	return FFS_SUCCESS;
+} /* end of fat_init_dir_entry */
+
+s32 exfat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type,
+						   u32 start_clu, u64 size)
+{
+	u32 sector;
+	u8 flags;
+	FILE_DENTRY_T *file_ep;
+	STRM_DENTRY_T *strm_ep;
+
+	flags = (type == TYPE_FILE) ? 0x01 : 0x03;
+
+	/* we cannot use get_entry_set_in_dir here because file ep is not initialized yet */
+	file_ep = (FILE_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!file_ep)
+		return FFS_MEDIAERR;
+
+	strm_ep = (STRM_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry+1, &sector);
+	if (!strm_ep)
+		return FFS_MEDIAERR;
+
+	init_file_entry(file_ep, type);
+	buf_modify(sb, sector);
+
+	init_strm_entry(strm_ep, flags, start_clu, size);
+	buf_modify(sb, sector);
+
+	return FFS_SUCCESS;
+} /* end of exfat_init_dir_entry */
+
+s32 fat_init_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries,
+						 UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname)
+{
+	int i;
+	u32 sector;
+	u8 chksum;
+	u16 *uniname = p_uniname->name;
+	DOS_DENTRY_T *dos_ep;
+	EXT_DENTRY_T *ext_ep;
+
+	dos_ep = (DOS_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!dos_ep)
+		return FFS_MEDIAERR;
+
+	dos_ep->lcase = p_dosname->name_case;
+	memcpy(dos_ep->name, p_dosname->name, DOS_NAME_LENGTH);
+	buf_modify(sb, sector);
+
+	if ((--num_entries) > 0) {
+		chksum = calc_checksum_1byte((void *) dos_ep->name, DOS_NAME_LENGTH, 0);
+
+		for (i = 1; i < num_entries; i++) {
+			ext_ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry-i, &sector);
+			if (!ext_ep)
+				return FFS_MEDIAERR;
+
+			init_ext_entry(ext_ep, i, chksum, uniname);
+			buf_modify(sb, sector);
+			uniname += 13;
+		}
+
+		ext_ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry-i, &sector);
+		if (!ext_ep)
+			return FFS_MEDIAERR;
+
+		init_ext_entry(ext_ep, i+0x40, chksum, uniname);
+		buf_modify(sb, sector);
+	}
+
+	return FFS_SUCCESS;
+} /* end of fat_init_ext_entry */
+
+s32 exfat_init_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries,
+						   UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname)
+{
+	int i;
+	u32 sector;
+	u16 *uniname = p_uniname->name;
+	FILE_DENTRY_T *file_ep;
+	STRM_DENTRY_T *strm_ep;
+	NAME_DENTRY_T *name_ep;
+
+	file_ep = (FILE_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!file_ep)
+		return FFS_MEDIAERR;
+
+	file_ep->num_ext = (u8)(num_entries - 1);
+	buf_modify(sb, sector);
+
+	strm_ep = (STRM_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry+1, &sector);
+	if (!strm_ep)
+		return FFS_MEDIAERR;
+
+	strm_ep->name_len = p_uniname->name_len;
+	SET16_A(strm_ep->name_hash, p_uniname->name_hash);
+	buf_modify(sb, sector);
+
+	for (i = 2; i < num_entries; i++) {
+		name_ep = (NAME_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry+i, &sector);
+		if (!name_ep)
+			return FFS_MEDIAERR;
+
+		init_name_entry(name_ep, uniname);
+		buf_modify(sb, sector);
+		uniname += 15;
+	}
+
+	update_dir_checksum(sb, p_dir, entry);
+
+	return FFS_SUCCESS;
+} /* end of exfat_init_ext_entry */
+
+void init_dos_entry(DOS_DENTRY_T *ep, u32 type, u32 start_clu)
+{
+	TIMESTAMP_T tm, *tp;
+
+	fat_set_entry_type((DENTRY_T *) ep, type);
+	SET16_A(ep->start_clu_lo, CLUSTER_16(start_clu));
+	SET16_A(ep->start_clu_hi, CLUSTER_16(start_clu >> 16));
+	SET32_A(ep->size, 0);
+
+	tp = tm_current(&tm);
+	fat_set_entry_time((DENTRY_T *) ep, tp, TM_CREATE);
+	fat_set_entry_time((DENTRY_T *) ep, tp, TM_MODIFY);
+	SET16_A(ep->access_date, 0);
+	ep->create_time_ms = 0;
+} /* end of init_dos_entry */
+
+void init_ext_entry(EXT_DENTRY_T *ep, s32 order, u8 chksum, u16 *uniname)
+{
+	int i;
+	u8 end = FALSE;
+
+	fat_set_entry_type((DENTRY_T *) ep, TYPE_EXTEND);
+	ep->order = (u8) order;
+	ep->sysid = 0;
+	ep->checksum = chksum;
+	SET16_A(ep->start_clu, 0);
+
+	for (i = 0; i < 10; i += 2) {
+		if (!end) {
+			SET16(ep->unicode_0_4+i, *uniname);
+			if (*uniname == 0x0)
+				end = TRUE;
+			else
+				uniname++;
+		} else {
+			SET16(ep->unicode_0_4+i, 0xFFFF);
+		}
+	}
+
+	for (i = 0; i < 12; i += 2) {
+		if (!end) {
+			SET16_A(ep->unicode_5_10+i, *uniname);
+			if (*uniname == 0x0)
+				end = TRUE;
+			else
+				uniname++;
+		} else {
+			SET16_A(ep->unicode_5_10+i, 0xFFFF);
+		}
+	}
+
+	for (i = 0; i < 4; i += 2) {
+		if (!end) {
+			SET16_A(ep->unicode_11_12+i, *uniname);
+			if (*uniname == 0x0)
+				end = TRUE;
+			else
+				uniname++;
+		} else {
+			SET16_A(ep->unicode_11_12+i, 0xFFFF);
+		}
+	}
+} /* end of init_ext_entry */
+
+void init_file_entry(FILE_DENTRY_T *ep, u32 type)
+{
+	TIMESTAMP_T tm, *tp;
+
+	exfat_set_entry_type((DENTRY_T *) ep, type);
+
+	tp = tm_current(&tm);
+	exfat_set_entry_time((DENTRY_T *) ep, tp, TM_CREATE);
+	exfat_set_entry_time((DENTRY_T *) ep, tp, TM_MODIFY);
+	exfat_set_entry_time((DENTRY_T *) ep, tp, TM_ACCESS);
+	ep->create_time_ms = 0;
+	ep->modify_time_ms = 0;
+	ep->access_time_ms = 0;
+} /* end of init_file_entry */
+
+void init_strm_entry(STRM_DENTRY_T *ep, u8 flags, u32 start_clu, u64 size)
+{
+	exfat_set_entry_type((DENTRY_T *) ep, TYPE_STREAM);
+	ep->flags = flags;
+	SET32_A(ep->start_clu, start_clu);
+	SET64_A(ep->valid_size, size);
+	SET64_A(ep->size, size);
+} /* end of init_strm_entry */
+
+void init_name_entry(NAME_DENTRY_T *ep, u16 *uniname)
+{
+	int i;
+
+	exfat_set_entry_type((DENTRY_T *) ep, TYPE_EXTEND);
+	ep->flags = 0x0;
+
+	for (i = 0; i < 30; i++, i++) {
+		SET16_A(ep->unicode_0_14+i, *uniname);
+		if (*uniname == 0x0)
+			break;
+		uniname++;
+	}
+} /* end of init_name_entry */
+
+void fat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries)
+{
+	int i;
+	u32 sector;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = num_entries-1; i >= order; i--) {
+		ep = get_entry_in_dir(sb, p_dir, entry-i, &sector);
+		if (!ep)
+			return;
+
+		p_fs->fs_func->set_entry_type(ep, TYPE_DELETED);
+		buf_modify(sb, sector);
+	}
+} /* end of fat_delete_dir_entry */
+
+void exfat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries)
+{
+	int i;
+	u32 sector;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = order; i < num_entries; i++) {
+		ep = get_entry_in_dir(sb, p_dir, entry+i, &sector);
+		if (!ep)
+			return;
+
+		p_fs->fs_func->set_entry_type(ep, TYPE_DELETED);
+		buf_modify(sb, sector);
+	}
+} /* end of exfat_delete_dir_entry */
+
+void update_dir_checksum(struct super_block *sb, CHAIN_T *p_dir, s32 entry)
+{
+	int i, num_entries;
+	u32 sector;
+	u16 chksum;
+	FILE_DENTRY_T *file_ep;
+	DENTRY_T *ep;
+
+	file_ep = (FILE_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!file_ep)
+		return;
+
+	buf_lock(sb, sector);
+
+	num_entries = (s32) file_ep->num_ext + 1;
+	chksum = calc_checksum_2byte((void *) file_ep, DENTRY_SIZE, 0, CS_DIR_ENTRY);
+
+	for (i = 1; i < num_entries; i++) {
+		ep = get_entry_in_dir(sb, p_dir, entry+i, NULL);
+		if (!ep) {
+			buf_unlock(sb, sector);
+			return;
+		}
+
+		chksum = calc_checksum_2byte((void *) ep, DENTRY_SIZE, chksum, CS_DEFAULT);
+	}
+
+	SET16_A(file_ep->checksum, chksum);
+	buf_modify(sb, sector);
+	buf_unlock(sb, sector);
+} /* end of update_dir_checksum */
+
+void update_dir_checksum_with_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es)
+{
+	DENTRY_T *ep;
+	u16 chksum = 0;
+	s32 chksum_type = CS_DIR_ENTRY, i;
+
+	ep = (DENTRY_T *)&(es->__buf);
+	for (i = 0; i < es->num_entries; i++) {
+		DPRINTK("update_dir_checksum_with_entry_set ep %p\n", ep);
+		chksum = calc_checksum_2byte((void *) ep, DENTRY_SIZE, chksum, chksum_type);
+		ep++;
+		chksum_type = CS_DEFAULT;
+	}
+
+	ep = (DENTRY_T *)&(es->__buf);
+	SET16_A(((FILE_DENTRY_T *)ep)->checksum, chksum);
+	write_whole_entry_set(sb, es);
+}
+
+static s32 _walk_fat_chain(struct super_block *sb, CHAIN_T *p_dir, s32 byte_offset, u32 *clu)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	s32 clu_offset;
+	u32 cur_clu;
+
+	clu_offset = byte_offset >> p_fs->cluster_size_bits;
+	cur_clu = p_dir->dir;
+
+	if (p_dir->flags == 0x03) {
+		cur_clu += clu_offset;
+	} else {
+		while (clu_offset > 0) {
+			if (FAT_read(sb, cur_clu, &cur_clu) == -1)
+				return FFS_MEDIAERR;
+			clu_offset--;
+		}
+	}
+
+	if (clu)
+		*clu = cur_clu;
+	return FFS_SUCCESS;
+}
+s32 find_location(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector, s32 *offset)
+{
+	s32 off, ret;
+	u32 clu = 0;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	off = entry << DENTRY_SIZE_BITS;
+
+	if (p_dir->dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+		*offset = off & p_bd->sector_size_mask;
+		*sector = off >> p_bd->sector_size_bits;
+		*sector += p_fs->root_start_sector;
+	} else {
+		ret = _walk_fat_chain(sb, p_dir, off, &clu);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		off &= p_fs->cluster_size - 1;	/* byte offset in cluster */
+
+		*offset = off & p_bd->sector_size_mask;	/* byte offset in sector    */
+		*sector = off >> p_bd->sector_size_bits;	/* sector offset in cluster */
+		*sector += START_SECTOR(clu);
+	}
+	return FFS_SUCCESS;
+} /* end of find_location */
+
+DENTRY_T *get_entry_with_sector(struct super_block *sb, u32 sector, s32 offset)
+{
+	u8 *buf;
+
+	buf = buf_getblk(sb, sector);
+
+	if (buf == NULL)
+		return NULL;
+
+	return (DENTRY_T *)(buf + offset);
+} /* end of get_entry_with_sector */
+
+DENTRY_T *get_entry_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector)
+{
+	s32 off;
+	u32 sec;
+	u8 *buf;
+
+	if (find_location(sb, p_dir, entry, &sec, &off) != FFS_SUCCESS)
+		return NULL;
+
+	buf = buf_getblk(sb, sec);
+
+	if (buf == NULL)
+		return NULL;
+
+	if (sector != NULL)
+		*sector = sec;
+	return (DENTRY_T *)(buf + off);
+} /* end of get_entry_in_dir */
+
+
+/* returns a set of dentries for a file or dir.
+ * Note that this is a copy (dump) of dentries so that user should call write_entry_set()
+ * to apply changes made in this entry set to the real device.
+ * in:
+ *   sb+p_dir+entry: indicates a file/dir
+ *   type:  specifies how many dentries should be included.
+ * out:
+ *   file_ep: will point the first dentry(= file dentry) on success
+ * return:
+ *   pointer of entry set on success,
+ *   NULL on failure.
+ */
+
+#define ES_MODE_STARTED				0
+#define ES_MODE_GET_FILE_ENTRY			1
+#define ES_MODE_GET_STRM_ENTRY			2
+#define ES_MODE_GET_NAME_ENTRY			3
+#define ES_MODE_GET_CRITICAL_SEC_ENTRY		4
+ENTRY_SET_CACHE_T *get_entry_set_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, DENTRY_T **file_ep)
+{
+	s32 off, ret, byte_offset;
+	u32 clu = 0;
+	u32 sec, entry_type;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	ENTRY_SET_CACHE_T *es = NULL;
+	DENTRY_T *ep, *pos;
+	u8 *buf;
+	u8 num_entries;
+	s32 mode = ES_MODE_STARTED;
+
+	DPRINTK("get_entry_set_in_dir entered\n");
+	DPRINTK("p_dir dir %u flags %x size %d\n", p_dir->dir, p_dir->flags, p_dir->size);
+
+	byte_offset = entry << DENTRY_SIZE_BITS;
+	ret = _walk_fat_chain(sb, p_dir, byte_offset, &clu);
+	if (ret != FFS_SUCCESS)
+		return NULL;
+
+
+	byte_offset &= p_fs->cluster_size - 1;	/* byte offset in cluster */
+
+	off = byte_offset & p_bd->sector_size_mask;	/* byte offset in sector    */
+	sec = byte_offset >> p_bd->sector_size_bits;	/* sector offset in cluster */
+	sec += START_SECTOR(clu);
+
+	buf = buf_getblk(sb, sec);
+	if (buf == NULL)
+		goto err_out;
+
+
+	ep = (DENTRY_T *)(buf + off);
+	entry_type = p_fs->fs_func->get_entry_type(ep);
+
+	if ((entry_type != TYPE_FILE)
+		&& (entry_type != TYPE_DIR))
+		goto err_out;
+
+	if (type == ES_ALL_ENTRIES)
+		num_entries = ((FILE_DENTRY_T *)ep)->num_ext+1;
+	else
+		num_entries = type;
+
+	DPRINTK("trying to kmalloc %zx bytes for %d entries\n", offsetof(ENTRY_SET_CACHE_T, __buf) + (num_entries)  * sizeof(DENTRY_T), num_entries);
+	es = kmalloc(offsetof(ENTRY_SET_CACHE_T, __buf) + (num_entries)  * sizeof(DENTRY_T), GFP_KERNEL);
+	if (es == NULL)
+		goto err_out;
+
+	es->num_entries = num_entries;
+	es->sector = sec;
+	es->offset = off;
+	es->alloc_flag = p_dir->flags;
+
+	pos = (DENTRY_T *) &(es->__buf);
+
+	while(num_entries) {
+		/* instead of copying whole sector, we will check every entry.
+		 * this will provide minimum stablity and consistancy.
+		 */
+
+		entry_type = p_fs->fs_func->get_entry_type(ep);
+
+		if ((entry_type == TYPE_UNUSED) || (entry_type == TYPE_DELETED))
+			goto err_out;
+
+		switch (mode) {
+		case ES_MODE_STARTED:
+			if  ((entry_type == TYPE_FILE) || (entry_type == TYPE_DIR))
+				mode = ES_MODE_GET_FILE_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_FILE_ENTRY:
+			if (entry_type == TYPE_STREAM)
+				mode = ES_MODE_GET_STRM_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_STRM_ENTRY:
+			if (entry_type == TYPE_EXTEND)
+				mode = ES_MODE_GET_NAME_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_NAME_ENTRY:
+			if (entry_type == TYPE_EXTEND)
+				break;
+			else if (entry_type == TYPE_STREAM)
+				goto err_out;
+			else if (entry_type & TYPE_CRITICAL_SEC)
+				mode = ES_MODE_GET_CRITICAL_SEC_ENTRY;
+			else
+				goto err_out;
+			break;
+		case ES_MODE_GET_CRITICAL_SEC_ENTRY:
+			if ((entry_type == TYPE_EXTEND) || (entry_type == TYPE_STREAM))
+				goto err_out;
+			else if ((entry_type & TYPE_CRITICAL_SEC) != TYPE_CRITICAL_SEC)
+				goto err_out;
+			break;
+		}
+
+		memcpy(pos, ep, sizeof(DENTRY_T));
+
+		if (--num_entries == 0)
+			break;
+
+		if (((off + DENTRY_SIZE) & p_bd->sector_size_mask) < (off &  p_bd->sector_size_mask)) {
+			/* get the next sector */
+			if (IS_LAST_SECTOR_IN_CLUSTER(sec)) {
+				if (es->alloc_flag == 0x03) {
+					clu++;
+				} else {
+					if (FAT_read(sb, clu, &clu) == -1)
+						goto err_out;
+				}
+				sec = START_SECTOR(clu);
+			} else {
+				sec++;
+			}
+			buf = buf_getblk(sb, sec);
+			if (buf == NULL)
+				goto err_out;
+			off = 0;
+			ep = (DENTRY_T *)(buf);
+		} else {
+			ep++;
+			off += DENTRY_SIZE;
+		}
+		pos++;
+	}
+
+	if (file_ep)
+		*file_ep = (DENTRY_T *)&(es->__buf);
+
+	DPRINTK("es sec %u offset %d flags %d, num_entries %u buf ptr %p\n",
+		   es->sector, es->offset, es->alloc_flag, es->num_entries, &(es->__buf));
+	DPRINTK("get_entry_set_in_dir exited %p\n", es);
+	return es;
+err_out:
+	DPRINTK("get_entry_set_in_dir exited NULL (es %p)\n", es);
+	if (es)
+		kfree(es);
+	return NULL;
+}
+
+void release_entry_set(ENTRY_SET_CACHE_T *es)
+{
+	DPRINTK("release_entry_set %p\n", es);
+	if (es)
+		kfree(es);
+}
+
+
+static s32 __write_partial_entries_in_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es, u32 sec, s32 off, u32 count)
+{
+	s32 num_entries, buf_off = (off - es->offset);
+	u32 remaining_byte_in_sector, copy_entries;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	u32 clu;
+	u8 *buf, *esbuf = (u8 *)&(es->__buf);
+
+	DPRINTK("__write_partial_entries_in_entry_set entered\n");
+	DPRINTK("es %p sec %u off %d count %d\n", es, sec, off, count);
+	num_entries = count;
+
+	while (num_entries) {
+		/* white per sector base */
+		remaining_byte_in_sector = (1 << p_bd->sector_size_bits) - off;
+		copy_entries = MIN(remaining_byte_in_sector >> DENTRY_SIZE_BITS , num_entries);
+		buf = buf_getblk(sb, sec);
+		if (buf == NULL)
+			goto err_out;
+		DPRINTK("es->buf %p buf_off %u\n", esbuf, buf_off);
+		DPRINTK("copying %d entries from %p to sector %u\n", copy_entries, (esbuf + buf_off), sec);
+		memcpy(buf + off, esbuf + buf_off, copy_entries << DENTRY_SIZE_BITS);
+		buf_modify(sb, sec);
+		num_entries -= copy_entries;
+
+		if (num_entries) {
+			/* get next sector */
+			if (IS_LAST_SECTOR_IN_CLUSTER(sec)) {
+				clu = GET_CLUSTER_FROM_SECTOR(sec);
+				if (es->alloc_flag == 0x03) {
+					clu++;
+				} else {
+					if (FAT_read(sb, clu, &clu) == -1)
+						goto err_out;
+				}
+				sec = START_SECTOR(clu);
+			} else {
+				sec++;
+			}
+			off = 0;
+			buf_off += copy_entries << DENTRY_SIZE_BITS;
+		}
+	}
+
+	DPRINTK("__write_partial_entries_in_entry_set exited successfully\n");
+	return FFS_SUCCESS;
+err_out:
+	DPRINTK("__write_partial_entries_in_entry_set failed\n");
+	return FFS_ERROR;
+}
+
+/* write back all entries in entry set */
+s32 write_whole_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es)
+{
+	return __write_partial_entries_in_entry_set(sb, es, es->sector, es->offset, es->num_entries);
+}
+
+/* write back some entries in entry set */
+s32 write_partial_entries_in_entry_set (struct super_block *sb, ENTRY_SET_CACHE_T *es, DENTRY_T *ep, u32 count)
+{
+	s32 ret, byte_offset, off;
+	u32 clu=0, sec;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	CHAIN_T dir;
+
+	/* vaidity check */
+	if (ep + count  > ((DENTRY_T *)&(es->__buf)) + es->num_entries)
+		return FFS_ERROR;
+
+	dir.dir = GET_CLUSTER_FROM_SECTOR(es->sector);
+	dir.flags = es->alloc_flag;
+	dir.size = 0xffffffff;		/* XXX */
+
+	byte_offset = (es->sector - START_SECTOR(dir.dir)) << p_bd->sector_size_bits;
+	byte_offset += ((void **)ep - &(es->__buf)) + es->offset;
+
+	ret =_walk_fat_chain(sb, &dir, byte_offset, &clu);
+	if (ret != FFS_SUCCESS)
+		return ret;
+	byte_offset &= p_fs->cluster_size - 1;	/* byte offset in cluster */
+	off = byte_offset & p_bd->sector_size_mask;	/* byte offset in sector    */
+	sec = byte_offset >> p_bd->sector_size_bits;	/* sector offset in cluster */
+	sec += START_SECTOR(clu);
+	return __write_partial_entries_in_entry_set(sb, es, sec, off, count);
+}
+
+/* search EMPTY CONTINUOUS "num_entries" entries */
+s32 search_deleted_or_unused_entry(struct super_block *sb, CHAIN_T *p_dir, s32 num_entries)
+{
+	int i, dentry, num_empty = 0;
+	s32 dentries_per_clu;
+	u32 type;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	if (p_fs->hint_uentry.dir == p_dir->dir) {
+		if (p_fs->hint_uentry.entry == -1)
+			return -1;
+
+		clu.dir = p_fs->hint_uentry.clu.dir;
+		clu.size = p_fs->hint_uentry.clu.size;
+		clu.flags = p_fs->hint_uentry.clu.flags;
+
+		dentry = p_fs->hint_uentry.entry;
+	} else {
+		p_fs->hint_uentry.entry = -1;
+
+		clu.dir = p_dir->dir;
+		clu.size = p_dir->size;
+		clu.flags = p_dir->flags;
+
+		dentry = 0;
+	}
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+			i = dentry % dentries_per_clu;
+		else
+			i = dentry & (dentries_per_clu-1);
+
+		for (; i < dentries_per_clu; i++, dentry++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -1;
+
+			type = p_fs->fs_func->get_entry_type(ep);
+
+			if (type == TYPE_UNUSED) {
+				num_empty++;
+				if (p_fs->hint_uentry.entry == -1) {
+					p_fs->hint_uentry.dir = p_dir->dir;
+					p_fs->hint_uentry.entry = dentry;
+
+					p_fs->hint_uentry.clu.dir = clu.dir;
+					p_fs->hint_uentry.clu.size = clu.size;
+					p_fs->hint_uentry.clu.flags = clu.flags;
+				}
+			} else if (type == TYPE_DELETED) {
+				num_empty++;
+			} else {
+				num_empty = 0;
+			}
+
+			if (num_empty >= num_entries) {
+				p_fs->hint_uentry.dir = CLUSTER_32(~0);
+				p_fs->hint_uentry.entry = -1;
+
+				if (p_fs->vol_type == EXFAT)
+					return dentry - (num_entries-1);
+				else
+					return dentry;
+			}
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				return -1;
+		}
+	}
+
+	return -1;
+} /* end of search_deleted_or_unused_entry */
+
+s32 find_empty_entry(struct inode *inode, CHAIN_T *p_dir, s32 num_entries)
+{
+	s32 ret, dentry;
+	u32 last_clu, sector;
+	u64 size = 0;
+	CHAIN_T clu;
+	DENTRY_T *ep = NULL;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		return search_deleted_or_unused_entry(sb, p_dir, num_entries);
+
+	while ((dentry = search_deleted_or_unused_entry(sb, p_dir, num_entries)) < 0) {
+		if (p_fs->dev_ejected)
+			break;
+
+		if (p_fs->vol_type == EXFAT) {
+			if (p_dir->dir != p_fs->root_dir)
+				size = i_size_read(inode);
+		}
+
+		last_clu = find_last_cluster(sb, p_dir);
+		clu.dir = last_clu + 1;
+		clu.size = 0;
+		clu.flags = p_dir->flags;
+
+		/* (1) allocate a cluster */
+		ret = p_fs->fs_func->alloc_cluster(sb, 1, &clu);
+		if (ret < 1)
+			return -1;
+
+		if (clear_cluster(sb, clu.dir) != FFS_SUCCESS)
+			return -1;
+
+		/* (2) append to the FAT chain */
+		if (clu.flags != p_dir->flags) {
+			exfat_chain_cont_cluster(sb, p_dir->dir, p_dir->size);
+			p_dir->flags = 0x01;
+			p_fs->hint_uentry.clu.flags = 0x01;
+		}
+		if (clu.flags == 0x01)
+			if (FAT_write(sb, last_clu, clu.dir) < 0)
+				return -1;
+
+		if (p_fs->hint_uentry.entry == -1) {
+			p_fs->hint_uentry.dir = p_dir->dir;
+			p_fs->hint_uentry.entry = p_dir->size << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+			p_fs->hint_uentry.clu.dir = clu.dir;
+			p_fs->hint_uentry.clu.size = 0;
+			p_fs->hint_uentry.clu.flags = clu.flags;
+		}
+		p_fs->hint_uentry.clu.size++;
+		p_dir->size++;
+
+		/* (3) update the directory entry */
+		if (p_fs->vol_type == EXFAT) {
+			if (p_dir->dir != p_fs->root_dir) {
+				size += p_fs->cluster_size;
+
+				ep = get_entry_in_dir(sb, &(fid->dir), fid->entry+1, &sector);
+				if (!ep)
+					return -1;
+				p_fs->fs_func->set_entry_size(ep, size);
+				p_fs->fs_func->set_entry_flag(ep, p_dir->flags);
+				buf_modify(sb, sector);
+
+				update_dir_checksum(sb, &(fid->dir), fid->entry);
+			}
+		}
+
+		i_size_write(inode, i_size_read(inode)+p_fs->cluster_size);
+		EXFAT_I(inode)->mmu_private += p_fs->cluster_size;
+		EXFAT_I(inode)->fid.size += p_fs->cluster_size;
+		EXFAT_I(inode)->fid.flags = p_dir->flags;
+		inode->i_blocks += 1 << (p_fs->cluster_size_bits - 9);
+	}
+
+	return dentry;
+} /* end of find_empty_entry */
+
+/* return values of fat_find_dir_entry()
+   >= 0 : return dir entiry position with the name in dir
+   -1 : (root dir, ".") it is the root dir itself
+   -2 : entry with the name does not exist */
+s32 fat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type)
+{
+	int i, dentry = 0, lossy = FALSE, len;
+	s32 order = 0, is_feasible_entry = TRUE, has_ext_entry = FALSE;
+	s32 dentries_per_clu;
+	u32 entry_type;
+	u16 entry_uniname[14], *uniname = NULL, unichar;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	DOS_DENTRY_T *dos_ep;
+	EXT_DENTRY_T *ext_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == p_fs->root_dir) {
+		if ((!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_CUR_DIR_NAME)) ||
+			(!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_PAR_DIR_NAME)))
+			return -1; // special case, root directory itself
+	}
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++, dentry++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -2;
+
+			entry_type = p_fs->fs_func->get_entry_type(ep);
+
+			if ((entry_type == TYPE_FILE) || (entry_type == TYPE_DIR)) {
+				if ((type == TYPE_ALL) || (type == entry_type)) {
+					if (is_feasible_entry && has_ext_entry)
+						return dentry;
+
+					dos_ep = (DOS_DENTRY_T *) ep;
+					if ((!lossy) && (!nls_dosname_cmp(sb, p_dosname->name, dos_ep->name)))
+						return dentry;
+				}
+				is_feasible_entry = TRUE;
+				has_ext_entry = FALSE;
+			} else if (entry_type == TYPE_EXTEND) {
+				if (is_feasible_entry) {
+					ext_ep = (EXT_DENTRY_T *) ep;
+					if (ext_ep->order > 0x40) {
+						order = (s32)(ext_ep->order - 0x40);
+						uniname = p_uniname->name + 13 * (order-1);
+					} else {
+						order = (s32) ext_ep->order;
+						uniname -= 13;
+					}
+
+					len = extract_uni_name_from_ext_entry(ext_ep, entry_uniname, order);
+
+					unichar = *(uniname+len);
+					*(uniname+len) = 0x0;
+
+					if (nls_uniname_cmp(sb, uniname, entry_uniname))
+						is_feasible_entry = FALSE;
+
+					*(uniname+len) = unichar;
+				}
+				has_ext_entry = TRUE;
+			} else if (entry_type == TYPE_UNUSED) {
+				return -2;
+			} else {
+				is_feasible_entry = TRUE;
+				has_ext_entry = FALSE;
+			}
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return -2;
+	}
+
+	return -2;
+} /* end of fat_find_dir_entry */
+
+/* return values of exfat_find_dir_entry()
+   >= 0 : return dir entiry position with the name in dir
+   -1 : (root dir, ".") it is the root dir itself
+   -2 : entry with the name does not exist */
+s32 exfat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type)
+{
+	int i = 0, dentry = 0, num_ext_entries = 0, len, step;
+	s32 order = 0, is_feasible_entry = FALSE;
+	s32 dentries_per_clu, num_empty = 0;
+	u32 entry_type;
+	u16 entry_uniname[16], *uniname = NULL, unichar;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FILE_DENTRY_T *file_ep;
+	STRM_DENTRY_T *strm_ep;
+	NAME_DENTRY_T *name_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == p_fs->root_dir) {
+		if ((!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_CUR_DIR_NAME)) ||
+			(!nls_uniname_cmp(sb, p_uniname->name, (u16 *) UNI_PAR_DIR_NAME)))
+			return -1; // special case, root directory itself
+	}
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.size = p_dir->size;
+	clu.flags = p_dir->flags;
+
+	p_fs->hint_uentry.dir = p_dir->dir;
+	p_fs->hint_uentry.entry = -1;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		while (i < dentries_per_clu) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -2;
+
+			entry_type = p_fs->fs_func->get_entry_type(ep);
+			step = 1;
+
+			if ((entry_type == TYPE_UNUSED) || (entry_type == TYPE_DELETED)) {
+				is_feasible_entry = FALSE;
+
+				if (p_fs->hint_uentry.entry == -1) {
+					num_empty++;
+
+					if (num_empty == 1) {
+						p_fs->hint_uentry.clu.dir = clu.dir;
+						p_fs->hint_uentry.clu.size = clu.size;
+						p_fs->hint_uentry.clu.flags = clu.flags;
+					}
+					if ((num_empty >= num_entries) || (entry_type == TYPE_UNUSED))
+						p_fs->hint_uentry.entry = dentry - (num_empty-1);
+				}
+
+				if (entry_type == TYPE_UNUSED)
+					return -2;
+			} else {
+				num_empty = 0;
+
+				if ((entry_type == TYPE_FILE) || (entry_type == TYPE_DIR)) {
+					file_ep = (FILE_DENTRY_T *) ep;
+					if ((type == TYPE_ALL) || (type == entry_type)) {
+						num_ext_entries = file_ep->num_ext;
+						is_feasible_entry = TRUE;
+					} else {
+						is_feasible_entry = FALSE;
+						step = file_ep->num_ext + 1;
+					}
+				} else if (entry_type == TYPE_STREAM) {
+					if (is_feasible_entry) {
+						strm_ep = (STRM_DENTRY_T *) ep;
+						if (p_uniname->name_hash == GET16_A(strm_ep->name_hash) &&
+						    p_uniname->name_len == strm_ep->name_len) {
+							order = 1;
+						} else {
+							is_feasible_entry = FALSE;
+							step = num_ext_entries;
+						}
+					}
+				} else if (entry_type == TYPE_EXTEND) {
+					if (is_feasible_entry) {
+						name_ep = (NAME_DENTRY_T *) ep;
+
+						if ((++order) == 2)
+							uniname = p_uniname->name;
+						else
+							uniname += 15;
+
+						len = extract_uni_name_from_name_entry(name_ep, entry_uniname, order);
+
+						unichar = *(uniname+len);
+						*(uniname+len) = 0x0;
+
+						if (nls_uniname_cmp(sb, uniname, entry_uniname)) {
+							is_feasible_entry = FALSE;
+							step = num_ext_entries - order + 1;
+						} else if (order == num_ext_entries) {
+							p_fs->hint_uentry.dir = CLUSTER_32(~0);
+							p_fs->hint_uentry.entry = -1;
+							return dentry - (num_ext_entries);
+						}
+
+						*(uniname+len) = unichar;
+					}
+				} else {
+					is_feasible_entry = FALSE;
+				}
+			}
+
+			i += step;
+			dentry += step;
+		}
+
+		i -= dentries_per_clu;
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				return -2;
+		}
+	}
+
+	return -2;
+} /* end of exfat_find_dir_entry */
+
+/* returns -1 on error */
+s32 fat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry)
+{
+	s32 count = 0;
+	u8 chksum;
+	DOS_DENTRY_T *dos_ep = (DOS_DENTRY_T *) p_entry;
+	EXT_DENTRY_T *ext_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	chksum = calc_checksum_1byte((void *) dos_ep->name, DOS_NAME_LENGTH, 0);
+
+	for (entry--; entry >= 0; entry--) {
+		ext_ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, NULL);
+		if (!ext_ep)
+			return -1;
+
+		if ((p_fs->fs_func->get_entry_type((DENTRY_T *) ext_ep) == TYPE_EXTEND) &&
+			(ext_ep->checksum == chksum)) {
+			count++;
+			if (ext_ep->order > 0x40)
+				return count;
+		} else {
+			return count;
+		}
+	}
+
+	return count;
+} /* end of fat_count_ext_entries */
+
+/* returns -1 on error */
+s32 exfat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry)
+{
+	int i, count = 0;
+	u32 type;
+	FILE_DENTRY_T *file_ep = (FILE_DENTRY_T *) p_entry;
+	DENTRY_T *ext_ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (i = 0, entry++; i < file_ep->num_ext; i++, entry++) {
+		ext_ep = get_entry_in_dir(sb, p_dir, entry, NULL);
+		if (!ext_ep)
+			return -1;
+
+		type = p_fs->fs_func->get_entry_type(ext_ep);
+		if ((type == TYPE_EXTEND) || (type == TYPE_STREAM))
+			count++;
+		else
+			return count;
+	}
+
+	return count;
+} /* end of exfat_count_ext_entries */
+
+/* returns -1 on error */
+s32 count_dos_name_entries(struct super_block *sb, CHAIN_T *p_dir, u32 type)
+{
+	int i, count = 0;
+	s32 dentries_per_clu;
+	u32 entry_type;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.size = p_dir->size;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return -1;
+
+			entry_type = p_fs->fs_func->get_entry_type(ep);
+
+			if (entry_type == TYPE_UNUSED)
+				return count;
+			if (!(type & TYPE_CRITICAL_PRI) && !(type & TYPE_BENIGN_PRI))
+				continue;
+
+			if ((type == TYPE_ALL) || (type == entry_type))
+				count++;
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				return -1;
+		}
+	}
+
+	return count;
+} /* end of count_dos_name_entries */
+
+bool is_dir_empty(struct super_block *sb, CHAIN_T *p_dir)
+{
+	int i, count = 0;
+	s32 dentries_per_clu;
+	u32 type;
+	CHAIN_T clu;
+	DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.size = p_dir->size;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++) {
+			ep = get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				break;
+
+			type = p_fs->fs_func->get_entry_type(ep);
+
+			if (type == TYPE_UNUSED)
+				return TRUE;
+			if ((type != TYPE_FILE) && (type != TYPE_DIR))
+				continue;
+
+			if (p_dir->dir == CLUSTER_32(0)) { /* FAT16 root_dir */
+				return FALSE;
+			} else {
+				if (p_fs->vol_type == EXFAT)
+					return FALSE;
+				if ((p_dir->dir == p_fs->root_dir) || ((++count) > 2))
+					return FALSE;
+			}
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (clu.flags == 0x03) {
+			if ((--clu.size) > 0)
+				clu.dir++;
+			else
+				clu.dir = CLUSTER_32(~0);
+		} else {
+			if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+				break;
+		}
+	}
+
+	return TRUE;
+} /* end of is_dir_empty */
+
+/*
+ *  Name Conversion Functions
+ */
+
+/* input  : dir, uni_name
+   output : num_of_entry, dos_name(format : aaaaaa~1.bbb) */
+s32 get_num_entries_and_dos_name(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 *entries, DOS_NAME_T *p_dosname)
+{
+	s32 ret, num_entries, lossy = FALSE;
+	char **r;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	num_entries = p_fs->fs_func->calc_num_entries(p_uniname);
+	if (num_entries == 0)
+		return FFS_INVALIDPATH;
+
+	if (p_fs->vol_type != EXFAT) {
+		nls_uniname_to_dosname(sb, p_dosname, p_uniname, &lossy);
+
+		if (lossy) {
+			ret = fat_generate_dos_name(sb, p_dir, p_dosname);
+			if (ret)
+				return ret;
+		} else {
+			for (r = reserved_names; *r; r++) {
+				if (!strncmp((void *) p_dosname->name, *r, 8))
+					return FFS_INVALIDPATH;
+			}
+
+			if (p_dosname->name_case != 0xFF)
+				num_entries = 1;
+		}
+
+		if (num_entries > 1)
+			p_dosname->name_case = 0x0;
+	}
+
+	*entries = num_entries;
+
+	return FFS_SUCCESS;
+} /* end of get_num_entries_and_dos_name */
+
+void get_uni_name_from_dos_entry(struct super_block *sb, DOS_DENTRY_T *ep, UNI_NAME_T *p_uniname, u8 mode)
+{
+	DOS_NAME_T dos_name;
+
+	if (mode == 0x0)
+		dos_name.name_case = 0x0;
+	else
+		dos_name.name_case = ep->lcase;
+
+	memcpy(dos_name.name, ep->name, DOS_NAME_LENGTH);
+	nls_dosname_to_uniname(sb, p_uniname, &dos_name);
+} /* end of get_uni_name_from_dos_entry */
+
+void fat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname)
+{
+	int i;
+	EXT_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	for (entry--, i = 1; entry >= 0; entry--, i++) {
+		ep = (EXT_DENTRY_T *) get_entry_in_dir(sb, p_dir, entry, NULL);
+		if (!ep)
+			return;
+
+		if (p_fs->fs_func->get_entry_type((DENTRY_T *) ep) == TYPE_EXTEND) {
+			extract_uni_name_from_ext_entry(ep, uniname, i);
+			if (ep->order > 0x40)
+				return;
+		} else {
+			return;
+		}
+
+		uniname += 13;
+	}
+} /* end of fat_get_uni_name_from_ext_entry */
+
+void exfat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname)
+{
+	int i;
+	DENTRY_T *ep;
+	ENTRY_SET_CACHE_T *es;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	es = get_entry_set_in_dir(sb, p_dir, entry, ES_ALL_ENTRIES, &ep);
+	if (es == NULL || es->num_entries < 3) {
+		if (es)
+			release_entry_set(es);
+		return;
+	}
+
+	ep += 2;
+
+	/*
+	* First entry  : file entry
+	* Second entry : stream-extension entry
+	* Third entry  : first file-name entry
+	* So, the index of first file-name dentry should start from 2.
+	*/
+	for (i = 2; i < es->num_entries; i++, ep++) {
+		if (p_fs->fs_func->get_entry_type(ep) == TYPE_EXTEND)
+			extract_uni_name_from_name_entry((NAME_DENTRY_T *)ep, uniname, i);
+		else
+			goto out;
+		uniname += 15;
+	}
+
+out:
+	release_entry_set(es);
+} /* end of exfat_get_uni_name_from_ext_entry */
+
+s32 extract_uni_name_from_ext_entry(EXT_DENTRY_T *ep, u16 *uniname, s32 order)
+{
+	int i, len = 0;
+
+	for (i = 0; i < 10; i += 2) {
+		*uniname = GET16(ep->unicode_0_4+i);
+		if (*uniname == 0x0)
+			return len;
+		uniname++;
+		len++;
+	}
+
+	if (order < 20) {
+		for (i = 0; i < 12; i += 2) {
+			*uniname = GET16_A(ep->unicode_5_10+i);
+			if (*uniname == 0x0)
+				return len;
+			uniname++;
+			len++;
+		}
+	} else {
+		for (i = 0; i < 8; i += 2) {
+			*uniname = GET16_A(ep->unicode_5_10+i);
+			if (*uniname == 0x0)
+				return len;
+			uniname++;
+			len++;
+		}
+		*uniname = 0x0; /* uniname[MAX_NAME_LENGTH-1] */
+		return len;
+	}
+
+	for (i = 0; i < 4; i += 2) {
+		*uniname = GET16_A(ep->unicode_11_12+i);
+		if (*uniname == 0x0)
+			return len;
+		uniname++;
+		len++;
+	}
+
+	*uniname = 0x0;
+	return len;
+
+} /* end of extract_uni_name_from_ext_entry */
+
+s32 extract_uni_name_from_name_entry(NAME_DENTRY_T *ep, u16 *uniname, s32 order)
+{
+	int i, len = 0;
+
+	for (i = 0; i < 30; i += 2) {
+		*uniname = GET16_A(ep->unicode_0_14+i);
+		if (*uniname == 0x0)
+			return len;
+		uniname++;
+		len++;
+	}
+
+	*uniname = 0x0;
+	return len;
+
+} /* end of extract_uni_name_from_name_entry */
+
+s32 fat_generate_dos_name(struct super_block *sb, CHAIN_T *p_dir, DOS_NAME_T *p_dosname)
+{
+	int i, j, count = 0, count_begin = FALSE;
+	s32 dentries_per_clu;
+	u32 type;
+	u8 bmap[128/* 1 ~ 1023 */];
+	CHAIN_T clu;
+	DOS_DENTRY_T *ep;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	memset(bmap, 0, sizeof bmap);
+	exfat_bitmap_set(bmap, 0);
+
+	if (p_dir->dir == CLUSTER_32(0)) /* FAT16 root_dir */
+		dentries_per_clu = p_fs->dentries_in_root;
+	else
+		dentries_per_clu = p_fs->dentries_per_clu;
+
+	clu.dir = p_dir->dir;
+	clu.flags = p_dir->flags;
+
+	while (clu.dir != CLUSTER_32(~0)) {
+		if (p_fs->dev_ejected)
+			break;
+
+		for (i = 0; i < dentries_per_clu; i++) {
+			ep = (DOS_DENTRY_T *) get_entry_in_dir(sb, &clu, i, NULL);
+			if (!ep)
+				return FFS_MEDIAERR;
+
+			type = p_fs->fs_func->get_entry_type((DENTRY_T *) ep);
+
+			if (type == TYPE_UNUSED)
+				break;
+			if ((type != TYPE_FILE) && (type != TYPE_DIR))
+				continue;
+
+			count = 0;
+			count_begin = FALSE;
+
+			for (j = 0; j < 8; j++) {
+				if (ep->name[j] == ' ')
+					break;
+
+				if (ep->name[j] == '~') {
+					count_begin = TRUE;
+				} else if (count_begin) {
+					if ((ep->name[j] >= '0') && (ep->name[j] <= '9')) {
+						count = count * 10 + (ep->name[j] - '0');
+					} else {
+						count = 0;
+						count_begin = FALSE;
+					}
+				}
+			}
+
+			if ((count > 0) && (count < 1024))
+				exfat_bitmap_set(bmap, count);
+		}
+
+		if (p_dir->dir == CLUSTER_32(0))
+			break; /* FAT16 root_dir */
+
+		if (FAT_read(sb, clu.dir, &(clu.dir)) != 0)
+			return FFS_MEDIAERR;
+	}
+
+	count = 0;
+	for (i = 0; i < 128; i++) {
+		if (bmap[i] != 0xFF) {
+			for (j = 0; j < 8; j++) {
+				if (exfat_bitmap_test(&(bmap[i]), j) == 0) {
+					count = (i << 3) + j;
+					break;
+				}
+			}
+			if (count != 0)
+				break;
+		}
+	}
+
+	if ((count == 0) || (count >= 1024))
+		return FFS_FILEEXIST;
+	else
+		fat_attach_count_to_dos_name(p_dosname->name, count);
+
+	/* Now dos_name has DOS~????.EXT */
+	return FFS_SUCCESS;
+} /* end of generate_dos_name */
+
+void fat_attach_count_to_dos_name(u8 *dosname, s32 count)
+{
+	int i, j, length;
+	char str_count[6];
+
+	snprintf(str_count, sizeof str_count, "~%d", count);
+	length = strlen(str_count);
+
+	i = j = 0;
+	while (j <= (8 - length)) {
+		i = j;
+		if (dosname[j] == ' ')
+			break;
+		if (dosname[j] & 0x80)
+			j += 2;
+		else
+			j++;
+	}
+
+	for (j = 0; j < length; i++, j++)
+		dosname[i] = (u8) str_count[j];
+
+	if (i == 7)
+		dosname[7] = ' ';
+
+} /* end of attach_count_to_dos_name */
+
+s32 fat_calc_num_entries(UNI_NAME_T *p_uniname)
+{
+	s32 len;
+
+	len = p_uniname->name_len;
+	if (len == 0)
+		return 0;
+
+	/* 1 dos name entry + extended entries */
+	return (len-1) / 13 + 2;
+
+} /* end of calc_num_enties */
+
+s32 exfat_calc_num_entries(UNI_NAME_T *p_uniname)
+{
+	s32 len;
+
+	len = p_uniname->name_len;
+	if (len == 0)
+		return 0;
+
+	/* 1 file entry + 1 stream entry + name entries */
+	return (len-1) / 15 + 3;
+
+} /* end of exfat_calc_num_enties */
+
+u8 calc_checksum_1byte(void *data, s32 len, u8 chksum)
+{
+	int i;
+	u8 *c = (u8 *) data;
+
+	for (i = 0; i < len; i++, c++)
+		chksum = (((chksum & 1) << 7) | ((chksum & 0xFE) >> 1)) + *c;
+
+	return chksum;
+} /* end of calc_checksum_1byte */
+
+u16 calc_checksum_2byte(void *data, s32 len, u16 chksum, s32 type)
+{
+	int i;
+	u8 *c = (u8 *) data;
+
+	switch (type) {
+	case CS_DIR_ENTRY:
+		for (i = 0; i < len; i++, c++) {
+			if ((i == 2) || (i == 3))
+				continue;
+			chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) + (u16) *c;
+		}
+		break;
+	default
+			:
+		for (i = 0; i < len; i++, c++)
+			chksum = (((chksum & 1) << 15) | ((chksum & 0xFFFE) >> 1)) + (u16) *c;
+	}
+
+	return chksum;
+} /* end of calc_checksum_2byte */
+
+u32 calc_checksum_4byte(void *data, s32 len, u32 chksum, s32 type)
+{
+	int i;
+	u8 *c = (u8 *) data;
+
+	switch (type) {
+	case CS_PBR_SECTOR:
+		for (i = 0; i < len; i++, c++) {
+			if ((i == 106) || (i == 107) || (i == 112))
+				continue;
+			chksum = (((chksum & 1) << 31) | ((chksum & 0xFFFFFFFE) >> 1)) + (u32) *c;
+		}
+		break;
+	default
+			:
+		for (i = 0; i < len; i++, c++)
+			chksum = (((chksum & 1) << 31) | ((chksum & 0xFFFFFFFE) >> 1)) + (u32) *c;
+	}
+
+	return chksum;
+} /* end of calc_checksum_4byte */
+
+/*
+ *  Name Resolution Functions
+ */
+
+/* return values of resolve_path()
+   > 0 : return the length of the path
+   < 0 : return error */
+s32 resolve_path(struct inode *inode, char *path, CHAIN_T *p_dir, UNI_NAME_T *p_uniname)
+{
+	s32 lossy = FALSE;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+
+	if (strlen(path) >= (MAX_NAME_LENGTH * MAX_CHARSET_SIZE))
+		return FFS_INVALIDPATH;
+
+	strcpy(name_buf, path);
+
+	nls_cstring_to_uniname(sb, p_uniname, name_buf, &lossy);
+	if (lossy)
+		return FFS_INVALIDPATH;
+
+	fid->size = i_size_read(inode);
+
+	p_dir->dir = fid->start_clu;
+	p_dir->size = (s32)(fid->size >> p_fs->cluster_size_bits);
+	p_dir->flags = fid->flags;
+
+	return FFS_SUCCESS;
+}
+
+/*
+ *  File Operation Functions
+ */
+static FS_FUNC_T fat_fs_func = {
+	.alloc_cluster = fat_alloc_cluster,
+	.free_cluster = fat_free_cluster,
+	.count_used_clusters = fat_count_used_clusters,
+
+	.init_dir_entry = fat_init_dir_entry,
+	.init_ext_entry = fat_init_ext_entry,
+	.find_dir_entry = fat_find_dir_entry,
+	.delete_dir_entry = fat_delete_dir_entry,
+	.get_uni_name_from_ext_entry = fat_get_uni_name_from_ext_entry,
+	.count_ext_entries = fat_count_ext_entries,
+	.calc_num_entries = fat_calc_num_entries,
+
+	.get_entry_type = fat_get_entry_type,
+	.set_entry_type = fat_set_entry_type,
+	.get_entry_attr = fat_get_entry_attr,
+	.set_entry_attr = fat_set_entry_attr,
+	.get_entry_flag = fat_get_entry_flag,
+	.set_entry_flag = fat_set_entry_flag,
+	.get_entry_clu0 = fat_get_entry_clu0,
+	.set_entry_clu0 = fat_set_entry_clu0,
+	.get_entry_size = fat_get_entry_size,
+	.set_entry_size = fat_set_entry_size,
+	.get_entry_time = fat_get_entry_time,
+	.set_entry_time = fat_set_entry_time,
+};
+
+
+s32 fat16_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr)
+{
+	s32 num_reserved, num_root_sectors;
+	BPB16_T *p_bpb = (BPB16_T *) p_pbr->bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bpb->num_fats == 0)
+		return FFS_FORMATERR;
+
+	num_root_sectors = GET16(p_bpb->num_root_entries) << DENTRY_SIZE_BITS;
+	num_root_sectors = ((num_root_sectors-1) >> p_bd->sector_size_bits) + 1;
+
+	p_fs->sectors_per_clu = p_bpb->sectors_per_clu;
+	p_fs->sectors_per_clu_bits = ilog2(p_bpb->sectors_per_clu);
+	p_fs->cluster_size_bits = p_fs->sectors_per_clu_bits + p_bd->sector_size_bits;
+	p_fs->cluster_size = 1 << p_fs->cluster_size_bits;
+
+	p_fs->num_FAT_sectors = GET16(p_bpb->num_fat_sectors);
+
+	p_fs->FAT1_start_sector = p_fs->PBR_sector + GET16(p_bpb->num_reserved);
+	if (p_bpb->num_fats == 1)
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector;
+	else
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector + p_fs->num_FAT_sectors;
+
+	p_fs->root_start_sector = p_fs->FAT2_start_sector + p_fs->num_FAT_sectors;
+	p_fs->data_start_sector = p_fs->root_start_sector + num_root_sectors;
+
+	p_fs->num_sectors = GET16(p_bpb->num_sectors);
+	if (p_fs->num_sectors == 0)
+		p_fs->num_sectors = GET32(p_bpb->num_huge_sectors);
+
+	num_reserved = p_fs->data_start_sector - p_fs->PBR_sector;
+	p_fs->num_clusters = ((p_fs->num_sectors - num_reserved) >> p_fs->sectors_per_clu_bits) + 2;
+	/* because the cluster index starts with 2 */
+
+	if (p_fs->num_clusters < FAT12_THRESHOLD)
+		p_fs->vol_type = FAT12;
+	else
+		p_fs->vol_type = FAT16;
+	p_fs->vol_id = GET32(p_bpb->vol_serial);
+
+	p_fs->root_dir = 0;
+	p_fs->dentries_in_root = GET16(p_bpb->num_root_entries);
+	p_fs->dentries_per_clu = 1 << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+	p_fs->vol_flag = VOL_CLEAN;
+	p_fs->clu_srch_ptr = 2;
+	p_fs->used_clusters = (u32) ~0;
+
+	p_fs->fs_func = &fat_fs_func;
+
+	return FFS_SUCCESS;
+} /* end of fat16_mount */
+
+s32 fat32_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr)
+{
+	s32 num_reserved;
+	BPB32_T *p_bpb = (BPB32_T *) p_pbr->bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bpb->num_fats == 0)
+		return FFS_FORMATERR;
+
+	p_fs->sectors_per_clu = p_bpb->sectors_per_clu;
+	p_fs->sectors_per_clu_bits = ilog2(p_bpb->sectors_per_clu);
+	p_fs->cluster_size_bits = p_fs->sectors_per_clu_bits + p_bd->sector_size_bits;
+	p_fs->cluster_size = 1 << p_fs->cluster_size_bits;
+
+	p_fs->num_FAT_sectors = GET32(p_bpb->num_fat32_sectors);
+
+	p_fs->FAT1_start_sector = p_fs->PBR_sector + GET16(p_bpb->num_reserved);
+	if (p_bpb->num_fats == 1)
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector;
+	else
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector + p_fs->num_FAT_sectors;
+
+	p_fs->root_start_sector = p_fs->FAT2_start_sector + p_fs->num_FAT_sectors;
+	p_fs->data_start_sector = p_fs->root_start_sector;
+
+	p_fs->num_sectors = GET32(p_bpb->num_huge_sectors);
+	num_reserved = p_fs->data_start_sector - p_fs->PBR_sector;
+
+	p_fs->num_clusters = ((p_fs->num_sectors-num_reserved) >> p_fs->sectors_per_clu_bits) + 2;
+	/* because the cluster index starts with 2 */
+
+	p_fs->vol_type = FAT32;
+	p_fs->vol_id = GET32(p_bpb->vol_serial);
+
+	p_fs->root_dir = GET32(p_bpb->root_cluster);
+	p_fs->dentries_in_root = 0;
+	p_fs->dentries_per_clu = 1 << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+	p_fs->vol_flag = VOL_CLEAN;
+	p_fs->clu_srch_ptr = 2;
+	p_fs->used_clusters = (u32) ~0;
+
+	p_fs->fs_func = &fat_fs_func;
+
+	return FFS_SUCCESS;
+} /* end of fat32_mount */
+
+static FS_FUNC_T exfat_fs_func = {
+	.alloc_cluster = exfat_alloc_cluster,
+	.free_cluster = exfat_free_cluster,
+	.count_used_clusters = exfat_count_used_clusters,
+
+	.init_dir_entry = exfat_init_dir_entry,
+	.init_ext_entry = exfat_init_ext_entry,
+	.find_dir_entry = exfat_find_dir_entry,
+	.delete_dir_entry = exfat_delete_dir_entry,
+	.get_uni_name_from_ext_entry = exfat_get_uni_name_from_ext_entry,
+	.count_ext_entries = exfat_count_ext_entries,
+	.calc_num_entries = exfat_calc_num_entries,
+
+	.get_entry_type = exfat_get_entry_type,
+	.set_entry_type = exfat_set_entry_type,
+	.get_entry_attr = exfat_get_entry_attr,
+	.set_entry_attr = exfat_set_entry_attr,
+	.get_entry_flag = exfat_get_entry_flag,
+	.set_entry_flag = exfat_set_entry_flag,
+	.get_entry_clu0 = exfat_get_entry_clu0,
+	.set_entry_clu0 = exfat_set_entry_clu0,
+	.get_entry_size = exfat_get_entry_size,
+	.set_entry_size = exfat_set_entry_size,
+	.get_entry_time = exfat_get_entry_time,
+	.set_entry_time = exfat_set_entry_time,
+};
+
+s32 exfat_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr)
+{
+	BPBEX_T *p_bpb = (BPBEX_T *) p_pbr->bpb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+
+	if (p_bpb->num_fats == 0)
+		return FFS_FORMATERR;
+
+	p_fs->sectors_per_clu = 1 << p_bpb->sectors_per_clu_bits;
+	p_fs->sectors_per_clu_bits = p_bpb->sectors_per_clu_bits;
+	p_fs->cluster_size_bits = p_fs->sectors_per_clu_bits + p_bd->sector_size_bits;
+	p_fs->cluster_size = 1 << p_fs->cluster_size_bits;
+
+	p_fs->num_FAT_sectors = GET32(p_bpb->fat_length);
+
+	p_fs->FAT1_start_sector = p_fs->PBR_sector + GET32(p_bpb->fat_offset);
+	if (p_bpb->num_fats == 1)
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector;
+	else
+		p_fs->FAT2_start_sector = p_fs->FAT1_start_sector + p_fs->num_FAT_sectors;
+
+	p_fs->root_start_sector = p_fs->PBR_sector + GET32(p_bpb->clu_offset);
+	p_fs->data_start_sector = p_fs->root_start_sector;
+
+	p_fs->num_sectors = GET64(p_bpb->vol_length);
+	p_fs->num_clusters = GET32(p_bpb->clu_count) + 2;
+	/* because the cluster index starts with 2 */
+
+	p_fs->vol_type = EXFAT;
+	p_fs->vol_id = GET32(p_bpb->vol_serial);
+
+	p_fs->root_dir = GET32(p_bpb->root_cluster);
+	p_fs->dentries_in_root = 0;
+	p_fs->dentries_per_clu = 1 << (p_fs->cluster_size_bits - DENTRY_SIZE_BITS);
+
+	p_fs->vol_flag = (u32) GET16(p_bpb->vol_flags);
+	p_fs->clu_srch_ptr = 2;
+	p_fs->used_clusters = (u32) ~0;
+
+	p_fs->fs_func = &exfat_fs_func;
+
+	return FFS_SUCCESS;
+} /* end of exfat_mount */
+
+s32 create_dir(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, FILE_ID_T *fid)
+{
+	s32 ret, dentry, num_entries;
+	u64 size;
+	CHAIN_T clu;
+	DOS_NAME_T dos_name, dot_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	ret = get_num_entries_and_dos_name(sb, p_dir, p_uniname, &num_entries, &dos_name);
+	if (ret)
+		return ret;
+
+	/* find_empty_entry must be called before alloc_cluster */
+	dentry = find_empty_entry(inode, p_dir, num_entries);
+	if (dentry < 0)
+		return FFS_FULL;
+
+	clu.dir = CLUSTER_32(~0);
+	clu.size = 0;
+	clu.flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+
+	/* (1) allocate a cluster */
+	ret = p_fs->fs_func->alloc_cluster(sb, 1, &clu);
+	if (ret < 0)
+		return FFS_MEDIAERR;
+	else if (ret == 0)
+		return FFS_FULL;
+
+	ret = clear_cluster(sb, clu.dir);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	if (p_fs->vol_type == EXFAT) {
+		size = p_fs->cluster_size;
+	} else {
+		size = 0;
+
+		/* initialize the . and .. entry
+		   Information for . points to itself
+		   Information for .. points to parent dir */
+
+		dot_name.name_case = 0x0;
+		memcpy(dot_name.name, DOS_CUR_DIR_NAME, DOS_NAME_LENGTH);
+
+		ret = p_fs->fs_func->init_dir_entry(sb, &clu, 0, TYPE_DIR, clu.dir, 0);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		ret = p_fs->fs_func->init_ext_entry(sb, &clu, 0, 1, NULL, &dot_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		memcpy(dot_name.name, DOS_PAR_DIR_NAME, DOS_NAME_LENGTH);
+
+		if (p_dir->dir == p_fs->root_dir)
+			ret = p_fs->fs_func->init_dir_entry(sb, &clu, 1, TYPE_DIR, CLUSTER_32(0), 0);
+		else
+			ret = p_fs->fs_func->init_dir_entry(sb, &clu, 1, TYPE_DIR, p_dir->dir, 0);
+
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		ret = p_fs->fs_func->init_ext_entry(sb, &clu, 1, 1, NULL, &dot_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+	}
+
+	/* (2) update the directory entry */
+	/* make sub-dir entry in parent directory */
+	ret = p_fs->fs_func->init_dir_entry(sb, p_dir, dentry, TYPE_DIR, clu.dir, size);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	ret = p_fs->fs_func->init_ext_entry(sb, p_dir, dentry, num_entries, p_uniname, &dos_name);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	fid->dir.dir = p_dir->dir;
+	fid->dir.size = p_dir->size;
+	fid->dir.flags = p_dir->flags;
+	fid->entry = dentry;
+
+	fid->attr = ATTR_SUBDIR;
+	fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+	fid->size = size;
+	fid->start_clu = clu.dir;
+
+	fid->type = TYPE_DIR;
+	fid->rwoffset = 0;
+	fid->hint_last_off = -1;
+
+	return FFS_SUCCESS;
+} /* end of create_dir */
+
+s32 create_file(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, u8 mode, FILE_ID_T *fid)
+{
+	s32 ret, dentry, num_entries;
+	DOS_NAME_T dos_name;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	ret = get_num_entries_and_dos_name(sb, p_dir, p_uniname, &num_entries, &dos_name);
+	if (ret)
+		return ret;
+
+	/* find_empty_entry must be called before alloc_cluster() */
+	dentry = find_empty_entry(inode, p_dir, num_entries);
+	if (dentry < 0)
+		return FFS_FULL;
+
+	/* (1) update the directory entry */
+	/* fill the dos name directory entry information of the created file.
+	   the first cluster is not determined yet. (0) */
+	ret = p_fs->fs_func->init_dir_entry(sb, p_dir, dentry, TYPE_FILE | mode, CLUSTER_32(0), 0);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	ret = p_fs->fs_func->init_ext_entry(sb, p_dir, dentry, num_entries, p_uniname, &dos_name);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	fid->dir.dir = p_dir->dir;
+	fid->dir.size = p_dir->size;
+	fid->dir.flags = p_dir->flags;
+	fid->entry = dentry;
+
+	fid->attr = ATTR_ARCHIVE | mode;
+	fid->flags = (p_fs->vol_type == EXFAT) ? 0x03 : 0x01;
+	fid->size = 0;
+	fid->start_clu = CLUSTER_32(~0);
+
+	fid->type = TYPE_FILE;
+	fid->rwoffset = 0;
+	fid->hint_last_off = -1;
+
+	return FFS_SUCCESS;
+} /* end of create_file */
+
+void remove_file(struct inode *inode, CHAIN_T *p_dir, s32 entry)
+{
+	s32 num_entries;
+	u32 sector;
+	DENTRY_T *ep;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	ep = get_entry_in_dir(sb, p_dir, entry, &sector);
+	if (!ep)
+		return;
+
+	buf_lock(sb, sector);
+
+	/* buf_lock() before call count_ext_entries() */
+	num_entries = p_fs->fs_func->count_ext_entries(sb, p_dir, entry, ep);
+	if (num_entries < 0) {
+		buf_unlock(sb, sector);
+		return;
+	}
+	num_entries++;
+
+	buf_unlock(sb, sector);
+
+	/* (1) update the directory entry */
+	p_fs->fs_func->delete_dir_entry(sb, p_dir, entry, 0, num_entries);
+} /* end of remove_file */
+
+s32 rename_file(struct inode *inode, CHAIN_T *p_dir, s32 oldentry, UNI_NAME_T *p_uniname, FILE_ID_T *fid)
+{
+	s32 ret, newentry = -1, num_old_entries, num_new_entries;
+	u32 sector_old, sector_new;
+	DOS_NAME_T dos_name;
+	DENTRY_T *epold, *epnew;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	epold = get_entry_in_dir(sb, p_dir, oldentry, &sector_old);
+	if (!epold)
+		return FFS_MEDIAERR;
+
+	buf_lock(sb, sector_old);
+
+	/* buf_lock() before call count_ext_entries() */
+	num_old_entries = p_fs->fs_func->count_ext_entries(sb, p_dir, oldentry, epold);
+	if (num_old_entries < 0) {
+		buf_unlock(sb, sector_old);
+		return FFS_MEDIAERR;
+	}
+	num_old_entries++;
+
+	ret = get_num_entries_and_dos_name(sb, p_dir, p_uniname, &num_new_entries, &dos_name);
+	if (ret) {
+		buf_unlock(sb, sector_old);
+		return ret;
+	}
+
+	if (num_old_entries < num_new_entries) {
+		newentry = find_empty_entry(inode, p_dir, num_new_entries);
+		if (newentry < 0) {
+			buf_unlock(sb, sector_old);
+			return FFS_FULL;
+		}
+
+		epnew = get_entry_in_dir(sb, p_dir, newentry, &sector_new);
+		if (!epnew) {
+			buf_unlock(sb, sector_old);
+			return FFS_MEDIAERR;
+		}
+
+		memcpy((void *) epnew, (void *) epold, DENTRY_SIZE);
+		if (p_fs->fs_func->get_entry_type(epnew) == TYPE_FILE) {
+			p_fs->fs_func->set_entry_attr(epnew, p_fs->fs_func->get_entry_attr(epnew) | ATTR_ARCHIVE);
+			fid->attr |= ATTR_ARCHIVE;
+		}
+		buf_modify(sb, sector_new);
+		buf_unlock(sb, sector_old);
+
+		if (p_fs->vol_type == EXFAT) {
+			epold = get_entry_in_dir(sb, p_dir, oldentry+1, &sector_old);
+			buf_lock(sb, sector_old);
+			epnew = get_entry_in_dir(sb, p_dir, newentry+1, &sector_new);
+
+			if (!epold || !epnew) {
+				buf_unlock(sb, sector_old);
+				return FFS_MEDIAERR;
+			}
+
+			memcpy((void *) epnew, (void *) epold, DENTRY_SIZE);
+			buf_modify(sb, sector_new);
+			buf_unlock(sb, sector_old);
+		}
+
+		ret = p_fs->fs_func->init_ext_entry(sb, p_dir, newentry, num_new_entries, p_uniname, &dos_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		p_fs->fs_func->delete_dir_entry(sb, p_dir, oldentry, 0, num_old_entries);
+		fid->entry = newentry;
+	} else {
+		if (p_fs->fs_func->get_entry_type(epold) == TYPE_FILE) {
+			p_fs->fs_func->set_entry_attr(epold, p_fs->fs_func->get_entry_attr(epold) | ATTR_ARCHIVE);
+			fid->attr |= ATTR_ARCHIVE;
+		}
+		buf_modify(sb, sector_old);
+		buf_unlock(sb, sector_old);
+
+		ret = p_fs->fs_func->init_ext_entry(sb, p_dir, oldentry, num_new_entries, p_uniname, &dos_name);
+		if (ret != FFS_SUCCESS)
+			return ret;
+
+		p_fs->fs_func->delete_dir_entry(sb, p_dir, oldentry, num_new_entries, num_old_entries);
+	}
+
+	return FFS_SUCCESS;
+} /* end of rename_file */
+
+s32 move_file(struct inode *inode, CHAIN_T *p_olddir, s32 oldentry, CHAIN_T *p_newdir, UNI_NAME_T *p_uniname, FILE_ID_T *fid)
+{
+	s32 ret, newentry, num_new_entries, num_old_entries;
+	u32 sector_mov, sector_new;
+	CHAIN_T clu;
+	DOS_NAME_T dos_name;
+	DENTRY_T *epmov, *epnew;
+	struct super_block *sb = inode->i_sb;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	epmov = get_entry_in_dir(sb, p_olddir, oldentry, &sector_mov);
+	if (!epmov)
+		return FFS_MEDIAERR;
+
+	/* check if the source and target directory is the same */
+	if (p_fs->fs_func->get_entry_type(epmov) == TYPE_DIR &&
+		p_fs->fs_func->get_entry_clu0(epmov) == p_newdir->dir)
+		return FFS_INVALIDPATH;
+
+	buf_lock(sb, sector_mov);
+
+	/* buf_lock() before call count_ext_entries() */
+	num_old_entries = p_fs->fs_func->count_ext_entries(sb, p_olddir, oldentry, epmov);
+	if (num_old_entries < 0) {
+		buf_unlock(sb, sector_mov);
+		return FFS_MEDIAERR;
+	}
+	num_old_entries++;
+
+	ret = get_num_entries_and_dos_name(sb, p_newdir, p_uniname, &num_new_entries, &dos_name);
+	if (ret) {
+		buf_unlock(sb, sector_mov);
+		return ret;
+	}
+
+	newentry = find_empty_entry(inode, p_newdir, num_new_entries);
+	if (newentry < 0) {
+		buf_unlock(sb, sector_mov);
+		return FFS_FULL;
+	}
+
+	epnew = get_entry_in_dir(sb, p_newdir, newentry, &sector_new);
+	if (!epnew) {
+		buf_unlock(sb, sector_mov);
+		return FFS_MEDIAERR;
+	}
+
+	memcpy((void *) epnew, (void *) epmov, DENTRY_SIZE);
+	if (p_fs->fs_func->get_entry_type(epnew) == TYPE_FILE) {
+		p_fs->fs_func->set_entry_attr(epnew, p_fs->fs_func->get_entry_attr(epnew) | ATTR_ARCHIVE);
+		fid->attr |= ATTR_ARCHIVE;
+	}
+	buf_modify(sb, sector_new);
+	buf_unlock(sb, sector_mov);
+
+	if (p_fs->vol_type == EXFAT) {
+		epmov = get_entry_in_dir(sb, p_olddir, oldentry+1, &sector_mov);
+		buf_lock(sb, sector_mov);
+		epnew = get_entry_in_dir(sb, p_newdir, newentry+1, &sector_new);
+		if (!epmov || !epnew) {
+			buf_unlock(sb, sector_mov);
+			return FFS_MEDIAERR;
+		}
+
+		memcpy((void *) epnew, (void *) epmov, DENTRY_SIZE);
+		buf_modify(sb, sector_new);
+		buf_unlock(sb, sector_mov);
+	} else if (p_fs->fs_func->get_entry_type(epnew) == TYPE_DIR) {
+		/* change ".." pointer to new parent dir */
+		clu.dir = p_fs->fs_func->get_entry_clu0(epnew);
+		clu.flags = 0x01;
+
+		epnew = get_entry_in_dir(sb, &clu, 1, &sector_new);
+		if (!epnew)
+			return FFS_MEDIAERR;
+
+		if (p_newdir->dir == p_fs->root_dir)
+			p_fs->fs_func->set_entry_clu0(epnew, CLUSTER_32(0));
+		else
+			p_fs->fs_func->set_entry_clu0(epnew, p_newdir->dir);
+		buf_modify(sb, sector_new);
+	}
+
+	ret = p_fs->fs_func->init_ext_entry(sb, p_newdir, newentry, num_new_entries, p_uniname, &dos_name);
+	if (ret != FFS_SUCCESS)
+		return ret;
+
+	p_fs->fs_func->delete_dir_entry(sb, p_olddir, oldentry, 0, num_old_entries);
+
+	fid->dir.dir = p_newdir->dir;
+	fid->dir.size = p_newdir->size;
+	fid->dir.flags = p_newdir->flags;
+
+	fid->entry = newentry;
+
+	return FFS_SUCCESS;
+} /* end of move_file */
+
+/*
+ *  Sector Read/Write Functions
+ */
+
+s32 sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 read)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if ((sec >= (p_fs->PBR_sector+p_fs->num_sectors)) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] sector_read: out of range error! (sec = %d)\n", sec);
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_read(sb, sec, bh, 1, read);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of sector_read */
+
+s32 sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 sync)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (sec >= (p_fs->PBR_sector+p_fs->num_sectors) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] sector_write: out of range error! (sec = %d)\n", sec);
+		fs_error(sb);
+		return ret;
+	}
+
+	if (bh == NULL) {
+		printk("[EXFAT] sector_write: bh is NULL!\n");
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_write(sb, sec, bh, 1, sync);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of sector_write */
+
+s32 multi_sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 num_secs, s32 read)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (((sec+num_secs) > (p_fs->PBR_sector+p_fs->num_sectors)) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] multi_sector_read: out of range error! (sec = %d, num_secs = %d)\n", sec, num_secs);
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_read(sb, sec, bh, num_secs, read);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of multi_sector_read */
+
+s32 multi_sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 num_secs, s32 sync)
+{
+	s32 ret = FFS_MEDIAERR;
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if ((sec+num_secs) > (p_fs->PBR_sector+p_fs->num_sectors) && (p_fs->num_sectors > 0)) {
+		printk("[EXFAT] multi_sector_write: out of range error! (sec = %d, num_secs = %d)\n", sec, num_secs);
+		fs_error(sb);
+		return ret;
+	}
+	if (bh == NULL) {
+		printk("[EXFAT] multi_sector_write: bh is NULL!\n");
+		fs_error(sb);
+		return ret;
+	}
+
+	if (!p_fs->dev_ejected) {
+		ret = bdev_write(sb, sec, bh, num_secs, sync);
+		if (ret != FFS_SUCCESS)
+			p_fs->dev_ejected = TRUE;
+	}
+
+	return ret;
+} /* end of multi_sector_write */
diff --git b/fs/exfat/exfat_core.h b/fs/exfat/exfat_core.h
new file mode 100644
index 0000000..4bdfe4e
--- /dev/null
+++ b/fs/exfat/exfat_core.h
@@ -0,0 +1,671 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_core.h                                              */
+/*  PURPOSE : Header File for exFAT File Manager                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_H
+#define _EXFAT_H
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_cache.h"
+
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+  /* For Debugging Purpose */
+	/* IOCTL code 'f' used by
+	 *   - file systems typically #0~0x1F
+	 *   - embedded terminal devices #128~
+	 *   - exts for debugging purpose #99
+	 * number 100 and 101 is availble now but has possible conflicts
+	 */
+#define EXFAT_IOC_GET_DEBUGFLAGS       _IOR('f', 100, long)
+#define EXFAT_IOC_SET_DEBUGFLAGS       _IOW('f', 101, long)
+
+#define EXFAT_DEBUGFLAGS_INVALID_UMOUNT        0x01
+#define EXFAT_DEBUGFLAGS_ERROR_RW              0x02
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	/*----------------------------------------------------------------------*/
+	/*  Constant & Macro Definitions                                        */
+	/*----------------------------------------------------------------------*/
+
+#define DENTRY_SIZE             32          /* dir entry size */
+#define DENTRY_SIZE_BITS        5
+
+/* PBR entries */
+#define PBR_SIGNATURE           0xAA55
+#define EXT_SIGNATURE           0xAA550000
+#define VOL_LABEL               "NO NAME    " /* size should be 11 */
+#define OEM_NAME                "MSWIN4.1"  /* size should be 8 */
+#define STR_FAT12               "FAT12   "  /* size should be 8 */
+#define STR_FAT16               "FAT16   "  /* size should be 8 */
+#define STR_FAT32               "FAT32   "  /* size should be 8 */
+#define STR_EXFAT               "EXFAT   "  /* size should be 8 */
+#define VOL_CLEAN               0x0000
+#define VOL_DIRTY               0x0002
+
+/* max number of clusters */
+#define FAT12_THRESHOLD         4087        /* 2^12 - 1 + 2 (clu 0 & 1) */
+#define FAT16_THRESHOLD         65527       /* 2^16 - 1 + 2 */
+#define FAT32_THRESHOLD         268435457   /* 2^28 - 1 + 2 */
+#define EXFAT_THRESHOLD         268435457   /* 2^28 - 1 + 2 */
+
+/* file types */
+#define TYPE_UNUSED             0x0000
+#define TYPE_DELETED            0x0001
+#define TYPE_INVALID            0x0002
+#define TYPE_CRITICAL_PRI       0x0100
+#define TYPE_BITMAP             0x0101
+#define TYPE_UPCASE             0x0102
+#define TYPE_VOLUME             0x0103
+#define TYPE_DIR                0x0104
+#define TYPE_FILE               0x011F
+#define TYPE_SYMLINK            0x015F
+#define TYPE_CRITICAL_SEC       0x0200
+#define TYPE_STREAM             0x0201
+#define TYPE_EXTEND             0x0202
+#define TYPE_ACL                0x0203
+#define TYPE_BENIGN_PRI         0x0400
+#define TYPE_GUID               0x0401
+#define TYPE_PADDING            0x0402
+#define TYPE_ACLTAB             0x0403
+#define TYPE_BENIGN_SEC         0x0800
+#define TYPE_ALL                0x0FFF
+
+/* time modes */
+#define TM_CREATE               0
+#define TM_MODIFY               1
+#define TM_ACCESS               2
+
+/* checksum types */
+#define CS_DIR_ENTRY            0
+#define CS_PBR_SECTOR           1
+#define CS_DEFAULT              2
+
+#define CLUSTER_16(x)           ((u16)(x))
+#define CLUSTER_32(x)           ((u32)(x))
+
+#define FALSE			0
+#define TRUE			1
+
+#define MIN(a, b)		(((a) < (b)) ? (a) : (b))
+#define MAX(a, b)		(((a) > (b)) ? (a) : (b))
+
+#define START_SECTOR(x) \
+	((((x) - 2) << p_fs->sectors_per_clu_bits) + p_fs->data_start_sector)
+
+#define IS_LAST_SECTOR_IN_CLUSTER(sec) \
+		((((sec) - p_fs->data_start_sector + 1) & ((1 <<  p_fs->sectors_per_clu_bits) - 1)) == 0)
+
+#define GET_CLUSTER_FROM_SECTOR(sec)			\
+		((((sec) - p_fs->data_start_sector) >> p_fs->sectors_per_clu_bits) + 2)
+
+#define GET16(p_src) \
+	(((u16)(p_src)[0]) | (((u16)(p_src)[1]) << 8))
+#define GET32(p_src) \
+	(((u32)(p_src)[0]) | (((u32)(p_src)[1]) << 8) | \
+	(((u32)(p_src)[2]) << 16) | (((u32)(p_src)[3]) << 24))
+#define GET64(p_src) \
+	(((u64)(p_src)[0]) | (((u64)(p_src)[1]) << 8) | \
+	(((u64)(p_src)[2]) << 16) | (((u64)(p_src)[3]) << 24) | \
+	(((u64)(p_src)[4]) << 32) | (((u64)(p_src)[5]) << 40) | \
+	(((u64)(p_src)[6]) << 48) | (((u64)(p_src)[7]) << 56))
+
+
+#define SET16(p_dst, src)                                  \
+	do {                                              \
+		(p_dst)[0] = (u8)(src);                     \
+		(p_dst)[1] = (u8)(((u16)(src)) >> 8);       \
+	} while (0)
+#define SET32(p_dst, src)                                  \
+	do {                                              \
+		(p_dst)[0] = (u8)(src);                     \
+		(p_dst)[1] = (u8)(((u32)(src)) >> 8);       \
+		(p_dst)[2] = (u8)(((u32)(src)) >> 16);      \
+		(p_dst)[3] = (u8)(((u32)(src)) >> 24);      \
+	} while (0)
+#define SET64(p_dst, src)                                  \
+	do {                                              \
+		(p_dst)[0] = (u8)(src);                   \
+		(p_dst)[1] = (u8)(((u64)(src)) >> 8);     \
+		(p_dst)[2] = (u8)(((u64)(src)) >> 16);    \
+		(p_dst)[3] = (u8)(((u64)(src)) >> 24);    \
+		(p_dst)[4] = (u8)(((u64)(src)) >> 32);    \
+		(p_dst)[5] = (u8)(((u64)(src)) >> 40);    \
+		(p_dst)[6] = (u8)(((u64)(src)) >> 48);    \
+		(p_dst)[7] = (u8)(((u64)(src)) >> 56);    \
+	} while (0)
+
+#ifdef __LITTLE_ENDIAN
+#define GET16_A(p_src)		(*((u16 *)(p_src)))
+#define GET32_A(p_src)		(*((u32 *)(p_src)))
+#define GET64_A(p_src)		(*((u64 *)(p_src)))
+#define SET16_A(p_dst, src)	(*((u16 *)(p_dst)) = (u16)(src))
+#define SET32_A(p_dst, src)	(*((u32 *)(p_dst)) = (u32)(src))
+#define SET64_A(p_dst, src)	(*((u64 *)(p_dst)) = (u64)(src))
+#else /* BIG_ENDIAN */
+#define GET16_A(p_src)		GET16(p_src)
+#define GET32_A(p_src)		GET32(p_src)
+#define GET64_A(p_src)		GET64(p_src)
+#define SET16_A(p_dst, src)	SET16(p_dst, src)
+#define SET32_A(p_dst, src)	SET32(p_dst, src)
+#define SET64_A(p_dst, src)	SET64(p_dst, src)
+#endif
+
+/* Upcase tabel mecro */
+#define HIGH_INDEX_BIT (8)
+#define HIGH_INDEX_MASK (0xFF00)
+#define LOW_INDEX_BIT (16-HIGH_INDEX_BIT)
+#define UTBL_ROW_COUNT (1<<LOW_INDEX_BIT)
+#define UTBL_COL_COUNT (1<<HIGH_INDEX_BIT)
+
+#if CONFIG_EXFAT_DEBUG_MSG
+#define DPRINTK(...)			\
+	do {								\
+		printk("[EXFAT] " __VA_ARGS__);	\
+	} while (0)
+#else
+#define DPRINTK(...)
+#endif
+
+static inline u16 get_col_index(u16 i)
+{
+	return i >> LOW_INDEX_BIT;
+}
+static inline u16 get_row_index(u16 i)
+{
+	return i & ~HIGH_INDEX_MASK;
+}
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+/* MS_DOS FAT partition boot record (512 bytes) */
+typedef struct {
+	u8       jmp_boot[3];
+	u8       oem_name[8];
+	u8       bpb[109];
+	u8       boot_code[390];
+	u8       signature[2];
+} PBR_SECTOR_T;
+
+/* MS-DOS FAT12/16 BIOS parameter block (51 bytes) */
+typedef struct {
+	u8       sector_size[2];
+	u8       sectors_per_clu;
+	u8       num_reserved[2];
+	u8       num_fats;
+	u8       num_root_entries[2];
+	u8       num_sectors[2];
+	u8       media_type;
+	u8       num_fat_sectors[2];
+	u8       sectors_in_track[2];
+	u8       num_heads[2];
+	u8       num_hid_sectors[4];
+	u8       num_huge_sectors[4];
+
+	u8       phy_drv_no;
+	u8       reserved;
+	u8       ext_signature;
+	u8       vol_serial[4];
+	u8       vol_label[11];
+	u8       vol_type[8];
+} BPB16_T;
+
+/* MS-DOS FAT32 BIOS parameter block (79 bytes) */
+typedef struct {
+	u8       sector_size[2];
+	u8       sectors_per_clu;
+	u8       num_reserved[2];
+	u8       num_fats;
+	u8       num_root_entries[2];
+	u8       num_sectors[2];
+	u8       media_type;
+	u8       num_fat_sectors[2];
+	u8       sectors_in_track[2];
+	u8       num_heads[2];
+	u8       num_hid_sectors[4];
+	u8       num_huge_sectors[4];
+	u8       num_fat32_sectors[4];
+	u8       ext_flags[2];
+	u8       fs_version[2];
+	u8       root_cluster[4];
+	u8       fsinfo_sector[2];
+	u8       backup_sector[2];
+	u8       reserved[12];
+
+	u8       phy_drv_no;
+	u8       ext_reserved;
+	u8       ext_signature;
+	u8       vol_serial[4];
+	u8       vol_label[11];
+	u8       vol_type[8];
+} BPB32_T;
+
+/* MS-DOS EXFAT BIOS parameter block (109 bytes) */
+typedef struct {
+	u8       reserved1[53];
+	u8       vol_offset[8];
+	u8       vol_length[8];
+	u8       fat_offset[4];
+	u8       fat_length[4];
+	u8       clu_offset[4];
+	u8       clu_count[4];
+	u8       root_cluster[4];
+	u8       vol_serial[4];
+	u8       fs_version[2];
+	u8       vol_flags[2];
+	u8       sector_size_bits;
+	u8       sectors_per_clu_bits;
+	u8       num_fats;
+	u8       phy_drv_no;
+	u8       perc_in_use;
+	u8       reserved2[7];
+} BPBEX_T;
+
+/* MS-DOS FAT file system information sector (512 bytes) */
+typedef struct {
+	u8       signature1[4];
+	u8       reserved1[480];
+	u8       signature2[4];
+	u8       free_cluster[4];
+	u8       next_cluster[4];
+	u8       reserved2[14];
+	u8       signature3[2];
+} FSI_SECTOR_T;
+
+/* MS-DOS FAT directory entry (32 bytes) */
+typedef struct {
+	u8       dummy[32];
+} DENTRY_T;
+
+typedef struct {
+	u8       name[DOS_NAME_LENGTH];
+	u8       attr;
+	u8       lcase;
+	u8       create_time_ms;
+	u8       create_time[2];
+	u8       create_date[2];
+	u8       access_date[2];
+	u8       start_clu_hi[2];
+	u8       modify_time[2];
+	u8       modify_date[2];
+	u8       start_clu_lo[2];
+	u8       size[4];
+} DOS_DENTRY_T;
+
+/* MS-DOS FAT extended directory entry (32 bytes) */
+typedef struct {
+	u8       order;
+	u8       unicode_0_4[10];
+	u8       attr;
+	u8       sysid;
+	u8       checksum;
+	u8       unicode_5_10[12];
+	u8       start_clu[2];
+	u8       unicode_11_12[4];
+} EXT_DENTRY_T;
+
+/* MS-DOS EXFAT file directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       num_ext;
+	u8       checksum[2];
+	u8       attr[2];
+	u8       reserved1[2];
+	u8       create_time[2];
+	u8       create_date[2];
+	u8       modify_time[2];
+	u8       modify_date[2];
+	u8       access_time[2];
+	u8       access_date[2];
+	u8       create_time_ms;
+	u8       modify_time_ms;
+	u8       access_time_ms;
+	u8       reserved2[9];
+} FILE_DENTRY_T;
+
+/* MS-DOS EXFAT stream extension directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       flags;
+	u8       reserved1;
+	u8       name_len;
+	u8       name_hash[2];
+	u8       reserved2[2];
+	u8       valid_size[8];
+	u8       reserved3[4];
+	u8       start_clu[4];
+	u8       size[8];
+} STRM_DENTRY_T;
+
+/* MS-DOS EXFAT file name directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       flags;
+	u8       unicode_0_14[30];
+} NAME_DENTRY_T;
+
+/* MS-DOS EXFAT allocation bitmap directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       flags;
+	u8       reserved[18];
+	u8       start_clu[4];
+	u8       size[8];
+} BMAP_DENTRY_T;
+
+/* MS-DOS EXFAT up-case table directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       reserved1[3];
+	u8       checksum[4];
+	u8       reserved2[12];
+	u8       start_clu[4];
+	u8       size[8];
+} CASE_DENTRY_T;
+
+/* MS-DOS EXFAT volume label directory entry (32 bytes) */
+typedef struct {
+	u8       type;
+	u8       label_len;
+	u8       unicode_0_10[22];
+	u8       reserved[8];
+} VOLM_DENTRY_T;
+
+/* unused entry hint information */
+typedef struct {
+	u32      dir;
+	s32       entry;
+	CHAIN_T     clu;
+} UENTRY_T;
+
+typedef struct {
+	s32       (*alloc_cluster)(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain);
+	void        (*free_cluster)(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse);
+	s32       (*count_used_clusters)(struct super_block *sb);
+
+	s32      (*init_dir_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type,
+								 u32 start_clu, u64 size);
+	s32      (*init_ext_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries,
+								 UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+	s32       (*find_dir_entry)(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type);
+	void        (*delete_dir_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 offset, s32 num_entries);
+	void        (*get_uni_name_from_ext_entry)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname);
+	s32       (*count_ext_entries)(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry);
+	s32       (*calc_num_entries)(UNI_NAME_T *p_uniname);
+
+	u32      (*get_entry_type)(DENTRY_T *p_entry);
+	void        (*set_entry_type)(DENTRY_T *p_entry, u32 type);
+	u32      (*get_entry_attr)(DENTRY_T *p_entry);
+	void        (*set_entry_attr)(DENTRY_T *p_entry, u32 attr);
+	u8       (*get_entry_flag)(DENTRY_T *p_entry);
+	void        (*set_entry_flag)(DENTRY_T *p_entry, u8 flag);
+	u32      (*get_entry_clu0)(DENTRY_T *p_entry);
+	void        (*set_entry_clu0)(DENTRY_T *p_entry, u32 clu0);
+	u64      (*get_entry_size)(DENTRY_T *p_entry);
+	void        (*set_entry_size)(DENTRY_T *p_entry, u64 size);
+	void        (*get_entry_time)(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+	void        (*set_entry_time)(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+} FS_FUNC_T;
+
+typedef struct __FS_INFO_T {
+	u32      drv;                    /* drive ID */
+	u32      vol_type;               /* volume FAT type */
+	u32      vol_id;                 /* volume serial number */
+
+	u32      num_sectors;            /* num of sectors in volume */
+	u32      num_clusters;           /* num of clusters in volume */
+	u32      cluster_size;           /* cluster size in bytes */
+	u32      cluster_size_bits;
+	u32      sectors_per_clu;        /* cluster size in sectors */
+	u32      sectors_per_clu_bits;
+
+	u32      PBR_sector;             /* PBR sector */
+	u32      FAT1_start_sector;      /* FAT1 start sector */
+	u32      FAT2_start_sector;      /* FAT2 start sector */
+	u32      root_start_sector;      /* root dir start sector */
+	u32      data_start_sector;      /* data area start sector */
+	u32      num_FAT_sectors;        /* num of FAT sectors */
+
+	u32      root_dir;               /* root dir cluster */
+	u32      dentries_in_root;       /* num of dentries in root dir */
+	u32      dentries_per_clu;       /* num of dentries per cluster */
+
+	u32      vol_flag;               /* volume dirty flag */
+	struct buffer_head *pbr_bh;         /* PBR sector */
+
+	u32      map_clu;                /* allocation bitmap start cluster */
+	u32      map_sectors;            /* num of allocation bitmap sectors */
+	struct buffer_head **vol_amap;      /* allocation bitmap */
+
+	u16      **vol_utbl;               /* upcase table */
+
+	u32      clu_srch_ptr;           /* cluster search pointer */
+	u32      used_clusters;          /* number of used clusters */
+	UENTRY_T    hint_uentry;         /* unused entry hint information */
+
+	u32      dev_ejected;            /* block device operation error flag */
+
+	FS_FUNC_T	*fs_func;
+	struct semaphore v_sem;
+
+	/* FAT cache */
+	BUF_CACHE_T FAT_cache_array[FAT_CACHE_SIZE];
+	BUF_CACHE_T FAT_cache_lru_list;
+	BUF_CACHE_T FAT_cache_hash_list[FAT_CACHE_HASH_SIZE];
+
+	/* buf cache */
+	BUF_CACHE_T buf_cache_array[BUF_CACHE_SIZE];
+	BUF_CACHE_T buf_cache_lru_list;
+	BUF_CACHE_T buf_cache_hash_list[BUF_CACHE_HASH_SIZE];
+} FS_INFO_T;
+
+#define ES_2_ENTRIES		2
+#define ES_3_ENTRIES		3
+#define ES_ALL_ENTRIES	0
+
+typedef struct {
+	u32	sector;		/* sector number that contains file_entry */
+	s32	offset;		/* byte offset in the sector */
+	s32	alloc_flag;	/* flag in stream entry. 01 for cluster chain, 03 for contig. clusteres. */
+	u32 num_entries;
+
+	/* __buf should be the last member */
+	void *__buf;
+} ENTRY_SET_CACHE_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/* file system initialization & shutdown functions */
+s32 ffsInit(void);
+s32 ffsShutdown(void);
+
+/* volume management functions */
+s32 ffsMountVol(struct super_block *sb);
+s32 ffsUmountVol(struct super_block *sb);
+s32 ffsCheckVol(struct super_block *sb);
+s32 ffsGetVolInfo(struct super_block *sb, VOL_INFO_T *info);
+s32 ffsSyncVol(struct super_block *sb, s32 do_sync);
+
+/* file management functions */
+s32 ffsLookupFile(struct inode *inode, char *path, FILE_ID_T *fid);
+s32 ffsCreateFile(struct inode *inode, char *path, u8 mode, FILE_ID_T *fid);
+s32 ffsReadFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *rcount);
+s32 ffsWriteFile(struct inode *inode, FILE_ID_T *fid, void *buffer, u64 count, u64 *wcount);
+s32 ffsTruncateFile(struct inode *inode, u64 old_size, u64 new_size);
+s32 ffsMoveFile(struct inode *old_parent_inode, FILE_ID_T *fid, struct inode *new_parent_inode, struct dentry *new_dentry);
+s32 ffsRemoveFile(struct inode *inode, FILE_ID_T *fid);
+s32 ffsSetAttr(struct inode *inode, u32 attr);
+s32 ffsGetStat(struct inode *inode, DIR_ENTRY_T *info);
+s32 ffsSetStat(struct inode *inode, DIR_ENTRY_T *info);
+s32 ffsMapCluster(struct inode *inode, s32 clu_offset, u32 *clu);
+
+/* directory management functions */
+s32 ffsCreateDir(struct inode *inode, char *path, FILE_ID_T *fid);
+s32 ffsReadDir(struct inode *inode, DIR_ENTRY_T *dir_ent);
+s32 ffsRemoveDir(struct inode *inode, FILE_ID_T *fid);
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations (NOT TO UPPER LAYER)                 */
+/*----------------------------------------------------------------------*/
+
+/* fs management functions */
+s32  fs_init(void);
+s32  fs_shutdown(void);
+void   fs_set_vol_flags(struct super_block *sb, u32 new_flag);
+void   fs_sync(struct super_block *sb, s32 do_sync);
+void   fs_error(struct super_block *sb);
+
+/* cluster management functions */
+s32   clear_cluster(struct super_block *sb, u32 clu);
+s32  fat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain);
+s32  exfat_alloc_cluster(struct super_block *sb, s32 num_alloc, CHAIN_T *p_chain);
+void   fat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse);
+void   exfat_free_cluster(struct super_block *sb, CHAIN_T *p_chain, s32 do_relse);
+u32 find_last_cluster(struct super_block *sb, CHAIN_T *p_chain);
+s32  count_num_clusters(struct super_block *sb, CHAIN_T *dir);
+s32  fat_count_used_clusters(struct super_block *sb);
+s32  exfat_count_used_clusters(struct super_block *sb);
+void   exfat_chain_cont_cluster(struct super_block *sb, u32 chain, s32 len);
+
+/* allocation bitmap management functions */
+s32  load_alloc_bitmap(struct super_block *sb);
+void   free_alloc_bitmap(struct super_block *sb);
+s32   set_alloc_bitmap(struct super_block *sb, u32 clu);
+s32   clr_alloc_bitmap(struct super_block *sb, u32 clu);
+u32 test_alloc_bitmap(struct super_block *sb, u32 clu);
+void   sync_alloc_bitmap(struct super_block *sb);
+
+/* upcase table management functions */
+s32  load_upcase_table(struct super_block *sb);
+void   free_upcase_table(struct super_block *sb);
+
+/* dir entry management functions */
+u32 fat_get_entry_type(DENTRY_T *p_entry);
+u32 exfat_get_entry_type(DENTRY_T *p_entry);
+void   fat_set_entry_type(DENTRY_T *p_entry, u32 type);
+void   exfat_set_entry_type(DENTRY_T *p_entry, u32 type);
+u32 fat_get_entry_attr(DENTRY_T *p_entry);
+u32 exfat_get_entry_attr(DENTRY_T *p_entry);
+void   fat_set_entry_attr(DENTRY_T *p_entry, u32 attr);
+void   exfat_set_entry_attr(DENTRY_T *p_entry, u32 attr);
+u8  fat_get_entry_flag(DENTRY_T *p_entry);
+u8  exfat_get_entry_flag(DENTRY_T *p_entry);
+void   fat_set_entry_flag(DENTRY_T *p_entry, u8 flag);
+void   exfat_set_entry_flag(DENTRY_T *p_entry, u8 flag);
+u32 fat_get_entry_clu0(DENTRY_T *p_entry);
+u32 exfat_get_entry_clu0(DENTRY_T *p_entry);
+void   fat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu);
+void   exfat_set_entry_clu0(DENTRY_T *p_entry, u32 start_clu);
+u64 fat_get_entry_size(DENTRY_T *p_entry);
+u64 exfat_get_entry_size(DENTRY_T *p_entry);
+void   fat_set_entry_size(DENTRY_T *p_entry, u64 size);
+void   exfat_set_entry_size(DENTRY_T *p_entry, u64 size);
+void   fat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+void   exfat_get_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+void   fat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+void   exfat_set_entry_time(DENTRY_T *p_entry, TIMESTAMP_T *tp, u8 mode);
+s32   fat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, u32 start_clu, u64 size);
+s32   exfat_init_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, u32 start_clu, u64 size);
+s32   fat_init_ext_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+s32   exfat_init_ext_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 num_entries, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+void   init_dos_entry(DOS_DENTRY_T *ep, u32 type, u32 start_clu);
+void   init_ext_entry(EXT_DENTRY_T *ep, s32 order, u8 chksum, u16 *uniname);
+void   init_file_entry(FILE_DENTRY_T *ep, u32 type);
+void   init_strm_entry(STRM_DENTRY_T *ep, u8 flags, u32 start_clu, u64 size);
+void   init_name_entry(NAME_DENTRY_T *ep, u16 *uniname);
+void   fat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries);
+void   exfat_delete_dir_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, s32 order, s32 num_entries);
+
+s32   find_location(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector, s32 *offset);
+DENTRY_T *get_entry_with_sector(struct super_block *sb, u32 sector, s32 offset);
+DENTRY_T *get_entry_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 *sector);
+ENTRY_SET_CACHE_T *get_entry_set_in_dir(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u32 type, DENTRY_T **file_ep);
+void release_entry_set(ENTRY_SET_CACHE_T *es);
+s32 write_whole_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es);
+s32 write_partial_entries_in_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es, DENTRY_T *ep, u32 count);
+s32  search_deleted_or_unused_entry(struct super_block *sb, CHAIN_T *p_dir, s32 num_entries);
+s32  find_empty_entry(struct inode *inode, CHAIN_T *p_dir, s32 num_entries);
+s32  fat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type);
+s32  exfat_find_dir_entry(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 num_entries, DOS_NAME_T *p_dosname, u32 type);
+s32  fat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry);
+s32  exfat_count_ext_entries(struct super_block *sb, CHAIN_T *p_dir, s32 entry, DENTRY_T *p_entry);
+s32  count_dos_name_entries(struct super_block *sb, CHAIN_T *p_dir, u32 type);
+void   update_dir_checksum(struct super_block *sb, CHAIN_T *p_dir, s32 entry);
+void update_dir_checksum_with_entry_set(struct super_block *sb, ENTRY_SET_CACHE_T *es);
+bool   is_dir_empty(struct super_block *sb, CHAIN_T *p_dir);
+
+/* name conversion functions */
+s32  get_num_entries_and_dos_name(struct super_block *sb, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, s32 *entries, DOS_NAME_T *p_dosname);
+void   get_uni_name_from_dos_entry(struct super_block *sb, DOS_DENTRY_T *ep, UNI_NAME_T *p_uniname, u8 mode);
+void   fat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname);
+void   exfat_get_uni_name_from_ext_entry(struct super_block *sb, CHAIN_T *p_dir, s32 entry, u16 *uniname);
+s32  extract_uni_name_from_ext_entry(EXT_DENTRY_T *ep, u16 *uniname, s32 order);
+s32  extract_uni_name_from_name_entry(NAME_DENTRY_T *ep, u16 *uniname, s32 order);
+s32  fat_generate_dos_name(struct super_block *sb, CHAIN_T *p_dir, DOS_NAME_T *p_dosname);
+void   fat_attach_count_to_dos_name(u8 *dosname, s32 count);
+s32  fat_calc_num_entries(UNI_NAME_T *p_uniname);
+s32  exfat_calc_num_entries(UNI_NAME_T *p_uniname);
+u8  calc_checksum_1byte(void *data, s32 len, u8 chksum);
+u16 calc_checksum_2byte(void *data, s32 len, u16 chksum, s32 type);
+u32 calc_checksum_4byte(void *data, s32 len, u32 chksum, s32 type);
+
+/* name resolution functions */
+s32  resolve_path(struct inode *inode, char *path, CHAIN_T *p_dir, UNI_NAME_T *p_uniname);
+s32  resolve_name(u8 *name, u8 **arg);
+
+/* file operation functions */
+s32  fat16_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr);
+s32  fat32_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr);
+s32  exfat_mount(struct super_block *sb, PBR_SECTOR_T *p_pbr);
+s32  create_dir(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, FILE_ID_T *fid);
+s32  create_file(struct inode *inode, CHAIN_T *p_dir, UNI_NAME_T *p_uniname, u8 mode, FILE_ID_T *fid);
+void   remove_file(struct inode *inode, CHAIN_T *p_dir, s32 entry);
+s32  rename_file(struct inode *inode, CHAIN_T *p_dir, s32 old_entry, UNI_NAME_T *p_uniname, FILE_ID_T *fid);
+s32  move_file(struct inode *inode, CHAIN_T *p_olddir, s32 oldentry, CHAIN_T *p_newdir, UNI_NAME_T *p_uniname, FILE_ID_T *fid);
+
+/* sector read/write functions */
+s32   sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 read);
+s32   sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 sync);
+s32   multi_sector_read(struct super_block *sb, u32 sec, struct buffer_head **bh, s32 num_secs, s32 read);
+s32   multi_sector_write(struct super_block *sb, u32 sec, struct buffer_head *bh, s32 num_secs, s32 sync);
+
+#endif /* _EXFAT_H */
diff --git b/fs/exfat/exfat_data.c b/fs/exfat/exfat_data.c
new file mode 100644
index 0000000..65da07a
--- /dev/null
+++ b/fs/exfat/exfat_data.c
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_data.c                                              */
+/*  PURPOSE : exFAT Configuable Data Definitions                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+/*======================================================================*/
+/*                                                                      */
+/*                    GLOBAL VARIABLE DEFINITIONS                       */
+/*                                                                      */
+/*======================================================================*/
+
+/*----------------------------------------------------------------------*/
+/*  File Manager                                                        */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Buffer Manager                                                      */
+/*----------------------------------------------------------------------*/
+
+/* FAT cache */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+DECLARE_MUTEX(f_sem);
+#else
+DEFINE_SEMAPHORE(f_sem);
+#endif
+BUF_CACHE_T FAT_cache_array[FAT_CACHE_SIZE];
+BUF_CACHE_T FAT_cache_lru_list;
+BUF_CACHE_T FAT_cache_hash_list[FAT_CACHE_HASH_SIZE];
+
+/* buf cache */
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+DECLARE_MUTEX(b_sem);
+#else
+DEFINE_SEMAPHORE(b_sem);
+#endif
+BUF_CACHE_T buf_cache_array[BUF_CACHE_SIZE];
+BUF_CACHE_T buf_cache_lru_list;
+BUF_CACHE_T buf_cache_hash_list[BUF_CACHE_HASH_SIZE];
diff --git b/fs/exfat/exfat_data.h b/fs/exfat/exfat_data.h
new file mode 100644
index 0000000..53b0e39
--- /dev/null
+++ b/fs/exfat/exfat_data.h
@@ -0,0 +1,58 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_data.h                                              */
+/*  PURPOSE : Header File for exFAT Configuable Constants               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_DATA_H
+#define _EXFAT_DATA_H
+
+#include "exfat_config.h"
+
+/*======================================================================*/
+/*                                                                      */
+/*                        FFS CONFIGURATIONS                            */
+/*                  (CHANGE THIS PART IF REQUIRED)                      */
+/*                                                                      */
+/*======================================================================*/
+
+/* max number of root directory entries in FAT12/16 */
+/* (should be an exponential value of 2)            */
+#define MAX_DENTRY              512
+
+/* cache size (in number of sectors)                */
+/* (should be an exponential value of 2)            */
+#define FAT_CACHE_SIZE          128
+#define FAT_CACHE_HASH_SIZE     64
+#define BUF_CACHE_SIZE          256
+#define BUF_CACHE_HASH_SIZE     64
+
+#endif /* _EXFAT_DATA_H */
diff --git b/fs/exfat/exfat_nls.c b/fs/exfat/exfat_nls.c
new file mode 100644
index 0000000..a48b3d0
--- /dev/null
+++ b/fs/exfat/exfat_nls.c
@@ -0,0 +1,448 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_nls.c                                               */
+/*  PURPOSE : exFAT NLS Manager                                         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_super.h"
+#include "exfat_core.h"
+
+#include <linux/nls.h>
+
+/*----------------------------------------------------------------------*/
+/*  Global Variable Definitions                                         */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Local Variable Definitions                                          */
+/*----------------------------------------------------------------------*/
+
+static u16 bad_dos_chars[] = {
+	/* + , ; = [ ] */
+	0x002B, 0x002C, 0x003B, 0x003D, 0x005B, 0x005D,
+	0xFF0B, 0xFF0C, 0xFF1B, 0xFF1D, 0xFF3B, 0xFF3D,
+	0
+};
+
+static u16 bad_uni_chars[] = {
+	/* " * / : < > ? \ | */
+	0x0022,         0x002A, 0x002F, 0x003A,
+	0x003C, 0x003E, 0x003F, 0x005C, 0x007C,
+	0
+};
+
+/*----------------------------------------------------------------------*/
+/*  Local Function Declarations                                         */
+/*----------------------------------------------------------------------*/
+
+static s32  convert_uni_to_ch(struct nls_table *nls, u8 *ch, u16 uni, s32 *lossy);
+static s32  convert_ch_to_uni(struct nls_table *nls, u16 *uni, u8 *ch, s32 *lossy);
+
+/*======================================================================*/
+/*  Global Function Definitions                                         */
+/*======================================================================*/
+
+u16 nls_upper(struct super_block *sb, u16 a)
+{
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+
+	if (EXFAT_SB(sb)->options.casesensitive)
+		return a;
+	if (p_fs->vol_utbl != NULL && (p_fs->vol_utbl)[get_col_index(a)] != NULL)
+		return (p_fs->vol_utbl)[get_col_index(a)][get_row_index(a)];
+	else
+		return a;
+}
+
+u16 *nls_wstrchr(u16 *str, u16 wchar)
+{
+	while (*str) {
+		if (*(str++) == wchar)
+			return str;
+	}
+
+	return 0;
+}
+
+s32 nls_dosname_cmp(struct super_block *sb, u8 *a, u8 *b)
+{
+	return strncmp((void *) a, (void *) b, DOS_NAME_LENGTH);
+} /* end of nls_dosname_cmp */
+
+s32 nls_uniname_cmp(struct super_block *sb, u16 *a, u16 *b)
+{
+	int i;
+
+	for (i = 0; i < MAX_NAME_LENGTH; i++, a++, b++) {
+		if (nls_upper(sb, *a) != nls_upper(sb, *b))
+			return 1;
+		if (*a == 0x0)
+			return 0;
+	}
+	return 0;
+} /* end of nls_uniname_cmp */
+
+void nls_uniname_to_dosname(struct super_block *sb, DOS_NAME_T *p_dosname, UNI_NAME_T *p_uniname, s32 *p_lossy)
+{
+	int i, j, len, lossy = FALSE;
+	u8 buf[MAX_CHARSET_SIZE];
+	u8 lower = 0, upper = 0;
+	u8 *dosname = p_dosname->name;
+	u16 *uniname = p_uniname->name;
+	u16 *p, *last_period;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_disk;
+
+	for (i = 0; i < DOS_NAME_LENGTH; i++)
+		*(dosname+i) = ' ';
+
+	if (!nls_uniname_cmp(sb, uniname, (u16 *) UNI_CUR_DIR_NAME)) {
+		*(dosname) = '.';
+		p_dosname->name_case = 0x0;
+		if (p_lossy != NULL)
+			*p_lossy = FALSE;
+		return;
+	}
+
+	if (!nls_uniname_cmp(sb, uniname, (u16 *) UNI_PAR_DIR_NAME)) {
+		*(dosname) = '.';
+		*(dosname+1) = '.';
+		p_dosname->name_case = 0x0;
+		if (p_lossy != NULL)
+			*p_lossy = FALSE;
+		return;
+	}
+
+	/* search for the last embedded period */
+	last_period = NULL;
+	for (p = uniname; *p; p++) {
+		if (*p == (u16) '.')
+			last_period = p;
+	}
+
+	i = 0;
+	while (i < DOS_NAME_LENGTH) {
+		if (i == 8) {
+			if (last_period == NULL)
+				break;
+
+			if (uniname <= last_period) {
+				if (uniname < last_period)
+					lossy = TRUE;
+				uniname = last_period + 1;
+			}
+		}
+
+		if (*uniname == (u16) '\0') {
+			break;
+		} else if (*uniname == (u16) ' ') {
+			lossy = TRUE;
+		} else if (*uniname == (u16) '.') {
+			if (uniname < last_period)
+				lossy = TRUE;
+			else
+				i = 8;
+		} else if (nls_wstrchr(bad_dos_chars, *uniname)) {
+			lossy = TRUE;
+			*(dosname+i) = '_';
+			i++;
+		} else {
+			len = convert_uni_to_ch(nls, buf, *uniname, &lossy);
+
+			if (len > 1) {
+				if ((i >= 8) && ((i+len) > DOS_NAME_LENGTH))
+					break;
+
+				if ((i <  8) && ((i+len) > 8)) {
+					i = 8;
+					continue;
+				}
+
+				lower = 0xFF;
+
+				for (j = 0; j < len; j++, i++)
+					*(dosname+i) = *(buf+j);
+			} else { /* len == 1 */
+				if ((*buf >= 'a') && (*buf <= 'z')) {
+					*(dosname+i) = *buf - ('a' - 'A');
+
+					if (i < 8)
+						lower |= 0x08;
+					else
+						lower |= 0x10;
+				} else if ((*buf >= 'A') && (*buf <= 'Z')) {
+					*(dosname+i) = *buf;
+
+					if (i < 8)
+						upper |= 0x08;
+					else
+						upper |= 0x10;
+				} else {
+					*(dosname+i) = *buf;
+				}
+				i++;
+			}
+		}
+
+		uniname++;
+	}
+
+	if (*dosname == 0xE5)
+		*dosname = 0x05;
+
+	if (*uniname != 0x0)
+		lossy = TRUE;
+
+	if (upper & lower)
+		p_dosname->name_case = 0xFF;
+	else
+		p_dosname->name_case = lower;
+
+	if (p_lossy != NULL)
+		*p_lossy = lossy;
+} /* end of nls_uniname_to_dosname */
+
+void nls_dosname_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname)
+{
+	int i = 0, j, n = 0;
+	u8 buf[DOS_NAME_LENGTH+2];
+	u8 *dosname = p_dosname->name;
+	u16 *uniname = p_uniname->name;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_disk;
+
+	if (*dosname == 0x05) {
+		*buf = 0xE5;
+		i++;
+		n++;
+	}
+
+	for (; i < 8; i++, n++) {
+		if (*(dosname+i) == ' ')
+			break;
+
+		if ((*(dosname+i) >= 'A') && (*(dosname+i) <= 'Z') && (p_dosname->name_case & 0x08))
+			*(buf+n) = *(dosname+i) + ('a' - 'A');
+		else
+			*(buf+n) = *(dosname+i);
+	}
+	if (*(dosname+8) != ' ') {
+		*(buf+n) = '.';
+		n++;
+	}
+
+	for (i = 8; i < DOS_NAME_LENGTH; i++, n++) {
+		if (*(dosname+i) == ' ')
+			break;
+
+		if ((*(dosname+i) >= 'A') && (*(dosname+i) <= 'Z') && (p_dosname->name_case & 0x10))
+			*(buf+n) = *(dosname+i) + ('a' - 'A');
+		else
+			*(buf+n) = *(dosname+i);
+	}
+	*(buf+n) = '\0';
+
+	i = j = 0;
+	while (j < (MAX_NAME_LENGTH-1)) {
+		if (*(buf+i) == '\0')
+			break;
+
+		i += convert_ch_to_uni(nls, uniname, (buf+i), NULL);
+
+		uniname++;
+		j++;
+	}
+
+	*uniname = (u16) '\0';
+} /* end of nls_dosname_to_uniname */
+
+void nls_uniname_to_cstring(struct super_block *sb, u8 *p_cstring, UNI_NAME_T *p_uniname)
+{
+	int i, j, len;
+	u8 buf[MAX_CHARSET_SIZE];
+	u16 *uniname = p_uniname->name;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_io;
+
+	if (nls == NULL) {
+		len = utf16s_to_utf8s(uniname, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN, p_cstring, MAX_NAME_LENGTH);
+		p_cstring[len] = 0;
+		return;
+	}
+
+	i = 0;
+	while (i < (MAX_NAME_LENGTH-1)) {
+		if (*uniname == (u16) '\0')
+			break;
+
+		len = convert_uni_to_ch(nls, buf, *uniname, NULL);
+
+		if (len > 1) {
+			for (j = 0; j < len; j++)
+				*p_cstring++ = (char) *(buf+j);
+		} else { /* len == 1 */
+			*p_cstring++ = (char) *buf;
+		}
+
+		uniname++;
+		i++;
+	}
+
+	*p_cstring = '\0';
+} /* end of nls_uniname_to_cstring */
+
+void nls_cstring_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, u8 *p_cstring, s32 *p_lossy)
+{
+	int i, j, lossy = FALSE;
+	u8 *end_of_name;
+	u8 upname[MAX_NAME_LENGTH * 2];
+	u16 *uniname = p_uniname->name;
+	struct nls_table *nls = EXFAT_SB(sb)->nls_io;
+
+
+	/* strip all trailing spaces */
+	end_of_name = p_cstring + strlen((char *) p_cstring);
+
+	while (*(--end_of_name) == ' ') {
+		if (end_of_name < p_cstring)
+			break;
+	}
+	*(++end_of_name) = '\0';
+
+	if (strcmp((char *) p_cstring, ".") && strcmp((char *) p_cstring, "..")) {
+
+		/* strip all trailing periods */
+		while (*(--end_of_name) == '.') {
+			if (end_of_name < p_cstring)
+				break;
+		}
+		*(++end_of_name) = '\0';
+	}
+
+	if (*p_cstring == '\0')
+		lossy = TRUE;
+
+	if (nls == NULL) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,101)
+		i = utf8s_to_utf16s(p_cstring, MAX_NAME_LENGTH, uniname);
+#else
+		i = utf8s_to_utf16s(p_cstring, MAX_NAME_LENGTH, UTF16_HOST_ENDIAN, uniname, MAX_NAME_LENGTH);
+#endif
+		for (j = 0; j < i; j++)
+			SET16_A(upname + j * 2, nls_upper(sb, uniname[j]));
+		uniname[i] = '\0';
+	}
+	else {
+		i = j = 0;
+		while (j < (MAX_NAME_LENGTH-1)) {
+			if (*(p_cstring+i) == '\0')
+				break;
+
+			i += convert_ch_to_uni(nls, uniname, (u8 *)(p_cstring+i), &lossy);
+
+			if ((*uniname < 0x0020) || nls_wstrchr(bad_uni_chars, *uniname))
+				lossy = TRUE;
+
+			SET16_A(upname + j * 2, nls_upper(sb, *uniname));
+
+			uniname++;
+			j++;
+		}
+
+		if (*(p_cstring+i) != '\0')
+			lossy = TRUE;
+		*uniname = (u16) '\0';
+	}
+
+	p_uniname->name_len = j;
+	p_uniname->name_hash = calc_checksum_2byte((void *) upname, j<<1, 0, CS_DEFAULT);
+
+	if (p_lossy != NULL)
+		*p_lossy = lossy;
+} /* end of nls_cstring_to_uniname */
+
+/*======================================================================*/
+/*  Local Function Definitions                                          */
+/*======================================================================*/
+
+static s32 convert_ch_to_uni(struct nls_table *nls, u16 *uni, u8 *ch, s32 *lossy)
+{
+	int len;
+
+	*uni = 0x0;
+
+	if (ch[0] < 0x80) {
+		*uni = (u16) ch[0];
+		return 1;
+	}
+
+	len = nls->char2uni(ch, NLS_MAX_CHARSET_SIZE, uni);
+	if (len < 0) {
+		/* conversion failed */
+		printk("%s: fail to use nls\n", __func__);
+		if (lossy != NULL)
+			*lossy = TRUE;
+		*uni = (u16) '_';
+		if (!strcmp(nls->charset, "utf8"))
+			return 1;
+		else
+			return 2;
+	}
+
+	return len;
+} /* end of convert_ch_to_uni */
+
+static s32 convert_uni_to_ch(struct nls_table *nls, u8 *ch, u16 uni, s32 *lossy)
+{
+	int len;
+
+	ch[0] = 0x0;
+
+	if (uni < 0x0080) {
+		ch[0] = (u8) uni;
+		return 1;
+	}
+
+	len = nls->uni2char(uni, ch, NLS_MAX_CHARSET_SIZE);
+	if (len < 0) {
+		/* conversion failed */
+		printk("%s: fail to use nls\n", __func__);
+		if (lossy != NULL)
+			*lossy = TRUE;
+		ch[0] = '_';
+		return 1;
+	}
+
+	return len;
+
+} /* end of convert_uni_to_ch */
diff --git b/fs/exfat/exfat_nls.h b/fs/exfat/exfat_nls.h
new file mode 100644
index 0000000..bc516d7
--- /dev/null
+++ b/fs/exfat/exfat_nls.h
@@ -0,0 +1,91 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_nls.h                                               */
+/*  PURPOSE : Header File for exFAT NLS Manager                         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_NLS_H
+#define _EXFAT_NLS_H
+
+#include <linux/types.h>
+#include <linux/nls.h>
+
+#include "exfat_config.h"
+#include "exfat_api.h"
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions                                        */
+/*----------------------------------------------------------------------*/
+
+#define NUM_UPCASE              2918
+
+#define DOS_CUR_DIR_NAME        ".          "
+#define DOS_PAR_DIR_NAME        "..         "
+
+#ifdef __LITTLE_ENDIAN
+#define UNI_CUR_DIR_NAME        ".\0"
+#define UNI_PAR_DIR_NAME        ".\0.\0"
+#else
+#define UNI_CUR_DIR_NAME        "\0."
+#define UNI_PAR_DIR_NAME        "\0.\0."
+#endif
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+/* DOS name stucture */
+typedef struct {
+	u8       name[DOS_NAME_LENGTH];
+	u8       name_case;
+} DOS_NAME_T;
+
+/* unicode name stucture */
+typedef struct {
+	u16      name[MAX_NAME_LENGTH];
+	u16      name_hash;
+	u8       name_len;
+} UNI_NAME_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+/* NLS management function */
+u16 nls_upper(struct super_block *sb, u16 a);
+s32  nls_dosname_cmp(struct super_block *sb, u8 *a, u8 *b);
+s32  nls_uniname_cmp(struct super_block *sb, u16 *a, u16 *b);
+void   nls_uniname_to_dosname(struct super_block *sb, DOS_NAME_T *p_dosname, UNI_NAME_T *p_uniname, s32 *p_lossy);
+void   nls_dosname_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, DOS_NAME_T *p_dosname);
+void   nls_uniname_to_cstring(struct super_block *sb, u8 *p_cstring, UNI_NAME_T *p_uniname);
+void   nls_cstring_to_uniname(struct super_block *sb, UNI_NAME_T *p_uniname, u8 *p_cstring, s32 *p_lossy);
+
+#endif /* _EXFAT_NLS_H */
diff --git b/fs/exfat/exfat_oal.c b/fs/exfat/exfat_oal.c
new file mode 100644
index 0000000..9b33998
--- /dev/null
+++ b/fs/exfat/exfat_oal.c
@@ -0,0 +1,190 @@
+/* Some of the source code in this file came from "linux/fs/fat/misc.c".  */
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *         and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_oal.c                                               */
+/*  PURPOSE : exFAT OS Adaptation Layer                                 */
+/*            (Semaphore Functions & Real-Time Clock Functions)         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include <linux/semaphore.h>
+#include <linux/time.h>
+
+#include "exfat_config.h"
+#include "exfat_api.h"
+#include "exfat_oal.h"
+
+/*======================================================================*/
+/*                                                                      */
+/*            SEMAPHORE FUNCTIONS                                       */
+/*                                                                      */
+/*======================================================================*/
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+DECLARE_MUTEX(z_sem);
+#else
+DEFINE_SEMAPHORE(z_sem);
+#endif
+
+s32 sm_init(struct semaphore *sm)
+{
+	sema_init(sm, 1);
+	return 0;
+} /* end of sm_init */
+
+s32 sm_P(struct semaphore *sm)
+{
+	down(sm);
+	return 0;
+} /* end of sm_P */
+
+void sm_V(struct semaphore *sm)
+{
+	up(sm);
+} /* end of sm_V */
+
+
+/*======================================================================*/
+/*                                                                      */
+/*            REAL-TIME CLOCK FUNCTIONS                                 */
+/*                                                                      */
+/*======================================================================*/
+
+extern struct timezone sys_tz;
+
+/*
+ * The epoch of FAT timestamp is 1980.
+ *     :  bits  : value
+ * date:  0 -  4: day    (1 -  31)
+ * date:  5 -  8: month  (1 -  12)
+ * date:  9 - 15: year   (0 - 127) from 1980
+ * time:  0 -  4: sec    (0 -  29) 2sec counts
+ * time:  5 - 10: min    (0 -  59)
+ * time: 11 - 15: hour   (0 -  23)
+ */
+#define UNIX_SECS_1980   315532800L
+
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108   4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA_DECADE	(365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define NO_LEAP_YEAR_2100    (120)
+#define IS_LEAP_YEAR(y)  (!((y) & 3) && (y) != NO_LEAP_YEAR_2100)
+
+#define SECS_PER_MIN     (60)
+#define SECS_PER_HOUR    (60 * SECS_PER_MIN)
+#define SECS_PER_DAY     (24 * SECS_PER_HOUR)
+
+#define MAKE_LEAP_YEAR(leap_year, year)                         \
+	do {                                                    \
+		if (unlikely(year > NO_LEAP_YEAR_2100))         \
+			leap_year = ((year + 3) / 4) - 1;       \
+		else                                            \
+			leap_year = ((year + 3) / 4);           \
+	} while (0)
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+static time_t accum_days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
+};
+
+TIMESTAMP_T *tm_current(TIMESTAMP_T *tp)
+{
+	struct timespec ts = CURRENT_TIME_SEC;
+	time_t second = ts.tv_sec;
+	time_t day, leap_day, month, year;
+
+	second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
+	if (second < UNIX_SECS_1980) {
+		tp->sec  = 0;
+		tp->min  = 0;
+		tp->hour = 0;
+		tp->day  = 1;
+		tp->mon  = 1;
+		tp->year = 0;
+		return tp;
+	}
+#if BITS_PER_LONG == 64
+	if (second >= UNIX_SECS_2108) {
+		tp->sec  = 59;
+		tp->min  = 59;
+		tp->hour = 23;
+		tp->day  = 31;
+		tp->mon  = 12;
+		tp->year = 127;
+		return tp;
+	}
+#endif
+
+	day = second / SECS_PER_DAY - DAYS_DELTA_DECADE;
+	year = day / 365;
+
+	MAKE_LEAP_YEAR(leap_day, year);
+	if (year * 365 + leap_day > day)
+		year--;
+
+	MAKE_LEAP_YEAR(leap_day, year);
+
+	day -= year * 365 + leap_day;
+
+	if (IS_LEAP_YEAR(year) && day == accum_days_in_year[3]) {
+		month = 2;
+	} else {
+		if (IS_LEAP_YEAR(year) && day > accum_days_in_year[3])
+			day--;
+		for (month = 1; month < 12; month++) {
+			if (accum_days_in_year[month + 1] > day)
+				break;
+		}
+	}
+	day -= accum_days_in_year[month];
+
+	tp->sec  = second % SECS_PER_MIN;
+	tp->min  = (second / SECS_PER_MIN) % 60;
+	tp->hour = (second / SECS_PER_HOUR) % 24;
+	tp->day  = day + 1;
+	tp->mon  = month;
+	tp->year = year;
+
+	return tp;
+} /* end of tm_current */
diff --git b/fs/exfat/exfat_oal.h b/fs/exfat/exfat_oal.h
new file mode 100644
index 0000000..b6dd789
--- /dev/null
+++ b/fs/exfat/exfat_oal.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_oal.h                                               */
+/*  PURPOSE : Header File for exFAT OS Adaptation Layer                 */
+/*            (Semaphore Functions & Real-Time Clock Functions)         */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#ifndef _EXFAT_OAL_H
+#define _EXFAT_OAL_H
+
+#include <linux/semaphore.h>
+#include "exfat_config.h"
+#include <linux/version.h>
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions (Configurable)                         */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Constant & Macro Definitions (Non-Configurable)                     */
+/*----------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------*/
+/*  Type Definitions                                                    */
+/*----------------------------------------------------------------------*/
+
+typedef struct {
+	u16      sec;        /* 0 ~ 59               */
+	u16      min;        /* 0 ~ 59               */
+	u16      hour;       /* 0 ~ 23               */
+	u16      day;        /* 1 ~ 31               */
+	u16      mon;        /* 1 ~ 12               */
+	u16      year;       /* 0 ~ 127 (since 1980) */
+} TIMESTAMP_T;
+
+/*----------------------------------------------------------------------*/
+/*  External Function Declarations                                      */
+/*----------------------------------------------------------------------*/
+
+s32 sm_init(struct semaphore *sm);
+s32 sm_P(struct semaphore *sm);
+void  sm_V(struct semaphore *sm);
+
+TIMESTAMP_T *tm_current(TIMESTAMP_T *tm);
+
+#endif /* _EXFAT_OAL_H */
diff --git b/fs/exfat/exfat_super.c b/fs/exfat/exfat_super.c
new file mode 100644
index 0000000..7d9b9ab
--- /dev/null
+++ b/fs/exfat/exfat_super.c
@@ -0,0 +1,2643 @@
+/* Some of the source code in this file came from "linux/fs/fat/file.c","linux/fs/fat/inode.c" and "linux/fs/fat/misc.c".  */
+/*
+ *  linux/fs/fat/file.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *
+ *  regular file handling primitives for fat-based filesystems
+ */
+
+/*
+ *  linux/fs/fat/inode.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  VFAT extensions by Gordon Chaffee, merged with msdos fs by Henrik Storner
+ *  Rewritten for the constant inumbers support by Al Viro
+ *
+ *  Fixes:
+ *
+ *    Max Cohan: Fixed invalid FSINFO offset when info_sector is 0
+ */
+
+/*
+ *  linux/fs/fat/misc.c
+ *
+ *  Written 1992,1993 by Werner Almesberger
+ *  22/11/2000 - Fixed fat_date_unix2dos for dates earlier than 01/01/1980
+ *         and date_dos2unix for date==0 by Igor Zhbanov(bsg@uniyar.ac.ru)
+ */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/version.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/time.h>
+#include <linux/slab.h>
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+#include <linux/smp_lock.h>
+#endif
+#include <linux/seq_file.h>
+#include <linux/pagemap.h>
+#include <linux/mpage.h>
+#include <linux/buffer_head.h>
+#include <linux/exportfs.h>
+#include <linux/mount.h>
+#include <linux/vfs.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0)
+#include <linux/aio.h>
+#endif
+#include <linux/parser.h>
+#include <linux/uio.h>
+#include <linux/writeback.h>
+#include <linux/log2.h>
+#include <linux/hash.h>
+#include <linux/backing-dev.h>
+#include <linux/sched.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <asm/current.h>
+#include <asm/unaligned.h>
+
+#include "exfat_version.h"
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_core.h"
+
+#include "exfat_super.h"
+
+static struct kmem_cache *exfat_inode_cachep;
+
+static int exfat_default_codepage = CONFIG_EXFAT_DEFAULT_CODEPAGE;
+static char exfat_default_iocharset[] = CONFIG_EXFAT_DEFAULT_IOCHARSET;
+
+extern struct timezone sys_tz;
+
+#define CHECK_ERR(x)	BUG_ON(x)
+
+#define UNIX_SECS_1980    315532800L
+
+#if BITS_PER_LONG == 64
+#define UNIX_SECS_2108    4354819200L
+#endif
+/* days between 1.1.70 and 1.1.80 (2 leap days) */
+#define DAYS_DELTA_DECADE    (365 * 10 + 2)
+/* 120 (2100 - 1980) isn't leap year */
+#define NO_LEAP_YEAR_2100    (120)
+#define IS_LEAP_YEAR(y)    (!((y) & 0x3) && (y) != NO_LEAP_YEAR_2100)
+
+#define SECS_PER_MIN    (60)
+#define SECS_PER_HOUR   (60 * SECS_PER_MIN)
+#define SECS_PER_DAY    (24 * SECS_PER_HOUR)
+
+#define MAKE_LEAP_YEAR(leap_year, year)                         \
+	do {                                                    \
+		if (unlikely(year > NO_LEAP_YEAR_2100))         \
+			leap_year = ((year + 3) / 4) - 1;       \
+		else                                            \
+			leap_year = ((year + 3) / 4);           \
+	} while (0)
+
+/* Linear day numbers of the respective 1sts in non-leap years. */
+static time_t accum_days_in_year[] = {
+	/* Jan  Feb  Mar  Apr  May  Jun  Jul  Aug  Sep  Oct  Nov  Dec */
+	0,   0,  31,  59,  90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0,
+};
+
+static void _exfat_truncate(struct inode *inode, loff_t old_size);
+
+/* Convert a FAT time/date pair to a UNIX date (seconds since 1 1 70). */
+void exfat_time_fat2unix(struct exfat_sb_info *sbi, struct timespec *ts,
+						 DATE_TIME_T *tp)
+{
+	time_t year = tp->Year;
+	time_t ld;
+
+	MAKE_LEAP_YEAR(ld, year);
+
+	if (IS_LEAP_YEAR(year) && (tp->Month) > 2)
+		ld++;
+
+	ts->tv_sec =  tp->Second  + tp->Minute * SECS_PER_MIN
+				  + tp->Hour * SECS_PER_HOUR
+				  + (year * 365 + ld + accum_days_in_year[(tp->Month)] + (tp->Day - 1) + DAYS_DELTA_DECADE) * SECS_PER_DAY
+				  + sys_tz.tz_minuteswest * SECS_PER_MIN;
+	ts->tv_nsec = 0;
+}
+
+/* Convert linear UNIX date to a FAT time/date pair. */
+void exfat_time_unix2fat(struct exfat_sb_info *sbi, struct timespec *ts,
+						 DATE_TIME_T *tp)
+{
+	time_t second = ts->tv_sec;
+	time_t day, month, year;
+	time_t ld;
+
+	second -= sys_tz.tz_minuteswest * SECS_PER_MIN;
+
+	/* Jan 1 GMT 00:00:00 1980. But what about another time zone? */
+	if (second < UNIX_SECS_1980) {
+		tp->Second  = 0;
+		tp->Minute  = 0;
+		tp->Hour = 0;
+		tp->Day  = 1;
+		tp->Month  = 1;
+		tp->Year = 0;
+		return;
+	}
+#if (BITS_PER_LONG == 64)
+	if (second >= UNIX_SECS_2108) {
+		tp->Second  = 59;
+		tp->Minute  = 59;
+		tp->Hour = 23;
+		tp->Day  = 31;
+		tp->Month  = 12;
+		tp->Year = 127;
+		return;
+	}
+#endif
+	day = second / SECS_PER_DAY - DAYS_DELTA_DECADE;
+	year = day / 365;
+	MAKE_LEAP_YEAR(ld, year);
+	if (year * 365 + ld > day)
+		year--;
+
+	MAKE_LEAP_YEAR(ld, year);
+	day -= year * 365 + ld;
+
+	if (IS_LEAP_YEAR(year) && day == accum_days_in_year[3]) {
+		month = 2;
+	} else {
+		if (IS_LEAP_YEAR(year) && day > accum_days_in_year[3])
+			day--;
+		for (month = 1; month < 12; month++) {
+			if (accum_days_in_year[month + 1] > day)
+				break;
+		}
+	}
+	day -= accum_days_in_year[month];
+
+	tp->Second  = second % SECS_PER_MIN;
+	tp->Minute  = (second / SECS_PER_MIN) % 60;
+	tp->Hour = (second / SECS_PER_HOUR) % 24;
+	tp->Day  = day + 1;
+	tp->Month  = month;
+	tp->Year = year;
+}
+
+static struct inode *exfat_iget(struct super_block *sb, loff_t i_pos);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+static int exfat_generic_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+#else
+static long exfat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
+#endif
+static int exfat_sync_inode(struct inode *inode);
+static struct inode *exfat_build_inode(struct super_block *sb, FILE_ID_T *fid, loff_t i_pos);
+static void exfat_detach(struct inode *inode);
+static void exfat_attach(struct inode *inode, loff_t i_pos);
+static inline unsigned long exfat_hash(loff_t i_pos);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
+static int exfat_write_inode(struct inode *inode, int wait);
+#else
+static int exfat_write_inode(struct inode *inode, struct writeback_control *wbc);
+#endif
+static void exfat_write_super(struct super_block *sb);
+
+static void __lock_super(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	lock_super(sb);
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	mutex_lock(&sbi->s_lock);
+#endif
+}
+
+static void __unlock_super(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	unlock_super(sb);
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	mutex_unlock(&sbi->s_lock);
+#endif
+}
+
+static int __is_sb_dirty(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	return sb->s_dirt;
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	return sbi->s_dirt;
+#endif
+}
+
+static void __set_sb_clean(struct super_block *sb)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	sb->s_dirt = 0;
+#else
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	sbi->s_dirt = 0;
+#endif
+}
+
+static int __exfat_revalidate(struct dentry *dentry)
+{
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static int exfat_revalidate(struct dentry *dentry, unsigned int flags)
+#else
+static int exfat_revalidate(struct dentry *dentry, struct nameidata *nd)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,00)
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+#endif
+
+	if (dentry->d_inode)
+		return 1;
+	return __exfat_revalidate(dentry);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static int exfat_revalidate_ci(struct dentry *dentry, unsigned int flags)
+#else
+static int exfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+	if (flags & LOOKUP_RCU)
+		return -ECHILD;
+#else
+	unsigned int flags;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,00)
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+#endif
+
+	flags = nd ? nd->flags : 0;
+#endif
+
+	if (dentry->d_inode)
+		return 1;
+
+	if (!flags)
+		return 0;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,00)
+	if (flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+		return 0;
+#else
+	if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
+		if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
+			return 0;
+	}
+#endif
+
+	return __exfat_revalidate(dentry);
+}
+
+static unsigned int __exfat_striptail_len(unsigned int len, const char *name)
+{
+	while (len && name[len - 1] == '.')
+		len--;
+	return len;
+}
+
+static unsigned int exfat_striptail_len(const struct qstr *qstr)
+{
+	return __exfat_striptail_len(qstr->len, qstr->name);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_d_hash(const struct dentry *dentry, struct qstr *qstr)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_d_hash(struct dentry *dentry, struct qstr *qstr)
+#else
+static int exfat_d_hash(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
+	qstr->hash = full_name_hash(dentry, qstr->name, exfat_striptail_len(qstr));
+#else
+	qstr->hash = full_name_hash(qstr->name, exfat_striptail_len(qstr));
+#endif
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_d_hashi(const struct dentry *dentry, struct qstr *qstr)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_d_hashi(struct dentry *dentry, struct qstr *qstr)
+#else
+static int exfat_d_hashi(const struct dentry *dentry, const struct inode *inode,
+		struct qstr *qstr)
+#endif
+{
+	struct super_block *sb = dentry->d_sb;
+	const unsigned char *name;
+	unsigned int len;
+	unsigned long hash;
+
+	name = qstr->name;
+	len = exfat_striptail_len(qstr);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
+	hash = init_name_hash(dentry);
+#else
+	hash = init_name_hash();
+#endif
+	while (len--)
+		hash = partial_name_hash(nls_upper(sb, *name++), hash);
+	qstr->hash = end_name_hash(hash);
+
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
+static int exfat_cmpi(const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_cmpi(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_cmpi(struct dentry *parent, struct qstr *a, struct qstr *b)
+#else
+static int exfat_cmpi(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
+	struct nls_table *t = EXFAT_SB(dentry->d_sb)->nls_io;
+#else
+	struct nls_table *t = EXFAT_SB(parent->d_sb)->nls_io;
+#endif
+	unsigned int alen, blen;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+	alen = exfat_striptail_len(a);
+	blen = exfat_striptail_len(b);
+#else
+	alen = exfat_striptail_len(name);
+	blen = __exfat_striptail_len(len, str);
+#endif
+	if (alen == blen) {
+		if (t == NULL) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+			if (strncasecmp(a->name, b->name, alen) == 0)
+#else
+			if (strncasecmp(name->name, str, alen) == 0)
+#endif
+				return 0;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+		} else if (nls_strnicmp(t, a->name, b->name, alen) == 0)
+#else
+		} else if (nls_strnicmp(t, name->name, str, alen) == 0)
+#endif
+			return 0;
+	}
+	return 1;
+}
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,8,0)
+static int exfat_cmp(const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_cmp(const struct dentry *parent, const struct dentry *dentry,
+		unsigned int len, const char *str, const struct qstr *name)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+static int exfat_cmp(struct dentry *parent, struct qstr *a,
+			struct qstr *b)
+#else
+static int exfat_cmp(const struct dentry *parent, const struct inode *pinode,
+		const struct dentry *dentry, const struct inode *inode,
+		unsigned int len, const char *str, const struct qstr *name)
+#endif
+{
+	unsigned int alen, blen;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+	alen = exfat_striptail_len(a);
+	blen = exfat_striptail_len(b);
+#else
+	alen = exfat_striptail_len(name);
+	blen = __exfat_striptail_len(len, str);
+#endif
+	if (alen == blen) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+		if (strncmp(a->name, b->name, alen) == 0)
+#else
+		if (strncmp(name->name, str, alen) == 0)
+#endif
+			return 0;
+	}
+	return 1;
+}
+
+static const struct dentry_operations exfat_ci_dentry_ops = {
+	.d_revalidate   = exfat_revalidate_ci,
+	.d_hash         = exfat_d_hashi,
+	.d_compare      = exfat_cmpi,
+};
+
+static const struct dentry_operations exfat_dentry_ops = {
+	.d_revalidate   = exfat_revalidate,
+	.d_hash         = exfat_d_hash,
+	.d_compare      = exfat_cmp,
+};
+
+/*======================================================================*/
+/*  Directory Entry Operations                                          */
+/*======================================================================*/
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+static int exfat_readdir(struct file *filp, struct dir_context *ctx)
+#else
+static int exfat_readdir(struct file *filp, void *dirent, filldir_t filldir)
+#endif
+{
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,9,0)
+	struct inode *inode = file_inode(filp);
+#else
+	struct inode *inode = filp->f_path.dentry->d_inode;
+#endif
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	BD_INFO_T *p_bd = &(EXFAT_SB(sb)->bd_info);
+	DIR_ENTRY_T de;
+	unsigned long inum;
+	loff_t cpos;
+	int err = 0;
+
+	__lock_super(sb);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	cpos = ctx->pos;
+#else
+	cpos = filp->f_pos;
+#endif
+	/* Fake . and .. for the root directory. */
+	if ((p_fs->vol_type == EXFAT) || (inode->i_ino == EXFAT_ROOT_INO)) {
+		while (cpos < 2) {
+			if (inode->i_ino == EXFAT_ROOT_INO)
+				inum = EXFAT_ROOT_INO;
+			else if (cpos == 0)
+				inum = inode->i_ino;
+			else /* (cpos == 1) */
+				inum = parent_ino(filp->f_path.dentry);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+			if (!dir_emit_dots(filp, ctx))
+#else
+			if (filldir(dirent, "..", cpos+1, cpos, inum, DT_DIR) < 0)
+#endif
+				goto out;
+			cpos++;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+			ctx->pos++;
+#else
+			filp->f_pos++;
+#endif
+		}
+		if (cpos == 2)
+			cpos = 0;
+	}
+	if (cpos & (DENTRY_SIZE - 1)) {
+		err = -ENOENT;
+		goto out;
+	}
+
+get_new:
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+	EXFAT_I(inode)->fid.rwoffset = cpos >> DENTRY_SIZE_BITS;
+
+	err = FsReadDir(inode, &de);
+	if (err) {
+		/* at least we tried to read a sector
+		 * move cpos to next sector position (should be aligned)
+		 */
+		if (err == FFS_MEDIAERR) {
+			cpos += 1 << p_bd->sector_size_bits;
+			cpos &= ~((1 << p_bd->sector_size_bits)-1);
+		}
+
+		err = -EIO;
+		goto end_of_dir;
+	}
+
+	cpos = EXFAT_I(inode)->fid.rwoffset << DENTRY_SIZE_BITS;
+
+	if (!de.Name[0])
+		goto end_of_dir;
+
+	if (!memcmp(de.ShortName, DOS_CUR_DIR_NAME, DOS_NAME_LENGTH)) {
+		inum = inode->i_ino;
+	} else if (!memcmp(de.ShortName, DOS_PAR_DIR_NAME, DOS_NAME_LENGTH)) {
+		inum = parent_ino(filp->f_path.dentry);
+	} else {
+		loff_t i_pos = ((loff_t) EXFAT_I(inode)->fid.start_clu << 32) |
+					   ((EXFAT_I(inode)->fid.rwoffset-1) & 0xffffffff);
+
+		struct inode *tmp = exfat_iget(sb, i_pos);
+		if (tmp) {
+			inum = tmp->i_ino;
+			iput(tmp);
+		} else {
+			inum = iunique(sb, EXFAT_ROOT_INO);
+		}
+	}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	if (!dir_emit(ctx, de.Name, strlen(de.Name), inum,
+		(de.Attr & ATTR_SUBDIR) ? DT_DIR : DT_REG))
+#else
+	if (filldir(dirent, de.Name, strlen(de.Name), cpos-1, inum,
+				(de.Attr & ATTR_SUBDIR) ? DT_DIR : DT_REG) < 0)
+#endif
+		goto out;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	ctx->pos = cpos;
+#else
+	filp->f_pos = cpos;
+#endif
+	goto get_new;
+
+end_of_dir:
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	ctx->pos = cpos;
+#else
+	filp->f_pos = cpos;
+#endif
+out:
+	__unlock_super(sb);
+	return err;
+}
+
+static int exfat_ioctl_volume_id(struct inode *dir)
+{
+	struct super_block *sb = dir->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+
+	return p_fs->vol_id;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+static int exfat_generic_ioctl(struct inode *inode, struct file *filp,
+							   unsigned int cmd, unsigned long arg)
+#else
+static long exfat_generic_ioctl(struct file *filp,
+								unsigned int cmd, unsigned long arg)
+#endif
+{
+#if !(LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36))
+    #if !(LINUX_VERSION_CODE < KERNEL_VERSION(3,18,3))
+		  struct inode *inode = filp->f_path.dentry->d_inode;
+    #else
+		  struct inode *inode = filp->f_dentry->d_inode;
+	#endif
+#endif
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	unsigned int flags;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+	switch (cmd) {
+	case EXFAT_IOCTL_GET_VOLUME_ID:
+		return exfat_ioctl_volume_id(inode);
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	case EXFAT_IOC_GET_DEBUGFLAGS: {
+		struct super_block *sb = inode->i_sb;
+		struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+		flags = sbi->debug_flags;
+		return put_user(flags, (int __user *)arg);
+	}
+	case EXFAT_IOC_SET_DEBUGFLAGS: {
+		struct super_block *sb = inode->i_sb;
+		struct exfat_sb_info *sbi = EXFAT_SB(sb);
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (get_user(flags, (int __user *) arg))
+			return -EFAULT;
+
+		__lock_super(sb);
+		sbi->debug_flags = flags;
+		__unlock_super(sb);
+
+		return 0;
+	}
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+	default:
+		return -ENOTTY; /* Inappropriate ioctl for device */
+	}
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
+static int exfat_file_fsync(struct file *filp, struct dentry *dentry,
+				int datasync)
+#else
+static int exfat_file_fsync(struct file *filp, int datasync)
+#endif
+{
+	struct inode *inode = filp->f_mapping->host;
+	struct super_block *sb = inode->i_sb;
+	int res, err;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35)
+	res = simple_fsync(filp, dentry, datasync);
+#else
+	res = generic_file_fsync(filp, datasync);
+#endif
+	err = FsSyncVol(sb, 1);
+
+	return res ? res : err;
+}
+#endif
+
+const struct file_operations exfat_dir_operations = {
+	.llseek     = generic_file_llseek,
+	.read       = generic_read_dir,
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0)
+	.iterate    = exfat_readdir,
+#else
+	.readdir    = exfat_readdir,
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	.ioctl      = exfat_generic_ioctl,
+	.fsync      = exfat_file_fsync,
+#else
+	.unlocked_ioctl = exfat_generic_ioctl,
+	.fsync      = generic_file_fsync,
+#endif
+};
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+						bool excl)
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+static int exfat_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+						struct nameidata *nd)
+#else
+static int exfat_create(struct inode *dir, struct dentry *dentry, int mode,
+						struct nameidata *nd)
+#endif
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct timespec ts;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_create entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	err = FsCreateFile(dir, (u8 *) dentry->d_name.name, FM_REGULAR, &fid);
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else if (err == FFS_NAMETOOLONG)
+			err = -ENAMETOOLONG;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unnecessary. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_create exited\n");
+	return err;
+}
+
+static int exfat_find(struct inode *dir, struct qstr *qname,
+					  FILE_ID_T *fid)
+{
+	int err;
+
+	if (qname->len == 0)
+		return -ENOENT;
+
+	err = FsLookupFile(dir, (u8 *) qname->name, fid);
+	if (err)
+		return -ENOENT;
+
+	return 0;
+}
+
+static int exfat_d_anon_disconn(struct dentry *dentry)
+{
+	return IS_ROOT(dentry) && (dentry->d_flags & DCACHE_DISCONNECTED);
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
+						   unsigned int flags)
+#else
+static struct dentry *exfat_lookup(struct inode *dir, struct dentry *dentry,
+						   struct nameidata *nd)
+#endif
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct dentry *alias;
+	int err;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	u64 ret;
+	mode_t i_mode;
+
+	__lock_super(sb);
+	DPRINTK("exfat_lookup entered\n");
+	err = exfat_find(dir, &dentry->d_name, &fid);
+	if (err) {
+		if (err == -ENOENT) {
+			inode = NULL;
+			goto out;
+		}
+		goto error;
+	}
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto error;
+	}
+
+	i_mode = inode->i_mode;
+	if (S_ISLNK(i_mode)) {
+		EXFAT_I(inode)->target = kmalloc(i_size_read(inode)+1, GFP_KERNEL);
+		if (!EXFAT_I(inode)->target) {
+			err = -ENOMEM;
+			goto error;
+		}
+		FsReadFile(dir, &fid, EXFAT_I(inode)->target, i_size_read(inode), &ret);
+		*(EXFAT_I(inode)->target + i_size_read(inode)) = '\0';
+	}
+
+	alias = d_find_alias(inode);
+	if (alias && !exfat_d_anon_disconn(alias)) {
+		CHECK_ERR(d_unhashed(alias));
+		if (!S_ISDIR(i_mode))
+			d_move(alias, dentry);
+		iput(inode);
+		__unlock_super(sb);
+		DPRINTK("exfat_lookup exited 1\n");
+		return alias;
+	} else {
+		dput(alias);
+	}
+out:
+	__unlock_super(sb);
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,38)
+	dentry->d_op = sb->s_root->d_op;
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry) {
+		dentry->d_op = sb->s_root->d_op;
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+	}
+#else
+	dentry = d_splice_alias(inode, dentry);
+	if (dentry)
+		dentry->d_time = dentry->d_parent->d_inode->i_version;
+#endif
+	DPRINTK("exfat_lookup exited 2\n");
+	return dentry;
+
+error:
+	__unlock_super(sb);
+	DPRINTK("exfat_lookup exited 3\n");
+	return ERR_PTR(err);
+}
+
+static int exfat_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct timespec ts;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_unlink entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+	err = FsRemoveFile(dir, &(EXFAT_I(inode)->fid));
+	if (err) {
+		if (err == FFS_PERMISSIONERR)
+			err = -EPERM;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = ts;
+	exfat_detach(inode);
+	remove_inode_hash(inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_unlink exited\n");
+	return err;
+}
+
+static int exfat_symlink(struct inode *dir, struct dentry *dentry, const char *target)
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct timespec ts;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	int err;
+	u64 len = (u64) strlen(target);
+	u64 ret;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_symlink entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	err = FsCreateFile(dir, (u8 *) dentry->d_name.name, FM_SYMLINK, &fid);
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else
+			err = -EIO;
+		goto out;
+	}
+
+	err = FsWriteFile(dir, &fid, (char *) target, len, &ret);
+
+	if (err) {
+		FsRemoveFile(dir, &fid);
+
+		if (err == FFS_FULL)
+			err = -ENOSPC;
+		else
+			err = -EIO;
+		goto out;
+	}
+
+	dir->i_version++;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	EXFAT_I(inode)->target = kmalloc(len+1, GFP_KERNEL);
+	if (!EXFAT_I(inode)->target) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(EXFAT_I(inode)->target, target, len+1);
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_symlink exited\n");
+	return err;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+static int exfat_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+#else
+static int exfat_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+#endif
+{
+	struct super_block *sb = dir->i_sb;
+	struct inode *inode;
+	struct timespec ts;
+	FILE_ID_T fid;
+	loff_t i_pos;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_mkdir entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	err = FsCreateDir(dir, (u8 *) dentry->d_name.name, &fid);
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else if (err == FFS_NAMETOOLONG)
+			err = -ENAMETOOLONG;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_ctime = dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+	inc_nlink(dir);
+
+	i_pos = ((loff_t) fid.dir.dir << 32) | (fid.entry & 0xffffffff);
+
+	inode = exfat_build_inode(sb, &fid, i_pos);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
+		goto out;
+	}
+	inode->i_version++;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+	/* timestamp is already written, so mark_inode_dirty() is unneeded. */
+
+	dentry->d_time = dentry->d_parent->d_inode->i_version;
+	d_instantiate(dentry, inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_mkdir exited\n");
+	return err;
+}
+
+static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct super_block *sb = dir->i_sb;
+	struct timespec ts;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_rmdir entered\n");
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+	err = FsRemoveDir(dir, &(EXFAT_I(inode)->fid));
+	if (err) {
+		if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -ENOTEMPTY;
+		else if (err == FFS_NOTFOUND)
+			err = -ENOENT;
+		else if (err == FFS_DIRBUSY)
+			err = -EBUSY;
+		else
+			err = -EIO;
+		goto out;
+	}
+	dir->i_version++;
+	dir->i_mtime = dir->i_atime = ts;
+	if (IS_DIRSYNC(dir))
+		(void) exfat_sync_inode(dir);
+	else
+		mark_inode_dirty(dir);
+	drop_nlink(dir);
+
+	clear_nlink(inode);
+	inode->i_mtime = inode->i_atime = ts;
+	exfat_detach(inode);
+	remove_inode_hash(inode);
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_rmdir exited\n");
+	return err;
+}
+
+static int exfat_rename(struct inode *old_dir, struct dentry *old_dentry,
+						struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode, *new_inode;
+	struct super_block *sb = old_dir->i_sb;
+	struct timespec ts;
+	loff_t i_pos;
+	int err;
+
+	__lock_super(sb);
+
+	DPRINTK("exfat_rename entered\n");
+
+	old_inode = old_dentry->d_inode;
+	new_inode = new_dentry->d_inode;
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(old_inode)->fid.size = i_size_read(old_inode);
+
+	err = FsMoveFile(old_dir, &(EXFAT_I(old_inode)->fid), new_dir, new_dentry);
+	if (err) {
+		if (err == FFS_PERMISSIONERR)
+			err = -EPERM;
+		else if (err == FFS_INVALIDPATH)
+			err = -EINVAL;
+		else if (err == FFS_FILEEXIST)
+			err = -EEXIST;
+		else if (err == FFS_NOTFOUND)
+			err = -ENOENT;
+		else if (err == FFS_FULL)
+			err = -ENOSPC;
+		else
+			err = -EIO;
+		goto out;
+	}
+	new_dir->i_version++;
+	new_dir->i_ctime = new_dir->i_mtime = new_dir->i_atime = ts;
+	if (IS_DIRSYNC(new_dir))
+		(void) exfat_sync_inode(new_dir);
+	else
+		mark_inode_dirty(new_dir);
+
+	i_pos = ((loff_t) EXFAT_I(old_inode)->fid.dir.dir << 32) |
+			(EXFAT_I(old_inode)->fid.entry & 0xffffffff);
+
+	exfat_detach(old_inode);
+	exfat_attach(old_inode, i_pos);
+	if (IS_DIRSYNC(new_dir))
+		(void) exfat_sync_inode(old_inode);
+	else
+		mark_inode_dirty(old_inode);
+
+	if ((S_ISDIR(old_inode->i_mode)) && (old_dir != new_dir)) {
+		drop_nlink(old_dir);
+		if (!new_inode)
+			inc_nlink(new_dir);
+	}
+
+	old_dir->i_version++;
+	old_dir->i_ctime = old_dir->i_mtime = ts;
+	if (IS_DIRSYNC(old_dir))
+		(void) exfat_sync_inode(old_dir);
+	else
+		mark_inode_dirty(old_dir);
+
+	if (new_inode) {
+		exfat_detach(new_inode);
+		drop_nlink(new_inode);
+		if (S_ISDIR(new_inode->i_mode))
+			drop_nlink(new_inode);
+		new_inode->i_ctime = ts;
+	}
+
+out:
+	__unlock_super(sb);
+	DPRINTK("exfat_rename exited\n");
+	return err;
+}
+
+static int exfat_cont_expand(struct inode *inode, loff_t size)
+{
+	struct address_space *mapping = inode->i_mapping;
+	loff_t start = i_size_read(inode), count = size - i_size_read(inode);
+	int err, err2;
+
+	err = generic_cont_expand_simple(inode, size);
+	if (err != 0)
+		return err;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	mark_inode_dirty(inode);
+
+	if (IS_SYNC(inode)) {
+		err = filemap_fdatawrite_range(mapping, start, start + count - 1);
+		err2 = sync_mapping_buffers(mapping);
+		err = (err) ? (err) : (err2);
+		err2 = write_inode_now(inode, 1);
+		err = (err) ? (err) : (err2);
+		if (!err)
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32)
+			err =  wait_on_page_writeback_range(mapping,
+					start >> PAGE_CACHE_SHIFT,
+					(start + count - 1) >> PAGE_CACHE_SHIFT);
+#else
+			err =  filemap_fdatawait_range(mapping, start, start + count - 1);
+#endif
+	}
+	return err;
+}
+
+static int exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode)
+{
+	mode_t allow_utime = sbi->options.allow_utime;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+	if (!uid_eq(current_fsuid(), inode->i_uid))
+#else
+	if (current_fsuid() != inode->i_uid)
+#endif
+	{
+		if (in_group_p(inode->i_gid))
+			allow_utime >>= 3;
+		if (allow_utime & MAY_WRITE)
+			return 1;
+	}
+
+	/* use a default check */
+	return 0;
+}
+
+static int exfat_sanitize_mode(const struct exfat_sb_info *sbi,
+							   struct inode *inode, umode_t *mode_ptr)
+{
+	mode_t i_mode, mask, perm;
+
+	i_mode = inode->i_mode;
+
+	if (S_ISREG(i_mode) || S_ISLNK(i_mode))
+		mask = sbi->options.fs_fmask;
+	else
+		mask = sbi->options.fs_dmask;
+
+	perm = *mode_ptr & ~(S_IFMT | mask);
+
+	/* Of the r and x bits, all (subject to umask) must be present.*/
+	if ((perm & (S_IRUGO | S_IXUGO)) != (i_mode & (S_IRUGO|S_IXUGO)))
+		return -EPERM;
+
+	if (exfat_mode_can_hold_ro(inode)) {
+		/* Of the w bits, either all (subject to umask) or none must be present. */
+		if ((perm & S_IWUGO) && ((perm & S_IWUGO) != (S_IWUGO & ~mask)))
+			return -EPERM;
+	} else {
+		/* If exfat_mode_can_hold_ro(inode) is false, can't change w bits. */
+		if ((perm & S_IWUGO) != (S_IWUGO & ~mask))
+			return -EPERM;
+	}
+
+	*mode_ptr &= S_IFMT | perm;
+
+	return 0;
+}
+
+static int exfat_setattr(struct dentry *dentry, struct iattr *attr)
+{
+
+	struct exfat_sb_info *sbi = EXFAT_SB(dentry->d_sb);
+	struct inode *inode = dentry->d_inode;
+	unsigned int ia_valid;
+	int error;
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,35)
+	loff_t old_size;
+#endif
+
+	DPRINTK("exfat_setattr entered\n");
+
+	if ((attr->ia_valid & ATTR_SIZE)
+		&& (attr->ia_size > i_size_read(inode))) {
+		error = exfat_cont_expand(inode, attr->ia_size);
+		if (error || attr->ia_valid == ATTR_SIZE)
+			return error;
+		attr->ia_valid &= ~ATTR_SIZE;
+	}
+
+	ia_valid = attr->ia_valid;
+
+	if ((ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET))
+		&& exfat_allow_set_time(sbi, inode)) {
+		attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET);
+	}
+
+	error = inode_change_ok(inode, attr);
+	attr->ia_valid = ia_valid;
+	if (error)
+		return error;
+
+	if (((attr->ia_valid & ATTR_UID) &&
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+		 (!uid_eq(attr->ia_uid, sbi->options.fs_uid))) ||
+		((attr->ia_valid & ATTR_GID) &&
+		 (!gid_eq(attr->ia_gid, sbi->options.fs_gid))) ||
+#else
+		 (attr->ia_uid != sbi->options.fs_uid)) ||
+		((attr->ia_valid & ATTR_GID) &&
+		 (attr->ia_gid != sbi->options.fs_gid)) ||
+#endif
+		((attr->ia_valid & ATTR_MODE) &&
+		 (attr->ia_mode & ~(S_IFREG | S_IFLNK | S_IFDIR | S_IRWXUGO)))) {
+		return -EPERM;
+	}
+
+	/*
+	 * We don't return -EPERM here. Yes, strange, but this is too
+	 * old behavior.
+	 */
+	if (attr->ia_valid & ATTR_MODE) {
+		if (exfat_sanitize_mode(sbi, inode, &attr->ia_mode) < 0)
+			attr->ia_valid &= ~ATTR_MODE;
+	}
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	if (attr->ia_valid)
+		error = inode_setattr(inode, attr);
+#else
+	if (attr->ia_valid & ATTR_SIZE) {
+		old_size = i_size_read(inode);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+		down_write(&EXFAT_I(inode)->truncate_lock);
+		truncate_setsize(inode, attr->ia_size);
+		_exfat_truncate(inode, old_size);
+		up_write(&EXFAT_I(inode)->truncate_lock);
+#else
+		truncate_setsize(inode, attr->ia_size);
+		_exfat_truncate(inode, old_size);
+#endif
+	}
+	setattr_copy(inode, attr);
+	mark_inode_dirty(inode);
+#endif
+
+	DPRINTK("exfat_setattr exited\n");
+	return error;
+}
+
+static int exfat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+	struct inode *inode = dentry->d_inode;
+
+	DPRINTK("exfat_getattr entered\n");
+
+	generic_fillattr(inode, stat);
+	stat->blksize = EXFAT_SB(inode->i_sb)->fs_info.cluster_size;
+
+	DPRINTK("exfat_getattr exited\n");
+	return 0;
+}
+
+const struct inode_operations exfat_dir_inode_operations = {
+	.create        = exfat_create,
+	.lookup        = exfat_lookup,
+	.unlink        = exfat_unlink,
+	.symlink       = exfat_symlink,
+	.mkdir         = exfat_mkdir,
+	.rmdir         = exfat_rmdir,
+	.rename        = exfat_rename,
+	.setattr       = exfat_setattr,
+	.getattr       = exfat_getattr,
+};
+
+/*======================================================================*/
+/*  File Operations                                                     */
+/*======================================================================*/
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
+static const char *exfat_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done)
+{
+	struct exfat_inode_info *ei = EXFAT_I(inode);
+	if (ei->target != NULL) {
+		char *cookie = ei->target;
+		if (cookie != NULL) {
+			return (char *)(ei->target);
+		}
+	}
+	return NULL;
+}
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(4,1,0)
+static const char *exfat_follow_link(struct dentry *dentry, void **cookie)
+{
+	struct exfat_inode_info *ei = EXFAT_I(dentry->d_inode);
+	return *cookie = (char *)(ei->target);
+}
+#else
+static void *exfat_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct exfat_inode_info *ei = EXFAT_I(dentry->d_inode);
+	nd_set_link(nd, (char *)(ei->target));
+	return NULL;
+}
+#endif
+
+const struct inode_operations exfat_symlink_inode_operations = {
+	.readlink    = generic_readlink,
+	#if LINUX_VERSION_CODE < KERNEL_VERSION(4,5,0)
+		.follow_link = exfat_follow_link,
+	#endif
+	#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0)
+		.get_link = exfat_get_link,
+	#endif
+};
+
+static int exfat_file_release(struct inode *inode, struct file *filp)
+{
+	struct super_block *sb = inode->i_sb;
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+	FsSyncVol(sb, 0);
+	return 0;
+}
+
+const struct file_operations exfat_file_operations = {
+	.llseek      = generic_file_llseek,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0)
+	.read        = do_sync_read,
+	.write       = do_sync_write,
+	.aio_read    = generic_file_aio_read,
+	.aio_write   = generic_file_aio_write,
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,1,0)
+	.read        = new_sync_read,
+	.write       = new_sync_write,
+#endif
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
+	.read_iter   = generic_file_read_iter,
+	.write_iter  = generic_file_write_iter,
+#endif
+	.mmap        = generic_file_mmap,
+	.release     = exfat_file_release,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	.ioctl       = exfat_generic_ioctl,
+	.fsync       = exfat_file_fsync,
+#else
+	.unlocked_ioctl  = exfat_generic_ioctl,
+	.fsync       = generic_file_fsync,
+#endif
+	.splice_read = generic_file_splice_read,
+};
+
+static void _exfat_truncate(struct inode *inode, loff_t old_size)
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	int err;
+
+	__lock_super(sb);
+
+	/*
+	 * This protects against truncating a file bigger than it was then
+	 * trying to write into the hole.
+	 */
+	if (EXFAT_I(inode)->mmu_private > i_size_read(inode))
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+
+	if (EXFAT_I(inode)->fid.start_clu == 0)
+		goto out;
+
+	err = FsTruncateFile(inode, old_size, i_size_read(inode));
+	if (err)
+		goto out;
+
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
+	if (IS_DIRSYNC(inode))
+		(void) exfat_sync_inode(inode);
+	else
+		mark_inode_dirty(inode);
+
+	inode->i_blocks = ((i_size_read(inode) + (p_fs->cluster_size - 1))
+					   & ~((loff_t)p_fs->cluster_size - 1)) >> 9;
+out:
+	__unlock_super(sb);
+}
+
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,36)
+static void exfat_truncate(struct inode *inode)
+{
+	_exfat_truncate(inode, i_size_read(inode));
+}
+#endif
+
+const struct inode_operations exfat_file_inode_operations = {
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,36)
+	.truncate    = exfat_truncate,
+#endif
+	.setattr     = exfat_setattr,
+	.getattr     = exfat_getattr,
+};
+
+/*======================================================================*/
+/*  Address Space Operations                                            */
+/*======================================================================*/
+
+static int exfat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+					  unsigned long *mapped_blocks, int *create)
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	BD_INFO_T *p_bd = &(sbi->bd_info);
+	const unsigned long blocksize = sb->s_blocksize;
+	const unsigned char blocksize_bits = sb->s_blocksize_bits;
+	sector_t last_block;
+	int err, clu_offset, sec_offset;
+	unsigned int cluster;
+
+	*phys = 0;
+	*mapped_blocks = 0;
+
+	if ((p_fs->vol_type == FAT12) || (p_fs->vol_type == FAT16)) {
+		if (inode->i_ino == EXFAT_ROOT_INO) {
+			if (sector < (p_fs->dentries_in_root >> (p_bd->sector_size_bits-DENTRY_SIZE_BITS))) {
+				*phys = sector + p_fs->root_start_sector;
+				*mapped_blocks = 1;
+			}
+			return 0;
+		}
+	}
+
+	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= last_block) {
+		if (*create == 0)
+			return 0;
+	} else {
+		*create = 0;
+	}
+
+	clu_offset = sector >> p_fs->sectors_per_clu_bits;  /* cluster offset */
+	sec_offset = sector & (p_fs->sectors_per_clu - 1);  /* sector offset in cluster */
+
+	EXFAT_I(inode)->fid.size = i_size_read(inode);
+
+	err = FsMapCluster(inode, clu_offset, &cluster);
+
+	if (err) {
+		if (err == FFS_FULL)
+			return -ENOSPC;
+		else
+			return -EIO;
+	} else if (cluster != CLUSTER_32(~0)) {
+		*phys = START_SECTOR(cluster) + sec_offset;
+		*mapped_blocks = p_fs->sectors_per_clu - sec_offset;
+	}
+
+	return 0;
+}
+
+static int exfat_get_block(struct inode *inode, sector_t iblock,
+						   struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int err;
+	unsigned long mapped_blocks;
+	sector_t phys;
+
+	__lock_super(sb);
+
+	err = exfat_bmap(inode, iblock, &phys, &mapped_blocks, &create);
+	if (err) {
+		__unlock_super(sb);
+		return err;
+	}
+
+	if (phys) {
+		max_blocks = min(mapped_blocks, max_blocks);
+		if (create) {
+			EXFAT_I(inode)->mmu_private += max_blocks << sb->s_blocksize_bits;
+			set_buffer_new(bh_result);
+		}
+		map_bh(bh_result, sb, phys);
+	}
+
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+	__unlock_super(sb);
+
+	return 0;
+}
+
+static int exfat_readpage(struct file *file, struct page *page)
+{
+	int ret;
+	ret =  mpage_readpage(page, exfat_get_block);
+	return ret;
+}
+
+static int exfat_readpages(struct file *file, struct address_space *mapping,
+				   struct list_head *pages, unsigned nr_pages)
+{
+	int ret;
+	ret =  mpage_readpages(mapping, pages, nr_pages, exfat_get_block);
+	return ret;
+}
+
+static int exfat_writepage(struct page *page, struct writeback_control *wbc)
+{
+	int ret;
+	ret = block_write_full_page(page, exfat_get_block, wbc);
+	return ret;
+}
+
+static int exfat_writepages(struct address_space *mapping,
+						struct writeback_control *wbc)
+{
+	int ret;
+	ret = mpage_writepages(mapping, wbc, exfat_get_block);
+	return ret;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+static void exfat_write_failed(struct address_space *mapping, loff_t to)
+{
+	struct inode *inode = mapping->host;
+	if (to > i_size_read(inode)) {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,12,0)
+		truncate_pagecache(inode, i_size_read(inode));
+#else
+		truncate_pagecache(inode, to, i_size_read(inode));
+#endif
+		EXFAT_I(inode)->fid.size = i_size_read(inode);
+		_exfat_truncate(inode, i_size_read(inode));
+	}
+}
+#endif
+
+static int exfat_write_begin(struct file *file, struct address_space *mapping,
+				 loff_t pos, unsigned len, unsigned flags,
+					 struct page **pagep, void **fsdata)
+{
+	int ret;
+	*pagep = NULL;
+	ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+				   exfat_get_block,
+				   &EXFAT_I(mapping->host)->mmu_private);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	if (ret < 0)
+		exfat_write_failed(mapping, pos+len);
+#endif
+	return ret;
+}
+
+static int exfat_write_end(struct file *file, struct address_space *mapping,
+				   loff_t pos, unsigned len, unsigned copied,
+					   struct page *pagep, void *fsdata)
+{
+	struct inode *inode = mapping->host;
+	FILE_ID_T *fid = &(EXFAT_I(inode)->fid);
+	int err;
+
+	err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	if (err < len)
+		exfat_write_failed(mapping, pos+len);
+#endif
+
+	if (!(err < 0) && !(fid->attr & ATTR_ARCHIVE)) {
+		inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+		fid->attr |= ATTR_ARCHIVE;
+		mark_inode_dirty(inode);
+	}
+	return err;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0)
+#ifdef CONFIG_AIO_OPTIMIZATION
+static ssize_t exfat_direct_IO(int rw, struct kiocb *iocb,
+						struct iov_iter *iter, loff_t offset)
+#else
+static ssize_t exfat_direct_IO(int rw, struct kiocb *iocb,
+					   const struct iovec *iov,
+					   loff_t offset, unsigned long nr_segs)
+#endif
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,2,0)
+static ssize_t exfat_direct_IO(int rw, struct kiocb *iocb,
+					   struct iov_iter *iter, loff_t offset)
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0)
+static ssize_t exfat_direct_IO(struct kiocb *iocb,
+					   struct iov_iter *iter, loff_t offset)
+#else /* >= 4.7.x */
+static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+#endif
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+#endif
+	ssize_t ret;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,2,0)
+	int rw;
+
+	rw = iov_iter_rw(iter);
+#endif
+
+	if (rw == WRITE) {
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,16,0)
+#ifdef CONFIG_AIO_OPTIMIZATION
+		if (EXFAT_I(inode)->mmu_private <
+					(offset + iov_iter_count(iter)))
+#else
+		if (EXFAT_I(inode)->mmu_private < (offset + iov_length(iov, nr_segs)))
+#endif
+#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0)
+		if (EXFAT_I(inode)->mmu_private < (offset + iov_iter_count(iter)))
+#else
+		if (EXFAT_I(inode)->mmu_private < iov_iter_count(iter))
+#endif
+			return 0;
+	}
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0)
+	ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(4,1,0)
+	ret = blockdev_direct_IO(iocb, inode, iter,
+					offset, exfat_get_block);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
+	ret = blockdev_direct_IO(rw, iocb, inode, iter,
+					offset, exfat_get_block);
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,1,0)
+#ifdef CONFIG_AIO_OPTIMIZATION
+	ret = blockdev_direct_IO(rw, iocb, inode, iter,
+					offset, exfat_get_block);
+#else
+	ret = blockdev_direct_IO(rw, iocb, inode, iov,
+					offset, nr_segs, exfat_get_block);
+#endif
+#else
+        ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+					offset, nr_segs, exfat_get_block, NULL);
+#endif
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,7,0)
+	if ((ret < 0) && (rw & WRITE))
+		exfat_write_failed(mapping, iov_iter_count(iter));
+#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)
+	if ((ret < 0) && (rw & WRITE))
+		exfat_write_failed(mapping, offset+iov_iter_count(iter));
+#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,34)
+	if ((ret < 0) && (rw & WRITE))
+#ifdef CONFIG_AIO_OPTIMIZATION
+		exfat_write_failed(mapping, offset+iov_iter_count(iter));
+#else
+		exfat_write_failed(mapping, offset+iov_length(iov, nr_segs));
+#endif
+#endif
+	return ret;
+}
+
+static sector_t _exfat_bmap(struct address_space *mapping, sector_t block)
+{
+	sector_t blocknr;
+
+	/* exfat_get_cluster() assumes the requested blocknr isn't truncated. */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	down_read(&EXFAT_I(mapping->host)->truncate_lock);
+	blocknr = generic_block_bmap(mapping, block, exfat_get_block);
+	up_read(&EXFAT_I(mapping->host)->truncate_lock);
+#else
+	down_read(&EXFAT_I(mapping->host)->i_alloc_sem);
+	blocknr = generic_block_bmap(mapping, block, exfat_get_block);
+	up_read(&EXFAT_I(mapping->host)->i_alloc_sem);
+#endif
+
+	return blocknr;
+}
+
+const struct address_space_operations exfat_aops = {
+	.readpage    = exfat_readpage,
+	.readpages   = exfat_readpages,
+	.writepage   = exfat_writepage,
+	.writepages  = exfat_writepages,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39)
+	.sync_page   = block_sync_page,
+#endif
+	.write_begin = exfat_write_begin,
+	.write_end   = exfat_write_end,
+	.direct_IO   = exfat_direct_IO,
+	.bmap        = _exfat_bmap
+};
+
+/*======================================================================*/
+/*  Super Operations                                                    */
+/*======================================================================*/
+
+static inline unsigned long exfat_hash(loff_t i_pos)
+{
+	return hash_32(i_pos, EXFAT_HASH_BITS);
+}
+
+static struct inode *exfat_iget(struct super_block *sb, loff_t i_pos)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct exfat_inode_info *info;
+	struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
+	struct inode *inode = NULL;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0)
+	struct hlist_node *node;
+
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_for_each_entry(info, node, head, i_hash_fat) {
+#else
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_for_each_entry(info, head, i_hash_fat) {
+#endif
+		CHECK_ERR(info->vfs_inode.i_sb != sb);
+
+		if (i_pos != info->i_pos)
+			continue;
+		inode = igrab(&info->vfs_inode);
+		if (inode)
+			break;
+	}
+	spin_unlock(&sbi->inode_hash_lock);
+	return inode;
+}
+
+static void exfat_attach(struct inode *inode, loff_t i_pos)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+	struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
+
+	spin_lock(&sbi->inode_hash_lock);
+	EXFAT_I(inode)->i_pos = i_pos;
+	hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head);
+	spin_unlock(&sbi->inode_hash_lock);
+}
+
+static void exfat_detach(struct inode *inode)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+
+	spin_lock(&sbi->inode_hash_lock);
+	hlist_del_init(&EXFAT_I(inode)->i_hash_fat);
+	EXFAT_I(inode)->i_pos = 0;
+	spin_unlock(&sbi->inode_hash_lock);
+}
+
+/* doesn't deal with root inode */
+static int exfat_fill_inode(struct inode *inode, FILE_ID_T *fid)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	DIR_ENTRY_T info;
+
+	memcpy(&(EXFAT_I(inode)->fid), fid, sizeof(FILE_ID_T));
+
+	FsReadStat(inode, &info);
+
+	EXFAT_I(inode)->i_pos = 0;
+	EXFAT_I(inode)->target = NULL;
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = get_seconds();
+
+	if (info.Attr & ATTR_SUBDIR) { /* directory */
+		inode->i_generation &= ~1;
+		inode->i_mode = exfat_make_mode(sbi, info.Attr, S_IRWXUGO);
+		inode->i_op = &exfat_dir_inode_operations;
+		inode->i_fop = &exfat_dir_operations;
+
+		i_size_write(inode, info.Size);
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,00)
+		set_nlink(inode, info.NumSubdirs);
+#else
+		inode->i_nlink = info.NumSubdirs;
+#endif
+	} else if (info.Attr & ATTR_SYMLINK) { /* symbolic link */
+		inode->i_generation |= 1;
+		inode->i_mode = exfat_make_mode(sbi, info.Attr, S_IRWXUGO);
+		inode->i_op = &exfat_symlink_inode_operations;
+
+		i_size_write(inode, info.Size);
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+	} else { /* regular file */
+		inode->i_generation |= 1;
+		inode->i_mode = exfat_make_mode(sbi, info.Attr, S_IRWXUGO);
+		inode->i_op = &exfat_file_inode_operations;
+		inode->i_fop = &exfat_file_operations;
+		inode->i_mapping->a_ops = &exfat_aops;
+		inode->i_mapping->nrpages = 0;
+
+		i_size_write(inode, info.Size);
+		EXFAT_I(inode)->mmu_private = i_size_read(inode);
+	}
+	exfat_save_attr(inode, info.Attr);
+
+	inode->i_blocks = ((i_size_read(inode) + (p_fs->cluster_size - 1))
+					   & ~((loff_t)p_fs->cluster_size - 1)) >> 9;
+
+	exfat_time_fat2unix(sbi, &inode->i_mtime, &info.ModifyTimestamp);
+	exfat_time_fat2unix(sbi, &inode->i_ctime, &info.CreateTimestamp);
+	exfat_time_fat2unix(sbi, &inode->i_atime, &info.AccessTimestamp);
+
+	return 0;
+}
+
+static struct inode *exfat_build_inode(struct super_block *sb,
+									   FILE_ID_T *fid, loff_t i_pos) {
+	struct inode *inode;
+	int err;
+
+	inode = exfat_iget(sb, i_pos);
+	if (inode)
+		goto out;
+	inode = new_inode(sb);
+	if (!inode) {
+		inode = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	inode->i_ino = iunique(sb, EXFAT_ROOT_INO);
+	inode->i_version = 1;
+	err = exfat_fill_inode(inode, fid);
+	if (err) {
+		iput(inode);
+		inode = ERR_PTR(err);
+		goto out;
+	}
+	exfat_attach(inode, i_pos);
+	insert_inode_hash(inode);
+out:
+	return inode;
+}
+
+static int exfat_sync_inode(struct inode *inode)
+{
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
+	return exfat_write_inode(inode, 0);
+#else
+	return exfat_write_inode(inode, NULL);
+#endif
+}
+
+static struct inode *exfat_alloc_inode(struct super_block *sb)
+{
+	struct exfat_inode_info *ei;
+
+	ei = kmem_cache_alloc(exfat_inode_cachep, GFP_NOFS);
+	if (!ei)
+		return NULL;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	init_rwsem(&ei->truncate_lock);
+#endif
+
+	return &ei->vfs_inode;
+}
+
+static void exfat_destroy_inode(struct inode *inode)
+{
+	if (EXFAT_I(inode)->target)
+		kfree(EXFAT_I(inode)->target);
+	EXFAT_I(inode)->target = NULL;
+
+	kmem_cache_free(exfat_inode_cachep, EXFAT_I(inode));
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34)
+static int exfat_write_inode(struct inode *inode, int wait)
+#else
+static int exfat_write_inode(struct inode *inode, struct writeback_control *wbc)
+#endif
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	DIR_ENTRY_T info;
+
+	if (inode->i_ino == EXFAT_ROOT_INO)
+		return 0;
+
+	info.Attr = exfat_make_attr(inode);
+	info.Size = i_size_read(inode);
+
+	exfat_time_unix2fat(sbi, &inode->i_mtime, &info.ModifyTimestamp);
+	exfat_time_unix2fat(sbi, &inode->i_ctime, &info.CreateTimestamp);
+	exfat_time_unix2fat(sbi, &inode->i_atime, &info.AccessTimestamp);
+
+	FsWriteStat(inode, &info);
+
+	return 0;
+}
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+static void exfat_delete_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+static void exfat_clear_inode(struct inode *inode)
+{
+	exfat_detach(inode);
+	remove_inode_hash(inode);
+}
+#else
+static void exfat_evict_inode(struct inode *inode)
+{
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (!inode->i_nlink)
+		i_size_write(inode, 0);
+	invalidate_inode_buffers(inode);
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0)
+	end_writeback(inode);
+#else
+	clear_inode(inode);
+#endif
+	exfat_detach(inode);
+
+	remove_inode_hash(inode);
+}
+#endif
+
+static void exfat_free_super(struct exfat_sb_info *sbi)
+{
+	if (sbi->nls_disk)
+		unload_nls(sbi->nls_disk);
+	if (sbi->nls_io)
+		unload_nls(sbi->nls_io);
+	if (sbi->options.iocharset != exfat_default_iocharset)
+		kfree(sbi->options.iocharset);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,0)
+	/* mutex_init is in exfat_fill_super function. only for 3.7+ */
+	mutex_destroy(&sbi->s_lock);
+#endif
+	kfree(sbi);
+}
+
+static void exfat_put_super(struct super_block *sb)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	if (__is_sb_dirty(sb))
+		exfat_write_super(sb);
+
+	FsUmountVol(sb);
+
+	sb->s_fs_info = NULL;
+	exfat_free_super(sbi);
+}
+
+static void exfat_write_super(struct super_block *sb)
+{
+	__lock_super(sb);
+
+	__set_sb_clean(sb);
+
+	if (!(sb->s_flags & MS_RDONLY))
+		FsSyncVol(sb, 1);
+
+	__unlock_super(sb);
+}
+
+static int exfat_sync_fs(struct super_block *sb, int wait)
+{
+	int err = 0;
+
+	if (__is_sb_dirty(sb)) {
+		__lock_super(sb);
+		__set_sb_clean(sb);
+		err = FsSyncVol(sb, 1);
+		__unlock_super(sb);
+	}
+
+	return err;
+}
+
+static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	FS_INFO_T *p_fs = &(EXFAT_SB(sb)->fs_info);
+	VOL_INFO_T info;
+
+	if (p_fs->used_clusters == (u32) ~0) {
+		if (FFS_MEDIAERR == FsGetVolInfo(sb, &info))
+			return -EIO;
+
+	} else {
+		info.FatType = p_fs->vol_type;
+		info.ClusterSize = p_fs->cluster_size;
+		info.NumClusters = p_fs->num_clusters - 2;
+		info.UsedClusters = p_fs->used_clusters;
+		info.FreeClusters = info.NumClusters - info.UsedClusters;
+
+		if (p_fs->dev_ejected)
+			printk("[EXFAT] statfs on device is ejected\n");
+	}
+
+	buf->f_type = sb->s_magic;
+	buf->f_bsize = info.ClusterSize;
+	buf->f_blocks = info.NumClusters;
+	buf->f_bfree = info.FreeClusters;
+	buf->f_bavail = info.FreeClusters;
+	buf->f_fsid.val[0] = (u32)id;
+	buf->f_fsid.val[1] = (u32)(id >> 32);
+	buf->f_namelen = 260;
+
+	return 0;
+}
+
+static int exfat_remount(struct super_block *sb, int *flags, char *data)
+{
+	*flags |= MS_NODIRATIME;
+	return 0;
+}
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,3,0)
+static int exfat_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(root->d_sb);
+#else
+static int exfat_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(mnt->mnt_sb);
+#endif
+	struct exfat_mount_options *opts = &sbi->options;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+	if (__kuid_val(opts->fs_uid))
+		seq_printf(m, ",uid=%u", __kuid_val(opts->fs_uid));
+	if (__kgid_val(opts->fs_gid))
+		seq_printf(m, ",gid=%u", __kgid_val(opts->fs_gid));
+#else
+	if (opts->fs_uid != 0)
+		seq_printf(m, ",uid=%u", opts->fs_uid);
+	if (opts->fs_gid != 0)
+		seq_printf(m, ",gid=%u", opts->fs_gid);
+#endif
+	seq_printf(m, ",fmask=%04o", opts->fs_fmask);
+	seq_printf(m, ",dmask=%04o", opts->fs_dmask);
+	if (opts->allow_utime)
+		seq_printf(m, ",allow_utime=%04o", opts->allow_utime);
+	if (sbi->nls_disk)
+		seq_printf(m, ",codepage=%s", sbi->nls_disk->charset);
+	if (sbi->nls_io)
+		seq_printf(m, ",iocharset=%s", sbi->nls_io->charset);
+	seq_printf(m, ",namecase=%u", opts->casesensitive);
+	if (opts->errors == EXFAT_ERRORS_CONT)
+		seq_puts(m, ",errors=continue");
+	else if (opts->errors == EXFAT_ERRORS_PANIC)
+		seq_puts(m, ",errors=panic");
+	else
+		seq_puts(m, ",errors=remount-ro");
+#ifdef CONFIG_EXFAT_DISCARD
+	if (opts->discard)
+		seq_printf(m, ",discard");
+#endif
+	return 0;
+}
+
+const struct super_operations exfat_sops = {
+	.alloc_inode   = exfat_alloc_inode,
+	.destroy_inode = exfat_destroy_inode,
+	.write_inode   = exfat_write_inode,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36)
+	.delete_inode  = exfat_delete_inode,
+	.clear_inode   = exfat_clear_inode,
+#else
+	.evict_inode  = exfat_evict_inode,
+#endif
+	.put_super     = exfat_put_super,
+#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0)
+	.write_super   = exfat_write_super,
+#endif
+	.sync_fs       = exfat_sync_fs,
+	.statfs        = exfat_statfs,
+	.remount_fs    = exfat_remount,
+	.show_options  = exfat_show_options,
+};
+
+/*======================================================================*/
+/*  Super Block Read Operations                                         */
+/*======================================================================*/
+
+enum {
+	Opt_uid,
+	Opt_gid,
+	Opt_umask,
+	Opt_dmask,
+	Opt_fmask,
+	Opt_allow_utime,
+	Opt_codepage,
+	Opt_charset,
+	Opt_namecase,
+	Opt_debug,
+	Opt_err_cont,
+	Opt_err_panic,
+	Opt_err_ro,
+	Opt_utf8_hack,
+	Opt_err,
+#ifdef CONFIG_EXFAT_DISCARD
+	Opt_discard,
+#endif /* EXFAT_CONFIG_DISCARD */
+};
+
+static const match_table_t exfat_tokens = {
+	{Opt_uid, "uid=%u"},
+	{Opt_gid, "gid=%u"},
+	{Opt_umask, "umask=%o"},
+	{Opt_dmask, "dmask=%o"},
+	{Opt_fmask, "fmask=%o"},
+	{Opt_allow_utime, "allow_utime=%o"},
+	{Opt_codepage, "codepage=%u"},
+	{Opt_charset, "iocharset=%s"},
+	{Opt_namecase, "namecase=%u"},
+	{Opt_debug, "debug"},
+	{Opt_err_cont, "errors=continue"},
+	{Opt_err_panic, "errors=panic"},
+	{Opt_err_ro, "errors=remount-ro"},
+	{Opt_utf8_hack, "utf8"},
+#ifdef CONFIG_EXFAT_DISCARD
+	{Opt_discard, "discard"},
+#endif /* CONFIG_EXFAT_DISCARD */
+	{Opt_err, NULL}
+};
+
+static int parse_options(char *options, int silent, int *debug,
+						 struct exfat_mount_options *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	char *iocharset;
+
+	opts->fs_uid = current_uid();
+	opts->fs_gid = current_gid();
+	opts->fs_fmask = opts->fs_dmask = current->fs->umask;
+	opts->allow_utime = (unsigned short) -1;
+	opts->codepage = exfat_default_codepage;
+	opts->iocharset = exfat_default_iocharset;
+	opts->casesensitive = 0;
+	opts->errors = EXFAT_ERRORS_RO;
+#ifdef CONFIG_EXFAT_DISCARD
+	opts->discard = 0;
+#endif
+	*debug = 0;
+
+	if (!options)
+		goto out;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		if (!*p)
+			continue;
+
+		token = match_token(p, exfat_tokens, args);
+		switch (token) {
+		case Opt_uid:
+			if (match_int(&args[0], &option))
+				return 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+			opts->fs_uid = KUIDT_INIT(option);
+#else
+			opts->fs_uid = option;
+#endif
+			break;
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+			opts->fs_gid = KGIDT_INIT(option);
+#else
+			opts->fs_gid = option;
+#endif
+			break;
+		case Opt_umask:
+		case Opt_dmask:
+		case Opt_fmask:
+			if (match_octal(&args[0], &option))
+				return 0;
+			if (token != Opt_dmask)
+				opts->fs_fmask = option;
+			if (token != Opt_fmask)
+				opts->fs_dmask = option;
+			break;
+		case Opt_allow_utime:
+			if (match_octal(&args[0], &option))
+				return 0;
+			opts->allow_utime = option & (S_IWGRP | S_IWOTH);
+			break;
+		case Opt_codepage:
+			if (match_int(&args[0], &option))
+				return 0;
+			opts->codepage = option;
+			break;
+		case Opt_charset:
+			if (opts->iocharset != exfat_default_iocharset)
+				kfree(opts->iocharset);
+			iocharset = match_strdup(&args[0]);
+			if (!iocharset)
+				return -ENOMEM;
+			opts->iocharset = iocharset;
+			break;
+		case Opt_namecase:
+			if (match_int(&args[0], &option))
+				return 0;
+			opts->casesensitive = option;
+			break;
+		case Opt_err_cont:
+			opts->errors = EXFAT_ERRORS_CONT;
+			break;
+		case Opt_err_panic:
+			opts->errors = EXFAT_ERRORS_PANIC;
+			break;
+		case Opt_err_ro:
+			opts->errors = EXFAT_ERRORS_RO;
+			break;
+		case Opt_debug:
+			*debug = 1;
+			break;
+#ifdef CONFIG_EXFAT_DISCARD
+		case Opt_discard:
+			opts->discard = 1;
+			break;
+#endif /* CONFIG_EXFAT_DISCARD */
+		case Opt_utf8_hack:
+			break;
+		default:
+			if (!silent)
+				printk(KERN_ERR "[EXFAT] Unrecognized mount option %s or missing value\n", p);
+			return -EINVAL;
+		}
+	}
+
+out:
+	if (opts->allow_utime == (unsigned short) -1)
+		opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH);
+
+	return 0;
+}
+
+static void exfat_hash_init(struct super_block *sb)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	int i;
+
+	spin_lock_init(&sbi->inode_hash_lock);
+	for (i = 0; i < EXFAT_HASH_SIZE; i++)
+		INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
+}
+
+static int exfat_read_root(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct timespec ts;
+	FS_INFO_T *p_fs = &(sbi->fs_info);
+	DIR_ENTRY_T info;
+
+	ts = CURRENT_TIME_SEC;
+
+	EXFAT_I(inode)->fid.dir.dir = p_fs->root_dir;
+	EXFAT_I(inode)->fid.dir.flags = 0x01;
+	EXFAT_I(inode)->fid.entry = -1;
+	EXFAT_I(inode)->fid.start_clu = p_fs->root_dir;
+	EXFAT_I(inode)->fid.flags = 0x01;
+	EXFAT_I(inode)->fid.type = TYPE_DIR;
+	EXFAT_I(inode)->fid.rwoffset = 0;
+	EXFAT_I(inode)->fid.hint_last_off = -1;
+
+	EXFAT_I(inode)->target = NULL;
+
+	FsReadStat(inode, &info);
+
+	inode->i_uid = sbi->options.fs_uid;
+	inode->i_gid = sbi->options.fs_gid;
+	inode->i_version++;
+	inode->i_generation = 0;
+	inode->i_mode = exfat_make_mode(sbi, ATTR_SUBDIR, S_IRWXUGO);
+	inode->i_op = &exfat_dir_inode_operations;
+	inode->i_fop = &exfat_dir_operations;
+
+	i_size_write(inode, info.Size);
+	inode->i_blocks = ((i_size_read(inode) + (p_fs->cluster_size - 1))
+					   & ~((loff_t)p_fs->cluster_size - 1)) >> 9;
+	EXFAT_I(inode)->i_pos = ((loff_t) p_fs->root_dir << 32) | 0xffffffff;
+	EXFAT_I(inode)->mmu_private = i_size_read(inode);
+
+	exfat_save_attr(inode, ATTR_SUBDIR);
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ts;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,2,00)
+	set_nlink(inode, info.NumSubdirs + 2);
+#else
+	inode->i_nlink = info.NumSubdirs + 2;
+#endif
+
+	return 0;
+}
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
+static void setup_dops(struct super_block *sb)
+{
+	if (EXFAT_SB(sb)->options.casesensitive == 0)
+		sb->s_d_op = &exfat_ci_dentry_ops;
+	else
+		sb->s_d_op = &exfat_dentry_ops;
+}
+#endif
+
+static int exfat_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root_inode = NULL;
+	struct exfat_sb_info *sbi;
+	int debug, ret;
+	long error;
+	char buf[50];
+
+	/*
+	 * GFP_KERNEL is ok here, because while we do hold the
+	 * supeblock lock, memory pressure can't call back into
+	 * the filesystem, since we're only just about to mount
+	 * it and have no inodes etc active!
+	 */
+	sbi = kzalloc(sizeof(struct exfat_sb_info), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,0)
+	mutex_init(&sbi->s_lock);
+#endif
+	sb->s_fs_info = sbi;
+	sb->s_flags |= MS_NODIRATIME;
+	sb->s_magic = EXFAT_SUPER_MAGIC;
+	sb->s_op = &exfat_sops;
+
+	error = parse_options(data, silent, &debug, &sbi->options);
+	if (error)
+		goto out_fail;
+
+#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,37)
+	setup_dops(sb);
+#endif
+
+	error = -EIO;
+	sb_min_blocksize(sb, 512);
+	sb->s_maxbytes = 0x7fffffffffffffffLL;    /* maximum file size */
+
+	ret = FsMountVol(sb);
+	if (ret) {
+		if (!silent)
+			printk(KERN_ERR "[EXFAT] FsMountVol failed\n");
+
+		goto out_fail;
+	}
+
+	/* set up enough so that it can read an inode */
+	exfat_hash_init(sb);
+
+	/*
+	 * The low byte of FAT's first entry must have same value with
+	 * media-field.  But in real world, too many devices is
+	 * writing wrong value.  So, removed that validity check.
+	 *
+	 * if (FAT_FIRST_ENT(sb, media) != first)
+	 */
+
+	/* codepage is not meaningful in exfat */
+	if (sbi->fs_info.vol_type != EXFAT) {
+		error = -EINVAL;
+		sprintf(buf, "cp%d", sbi->options.codepage);
+		sbi->nls_disk = load_nls(buf);
+		if (!sbi->nls_disk) {
+			printk(KERN_ERR "[EXFAT] Codepage %s not found\n", buf);
+			goto out_fail2;
+		}
+	}
+
+	sbi->nls_io = load_nls(sbi->options.iocharset);
+
+	error = -ENOMEM;
+	root_inode = new_inode(sb);
+	if (!root_inode)
+		goto out_fail2;
+	root_inode->i_ino = EXFAT_ROOT_INO;
+	root_inode->i_version = 1;
+	error = exfat_read_root(root_inode);
+	if (error < 0)
+		goto out_fail2;
+	error = -ENOMEM;
+	exfat_attach(root_inode, EXFAT_I(root_inode)->i_pos);
+	insert_inode_hash(root_inode);
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	sb->s_root = d_make_root(root_inode);
+#else
+	sb->s_root = d_alloc_root(root_inode);
+#endif
+	if (!sb->s_root) {
+		printk(KERN_ERR "[EXFAT] Getting the root inode failed\n");
+		goto out_fail2;
+	}
+
+	return 0;
+
+out_fail2:
+	FsUmountVol(sb);
+out_fail:
+	if (root_inode)
+		iput(root_inode);
+	sb->s_fs_info = NULL;
+	exfat_free_super(sbi);
+	return error;
+}
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+static int exfat_get_sb(struct file_system_type *fs_type,
+						int flags, const char *dev_name,
+						void *data, struct vfsmount *mnt)
+{
+	return get_sb_bdev(fs_type, flags, dev_name, data, exfat_fill_super, mnt);
+}
+#else
+static struct dentry *exfat_fs_mount(struct file_system_type *fs_type,
+									 int flags, const char *dev_name,
+									 void *data) {
+	return mount_bdev(fs_type, flags, dev_name, data, exfat_fill_super);
+}
+#endif
+
+static void init_once(void *foo)
+{
+	struct exfat_inode_info *ei = (struct exfat_inode_info *)foo;
+
+	INIT_HLIST_NODE(&ei->i_hash_fat);
+	inode_init_once(&ei->vfs_inode);
+}
+
+static int __init exfat_init_inodecache(void)
+{
+	exfat_inode_cachep = kmem_cache_create("exfat_inode_cache",
+										   sizeof(struct exfat_inode_info),
+										   0, (SLAB_RECLAIM_ACCOUNT|
+												   SLAB_MEM_SPREAD),
+										   init_once);
+	if (exfat_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+static void __exit exfat_destroy_inodecache(void)
+{
+	kmem_cache_destroy(exfat_inode_cachep);
+}
+
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+static void exfat_debug_kill_sb(struct super_block *sb)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(sb);
+	struct block_device *bdev = sb->s_bdev;
+
+	long flags;
+
+	if (sbi) {
+		flags = sbi->debug_flags;
+
+		if (flags & EXFAT_DEBUGFLAGS_INVALID_UMOUNT) {
+			/* invalidate_bdev drops all device cache include dirty.
+			   we use this to simulate device removal */
+			FsReleaseCache(sb);
+			invalidate_bdev(bdev);
+		}
+	}
+
+	kill_block_super(sb);
+}
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+
+static struct file_system_type exfat_fs_type = {
+	.owner       = THIS_MODULE,
+#if defined(CONFIG_MACH_LGE) || defined(CONFIG_HTC_BATT_CORE)
+	.name        = "texfat",
+#else
+	.name        = "exfat",
+#endif
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,37)
+	.get_sb      = exfat_get_sb,
+#else
+	.mount       = exfat_fs_mount,
+#endif
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	.kill_sb    = exfat_debug_kill_sb,
+#else
+	.kill_sb    = kill_block_super,
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+	.fs_flags    = FS_REQUIRES_DEV,
+};
+
+static int __init init_exfat(void)
+{
+	int err;
+
+	err = FsInit();
+	if (err) {
+		if (err == FFS_MEMORYERR)
+			return -ENOMEM;
+		else
+			return -EIO;
+	}
+
+	printk(KERN_INFO "exFAT: Version %s\n", EXFAT_VERSION);
+
+	err = exfat_init_inodecache();
+	if (err)
+		goto out;
+
+	err = register_filesystem(&exfat_fs_type);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	FsShutdown();
+	return err;
+}
+
+static void __exit exit_exfat(void)
+{
+	exfat_destroy_inodecache();
+	unregister_filesystem(&exfat_fs_type);
+	FsShutdown();
+}
+
+module_init(init_exfat);
+module_exit(exit_exfat);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("exFAT Filesystem Driver");
+#ifdef MODULE_ALIAS_FS
+#if defined(CONFIG_MACH_LGE) || defined(CONFIG_HTC_BATT_CORE)
+MODULE_ALIAS_FS("texfat");
+#else
+MODULE_ALIAS_FS("exfat");
+#endif
+#endif
\ No newline at end of file
diff --git b/fs/exfat/exfat_super.h b/fs/exfat/exfat_super.h
new file mode 100644
index 0000000..916811e
--- /dev/null
+++ b/fs/exfat/exfat_super.h
@@ -0,0 +1,171 @@
+/* Some of the source code in this file came from "linux/fs/fat/fat.h".  */
+
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#ifndef _EXFAT_LINUX_H
+#define _EXFAT_LINUX_H
+
+#include <linux/buffer_head.h>
+#include <linux/string.h>
+#include <linux/nls.h>
+#include <linux/fs.h>
+#include <linux/mutex.h>
+#include <linux/swap.h>
+
+#include "exfat_config.h"
+#include "exfat_data.h"
+#include "exfat_oal.h"
+
+#include "exfat_blkdev.h"
+#include "exfat_cache.h"
+#include "exfat_nls.h"
+#include "exfat_api.h"
+#include "exfat_core.h"
+
+#define EXFAT_ERRORS_CONT  1    /* ignore error and continue */
+#define EXFAT_ERRORS_PANIC 2    /* panic on error */
+#define EXFAT_ERRORS_RO    3    /* remount r/o on error */
+
+/* ioctl command */
+#define EXFAT_IOCTL_GET_VOLUME_ID _IOR('r', 0x12, __u32)
+
+struct exfat_mount_options {
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0)
+	kuid_t fs_uid;
+	kgid_t fs_gid;
+#else
+	uid_t fs_uid;
+	gid_t fs_gid;
+#endif
+	unsigned short fs_fmask;
+	unsigned short fs_dmask;
+	unsigned short allow_utime; /* permission for setting the [am]time */
+	unsigned short codepage;    /* codepage for shortname conversions */
+	char *iocharset;            /* charset for filename input/display */
+	unsigned char casesensitive;
+	unsigned char errors;       /* on error: continue, panic, remount-ro */
+#ifdef CONFIG_EXFAT_DISCARD
+	unsigned char discard;      /* flag on if -o dicard specified and device support discard() */
+#endif /* CONFIG_EXFAT_DISCARD */
+};
+
+#define EXFAT_HASH_BITS    8
+#define EXFAT_HASH_SIZE    (1UL << EXFAT_HASH_BITS)
+
+/*
+ * EXFAT file system in-core superblock data
+ */
+struct exfat_sb_info {
+	FS_INFO_T fs_info;
+	BD_INFO_T bd_info;
+
+	struct exfat_mount_options options;
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,7,00)
+	int s_dirt;
+	struct mutex s_lock;
+#endif
+	struct nls_table *nls_disk; /* Codepage used on disk */
+	struct nls_table *nls_io;   /* Charset used for input and display */
+
+	struct inode *fat_inode;
+
+	spinlock_t inode_hash_lock;
+	struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+#ifdef CONFIG_EXFAT_KERNEL_DEBUG
+	long debug_flags;
+#endif /* CONFIG_EXFAT_KERNEL_DEBUG */
+};
+
+/*
+ * EXFAT file system inode data in memory
+ */
+struct exfat_inode_info {
+	FILE_ID_T fid;
+	char  *target;
+	/* NOTE: mmu_private is 64bits, so must hold ->i_mutex to access */
+	loff_t mmu_private;         /* physically allocated size */
+	loff_t i_pos;               /* on-disk position of directory entry or 0 */
+	struct hlist_node i_hash_fat;	/* hash by i_location */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,4,00)
+	struct rw_semaphore truncate_lock;
+#endif
+	struct inode vfs_inode;
+	struct rw_semaphore i_alloc_sem; /* protect bmap against truncate */
+};
+
+#define EXFAT_SB(sb)		((struct exfat_sb_info *)((sb)->s_fs_info))
+
+static inline struct exfat_inode_info *EXFAT_I(struct inode *inode)
+{
+	return container_of(inode, struct exfat_inode_info, vfs_inode);
+}
+
+/*
+ * If ->i_mode can't hold S_IWUGO (i.e. ATTR_RO), we use ->i_attrs to
+ * save ATTR_RO instead of ->i_mode.
+ *
+ * If it's directory and !sbi->options.rodir, ATTR_RO isn't read-only
+ * bit, it's just used as flag for app.
+ */
+static inline int exfat_mode_can_hold_ro(struct inode *inode)
+{
+	struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+
+	if (S_ISDIR(inode->i_mode))
+		return 0;
+
+	if ((~sbi->options.fs_fmask) & S_IWUGO)
+		return 1;
+	return 0;
+}
+
+/* Convert attribute bits and a mask to the UNIX mode. */
+static inline mode_t exfat_make_mode(struct exfat_sb_info *sbi,
+									 u32 attr, mode_t mode)
+{
+	if ((attr & ATTR_READONLY) && !(attr & ATTR_SUBDIR))
+		mode &= ~S_IWUGO;
+
+	if (attr & ATTR_SUBDIR)
+		return (mode & ~sbi->options.fs_dmask) | S_IFDIR;
+	else if (attr & ATTR_SYMLINK)
+		return (mode & ~sbi->options.fs_dmask) | S_IFLNK;
+	else
+		return (mode & ~sbi->options.fs_fmask) | S_IFREG;
+}
+
+/* Return the FAT attribute byte for this inode */
+static inline u32 exfat_make_attr(struct inode *inode)
+{
+	if (exfat_mode_can_hold_ro(inode) && !(inode->i_mode & S_IWUGO))
+		return (EXFAT_I(inode)->fid.attr) | ATTR_READONLY;
+	else
+		return EXFAT_I(inode)->fid.attr;
+}
+
+static inline void exfat_save_attr(struct inode *inode, u32 attr)
+{
+	if (exfat_mode_can_hold_ro(inode))
+		EXFAT_I(inode)->fid.attr = attr & ATTR_RWMASK;
+	else
+		EXFAT_I(inode)->fid.attr = attr & (ATTR_RWMASK | ATTR_READONLY);
+}
+
+#endif /* _EXFAT_LINUX_H */
diff --git b/fs/exfat/exfat_upcase.c b/fs/exfat/exfat_upcase.c
new file mode 100644
index 0000000..3807f37
--- /dev/null
+++ b/fs/exfat/exfat_upcase.c
@@ -0,0 +1,405 @@
+/*
+ *  Copyright (C) 2012-2013 Samsung Electronics Co., Ltd.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_upcase.c                                            */
+/*  PURPOSE : exFAT Up-case Table                                       */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY (Ver 0.9)                                          */
+/*                                                                      */
+/*  - 2010.11.15 [Joosun Hahn] : first writing                          */
+/*                                                                      */
+/************************************************************************/
+
+#include "exfat_config.h"
+
+#include "exfat_nls.h"
+
+const u8 uni_upcase[NUM_UPCASE<<1] = {
+	0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x03, 0x00, 0x04, 0x00, 0x05, 0x00, 0x06, 0x00, 0x07, 0x00,
+	0x08, 0x00, 0x09, 0x00, 0x0A, 0x00, 0x0B, 0x00, 0x0C, 0x00, 0x0D, 0x00, 0x0E, 0x00, 0x0F, 0x00,
+	0x10, 0x00, 0x11, 0x00, 0x12, 0x00, 0x13, 0x00, 0x14, 0x00, 0x15, 0x00, 0x16, 0x00, 0x17, 0x00,
+	0x18, 0x00, 0x19, 0x00, 0x1A, 0x00, 0x1B, 0x00, 0x1C, 0x00, 0x1D, 0x00, 0x1E, 0x00, 0x1F, 0x00,
+	0x20, 0x00, 0x21, 0x00, 0x22, 0x00, 0x23, 0x00, 0x24, 0x00, 0x25, 0x00, 0x26, 0x00, 0x27, 0x00,
+	0x28, 0x00, 0x29, 0x00, 0x2A, 0x00, 0x2B, 0x00, 0x2C, 0x00, 0x2D, 0x00, 0x2E, 0x00, 0x2F, 0x00,
+	0x30, 0x00, 0x31, 0x00, 0x32, 0x00, 0x33, 0x00, 0x34, 0x00, 0x35, 0x00, 0x36, 0x00, 0x37, 0x00,
+	0x38, 0x00, 0x39, 0x00, 0x3A, 0x00, 0x3B, 0x00, 0x3C, 0x00, 0x3D, 0x00, 0x3E, 0x00, 0x3F, 0x00,
+	0x40, 0x00, 0x41, 0x00, 0x42, 0x00, 0x43, 0x00, 0x44, 0x00, 0x45, 0x00, 0x46, 0x00, 0x47, 0x00,
+	0x48, 0x00, 0x49, 0x00, 0x4A, 0x00, 0x4B, 0x00, 0x4C, 0x00, 0x4D, 0x00, 0x4E, 0x00, 0x4F, 0x00,
+	0x50, 0x00, 0x51, 0x00, 0x52, 0x00, 0x53, 0x00, 0x54, 0x00, 0x55, 0x00, 0x56, 0x00, 0x57, 0x00,
+	0x58, 0x00, 0x59, 0x00, 0x5A, 0x00, 0x5B, 0x00, 0x5C, 0x00, 0x5D, 0x00, 0x5E, 0x00, 0x5F, 0x00,
+	0x60, 0x00, 0x41, 0x00, 0x42, 0x00, 0x43, 0x00, 0x44, 0x00, 0x45, 0x00, 0x46, 0x00, 0x47, 0x00,
+	0x48, 0x00, 0x49, 0x00, 0x4A, 0x00, 0x4B, 0x00, 0x4C, 0x00, 0x4D, 0x00, 0x4E, 0x00, 0x4F, 0x00,
+	0x50, 0x00, 0x51, 0x00, 0x52, 0x00, 0x53, 0x00, 0x54, 0x00, 0x55, 0x00, 0x56, 0x00, 0x57, 0x00,
+	0x58, 0x00, 0x59, 0x00, 0x5A, 0x00, 0x7B, 0x00, 0x7C, 0x00, 0x7D, 0x00, 0x7E, 0x00, 0x7F, 0x00,
+	0x80, 0x00, 0x81, 0x00, 0x82, 0x00, 0x83, 0x00, 0x84, 0x00, 0x85, 0x00, 0x86, 0x00, 0x87, 0x00,
+	0x88, 0x00, 0x89, 0x00, 0x8A, 0x00, 0x8B, 0x00, 0x8C, 0x00, 0x8D, 0x00, 0x8E, 0x00, 0x8F, 0x00,
+	0x90, 0x00, 0x91, 0x00, 0x92, 0x00, 0x93, 0x00, 0x94, 0x00, 0x95, 0x00, 0x96, 0x00, 0x97, 0x00,
+	0x98, 0x00, 0x99, 0x00, 0x9A, 0x00, 0x9B, 0x00, 0x9C, 0x00, 0x9D, 0x00, 0x9E, 0x00, 0x9F, 0x00,
+	0xA0, 0x00, 0xA1, 0x00, 0xA2, 0x00, 0xA3, 0x00, 0xA4, 0x00, 0xA5, 0x00, 0xA6, 0x00, 0xA7, 0x00,
+	0xA8, 0x00, 0xA9, 0x00, 0xAA, 0x00, 0xAB, 0x00, 0xAC, 0x00, 0xAD, 0x00, 0xAE, 0x00, 0xAF, 0x00,
+	0xB0, 0x00, 0xB1, 0x00, 0xB2, 0x00, 0xB3, 0x00, 0xB4, 0x00, 0xB5, 0x00, 0xB6, 0x00, 0xB7, 0x00,
+	0xB8, 0x00, 0xB9, 0x00, 0xBA, 0x00, 0xBB, 0x00, 0xBC, 0x00, 0xBD, 0x00, 0xBE, 0x00, 0xBF, 0x00,
+	0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, 0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, 0x00,
+	0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, 0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, 0x00,
+	0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, 0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xD7, 0x00,
+	0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, 0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0xDF, 0x00,
+	0xC0, 0x00, 0xC1, 0x00, 0xC2, 0x00, 0xC3, 0x00, 0xC4, 0x00, 0xC5, 0x00, 0xC6, 0x00, 0xC7, 0x00,
+	0xC8, 0x00, 0xC9, 0x00, 0xCA, 0x00, 0xCB, 0x00, 0xCC, 0x00, 0xCD, 0x00, 0xCE, 0x00, 0xCF, 0x00,
+	0xD0, 0x00, 0xD1, 0x00, 0xD2, 0x00, 0xD3, 0x00, 0xD4, 0x00, 0xD5, 0x00, 0xD6, 0x00, 0xF7, 0x00,
+	0xD8, 0x00, 0xD9, 0x00, 0xDA, 0x00, 0xDB, 0x00, 0xDC, 0x00, 0xDD, 0x00, 0xDE, 0x00, 0x78, 0x01,
+	0x00, 0x01, 0x00, 0x01, 0x02, 0x01, 0x02, 0x01, 0x04, 0x01, 0x04, 0x01, 0x06, 0x01, 0x06, 0x01,
+	0x08, 0x01, 0x08, 0x01, 0x0A, 0x01, 0x0A, 0x01, 0x0C, 0x01, 0x0C, 0x01, 0x0E, 0x01, 0x0E, 0x01,
+	0x10, 0x01, 0x10, 0x01, 0x12, 0x01, 0x12, 0x01, 0x14, 0x01, 0x14, 0x01, 0x16, 0x01, 0x16, 0x01,
+	0x18, 0x01, 0x18, 0x01, 0x1A, 0x01, 0x1A, 0x01, 0x1C, 0x01, 0x1C, 0x01, 0x1E, 0x01, 0x1E, 0x01,
+	0x20, 0x01, 0x20, 0x01, 0x22, 0x01, 0x22, 0x01, 0x24, 0x01, 0x24, 0x01, 0x26, 0x01, 0x26, 0x01,
+	0x28, 0x01, 0x28, 0x01, 0x2A, 0x01, 0x2A, 0x01, 0x2C, 0x01, 0x2C, 0x01, 0x2E, 0x01, 0x2E, 0x01,
+	0x30, 0x01, 0x31, 0x01, 0x32, 0x01, 0x32, 0x01, 0x34, 0x01, 0x34, 0x01, 0x36, 0x01, 0x36, 0x01,
+	0x38, 0x01, 0x39, 0x01, 0x39, 0x01, 0x3B, 0x01, 0x3B, 0x01, 0x3D, 0x01, 0x3D, 0x01, 0x3F, 0x01,
+	0x3F, 0x01, 0x41, 0x01, 0x41, 0x01, 0x43, 0x01, 0x43, 0x01, 0x45, 0x01, 0x45, 0x01, 0x47, 0x01,
+	0x47, 0x01, 0x49, 0x01, 0x4A, 0x01, 0x4A, 0x01, 0x4C, 0x01, 0x4C, 0x01, 0x4E, 0x01, 0x4E, 0x01,
+	0x50, 0x01, 0x50, 0x01, 0x52, 0x01, 0x52, 0x01, 0x54, 0x01, 0x54, 0x01, 0x56, 0x01, 0x56, 0x01,
+	0x58, 0x01, 0x58, 0x01, 0x5A, 0x01, 0x5A, 0x01, 0x5C, 0x01, 0x5C, 0x01, 0x5E, 0x01, 0x5E, 0x01,
+	0x60, 0x01, 0x60, 0x01, 0x62, 0x01, 0x62, 0x01, 0x64, 0x01, 0x64, 0x01, 0x66, 0x01, 0x66, 0x01,
+	0x68, 0x01, 0x68, 0x01, 0x6A, 0x01, 0x6A, 0x01, 0x6C, 0x01, 0x6C, 0x01, 0x6E, 0x01, 0x6E, 0x01,
+	0x70, 0x01, 0x70, 0x01, 0x72, 0x01, 0x72, 0x01, 0x74, 0x01, 0x74, 0x01, 0x76, 0x01, 0x76, 0x01,
+	0x78, 0x01, 0x79, 0x01, 0x79, 0x01, 0x7B, 0x01, 0x7B, 0x01, 0x7D, 0x01, 0x7D, 0x01, 0x7F, 0x01,
+	0x43, 0x02, 0x81, 0x01, 0x82, 0x01, 0x82, 0x01, 0x84, 0x01, 0x84, 0x01, 0x86, 0x01, 0x87, 0x01,
+	0x87, 0x01, 0x89, 0x01, 0x8A, 0x01, 0x8B, 0x01, 0x8B, 0x01, 0x8D, 0x01, 0x8E, 0x01, 0x8F, 0x01,
+	0x90, 0x01, 0x91, 0x01, 0x91, 0x01, 0x93, 0x01, 0x94, 0x01, 0xF6, 0x01, 0x96, 0x01, 0x97, 0x01,
+	0x98, 0x01, 0x98, 0x01, 0x3D, 0x02, 0x9B, 0x01, 0x9C, 0x01, 0x9D, 0x01, 0x20, 0x02, 0x9F, 0x01,
+	0xA0, 0x01, 0xA0, 0x01, 0xA2, 0x01, 0xA2, 0x01, 0xA4, 0x01, 0xA4, 0x01, 0xA6, 0x01, 0xA7, 0x01,
+	0xA7, 0x01, 0xA9, 0x01, 0xAA, 0x01, 0xAB, 0x01, 0xAC, 0x01, 0xAC, 0x01, 0xAE, 0x01, 0xAF, 0x01,
+	0xAF, 0x01, 0xB1, 0x01, 0xB2, 0x01, 0xB3, 0x01, 0xB3, 0x01, 0xB5, 0x01, 0xB5, 0x01, 0xB7, 0x01,
+	0xB8, 0x01, 0xB8, 0x01, 0xBA, 0x01, 0xBB, 0x01, 0xBC, 0x01, 0xBC, 0x01, 0xBE, 0x01, 0xF7, 0x01,
+	0xC0, 0x01, 0xC1, 0x01, 0xC2, 0x01, 0xC3, 0x01, 0xC4, 0x01, 0xC5, 0x01, 0xC4, 0x01, 0xC7, 0x01,
+	0xC8, 0x01, 0xC7, 0x01, 0xCA, 0x01, 0xCB, 0x01, 0xCA, 0x01, 0xCD, 0x01, 0xCD, 0x01, 0xCF, 0x01,
+	0xCF, 0x01, 0xD1, 0x01, 0xD1, 0x01, 0xD3, 0x01, 0xD3, 0x01, 0xD5, 0x01, 0xD5, 0x01, 0xD7, 0x01,
+	0xD7, 0x01, 0xD9, 0x01, 0xD9, 0x01, 0xDB, 0x01, 0xDB, 0x01, 0x8E, 0x01, 0xDE, 0x01, 0xDE, 0x01,
+	0xE0, 0x01, 0xE0, 0x01, 0xE2, 0x01, 0xE2, 0x01, 0xE4, 0x01, 0xE4, 0x01, 0xE6, 0x01, 0xE6, 0x01,
+	0xE8, 0x01, 0xE8, 0x01, 0xEA, 0x01, 0xEA, 0x01, 0xEC, 0x01, 0xEC, 0x01, 0xEE, 0x01, 0xEE, 0x01,
+	0xF0, 0x01, 0xF1, 0x01, 0xF2, 0x01, 0xF1, 0x01, 0xF4, 0x01, 0xF4, 0x01, 0xF6, 0x01, 0xF7, 0x01,
+	0xF8, 0x01, 0xF8, 0x01, 0xFA, 0x01, 0xFA, 0x01, 0xFC, 0x01, 0xFC, 0x01, 0xFE, 0x01, 0xFE, 0x01,
+	0x00, 0x02, 0x00, 0x02, 0x02, 0x02, 0x02, 0x02, 0x04, 0x02, 0x04, 0x02, 0x06, 0x02, 0x06, 0x02,
+	0x08, 0x02, 0x08, 0x02, 0x0A, 0x02, 0x0A, 0x02, 0x0C, 0x02, 0x0C, 0x02, 0x0E, 0x02, 0x0E, 0x02,
+	0x10, 0x02, 0x10, 0x02, 0x12, 0x02, 0x12, 0x02, 0x14, 0x02, 0x14, 0x02, 0x16, 0x02, 0x16, 0x02,
+	0x18, 0x02, 0x18, 0x02, 0x1A, 0x02, 0x1A, 0x02, 0x1C, 0x02, 0x1C, 0x02, 0x1E, 0x02, 0x1E, 0x02,
+	0x20, 0x02, 0x21, 0x02, 0x22, 0x02, 0x22, 0x02, 0x24, 0x02, 0x24, 0x02, 0x26, 0x02, 0x26, 0x02,
+	0x28, 0x02, 0x28, 0x02, 0x2A, 0x02, 0x2A, 0x02, 0x2C, 0x02, 0x2C, 0x02, 0x2E, 0x02, 0x2E, 0x02,
+	0x30, 0x02, 0x30, 0x02, 0x32, 0x02, 0x32, 0x02, 0x34, 0x02, 0x35, 0x02, 0x36, 0x02, 0x37, 0x02,
+	0x38, 0x02, 0x39, 0x02, 0x65, 0x2C, 0x3B, 0x02, 0x3B, 0x02, 0x3D, 0x02, 0x66, 0x2C, 0x3F, 0x02,
+	0x40, 0x02, 0x41, 0x02, 0x41, 0x02, 0x43, 0x02, 0x44, 0x02, 0x45, 0x02, 0x46, 0x02, 0x46, 0x02,
+	0x48, 0x02, 0x48, 0x02, 0x4A, 0x02, 0x4A, 0x02, 0x4C, 0x02, 0x4C, 0x02, 0x4E, 0x02, 0x4E, 0x02,
+	0x50, 0x02, 0x51, 0x02, 0x52, 0x02, 0x81, 0x01, 0x86, 0x01, 0x55, 0x02, 0x89, 0x01, 0x8A, 0x01,
+	0x58, 0x02, 0x8F, 0x01, 0x5A, 0x02, 0x90, 0x01, 0x5C, 0x02, 0x5D, 0x02, 0x5E, 0x02, 0x5F, 0x02,
+	0x93, 0x01, 0x61, 0x02, 0x62, 0x02, 0x94, 0x01, 0x64, 0x02, 0x65, 0x02, 0x66, 0x02, 0x67, 0x02,
+	0x97, 0x01, 0x96, 0x01, 0x6A, 0x02, 0x62, 0x2C, 0x6C, 0x02, 0x6D, 0x02, 0x6E, 0x02, 0x9C, 0x01,
+	0x70, 0x02, 0x71, 0x02, 0x9D, 0x01, 0x73, 0x02, 0x74, 0x02, 0x9F, 0x01, 0x76, 0x02, 0x77, 0x02,
+	0x78, 0x02, 0x79, 0x02, 0x7A, 0x02, 0x7B, 0x02, 0x7C, 0x02, 0x64, 0x2C, 0x7E, 0x02, 0x7F, 0x02,
+	0xA6, 0x01, 0x81, 0x02, 0x82, 0x02, 0xA9, 0x01, 0x84, 0x02, 0x85, 0x02, 0x86, 0x02, 0x87, 0x02,
+	0xAE, 0x01, 0x44, 0x02, 0xB1, 0x01, 0xB2, 0x01, 0x45, 0x02, 0x8D, 0x02, 0x8E, 0x02, 0x8F, 0x02,
+	0x90, 0x02, 0x91, 0x02, 0xB7, 0x01, 0x93, 0x02, 0x94, 0x02, 0x95, 0x02, 0x96, 0x02, 0x97, 0x02,
+	0x98, 0x02, 0x99, 0x02, 0x9A, 0x02, 0x9B, 0x02, 0x9C, 0x02, 0x9D, 0x02, 0x9E, 0x02, 0x9F, 0x02,
+	0xA0, 0x02, 0xA1, 0x02, 0xA2, 0x02, 0xA3, 0x02, 0xA4, 0x02, 0xA5, 0x02, 0xA6, 0x02, 0xA7, 0x02,
+	0xA8, 0x02, 0xA9, 0x02, 0xAA, 0x02, 0xAB, 0x02, 0xAC, 0x02, 0xAD, 0x02, 0xAE, 0x02, 0xAF, 0x02,
+	0xB0, 0x02, 0xB1, 0x02, 0xB2, 0x02, 0xB3, 0x02, 0xB4, 0x02, 0xB5, 0x02, 0xB6, 0x02, 0xB7, 0x02,
+	0xB8, 0x02, 0xB9, 0x02, 0xBA, 0x02, 0xBB, 0x02, 0xBC, 0x02, 0xBD, 0x02, 0xBE, 0x02, 0xBF, 0x02,
+	0xC0, 0x02, 0xC1, 0x02, 0xC2, 0x02, 0xC3, 0x02, 0xC4, 0x02, 0xC5, 0x02, 0xC6, 0x02, 0xC7, 0x02,
+	0xC8, 0x02, 0xC9, 0x02, 0xCA, 0x02, 0xCB, 0x02, 0xCC, 0x02, 0xCD, 0x02, 0xCE, 0x02, 0xCF, 0x02,
+	0xD0, 0x02, 0xD1, 0x02, 0xD2, 0x02, 0xD3, 0x02, 0xD4, 0x02, 0xD5, 0x02, 0xD6, 0x02, 0xD7, 0x02,
+	0xD8, 0x02, 0xD9, 0x02, 0xDA, 0x02, 0xDB, 0x02, 0xDC, 0x02, 0xDD, 0x02, 0xDE, 0x02, 0xDF, 0x02,
+	0xE0, 0x02, 0xE1, 0x02, 0xE2, 0x02, 0xE3, 0x02, 0xE4, 0x02, 0xE5, 0x02, 0xE6, 0x02, 0xE7, 0x02,
+	0xE8, 0x02, 0xE9, 0x02, 0xEA, 0x02, 0xEB, 0x02, 0xEC, 0x02, 0xED, 0x02, 0xEE, 0x02, 0xEF, 0x02,
+	0xF0, 0x02, 0xF1, 0x02, 0xF2, 0x02, 0xF3, 0x02, 0xF4, 0x02, 0xF5, 0x02, 0xF6, 0x02, 0xF7, 0x02,
+	0xF8, 0x02, 0xF9, 0x02, 0xFA, 0x02, 0xFB, 0x02, 0xFC, 0x02, 0xFD, 0x02, 0xFE, 0x02, 0xFF, 0x02,
+	0x00, 0x03, 0x01, 0x03, 0x02, 0x03, 0x03, 0x03, 0x04, 0x03, 0x05, 0x03, 0x06, 0x03, 0x07, 0x03,
+	0x08, 0x03, 0x09, 0x03, 0x0A, 0x03, 0x0B, 0x03, 0x0C, 0x03, 0x0D, 0x03, 0x0E, 0x03, 0x0F, 0x03,
+	0x10, 0x03, 0x11, 0x03, 0x12, 0x03, 0x13, 0x03, 0x14, 0x03, 0x15, 0x03, 0x16, 0x03, 0x17, 0x03,
+	0x18, 0x03, 0x19, 0x03, 0x1A, 0x03, 0x1B, 0x03, 0x1C, 0x03, 0x1D, 0x03, 0x1E, 0x03, 0x1F, 0x03,
+	0x20, 0x03, 0x21, 0x03, 0x22, 0x03, 0x23, 0x03, 0x24, 0x03, 0x25, 0x03, 0x26, 0x03, 0x27, 0x03,
+	0x28, 0x03, 0x29, 0x03, 0x2A, 0x03, 0x2B, 0x03, 0x2C, 0x03, 0x2D, 0x03, 0x2E, 0x03, 0x2F, 0x03,
+	0x30, 0x03, 0x31, 0x03, 0x32, 0x03, 0x33, 0x03, 0x34, 0x03, 0x35, 0x03, 0x36, 0x03, 0x37, 0x03,
+	0x38, 0x03, 0x39, 0x03, 0x3A, 0x03, 0x3B, 0x03, 0x3C, 0x03, 0x3D, 0x03, 0x3E, 0x03, 0x3F, 0x03,
+	0x40, 0x03, 0x41, 0x03, 0x42, 0x03, 0x43, 0x03, 0x44, 0x03, 0x45, 0x03, 0x46, 0x03, 0x47, 0x03,
+	0x48, 0x03, 0x49, 0x03, 0x4A, 0x03, 0x4B, 0x03, 0x4C, 0x03, 0x4D, 0x03, 0x4E, 0x03, 0x4F, 0x03,
+	0x50, 0x03, 0x51, 0x03, 0x52, 0x03, 0x53, 0x03, 0x54, 0x03, 0x55, 0x03, 0x56, 0x03, 0x57, 0x03,
+	0x58, 0x03, 0x59, 0x03, 0x5A, 0x03, 0x5B, 0x03, 0x5C, 0x03, 0x5D, 0x03, 0x5E, 0x03, 0x5F, 0x03,
+	0x60, 0x03, 0x61, 0x03, 0x62, 0x03, 0x63, 0x03, 0x64, 0x03, 0x65, 0x03, 0x66, 0x03, 0x67, 0x03,
+	0x68, 0x03, 0x69, 0x03, 0x6A, 0x03, 0x6B, 0x03, 0x6C, 0x03, 0x6D, 0x03, 0x6E, 0x03, 0x6F, 0x03,
+	0x70, 0x03, 0x71, 0x03, 0x72, 0x03, 0x73, 0x03, 0x74, 0x03, 0x75, 0x03, 0x76, 0x03, 0x77, 0x03,
+	0x78, 0x03, 0x79, 0x03, 0x7A, 0x03, 0xFD, 0x03, 0xFE, 0x03, 0xFF, 0x03, 0x7E, 0x03, 0x7F, 0x03,
+	0x80, 0x03, 0x81, 0x03, 0x82, 0x03, 0x83, 0x03, 0x84, 0x03, 0x85, 0x03, 0x86, 0x03, 0x87, 0x03,
+	0x88, 0x03, 0x89, 0x03, 0x8A, 0x03, 0x8B, 0x03, 0x8C, 0x03, 0x8D, 0x03, 0x8E, 0x03, 0x8F, 0x03,
+	0x90, 0x03, 0x91, 0x03, 0x92, 0x03, 0x93, 0x03, 0x94, 0x03, 0x95, 0x03, 0x96, 0x03, 0x97, 0x03,
+	0x98, 0x03, 0x99, 0x03, 0x9A, 0x03, 0x9B, 0x03, 0x9C, 0x03, 0x9D, 0x03, 0x9E, 0x03, 0x9F, 0x03,
+	0xA0, 0x03, 0xA1, 0x03, 0xA2, 0x03, 0xA3, 0x03, 0xA4, 0x03, 0xA5, 0x03, 0xA6, 0x03, 0xA7, 0x03,
+	0xA8, 0x03, 0xA9, 0x03, 0xAA, 0x03, 0xAB, 0x03, 0x86, 0x03, 0x88, 0x03, 0x89, 0x03, 0x8A, 0x03,
+	0xB0, 0x03, 0x91, 0x03, 0x92, 0x03, 0x93, 0x03, 0x94, 0x03, 0x95, 0x03, 0x96, 0x03, 0x97, 0x03,
+	0x98, 0x03, 0x99, 0x03, 0x9A, 0x03, 0x9B, 0x03, 0x9C, 0x03, 0x9D, 0x03, 0x9E, 0x03, 0x9F, 0x03,
+	0xA0, 0x03, 0xA1, 0x03, 0xA3, 0x03, 0xA3, 0x03, 0xA4, 0x03, 0xA5, 0x03, 0xA6, 0x03, 0xA7, 0x03,
+	0xA8, 0x03, 0xA9, 0x03, 0xAA, 0x03, 0xAB, 0x03, 0x8C, 0x03, 0x8E, 0x03, 0x8F, 0x03, 0xCF, 0x03,
+	0xD0, 0x03, 0xD1, 0x03, 0xD2, 0x03, 0xD3, 0x03, 0xD4, 0x03, 0xD5, 0x03, 0xD6, 0x03, 0xD7, 0x03,
+	0xD8, 0x03, 0xD8, 0x03, 0xDA, 0x03, 0xDA, 0x03, 0xDC, 0x03, 0xDC, 0x03, 0xDE, 0x03, 0xDE, 0x03,
+	0xE0, 0x03, 0xE0, 0x03, 0xE2, 0x03, 0xE2, 0x03, 0xE4, 0x03, 0xE4, 0x03, 0xE6, 0x03, 0xE6, 0x03,
+	0xE8, 0x03, 0xE8, 0x03, 0xEA, 0x03, 0xEA, 0x03, 0xEC, 0x03, 0xEC, 0x03, 0xEE, 0x03, 0xEE, 0x03,
+	0xF0, 0x03, 0xF1, 0x03, 0xF9, 0x03, 0xF3, 0x03, 0xF4, 0x03, 0xF5, 0x03, 0xF6, 0x03, 0xF7, 0x03,
+	0xF7, 0x03, 0xF9, 0x03, 0xFA, 0x03, 0xFA, 0x03, 0xFC, 0x03, 0xFD, 0x03, 0xFE, 0x03, 0xFF, 0x03,
+	0x00, 0x04, 0x01, 0x04, 0x02, 0x04, 0x03, 0x04, 0x04, 0x04, 0x05, 0x04, 0x06, 0x04, 0x07, 0x04,
+	0x08, 0x04, 0x09, 0x04, 0x0A, 0x04, 0x0B, 0x04, 0x0C, 0x04, 0x0D, 0x04, 0x0E, 0x04, 0x0F, 0x04,
+	0x10, 0x04, 0x11, 0x04, 0x12, 0x04, 0x13, 0x04, 0x14, 0x04, 0x15, 0x04, 0x16, 0x04, 0x17, 0x04,
+	0x18, 0x04, 0x19, 0x04, 0x1A, 0x04, 0x1B, 0x04, 0x1C, 0x04, 0x1D, 0x04, 0x1E, 0x04, 0x1F, 0x04,
+	0x20, 0x04, 0x21, 0x04, 0x22, 0x04, 0x23, 0x04, 0x24, 0x04, 0x25, 0x04, 0x26, 0x04, 0x27, 0x04,
+	0x28, 0x04, 0x29, 0x04, 0x2A, 0x04, 0x2B, 0x04, 0x2C, 0x04, 0x2D, 0x04, 0x2E, 0x04, 0x2F, 0x04,
+	0x10, 0x04, 0x11, 0x04, 0x12, 0x04, 0x13, 0x04, 0x14, 0x04, 0x15, 0x04, 0x16, 0x04, 0x17, 0x04,
+	0x18, 0x04, 0x19, 0x04, 0x1A, 0x04, 0x1B, 0x04, 0x1C, 0x04, 0x1D, 0x04, 0x1E, 0x04, 0x1F, 0x04,
+	0x20, 0x04, 0x21, 0x04, 0x22, 0x04, 0x23, 0x04, 0x24, 0x04, 0x25, 0x04, 0x26, 0x04, 0x27, 0x04,
+	0x28, 0x04, 0x29, 0x04, 0x2A, 0x04, 0x2B, 0x04, 0x2C, 0x04, 0x2D, 0x04, 0x2E, 0x04, 0x2F, 0x04,
+	0x00, 0x04, 0x01, 0x04, 0x02, 0x04, 0x03, 0x04, 0x04, 0x04, 0x05, 0x04, 0x06, 0x04, 0x07, 0x04,
+	0x08, 0x04, 0x09, 0x04, 0x0A, 0x04, 0x0B, 0x04, 0x0C, 0x04, 0x0D, 0x04, 0x0E, 0x04, 0x0F, 0x04,
+	0x60, 0x04, 0x60, 0x04, 0x62, 0x04, 0x62, 0x04, 0x64, 0x04, 0x64, 0x04, 0x66, 0x04, 0x66, 0x04,
+	0x68, 0x04, 0x68, 0x04, 0x6A, 0x04, 0x6A, 0x04, 0x6C, 0x04, 0x6C, 0x04, 0x6E, 0x04, 0x6E, 0x04,
+	0x70, 0x04, 0x70, 0x04, 0x72, 0x04, 0x72, 0x04, 0x74, 0x04, 0x74, 0x04, 0x76, 0x04, 0x76, 0x04,
+	0x78, 0x04, 0x78, 0x04, 0x7A, 0x04, 0x7A, 0x04, 0x7C, 0x04, 0x7C, 0x04, 0x7E, 0x04, 0x7E, 0x04,
+	0x80, 0x04, 0x80, 0x04, 0x82, 0x04, 0x83, 0x04, 0x84, 0x04, 0x85, 0x04, 0x86, 0x04, 0x87, 0x04,
+	0x88, 0x04, 0x89, 0x04, 0x8A, 0x04, 0x8A, 0x04, 0x8C, 0x04, 0x8C, 0x04, 0x8E, 0x04, 0x8E, 0x04,
+	0x90, 0x04, 0x90, 0x04, 0x92, 0x04, 0x92, 0x04, 0x94, 0x04, 0x94, 0x04, 0x96, 0x04, 0x96, 0x04,
+	0x98, 0x04, 0x98, 0x04, 0x9A, 0x04, 0x9A, 0x04, 0x9C, 0x04, 0x9C, 0x04, 0x9E, 0x04, 0x9E, 0x04,
+	0xA0, 0x04, 0xA0, 0x04, 0xA2, 0x04, 0xA2, 0x04, 0xA4, 0x04, 0xA4, 0x04, 0xA6, 0x04, 0xA6, 0x04,
+	0xA8, 0x04, 0xA8, 0x04, 0xAA, 0x04, 0xAA, 0x04, 0xAC, 0x04, 0xAC, 0x04, 0xAE, 0x04, 0xAE, 0x04,
+	0xB0, 0x04, 0xB0, 0x04, 0xB2, 0x04, 0xB2, 0x04, 0xB4, 0x04, 0xB4, 0x04, 0xB6, 0x04, 0xB6, 0x04,
+	0xB8, 0x04, 0xB8, 0x04, 0xBA, 0x04, 0xBA, 0x04, 0xBC, 0x04, 0xBC, 0x04, 0xBE, 0x04, 0xBE, 0x04,
+	0xC0, 0x04, 0xC1, 0x04, 0xC1, 0x04, 0xC3, 0x04, 0xC3, 0x04, 0xC5, 0x04, 0xC5, 0x04, 0xC7, 0x04,
+	0xC7, 0x04, 0xC9, 0x04, 0xC9, 0x04, 0xCB, 0x04, 0xCB, 0x04, 0xCD, 0x04, 0xCD, 0x04, 0xC0, 0x04,
+	0xD0, 0x04, 0xD0, 0x04, 0xD2, 0x04, 0xD2, 0x04, 0xD4, 0x04, 0xD4, 0x04, 0xD6, 0x04, 0xD6, 0x04,
+	0xD8, 0x04, 0xD8, 0x04, 0xDA, 0x04, 0xDA, 0x04, 0xDC, 0x04, 0xDC, 0x04, 0xDE, 0x04, 0xDE, 0x04,
+	0xE0, 0x04, 0xE0, 0x04, 0xE2, 0x04, 0xE2, 0x04, 0xE4, 0x04, 0xE4, 0x04, 0xE6, 0x04, 0xE6, 0x04,
+	0xE8, 0x04, 0xE8, 0x04, 0xEA, 0x04, 0xEA, 0x04, 0xEC, 0x04, 0xEC, 0x04, 0xEE, 0x04, 0xEE, 0x04,
+	0xF0, 0x04, 0xF0, 0x04, 0xF2, 0x04, 0xF2, 0x04, 0xF4, 0x04, 0xF4, 0x04, 0xF6, 0x04, 0xF6, 0x04,
+	0xF8, 0x04, 0xF8, 0x04, 0xFA, 0x04, 0xFA, 0x04, 0xFC, 0x04, 0xFC, 0x04, 0xFE, 0x04, 0xFE, 0x04,
+	0x00, 0x05, 0x00, 0x05, 0x02, 0x05, 0x02, 0x05, 0x04, 0x05, 0x04, 0x05, 0x06, 0x05, 0x06, 0x05,
+	0x08, 0x05, 0x08, 0x05, 0x0A, 0x05, 0x0A, 0x05, 0x0C, 0x05, 0x0C, 0x05, 0x0E, 0x05, 0x0E, 0x05,
+	0x10, 0x05, 0x10, 0x05, 0x12, 0x05, 0x12, 0x05, 0x14, 0x05, 0x15, 0x05, 0x16, 0x05, 0x17, 0x05,
+	0x18, 0x05, 0x19, 0x05, 0x1A, 0x05, 0x1B, 0x05, 0x1C, 0x05, 0x1D, 0x05, 0x1E, 0x05, 0x1F, 0x05,
+	0x20, 0x05, 0x21, 0x05, 0x22, 0x05, 0x23, 0x05, 0x24, 0x05, 0x25, 0x05, 0x26, 0x05, 0x27, 0x05,
+	0x28, 0x05, 0x29, 0x05, 0x2A, 0x05, 0x2B, 0x05, 0x2C, 0x05, 0x2D, 0x05, 0x2E, 0x05, 0x2F, 0x05,
+	0x30, 0x05, 0x31, 0x05, 0x32, 0x05, 0x33, 0x05, 0x34, 0x05, 0x35, 0x05, 0x36, 0x05, 0x37, 0x05,
+	0x38, 0x05, 0x39, 0x05, 0x3A, 0x05, 0x3B, 0x05, 0x3C, 0x05, 0x3D, 0x05, 0x3E, 0x05, 0x3F, 0x05,
+	0x40, 0x05, 0x41, 0x05, 0x42, 0x05, 0x43, 0x05, 0x44, 0x05, 0x45, 0x05, 0x46, 0x05, 0x47, 0x05,
+	0x48, 0x05, 0x49, 0x05, 0x4A, 0x05, 0x4B, 0x05, 0x4C, 0x05, 0x4D, 0x05, 0x4E, 0x05, 0x4F, 0x05,
+	0x50, 0x05, 0x51, 0x05, 0x52, 0x05, 0x53, 0x05, 0x54, 0x05, 0x55, 0x05, 0x56, 0x05, 0x57, 0x05,
+	0x58, 0x05, 0x59, 0x05, 0x5A, 0x05, 0x5B, 0x05, 0x5C, 0x05, 0x5D, 0x05, 0x5E, 0x05, 0x5F, 0x05,
+	0x60, 0x05, 0x31, 0x05, 0x32, 0x05, 0x33, 0x05, 0x34, 0x05, 0x35, 0x05, 0x36, 0x05, 0x37, 0x05,
+	0x38, 0x05, 0x39, 0x05, 0x3A, 0x05, 0x3B, 0x05, 0x3C, 0x05, 0x3D, 0x05, 0x3E, 0x05, 0x3F, 0x05,
+	0x40, 0x05, 0x41, 0x05, 0x42, 0x05, 0x43, 0x05, 0x44, 0x05, 0x45, 0x05, 0x46, 0x05, 0x47, 0x05,
+	0x48, 0x05, 0x49, 0x05, 0x4A, 0x05, 0x4B, 0x05, 0x4C, 0x05, 0x4D, 0x05, 0x4E, 0x05, 0x4F, 0x05,
+	0x50, 0x05, 0x51, 0x05, 0x52, 0x05, 0x53, 0x05, 0x54, 0x05, 0x55, 0x05, 0x56, 0x05, 0xFF, 0xFF,
+	0xF6, 0x17, 0x63, 0x2C, 0x7E, 0x1D, 0x7F, 0x1D, 0x80, 0x1D, 0x81, 0x1D, 0x82, 0x1D, 0x83, 0x1D,
+	0x84, 0x1D, 0x85, 0x1D, 0x86, 0x1D, 0x87, 0x1D, 0x88, 0x1D, 0x89, 0x1D, 0x8A, 0x1D, 0x8B, 0x1D,
+	0x8C, 0x1D, 0x8D, 0x1D, 0x8E, 0x1D, 0x8F, 0x1D, 0x90, 0x1D, 0x91, 0x1D, 0x92, 0x1D, 0x93, 0x1D,
+	0x94, 0x1D, 0x95, 0x1D, 0x96, 0x1D, 0x97, 0x1D, 0x98, 0x1D, 0x99, 0x1D, 0x9A, 0x1D, 0x9B, 0x1D,
+	0x9C, 0x1D, 0x9D, 0x1D, 0x9E, 0x1D, 0x9F, 0x1D, 0xA0, 0x1D, 0xA1, 0x1D, 0xA2, 0x1D, 0xA3, 0x1D,
+	0xA4, 0x1D, 0xA5, 0x1D, 0xA6, 0x1D, 0xA7, 0x1D, 0xA8, 0x1D, 0xA9, 0x1D, 0xAA, 0x1D, 0xAB, 0x1D,
+	0xAC, 0x1D, 0xAD, 0x1D, 0xAE, 0x1D, 0xAF, 0x1D, 0xB0, 0x1D, 0xB1, 0x1D, 0xB2, 0x1D, 0xB3, 0x1D,
+	0xB4, 0x1D, 0xB5, 0x1D, 0xB6, 0x1D, 0xB7, 0x1D, 0xB8, 0x1D, 0xB9, 0x1D, 0xBA, 0x1D, 0xBB, 0x1D,
+	0xBC, 0x1D, 0xBD, 0x1D, 0xBE, 0x1D, 0xBF, 0x1D, 0xC0, 0x1D, 0xC1, 0x1D, 0xC2, 0x1D, 0xC3, 0x1D,
+	0xC4, 0x1D, 0xC5, 0x1D, 0xC6, 0x1D, 0xC7, 0x1D, 0xC8, 0x1D, 0xC9, 0x1D, 0xCA, 0x1D, 0xCB, 0x1D,
+	0xCC, 0x1D, 0xCD, 0x1D, 0xCE, 0x1D, 0xCF, 0x1D, 0xD0, 0x1D, 0xD1, 0x1D, 0xD2, 0x1D, 0xD3, 0x1D,
+	0xD4, 0x1D, 0xD5, 0x1D, 0xD6, 0x1D, 0xD7, 0x1D, 0xD8, 0x1D, 0xD9, 0x1D, 0xDA, 0x1D, 0xDB, 0x1D,
+	0xDC, 0x1D, 0xDD, 0x1D, 0xDE, 0x1D, 0xDF, 0x1D, 0xE0, 0x1D, 0xE1, 0x1D, 0xE2, 0x1D, 0xE3, 0x1D,
+	0xE4, 0x1D, 0xE5, 0x1D, 0xE6, 0x1D, 0xE7, 0x1D, 0xE8, 0x1D, 0xE9, 0x1D, 0xEA, 0x1D, 0xEB, 0x1D,
+	0xEC, 0x1D, 0xED, 0x1D, 0xEE, 0x1D, 0xEF, 0x1D, 0xF0, 0x1D, 0xF1, 0x1D, 0xF2, 0x1D, 0xF3, 0x1D,
+	0xF4, 0x1D, 0xF5, 0x1D, 0xF6, 0x1D, 0xF7, 0x1D, 0xF8, 0x1D, 0xF9, 0x1D, 0xFA, 0x1D, 0xFB, 0x1D,
+	0xFC, 0x1D, 0xFD, 0x1D, 0xFE, 0x1D, 0xFF, 0x1D, 0x00, 0x1E, 0x00, 0x1E, 0x02, 0x1E, 0x02, 0x1E,
+	0x04, 0x1E, 0x04, 0x1E, 0x06, 0x1E, 0x06, 0x1E, 0x08, 0x1E, 0x08, 0x1E, 0x0A, 0x1E, 0x0A, 0x1E,
+	0x0C, 0x1E, 0x0C, 0x1E, 0x0E, 0x1E, 0x0E, 0x1E, 0x10, 0x1E, 0x10, 0x1E, 0x12, 0x1E, 0x12, 0x1E,
+	0x14, 0x1E, 0x14, 0x1E, 0x16, 0x1E, 0x16, 0x1E, 0x18, 0x1E, 0x18, 0x1E, 0x1A, 0x1E, 0x1A, 0x1E,
+	0x1C, 0x1E, 0x1C, 0x1E, 0x1E, 0x1E, 0x1E, 0x1E, 0x20, 0x1E, 0x20, 0x1E, 0x22, 0x1E, 0x22, 0x1E,
+	0x24, 0x1E, 0x24, 0x1E, 0x26, 0x1E, 0x26, 0x1E, 0x28, 0x1E, 0x28, 0x1E, 0x2A, 0x1E, 0x2A, 0x1E,
+	0x2C, 0x1E, 0x2C, 0x1E, 0x2E, 0x1E, 0x2E, 0x1E, 0x30, 0x1E, 0x30, 0x1E, 0x32, 0x1E, 0x32, 0x1E,
+	0x34, 0x1E, 0x34, 0x1E, 0x36, 0x1E, 0x36, 0x1E, 0x38, 0x1E, 0x38, 0x1E, 0x3A, 0x1E, 0x3A, 0x1E,
+	0x3C, 0x1E, 0x3C, 0x1E, 0x3E, 0x1E, 0x3E, 0x1E, 0x40, 0x1E, 0x40, 0x1E, 0x42, 0x1E, 0x42, 0x1E,
+	0x44, 0x1E, 0x44, 0x1E, 0x46, 0x1E, 0x46, 0x1E, 0x48, 0x1E, 0x48, 0x1E, 0x4A, 0x1E, 0x4A, 0x1E,
+	0x4C, 0x1E, 0x4C, 0x1E, 0x4E, 0x1E, 0x4E, 0x1E, 0x50, 0x1E, 0x50, 0x1E, 0x52, 0x1E, 0x52, 0x1E,
+	0x54, 0x1E, 0x54, 0x1E, 0x56, 0x1E, 0x56, 0x1E, 0x58, 0x1E, 0x58, 0x1E, 0x5A, 0x1E, 0x5A, 0x1E,
+	0x5C, 0x1E, 0x5C, 0x1E, 0x5E, 0x1E, 0x5E, 0x1E, 0x60, 0x1E, 0x60, 0x1E, 0x62, 0x1E, 0x62, 0x1E,
+	0x64, 0x1E, 0x64, 0x1E, 0x66, 0x1E, 0x66, 0x1E, 0x68, 0x1E, 0x68, 0x1E, 0x6A, 0x1E, 0x6A, 0x1E,
+	0x6C, 0x1E, 0x6C, 0x1E, 0x6E, 0x1E, 0x6E, 0x1E, 0x70, 0x1E, 0x70, 0x1E, 0x72, 0x1E, 0x72, 0x1E,
+	0x74, 0x1E, 0x74, 0x1E, 0x76, 0x1E, 0x76, 0x1E, 0x78, 0x1E, 0x78, 0x1E, 0x7A, 0x1E, 0x7A, 0x1E,
+	0x7C, 0x1E, 0x7C, 0x1E, 0x7E, 0x1E, 0x7E, 0x1E, 0x80, 0x1E, 0x80, 0x1E, 0x82, 0x1E, 0x82, 0x1E,
+	0x84, 0x1E, 0x84, 0x1E, 0x86, 0x1E, 0x86, 0x1E, 0x88, 0x1E, 0x88, 0x1E, 0x8A, 0x1E, 0x8A, 0x1E,
+	0x8C, 0x1E, 0x8C, 0x1E, 0x8E, 0x1E, 0x8E, 0x1E, 0x90, 0x1E, 0x90, 0x1E, 0x92, 0x1E, 0x92, 0x1E,
+	0x94, 0x1E, 0x94, 0x1E, 0x96, 0x1E, 0x97, 0x1E, 0x98, 0x1E, 0x99, 0x1E, 0x9A, 0x1E, 0x9B, 0x1E,
+	0x9C, 0x1E, 0x9D, 0x1E, 0x9E, 0x1E, 0x9F, 0x1E, 0xA0, 0x1E, 0xA0, 0x1E, 0xA2, 0x1E, 0xA2, 0x1E,
+	0xA4, 0x1E, 0xA4, 0x1E, 0xA6, 0x1E, 0xA6, 0x1E, 0xA8, 0x1E, 0xA8, 0x1E, 0xAA, 0x1E, 0xAA, 0x1E,
+	0xAC, 0x1E, 0xAC, 0x1E, 0xAE, 0x1E, 0xAE, 0x1E, 0xB0, 0x1E, 0xB0, 0x1E, 0xB2, 0x1E, 0xB2, 0x1E,
+	0xB4, 0x1E, 0xB4, 0x1E, 0xB6, 0x1E, 0xB6, 0x1E, 0xB8, 0x1E, 0xB8, 0x1E, 0xBA, 0x1E, 0xBA, 0x1E,
+	0xBC, 0x1E, 0xBC, 0x1E, 0xBE, 0x1E, 0xBE, 0x1E, 0xC0, 0x1E, 0xC0, 0x1E, 0xC2, 0x1E, 0xC2, 0x1E,
+	0xC4, 0x1E, 0xC4, 0x1E, 0xC6, 0x1E, 0xC6, 0x1E, 0xC8, 0x1E, 0xC8, 0x1E, 0xCA, 0x1E, 0xCA, 0x1E,
+	0xCC, 0x1E, 0xCC, 0x1E, 0xCE, 0x1E, 0xCE, 0x1E, 0xD0, 0x1E, 0xD0, 0x1E, 0xD2, 0x1E, 0xD2, 0x1E,
+	0xD4, 0x1E, 0xD4, 0x1E, 0xD6, 0x1E, 0xD6, 0x1E, 0xD8, 0x1E, 0xD8, 0x1E, 0xDA, 0x1E, 0xDA, 0x1E,
+	0xDC, 0x1E, 0xDC, 0x1E, 0xDE, 0x1E, 0xDE, 0x1E, 0xE0, 0x1E, 0xE0, 0x1E, 0xE2, 0x1E, 0xE2, 0x1E,
+	0xE4, 0x1E, 0xE4, 0x1E, 0xE6, 0x1E, 0xE6, 0x1E, 0xE8, 0x1E, 0xE8, 0x1E, 0xEA, 0x1E, 0xEA, 0x1E,
+	0xEC, 0x1E, 0xEC, 0x1E, 0xEE, 0x1E, 0xEE, 0x1E, 0xF0, 0x1E, 0xF0, 0x1E, 0xF2, 0x1E, 0xF2, 0x1E,
+	0xF4, 0x1E, 0xF4, 0x1E, 0xF6, 0x1E, 0xF6, 0x1E, 0xF8, 0x1E, 0xF8, 0x1E, 0xFA, 0x1E, 0xFB, 0x1E,
+	0xFC, 0x1E, 0xFD, 0x1E, 0xFE, 0x1E, 0xFF, 0x1E, 0x08, 0x1F, 0x09, 0x1F, 0x0A, 0x1F, 0x0B, 0x1F,
+	0x0C, 0x1F, 0x0D, 0x1F, 0x0E, 0x1F, 0x0F, 0x1F, 0x08, 0x1F, 0x09, 0x1F, 0x0A, 0x1F, 0x0B, 0x1F,
+	0x0C, 0x1F, 0x0D, 0x1F, 0x0E, 0x1F, 0x0F, 0x1F, 0x18, 0x1F, 0x19, 0x1F, 0x1A, 0x1F, 0x1B, 0x1F,
+	0x1C, 0x1F, 0x1D, 0x1F, 0x16, 0x1F, 0x17, 0x1F, 0x18, 0x1F, 0x19, 0x1F, 0x1A, 0x1F, 0x1B, 0x1F,
+	0x1C, 0x1F, 0x1D, 0x1F, 0x1E, 0x1F, 0x1F, 0x1F, 0x28, 0x1F, 0x29, 0x1F, 0x2A, 0x1F, 0x2B, 0x1F,
+	0x2C, 0x1F, 0x2D, 0x1F, 0x2E, 0x1F, 0x2F, 0x1F, 0x28, 0x1F, 0x29, 0x1F, 0x2A, 0x1F, 0x2B, 0x1F,
+	0x2C, 0x1F, 0x2D, 0x1F, 0x2E, 0x1F, 0x2F, 0x1F, 0x38, 0x1F, 0x39, 0x1F, 0x3A, 0x1F, 0x3B, 0x1F,
+	0x3C, 0x1F, 0x3D, 0x1F, 0x3E, 0x1F, 0x3F, 0x1F, 0x38, 0x1F, 0x39, 0x1F, 0x3A, 0x1F, 0x3B, 0x1F,
+	0x3C, 0x1F, 0x3D, 0x1F, 0x3E, 0x1F, 0x3F, 0x1F, 0x48, 0x1F, 0x49, 0x1F, 0x4A, 0x1F, 0x4B, 0x1F,
+	0x4C, 0x1F, 0x4D, 0x1F, 0x46, 0x1F, 0x47, 0x1F, 0x48, 0x1F, 0x49, 0x1F, 0x4A, 0x1F, 0x4B, 0x1F,
+	0x4C, 0x1F, 0x4D, 0x1F, 0x4E, 0x1F, 0x4F, 0x1F, 0x50, 0x1F, 0x59, 0x1F, 0x52, 0x1F, 0x5B, 0x1F,
+	0x54, 0x1F, 0x5D, 0x1F, 0x56, 0x1F, 0x5F, 0x1F, 0x58, 0x1F, 0x59, 0x1F, 0x5A, 0x1F, 0x5B, 0x1F,
+	0x5C, 0x1F, 0x5D, 0x1F, 0x5E, 0x1F, 0x5F, 0x1F, 0x68, 0x1F, 0x69, 0x1F, 0x6A, 0x1F, 0x6B, 0x1F,
+	0x6C, 0x1F, 0x6D, 0x1F, 0x6E, 0x1F, 0x6F, 0x1F, 0x68, 0x1F, 0x69, 0x1F, 0x6A, 0x1F, 0x6B, 0x1F,
+	0x6C, 0x1F, 0x6D, 0x1F, 0x6E, 0x1F, 0x6F, 0x1F, 0xBA, 0x1F, 0xBB, 0x1F, 0xC8, 0x1F, 0xC9, 0x1F,
+	0xCA, 0x1F, 0xCB, 0x1F, 0xDA, 0x1F, 0xDB, 0x1F, 0xF8, 0x1F, 0xF9, 0x1F, 0xEA, 0x1F, 0xEB, 0x1F,
+	0xFA, 0x1F, 0xFB, 0x1F, 0x7E, 0x1F, 0x7F, 0x1F, 0x88, 0x1F, 0x89, 0x1F, 0x8A, 0x1F, 0x8B, 0x1F,
+	0x8C, 0x1F, 0x8D, 0x1F, 0x8E, 0x1F, 0x8F, 0x1F, 0x88, 0x1F, 0x89, 0x1F, 0x8A, 0x1F, 0x8B, 0x1F,
+	0x8C, 0x1F, 0x8D, 0x1F, 0x8E, 0x1F, 0x8F, 0x1F, 0x98, 0x1F, 0x99, 0x1F, 0x9A, 0x1F, 0x9B, 0x1F,
+	0x9C, 0x1F, 0x9D, 0x1F, 0x9E, 0x1F, 0x9F, 0x1F, 0x98, 0x1F, 0x99, 0x1F, 0x9A, 0x1F, 0x9B, 0x1F,
+	0x9C, 0x1F, 0x9D, 0x1F, 0x9E, 0x1F, 0x9F, 0x1F, 0xA8, 0x1F, 0xA9, 0x1F, 0xAA, 0x1F, 0xAB, 0x1F,
+	0xAC, 0x1F, 0xAD, 0x1F, 0xAE, 0x1F, 0xAF, 0x1F, 0xA8, 0x1F, 0xA9, 0x1F, 0xAA, 0x1F, 0xAB, 0x1F,
+	0xAC, 0x1F, 0xAD, 0x1F, 0xAE, 0x1F, 0xAF, 0x1F, 0xB8, 0x1F, 0xB9, 0x1F, 0xB2, 0x1F, 0xBC, 0x1F,
+	0xB4, 0x1F, 0xB5, 0x1F, 0xB6, 0x1F, 0xB7, 0x1F, 0xB8, 0x1F, 0xB9, 0x1F, 0xBA, 0x1F, 0xBB, 0x1F,
+	0xBC, 0x1F, 0xBD, 0x1F, 0xBE, 0x1F, 0xBF, 0x1F, 0xC0, 0x1F, 0xC1, 0x1F, 0xC2, 0x1F, 0xC3, 0x1F,
+	0xC4, 0x1F, 0xC5, 0x1F, 0xC6, 0x1F, 0xC7, 0x1F, 0xC8, 0x1F, 0xC9, 0x1F, 0xCA, 0x1F, 0xCB, 0x1F,
+	0xC3, 0x1F, 0xCD, 0x1F, 0xCE, 0x1F, 0xCF, 0x1F, 0xD8, 0x1F, 0xD9, 0x1F, 0xD2, 0x1F, 0xD3, 0x1F,
+	0xD4, 0x1F, 0xD5, 0x1F, 0xD6, 0x1F, 0xD7, 0x1F, 0xD8, 0x1F, 0xD9, 0x1F, 0xDA, 0x1F, 0xDB, 0x1F,
+	0xDC, 0x1F, 0xDD, 0x1F, 0xDE, 0x1F, 0xDF, 0x1F, 0xE8, 0x1F, 0xE9, 0x1F, 0xE2, 0x1F, 0xE3, 0x1F,
+	0xE4, 0x1F, 0xEC, 0x1F, 0xE6, 0x1F, 0xE7, 0x1F, 0xE8, 0x1F, 0xE9, 0x1F, 0xEA, 0x1F, 0xEB, 0x1F,
+	0xEC, 0x1F, 0xED, 0x1F, 0xEE, 0x1F, 0xEF, 0x1F, 0xF0, 0x1F, 0xF1, 0x1F, 0xF2, 0x1F, 0xF3, 0x1F,
+	0xF4, 0x1F, 0xF5, 0x1F, 0xF6, 0x1F, 0xF7, 0x1F, 0xF8, 0x1F, 0xF9, 0x1F, 0xFA, 0x1F, 0xFB, 0x1F,
+	0xF3, 0x1F, 0xFD, 0x1F, 0xFE, 0x1F, 0xFF, 0x1F, 0x00, 0x20, 0x01, 0x20, 0x02, 0x20, 0x03, 0x20,
+	0x04, 0x20, 0x05, 0x20, 0x06, 0x20, 0x07, 0x20, 0x08, 0x20, 0x09, 0x20, 0x0A, 0x20, 0x0B, 0x20,
+	0x0C, 0x20, 0x0D, 0x20, 0x0E, 0x20, 0x0F, 0x20, 0x10, 0x20, 0x11, 0x20, 0x12, 0x20, 0x13, 0x20,
+	0x14, 0x20, 0x15, 0x20, 0x16, 0x20, 0x17, 0x20, 0x18, 0x20, 0x19, 0x20, 0x1A, 0x20, 0x1B, 0x20,
+	0x1C, 0x20, 0x1D, 0x20, 0x1E, 0x20, 0x1F, 0x20, 0x20, 0x20, 0x21, 0x20, 0x22, 0x20, 0x23, 0x20,
+	0x24, 0x20, 0x25, 0x20, 0x26, 0x20, 0x27, 0x20, 0x28, 0x20, 0x29, 0x20, 0x2A, 0x20, 0x2B, 0x20,
+	0x2C, 0x20, 0x2D, 0x20, 0x2E, 0x20, 0x2F, 0x20, 0x30, 0x20, 0x31, 0x20, 0x32, 0x20, 0x33, 0x20,
+	0x34, 0x20, 0x35, 0x20, 0x36, 0x20, 0x37, 0x20, 0x38, 0x20, 0x39, 0x20, 0x3A, 0x20, 0x3B, 0x20,
+	0x3C, 0x20, 0x3D, 0x20, 0x3E, 0x20, 0x3F, 0x20, 0x40, 0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20,
+	0x44, 0x20, 0x45, 0x20, 0x46, 0x20, 0x47, 0x20, 0x48, 0x20, 0x49, 0x20, 0x4A, 0x20, 0x4B, 0x20,
+	0x4C, 0x20, 0x4D, 0x20, 0x4E, 0x20, 0x4F, 0x20, 0x50, 0x20, 0x51, 0x20, 0x52, 0x20, 0x53, 0x20,
+	0x54, 0x20, 0x55, 0x20, 0x56, 0x20, 0x57, 0x20, 0x58, 0x20, 0x59, 0x20, 0x5A, 0x20, 0x5B, 0x20,
+	0x5C, 0x20, 0x5D, 0x20, 0x5E, 0x20, 0x5F, 0x20, 0x60, 0x20, 0x61, 0x20, 0x62, 0x20, 0x63, 0x20,
+	0x64, 0x20, 0x65, 0x20, 0x66, 0x20, 0x67, 0x20, 0x68, 0x20, 0x69, 0x20, 0x6A, 0x20, 0x6B, 0x20,
+	0x6C, 0x20, 0x6D, 0x20, 0x6E, 0x20, 0x6F, 0x20, 0x70, 0x20, 0x71, 0x20, 0x72, 0x20, 0x73, 0x20,
+	0x74, 0x20, 0x75, 0x20, 0x76, 0x20, 0x77, 0x20, 0x78, 0x20, 0x79, 0x20, 0x7A, 0x20, 0x7B, 0x20,
+	0x7C, 0x20, 0x7D, 0x20, 0x7E, 0x20, 0x7F, 0x20, 0x80, 0x20, 0x81, 0x20, 0x82, 0x20, 0x83, 0x20,
+	0x84, 0x20, 0x85, 0x20, 0x86, 0x20, 0x87, 0x20, 0x88, 0x20, 0x89, 0x20, 0x8A, 0x20, 0x8B, 0x20,
+	0x8C, 0x20, 0x8D, 0x20, 0x8E, 0x20, 0x8F, 0x20, 0x90, 0x20, 0x91, 0x20, 0x92, 0x20, 0x93, 0x20,
+	0x94, 0x20, 0x95, 0x20, 0x96, 0x20, 0x97, 0x20, 0x98, 0x20, 0x99, 0x20, 0x9A, 0x20, 0x9B, 0x20,
+	0x9C, 0x20, 0x9D, 0x20, 0x9E, 0x20, 0x9F, 0x20, 0xA0, 0x20, 0xA1, 0x20, 0xA2, 0x20, 0xA3, 0x20,
+	0xA4, 0x20, 0xA5, 0x20, 0xA6, 0x20, 0xA7, 0x20, 0xA8, 0x20, 0xA9, 0x20, 0xAA, 0x20, 0xAB, 0x20,
+	0xAC, 0x20, 0xAD, 0x20, 0xAE, 0x20, 0xAF, 0x20, 0xB0, 0x20, 0xB1, 0x20, 0xB2, 0x20, 0xB3, 0x20,
+	0xB4, 0x20, 0xB5, 0x20, 0xB6, 0x20, 0xB7, 0x20, 0xB8, 0x20, 0xB9, 0x20, 0xBA, 0x20, 0xBB, 0x20,
+	0xBC, 0x20, 0xBD, 0x20, 0xBE, 0x20, 0xBF, 0x20, 0xC0, 0x20, 0xC1, 0x20, 0xC2, 0x20, 0xC3, 0x20,
+	0xC4, 0x20, 0xC5, 0x20, 0xC6, 0x20, 0xC7, 0x20, 0xC8, 0x20, 0xC9, 0x20, 0xCA, 0x20, 0xCB, 0x20,
+	0xCC, 0x20, 0xCD, 0x20, 0xCE, 0x20, 0xCF, 0x20, 0xD0, 0x20, 0xD1, 0x20, 0xD2, 0x20, 0xD3, 0x20,
+	0xD4, 0x20, 0xD5, 0x20, 0xD6, 0x20, 0xD7, 0x20, 0xD8, 0x20, 0xD9, 0x20, 0xDA, 0x20, 0xDB, 0x20,
+	0xDC, 0x20, 0xDD, 0x20, 0xDE, 0x20, 0xDF, 0x20, 0xE0, 0x20, 0xE1, 0x20, 0xE2, 0x20, 0xE3, 0x20,
+	0xE4, 0x20, 0xE5, 0x20, 0xE6, 0x20, 0xE7, 0x20, 0xE8, 0x20, 0xE9, 0x20, 0xEA, 0x20, 0xEB, 0x20,
+	0xEC, 0x20, 0xED, 0x20, 0xEE, 0x20, 0xEF, 0x20, 0xF0, 0x20, 0xF1, 0x20, 0xF2, 0x20, 0xF3, 0x20,
+	0xF4, 0x20, 0xF5, 0x20, 0xF6, 0x20, 0xF7, 0x20, 0xF8, 0x20, 0xF9, 0x20, 0xFA, 0x20, 0xFB, 0x20,
+	0xFC, 0x20, 0xFD, 0x20, 0xFE, 0x20, 0xFF, 0x20, 0x00, 0x21, 0x01, 0x21, 0x02, 0x21, 0x03, 0x21,
+	0x04, 0x21, 0x05, 0x21, 0x06, 0x21, 0x07, 0x21, 0x08, 0x21, 0x09, 0x21, 0x0A, 0x21, 0x0B, 0x21,
+	0x0C, 0x21, 0x0D, 0x21, 0x0E, 0x21, 0x0F, 0x21, 0x10, 0x21, 0x11, 0x21, 0x12, 0x21, 0x13, 0x21,
+	0x14, 0x21, 0x15, 0x21, 0x16, 0x21, 0x17, 0x21, 0x18, 0x21, 0x19, 0x21, 0x1A, 0x21, 0x1B, 0x21,
+	0x1C, 0x21, 0x1D, 0x21, 0x1E, 0x21, 0x1F, 0x21, 0x20, 0x21, 0x21, 0x21, 0x22, 0x21, 0x23, 0x21,
+	0x24, 0x21, 0x25, 0x21, 0x26, 0x21, 0x27, 0x21, 0x28, 0x21, 0x29, 0x21, 0x2A, 0x21, 0x2B, 0x21,
+	0x2C, 0x21, 0x2D, 0x21, 0x2E, 0x21, 0x2F, 0x21, 0x30, 0x21, 0x31, 0x21, 0x32, 0x21, 0x33, 0x21,
+	0x34, 0x21, 0x35, 0x21, 0x36, 0x21, 0x37, 0x21, 0x38, 0x21, 0x39, 0x21, 0x3A, 0x21, 0x3B, 0x21,
+	0x3C, 0x21, 0x3D, 0x21, 0x3E, 0x21, 0x3F, 0x21, 0x40, 0x21, 0x41, 0x21, 0x42, 0x21, 0x43, 0x21,
+	0x44, 0x21, 0x45, 0x21, 0x46, 0x21, 0x47, 0x21, 0x48, 0x21, 0x49, 0x21, 0x4A, 0x21, 0x4B, 0x21,
+	0x4C, 0x21, 0x4D, 0x21, 0x32, 0x21, 0x4F, 0x21, 0x50, 0x21, 0x51, 0x21, 0x52, 0x21, 0x53, 0x21,
+	0x54, 0x21, 0x55, 0x21, 0x56, 0x21, 0x57, 0x21, 0x58, 0x21, 0x59, 0x21, 0x5A, 0x21, 0x5B, 0x21,
+	0x5C, 0x21, 0x5D, 0x21, 0x5E, 0x21, 0x5F, 0x21, 0x60, 0x21, 0x61, 0x21, 0x62, 0x21, 0x63, 0x21,
+	0x64, 0x21, 0x65, 0x21, 0x66, 0x21, 0x67, 0x21, 0x68, 0x21, 0x69, 0x21, 0x6A, 0x21, 0x6B, 0x21,
+	0x6C, 0x21, 0x6D, 0x21, 0x6E, 0x21, 0x6F, 0x21, 0x60, 0x21, 0x61, 0x21, 0x62, 0x21, 0x63, 0x21,
+	0x64, 0x21, 0x65, 0x21, 0x66, 0x21, 0x67, 0x21, 0x68, 0x21, 0x69, 0x21, 0x6A, 0x21, 0x6B, 0x21,
+	0x6C, 0x21, 0x6D, 0x21, 0x6E, 0x21, 0x6F, 0x21, 0x80, 0x21, 0x81, 0x21, 0x82, 0x21, 0x83, 0x21,
+	0x83, 0x21, 0xFF, 0xFF, 0x4B, 0x03, 0xB6, 0x24, 0xB7, 0x24, 0xB8, 0x24, 0xB9, 0x24, 0xBA, 0x24,
+	0xBB, 0x24, 0xBC, 0x24, 0xBD, 0x24, 0xBE, 0x24, 0xBF, 0x24, 0xC0, 0x24, 0xC1, 0x24, 0xC2, 0x24,
+	0xC3, 0x24, 0xC4, 0x24, 0xC5, 0x24, 0xC6, 0x24, 0xC7, 0x24, 0xC8, 0x24, 0xC9, 0x24, 0xCA, 0x24,
+	0xCB, 0x24, 0xCC, 0x24, 0xCD, 0x24, 0xCE, 0x24, 0xCF, 0x24, 0xFF, 0xFF, 0x46, 0x07, 0x00, 0x2C,
+	0x01, 0x2C, 0x02, 0x2C, 0x03, 0x2C, 0x04, 0x2C, 0x05, 0x2C, 0x06, 0x2C, 0x07, 0x2C, 0x08, 0x2C,
+	0x09, 0x2C, 0x0A, 0x2C, 0x0B, 0x2C, 0x0C, 0x2C, 0x0D, 0x2C, 0x0E, 0x2C, 0x0F, 0x2C, 0x10, 0x2C,
+	0x11, 0x2C, 0x12, 0x2C, 0x13, 0x2C, 0x14, 0x2C, 0x15, 0x2C, 0x16, 0x2C, 0x17, 0x2C, 0x18, 0x2C,
+	0x19, 0x2C, 0x1A, 0x2C, 0x1B, 0x2C, 0x1C, 0x2C, 0x1D, 0x2C, 0x1E, 0x2C, 0x1F, 0x2C, 0x20, 0x2C,
+	0x21, 0x2C, 0x22, 0x2C, 0x23, 0x2C, 0x24, 0x2C, 0x25, 0x2C, 0x26, 0x2C, 0x27, 0x2C, 0x28, 0x2C,
+	0x29, 0x2C, 0x2A, 0x2C, 0x2B, 0x2C, 0x2C, 0x2C, 0x2D, 0x2C, 0x2E, 0x2C, 0x5F, 0x2C, 0x60, 0x2C,
+	0x60, 0x2C, 0x62, 0x2C, 0x63, 0x2C, 0x64, 0x2C, 0x65, 0x2C, 0x66, 0x2C, 0x67, 0x2C, 0x67, 0x2C,
+	0x69, 0x2C, 0x69, 0x2C, 0x6B, 0x2C, 0x6B, 0x2C, 0x6D, 0x2C, 0x6E, 0x2C, 0x6F, 0x2C, 0x70, 0x2C,
+	0x71, 0x2C, 0x72, 0x2C, 0x73, 0x2C, 0x74, 0x2C, 0x75, 0x2C, 0x75, 0x2C, 0x77, 0x2C, 0x78, 0x2C,
+	0x79, 0x2C, 0x7A, 0x2C, 0x7B, 0x2C, 0x7C, 0x2C, 0x7D, 0x2C, 0x7E, 0x2C, 0x7F, 0x2C, 0x80, 0x2C,
+	0x80, 0x2C, 0x82, 0x2C, 0x82, 0x2C, 0x84, 0x2C, 0x84, 0x2C, 0x86, 0x2C, 0x86, 0x2C, 0x88, 0x2C,
+	0x88, 0x2C, 0x8A, 0x2C, 0x8A, 0x2C, 0x8C, 0x2C, 0x8C, 0x2C, 0x8E, 0x2C, 0x8E, 0x2C, 0x90, 0x2C,
+	0x90, 0x2C, 0x92, 0x2C, 0x92, 0x2C, 0x94, 0x2C, 0x94, 0x2C, 0x96, 0x2C, 0x96, 0x2C, 0x98, 0x2C,
+	0x98, 0x2C, 0x9A, 0x2C, 0x9A, 0x2C, 0x9C, 0x2C, 0x9C, 0x2C, 0x9E, 0x2C, 0x9E, 0x2C, 0xA0, 0x2C,
+	0xA0, 0x2C, 0xA2, 0x2C, 0xA2, 0x2C, 0xA4, 0x2C, 0xA4, 0x2C, 0xA6, 0x2C, 0xA6, 0x2C, 0xA8, 0x2C,
+	0xA8, 0x2C, 0xAA, 0x2C, 0xAA, 0x2C, 0xAC, 0x2C, 0xAC, 0x2C, 0xAE, 0x2C, 0xAE, 0x2C, 0xB0, 0x2C,
+	0xB0, 0x2C, 0xB2, 0x2C, 0xB2, 0x2C, 0xB4, 0x2C, 0xB4, 0x2C, 0xB6, 0x2C, 0xB6, 0x2C, 0xB8, 0x2C,
+	0xB8, 0x2C, 0xBA, 0x2C, 0xBA, 0x2C, 0xBC, 0x2C, 0xBC, 0x2C, 0xBE, 0x2C, 0xBE, 0x2C, 0xC0, 0x2C,
+	0xC0, 0x2C, 0xC2, 0x2C, 0xC2, 0x2C, 0xC4, 0x2C, 0xC4, 0x2C, 0xC6, 0x2C, 0xC6, 0x2C, 0xC8, 0x2C,
+	0xC8, 0x2C, 0xCA, 0x2C, 0xCA, 0x2C, 0xCC, 0x2C, 0xCC, 0x2C, 0xCE, 0x2C, 0xCE, 0x2C, 0xD0, 0x2C,
+	0xD0, 0x2C, 0xD2, 0x2C, 0xD2, 0x2C, 0xD4, 0x2C, 0xD4, 0x2C, 0xD6, 0x2C, 0xD6, 0x2C, 0xD8, 0x2C,
+	0xD8, 0x2C, 0xDA, 0x2C, 0xDA, 0x2C, 0xDC, 0x2C, 0xDC, 0x2C, 0xDE, 0x2C, 0xDE, 0x2C, 0xE0, 0x2C,
+	0xE0, 0x2C, 0xE2, 0x2C, 0xE2, 0x2C, 0xE4, 0x2C, 0xE5, 0x2C, 0xE6, 0x2C, 0xE7, 0x2C, 0xE8, 0x2C,
+	0xE9, 0x2C, 0xEA, 0x2C, 0xEB, 0x2C, 0xEC, 0x2C, 0xED, 0x2C, 0xEE, 0x2C, 0xEF, 0x2C, 0xF0, 0x2C,
+	0xF1, 0x2C, 0xF2, 0x2C, 0xF3, 0x2C, 0xF4, 0x2C, 0xF5, 0x2C, 0xF6, 0x2C, 0xF7, 0x2C, 0xF8, 0x2C,
+	0xF9, 0x2C, 0xFA, 0x2C, 0xFB, 0x2C, 0xFC, 0x2C, 0xFD, 0x2C, 0xFE, 0x2C, 0xFF, 0x2C, 0xA0, 0x10,
+	0xA1, 0x10, 0xA2, 0x10, 0xA3, 0x10, 0xA4, 0x10, 0xA5, 0x10, 0xA6, 0x10, 0xA7, 0x10, 0xA8, 0x10,
+	0xA9, 0x10, 0xAA, 0x10, 0xAB, 0x10, 0xAC, 0x10, 0xAD, 0x10, 0xAE, 0x10, 0xAF, 0x10, 0xB0, 0x10,
+	0xB1, 0x10, 0xB2, 0x10, 0xB3, 0x10, 0xB4, 0x10, 0xB5, 0x10, 0xB6, 0x10, 0xB7, 0x10, 0xB8, 0x10,
+	0xB9, 0x10, 0xBA, 0x10, 0xBB, 0x10, 0xBC, 0x10, 0xBD, 0x10, 0xBE, 0x10, 0xBF, 0x10, 0xC0, 0x10,
+	0xC1, 0x10, 0xC2, 0x10, 0xC3, 0x10, 0xC4, 0x10, 0xC5, 0x10, 0xFF, 0xFF, 0x1B, 0xD2, 0x21, 0xFF,
+	0x22, 0xFF, 0x23, 0xFF, 0x24, 0xFF, 0x25, 0xFF, 0x26, 0xFF, 0x27, 0xFF, 0x28, 0xFF, 0x29, 0xFF,
+	0x2A, 0xFF, 0x2B, 0xFF, 0x2C, 0xFF, 0x2D, 0xFF, 0x2E, 0xFF, 0x2F, 0xFF, 0x30, 0xFF, 0x31, 0xFF,
+	0x32, 0xFF, 0x33, 0xFF, 0x34, 0xFF, 0x35, 0xFF, 0x36, 0xFF, 0x37, 0xFF, 0x38, 0xFF, 0x39, 0xFF,
+	0x3A, 0xFF, 0x5B, 0xFF, 0x5C, 0xFF, 0x5D, 0xFF, 0x5E, 0xFF, 0x5F, 0xFF, 0x60, 0xFF, 0x61, 0xFF,
+	0x62, 0xFF, 0x63, 0xFF, 0x64, 0xFF, 0x65, 0xFF, 0x66, 0xFF, 0x67, 0xFF, 0x68, 0xFF, 0x69, 0xFF,
+	0x6A, 0xFF, 0x6B, 0xFF, 0x6C, 0xFF, 0x6D, 0xFF, 0x6E, 0xFF, 0x6F, 0xFF, 0x70, 0xFF, 0x71, 0xFF,
+	0x72, 0xFF, 0x73, 0xFF, 0x74, 0xFF, 0x75, 0xFF, 0x76, 0xFF, 0x77, 0xFF, 0x78, 0xFF, 0x79, 0xFF,
+	0x7A, 0xFF, 0x7B, 0xFF, 0x7C, 0xFF, 0x7D, 0xFF, 0x7E, 0xFF, 0x7F, 0xFF, 0x80, 0xFF, 0x81, 0xFF,
+	0x82, 0xFF, 0x83, 0xFF, 0x84, 0xFF, 0x85, 0xFF, 0x86, 0xFF, 0x87, 0xFF, 0x88, 0xFF, 0x89, 0xFF,
+	0x8A, 0xFF, 0x8B, 0xFF, 0x8C, 0xFF, 0x8D, 0xFF, 0x8E, 0xFF, 0x8F, 0xFF, 0x90, 0xFF, 0x91, 0xFF,
+	0x92, 0xFF, 0x93, 0xFF, 0x94, 0xFF, 0x95, 0xFF, 0x96, 0xFF, 0x97, 0xFF, 0x98, 0xFF, 0x99, 0xFF,
+	0x9A, 0xFF, 0x9B, 0xFF, 0x9C, 0xFF, 0x9D, 0xFF, 0x9E, 0xFF, 0x9F, 0xFF, 0xA0, 0xFF, 0xA1, 0xFF,
+	0xA2, 0xFF, 0xA3, 0xFF, 0xA4, 0xFF, 0xA5, 0xFF, 0xA6, 0xFF, 0xA7, 0xFF, 0xA8, 0xFF, 0xA9, 0xFF,
+	0xAA, 0xFF, 0xAB, 0xFF, 0xAC, 0xFF, 0xAD, 0xFF, 0xAE, 0xFF, 0xAF, 0xFF, 0xB0, 0xFF, 0xB1, 0xFF,
+	0xB2, 0xFF, 0xB3, 0xFF, 0xB4, 0xFF, 0xB5, 0xFF, 0xB6, 0xFF, 0xB7, 0xFF, 0xB8, 0xFF, 0xB9, 0xFF,
+	0xBA, 0xFF, 0xBB, 0xFF, 0xBC, 0xFF, 0xBD, 0xFF, 0xBE, 0xFF, 0xBF, 0xFF, 0xC0, 0xFF, 0xC1, 0xFF,
+	0xC2, 0xFF, 0xC3, 0xFF, 0xC4, 0xFF, 0xC5, 0xFF, 0xC6, 0xFF, 0xC7, 0xFF, 0xC8, 0xFF, 0xC9, 0xFF,
+	0xCA, 0xFF, 0xCB, 0xFF, 0xCC, 0xFF, 0xCD, 0xFF, 0xCE, 0xFF, 0xCF, 0xFF, 0xD0, 0xFF, 0xD1, 0xFF,
+	0xD2, 0xFF, 0xD3, 0xFF, 0xD4, 0xFF, 0xD5, 0xFF, 0xD6, 0xFF, 0xD7, 0xFF, 0xD8, 0xFF, 0xD9, 0xFF,
+	0xDA, 0xFF, 0xDB, 0xFF, 0xDC, 0xFF, 0xDD, 0xFF, 0xDE, 0xFF, 0xDF, 0xFF, 0xE0, 0xFF, 0xE1, 0xFF,
+	0xE2, 0xFF, 0xE3, 0xFF, 0xE4, 0xFF, 0xE5, 0xFF, 0xE6, 0xFF, 0xE7, 0xFF, 0xE8, 0xFF, 0xE9, 0xFF,
+	0xEA, 0xFF, 0xEB, 0xFF, 0xEC, 0xFF, 0xED, 0xFF, 0xEE, 0xFF, 0xEF, 0xFF, 0xF0, 0xFF, 0xF1, 0xFF,
+	0xF2, 0xFF, 0xF3, 0xFF, 0xF4, 0xFF, 0xF5, 0xFF, 0xF6, 0xFF, 0xF7, 0xFF, 0xF8, 0xFF, 0xF9, 0xFF,
+	0xFA, 0xFF, 0xFB, 0xFF, 0xFC, 0xFF, 0xFD, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF
+};
diff --git b/fs/exfat/exfat_version.h b/fs/exfat/exfat_version.h
new file mode 100644
index 0000000..a93fa46
--- /dev/null
+++ b/fs/exfat/exfat_version.h
@@ -0,0 +1,19 @@
+/************************************************************************/
+/*                                                                      */
+/*  PROJECT : exFAT & FAT12/16/32 File System                           */
+/*  FILE    : exfat_version.h                                           */
+/*  PURPOSE : exFAT File Manager                                        */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  NOTES                                                               */
+/*                                                                      */
+/*----------------------------------------------------------------------*/
+/*  REVISION HISTORY                                                    */
+/*                                                                      */
+/*  - 2012.02.10 : Release Version 1.1.0                                */
+/*  - 2012.04.02 : P1 : Change Module License to Samsung Proprietary    */
+/*  - 2012.06.07 : P2 : Fixed incorrect filename problem                */
+/*                                                                      */
+/************************************************************************/
+
+#define EXFAT_VERSION  "1.2.9"
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index ccb401e..cb0528b 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1240,7 +1240,7 @@ static int f2fs_write_data_page(struct page *page,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f75d197..c1713da 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1561,7 +1561,7 @@ static int f2fs_write_node_page(struct page *page,
 		.sbi = sbi,
 		.type = NODE,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 350a2c8..04fd33c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -29,7 +29,7 @@
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
-static int setfl(int fd, struct file * filp, unsigned long arg)
+int setfl(int fd, struct file * filp, unsigned long arg)
 {
 	struct inode * inode = file_inode(filp);
 	int error = 0;
@@ -60,6 +60,8 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 
 	if (filp->f_op->check_flags)
 		error = filp->f_op->check_flags(arg);
+	if (!error && filp->f_op->setfl)
+		error = filp->f_op->setfl(filp, arg);
 	if (error)
 		return error;
 
@@ -80,6 +82,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
  out:
 	return error;
 }
+EXPORT_SYMBOL_GPL(setfl);
 
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                      int force)
diff --git a/fs/file_table.c b/fs/file_table.c
index ad17e05..ae9f267 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -147,6 +147,7 @@ over:
 	}
 	return ERR_PTR(-ENFILE);
 }
+EXPORT_SYMBOL_GPL(get_empty_filp);
 
 /**
  * alloc_file - allocate and initialize a 'struct file'
@@ -258,6 +259,7 @@ void flush_delayed_fput(void)
 {
 	delayed_fput(NULL);
 }
+EXPORT_SYMBOL_GPL(flush_delayed_fput);
 
 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
@@ -300,6 +302,7 @@ void __fput_sync(struct file *file)
 }
 
 EXPORT_SYMBOL(fput);
+EXPORT_SYMBOL_GPL(__fput_sync);
 
 void put_filp(struct file *file)
 {
@@ -308,6 +311,7 @@ void put_filp(struct file *file)
 		file_free(file);
 	}
 }
+EXPORT_SYMBOL_GPL(put_filp);
 
 void __init files_init(void)
 { 
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 950b8be..7991c62 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 {
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
-	int write_flags = REQ_META | REQ_PRIO |
-		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+	int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
diff --git a/fs/inode.c b/fs/inode.c
index 7e3ef3a..da063d3 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1593,7 +1593,7 @@ EXPORT_SYMBOL(generic_update_time);
  * This does the actual work of updating an inodes time or version.  Must have
  * had called mnt_want_write() before calling this.
  */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
+int update_time(struct inode *inode, struct timespec *time, int flags)
 {
 	int (*update_time)(struct inode *, struct timespec *, int);
 
@@ -1602,6 +1602,7 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
 
 	return update_time(inode, time, flags);
 }
+EXPORT_SYMBOL_GPL(update_time);
 
 /**
  *	touch_atime	-	update the access time
diff --git a/fs/mpage.c b/fs/mpage.c
index d2413af..d6f1afe 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
-	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : 0);
+	int op_flags = wbc_to_write_flags(wbc);
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7bb2cda..88ec098 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -463,6 +463,7 @@ void __mnt_drop_write(struct vfsmount *mnt)
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
+EXPORT_SYMBOL_GPL(__mnt_drop_write);
 
 /**
  * mnt_drop_write - give up write access to a mount
@@ -1812,6 +1813,7 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(iterate_mounts);
 
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
diff --git a/fs/notify/group.c b/fs/notify/group.c
index b47f7cf..618bc9e 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,6 +22,7 @@
 #include <linux/srcu.h>
 #include <linux/rculist.h>
 #include <linux/wait.h>
+#include <linux/module.h>
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
@@ -100,6 +101,7 @@ void fsnotify_get_group(struct fsnotify_group *group)
 {
 	atomic_inc(&group->refcnt);
 }
+EXPORT_SYMBOL_GPL(fsnotify_get_group);
 
 /*
  * Drop a reference to a group.  Free it if it's through.
@@ -109,6 +111,7 @@ void fsnotify_put_group(struct fsnotify_group *group)
 	if (atomic_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
@@ -137,6 +140,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	return group;
 }
+EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
 {
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d3fea0b..5fc06ad 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -113,6 +113,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 		mark->free_mark(mark);
 	}
 }
+EXPORT_SYMBOL_GPL(fsnotify_put_mark);
 
 /* Calculate mask of events for a list of marks */
 u32 fsnotify_recalc_mask(struct hlist_head *head)
@@ -230,6 +231,7 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	mutex_unlock(&group->mark_mutex);
 	fsnotify_free_mark(mark);
 }
+EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
 
 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 {
@@ -415,6 +417,7 @@ err:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(fsnotify_add_mark);
 
 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
 		      struct inode *inode, struct vfsmount *mnt, int allow_dups)
@@ -533,6 +536,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
 }
+EXPORT_SYMBOL_GPL(fsnotify_init_mark);
 
 /*
  * Destroy all marks in destroy_list, waits for SRCU period to finish before
diff --git a/fs/open.c b/fs/open.c
index 4fd6e25..bce19e3 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -34,6 +34,9 @@
 
 #include "internal.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/fs.h>
+
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	struct file *filp)
 {
@@ -64,6 +67,7 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	inode_unlock(dentry->d_inode);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(do_truncate);
 
 long vfs_truncate(const struct path *path, loff_t length)
 {
@@ -678,6 +682,7 @@ int open_check_o_direct(struct file *f)
 	}
 	return 0;
 }
+EXPORT_SYMBOL_GPL(open_check_o_direct);
 
 static int do_dentry_open(struct file *f,
 			  struct inode *inode,
@@ -1040,6 +1045,7 @@ long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
 		} else {
 			fsnotify_open(f);
 			fd_install(fd, f);
+			trace_do_sys_open(tmp->name, flags, mode);
 		}
 	}
 	putname(tmp);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ac0df4d..5776a2e 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -505,7 +505,7 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns,
 		seq_printf(m, "0 0 0\n");
 	else
 		seq_printf(m, "%llu %llu %lu\n",
-		   (unsigned long long)task->se.sum_exec_runtime,
+		   (unsigned long long)tsk_seruntime(task),
 		   (unsigned long long)task->sched_info.run_delay,
 		   task->sched_info.pcount);
 
@@ -1938,7 +1938,7 @@ static int map_files_get_link(struct dentry *dentry, struct path *path)
 	down_read(&mm->mmap_sem);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma->vm_file->f_path;
+		*path = vma_pr_or_file(vma)->f_path;
 		path_get(path);
 		rc = 0;
 	}
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index b9a8c81..9765269 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -89,6 +89,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		"SUnreclaim:     %8lu kB\n"
 		"KernelStack:    %8lu kB\n"
 		"PageTables:     %8lu kB\n"
+#ifdef CONFIG_UKSM
+		"KsmZeroPages:   %8lu kB\n"
+#endif
 #ifdef CONFIG_QUICKLIST
 		"Quicklists:     %8lu kB\n"
 #endif
@@ -149,6 +152,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		global_page_state(NR_KERNEL_STACK_KB),
 		K(global_page_state(NR_PAGETABLE)),
+#ifdef CONFIG_UKSM
+		K(global_page_state(NR_UKSM_ZERO_PAGES)),
+#endif
 #ifdef CONFIG_QUICKLIST
 		K(quicklist_total_size()),
 #endif
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index f8595e8..cb8eda0 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -45,7 +45,10 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode = file_inode(region->vm_file);
+		struct inode *inode;
+
+		file = vmr_pr_or_file(region);
+		inode = file_inode(file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index f6fa99e..2750949 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -298,7 +298,10 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	const char *name = NULL;
 
 	if (file) {
-		struct inode *inode = file_inode(vma->vm_file);
+		struct inode *inode;
+
+		file = vma_pr_or_file(vma);
+		inode = file_inode(file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -1634,7 +1637,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md = &numa_priv->md;
-	struct file *file = vma->vm_file;
+	struct file *file = vma_pr_or_file(vma);
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {
 		.hugetlb_entry = gather_hugetlb_stats,
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index faacb0c..17b43be 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -163,7 +163,10 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	file = vma->vm_file;
 
 	if (file) {
-		struct inode *inode = file_inode(vma->vm_file);
+		struct inode *inode;
+
+		file = vma_pr_or_file(vma);
+		inode = file_inode(file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
diff --git a/fs/read_write.c b/fs/read_write.c
index 66215a7..c643215 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -515,6 +515,30 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 }
 EXPORT_SYMBOL(__vfs_write);
 
+vfs_readf_t vfs_readf(struct file *file)
+{
+	const struct file_operations *fop = file->f_op;
+
+	if (fop->read)
+		return fop->read;
+	if (fop->read_iter)
+		return new_sync_read;
+	return ERR_PTR(-ENOSYS);
+}
+EXPORT_SYMBOL_GPL(vfs_readf);
+
+vfs_writef_t vfs_writef(struct file *file)
+{
+	const struct file_operations *fop = file->f_op;
+
+	if (fop->write)
+		return fop->write;
+	if (fop->write_iter)
+		return new_sync_write;
+	return ERR_PTR(-ENOSYS);
+}
+EXPORT_SYMBOL_GPL(vfs_writef);
+
 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 {
 	mm_segment_t old_fs;
diff --git a/fs/splice.c b/fs/splice.c
index dd9bf7e..0606690 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1111,8 +1111,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-			   loff_t *ppos, size_t len, unsigned int flags)
+long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+		    loff_t *ppos, size_t len, unsigned int flags)
 {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 				loff_t *, size_t, unsigned int);
@@ -1124,13 +1124,14 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 
 	return splice_write(pipe, out, ppos, len, flags);
 }
+EXPORT_SYMBOL_GPL(do_splice_from);
 
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, loff_t *ppos,
-			 struct pipe_inode_info *pipe, size_t len,
-			 unsigned int flags)
+long do_splice_to(struct file *in, loff_t *ppos,
+		  struct pipe_inode_info *pipe, size_t len,
+		  unsigned int flags)
 {
 	ssize_t (*splice_read)(struct file *, loff_t *,
 			       struct pipe_inode_info *, size_t, unsigned int);
@@ -1153,6 +1154,7 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 
 	return splice_read(in, ppos, pipe, len, flags);
 }
+EXPORT_SYMBOL_GPL(do_splice_to);
 
 /**
  * splice_direct_to_actor - splices data directly between two non-pipes
diff --git a/fs/xattr.c b/fs/xattr.c
index c243905..b60dc60 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -214,6 +214,7 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
 	*xattr_value = value;
 	return error;
 }
+EXPORT_SYMBOL_GPL(vfs_getxattr_alloc);
 
 ssize_t
 vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 7575cfc..a68645a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -447,8 +447,8 @@ xfs_submit_ioend(
 
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			 (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
+
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
 	 * error and finish it. This will run IO completion immediately
@@ -519,8 +519,7 @@ xfs_chain_bio(
 
 	bio_chain(ioend->io_bio, new);
 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			  (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE, wbc_to_write_flags(wbc));
 	submit_bio(ioend->io_bio);
 	ioend->io_bio = new;
 }
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index d4458b6..172ceb9 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -601,12 +601,25 @@ extern void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn,
 extern void untrack_pfn_moved(struct vm_area_struct *vma);
 #endif
 
+#ifdef CONFIG_UKSM
+static inline int is_uksm_zero_pfn(unsigned long pfn)
+{
+	extern unsigned long uksm_zero_pfn;
+        return pfn == uksm_zero_pfn;
+}
+#else
+static inline int is_uksm_zero_pfn(unsigned long pfn)
+{
+        return 0;
+}
+#endif
+
 #ifdef __HAVE_COLOR_ZERO_PAGE
 static inline int is_zero_pfn(unsigned long pfn)
 {
 	extern unsigned long zero_pfn;
 	unsigned long offset_from_zero_pfn = pfn - zero_pfn;
-	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT);
+	return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn);
 }
 
 #define my_zero_pfn(addr)	page_to_pfn(ZERO_PAGE(addr))
@@ -615,7 +628,7 @@ static inline int is_zero_pfn(unsigned long pfn)
 static inline int is_zero_pfn(unsigned long pfn)
 {
 	extern unsigned long zero_pfn;
-	return pfn == zero_pfn;
+	return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn));
 }
 
 static inline unsigned long my_zero_pfn(unsigned long addr)
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index c357f27..dc5f76d 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -116,6 +116,8 @@ struct bdi_writeback {
 	struct list_head work_list;
 	struct delayed_work dwork;	/* work item used for writeback */
 
+	unsigned long dirty_sleep;	/* last wait */
+
 	struct list_head bdi_node;	/* anchored at bdi->wb_list */
 
 #ifdef CONFIG_CGROUP_WRITEBACK
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 436f43f..95fbfa1 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -155,6 +155,7 @@ enum rq_flag_bits {
 	__REQ_INTEGRITY,	/* I/O includes block integrity payload */
 	__REQ_FUA,		/* forced unit access */
 	__REQ_PREFLUSH,		/* request for cache flush */
+	__REQ_BG,		/* background activity */
 
 	/* bio only flags */
 	__REQ_RAHEAD,		/* read ahead, can fail anytime */
@@ -198,7 +199,7 @@ enum rq_flag_bits {
 	(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 #define REQ_COMMON_MASK \
 	(REQ_FAILFAST_MASK | REQ_SYNC | REQ_META | REQ_PRIO | REQ_NOIDLE | \
-	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE)
+	 REQ_PREFLUSH | REQ_FUA | REQ_INTEGRITY | REQ_NOMERGE | REQ_BG)
 #define REQ_CLONE_MASK		REQ_COMMON_MASK
 
 /* This mask is used for both bio and request merge checking */
@@ -223,6 +224,7 @@ enum rq_flag_bits {
 #define REQ_COPY_USER		(1ULL << __REQ_COPY_USER)
 #define REQ_PREFLUSH		(1ULL << __REQ_PREFLUSH)
 #define REQ_FLUSH_SEQ		(1ULL << __REQ_FLUSH_SEQ)
+#define REQ_BG			(1ULL << __REQ_BG)
 #define REQ_IO_STAT		(1ULL << __REQ_IO_STAT)
 #define REQ_MIXED_MERGE		(1ULL << __REQ_MIXED_MERGE)
 #define REQ_PM			(1ULL << __REQ_PM)
@@ -264,4 +266,16 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
 	return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
 }
 
+#define BLK_RQ_STAT_BATCH	64
+
+struct blk_rq_stat {
+	s64 mean;
+	u64 min;
+	u64 max;
+	s32 nr_samples;
+	s32 nr_batch;
+	u64 batch;
+	s64 time;
+};
+
 #endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e79055c..17d4bc5 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -24,6 +24,7 @@
 #include <linux/rcupdate.h>
 #include <linux/percpu-refcount.h>
 #include <linux/scatterlist.h>
+#include <linux/wbt.h>
 
 struct module;
 struct scsi_ioctl_command;
@@ -37,15 +38,20 @@ struct bsg_job;
 struct blkcg_gq;
 struct blk_flush_queue;
 struct pr_ops;
+struct rq_wb;
 
 #define BLKDEV_MIN_RQ	4
+#ifdef CONFIG_PCK_INTERACTIVE
+#define BLKDEV_MAX_RQ	512
+#else
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
+#endif
 
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS		2
+#define BLKCG_MAX_POLS		3
 
 typedef void (rq_end_io_fn)(struct request *, int);
 
@@ -151,6 +157,7 @@ struct request {
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
 	unsigned long start_time;
+	struct wb_issue_stat wb_stat;
 #ifdef CONFIG_BLK_CGROUP
 	struct request_list *rl;		/* rl this rq is alloced from */
 	unsigned long long start_time_ns;
@@ -302,6 +309,8 @@ struct request_queue {
 	int			nr_rqs[2];	/* # allocated [a]sync rqs */
 	int			nr_rqs_elvpriv;	/* # allocated rqs w/ elvpriv */
 
+	struct rq_wb		*rq_wb;
+
 	/*
 	 * If blkcg is not used, @q->root_rl serves all requests.  If blkcg
 	 * is used, root blkg allocates from @q->root_rl and all other
@@ -327,6 +336,8 @@ struct request_queue {
 	struct blk_mq_ctx __percpu	*queue_ctx;
 	unsigned int		nr_queues;
 
+	unsigned int		queue_depth;
+
 	/* hw dispatch queues */
 	struct blk_mq_hw_ctx	**queue_hw_ctx;
 	unsigned int		nr_hw_queues;
@@ -412,6 +423,9 @@ struct request_queue {
 
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
+
+	struct blk_rq_stat	rq_stats[2];
+
 	/*
 	 * Number of active block driver functions for which blk_drain_queue()
 	 * must wait. Must be incremented around functions that unlock the
@@ -683,6 +697,14 @@ static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
 	return false;
 }
 
+static inline unsigned int blk_queue_depth(struct request_queue *q)
+{
+	if (q->queue_depth)
+		return q->queue_depth;
+
+	return q->nr_requests;
+}
+
 /*
  * q->prep_rq_fn return values
  */
@@ -999,6 +1021,7 @@ extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
diff --git a/include/linux/file.h b/include/linux/file.h
index 7444f5f..bdac0be 100644
--- a/include/linux/file.h
+++ b/include/linux/file.h
@@ -19,6 +19,7 @@ struct dentry;
 struct path;
 extern struct file *alloc_file(struct path *, fmode_t mode,
 	const struct file_operations *fop);
+extern struct file *get_empty_filp(void);
 
 static inline void fput_light(struct file *file, int fput_needed)
 {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 7c39136..6f180af 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -189,6 +189,8 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
  * WRITE_FLUSH_FUA	Combination of WRITE_FLUSH and FUA. The IO is preceded
  *			by a cache flush and data is guaranteed to be on
  *			non-volatile media on completion.
+ * WRITE_BG		Background write. This is for background activity like
+ *			the periodic flush and background threshold writeback
  *
  */
 #define RW_MASK			REQ_OP_WRITE
@@ -202,6 +204,7 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
 #define WRITE_FLUSH		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH)
 #define WRITE_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_FUA)
 #define WRITE_FLUSH_FUA		(REQ_SYNC | REQ_NOIDLE | REQ_PREFLUSH | REQ_FUA)
+#define WRITE_BG		(REQ_NOIDLE | REQ_BG)
 
 /*
  * Attribute flags.  These should be or-ed together to figure out what
@@ -1276,6 +1279,7 @@ extern void fasync_free(struct fasync_struct *);
 /* can be called from interrupts */
 extern void kill_fasync(struct fasync_struct **, int, int);
 
+extern int setfl(int fd, struct file * filp, unsigned long arg);
 extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force);
 extern void f_setown(struct file *filp, unsigned long arg, int force);
 extern void f_delown(struct file *filp);
@@ -1700,6 +1704,7 @@ struct file_operations {
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
 	int (*check_flags)(int);
+	int (*setfl)(struct file *, unsigned long);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
@@ -1760,6 +1765,12 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
 			      struct iovec *fast_pointer,
 			      struct iovec **ret_pointer);
 
+typedef ssize_t (*vfs_readf_t)(struct file *, char __user *, size_t, loff_t *);
+typedef ssize_t (*vfs_writef_t)(struct file *, const char __user *, size_t,
+				loff_t *);
+vfs_readf_t vfs_readf(struct file *file);
+vfs_writef_t vfs_writef(struct file *file);
+
 extern ssize_t __vfs_read(struct file *, char __user *, size_t, loff_t *);
 extern ssize_t __vfs_write(struct file *, const char __user *, size_t, loff_t *);
 extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
@@ -2124,6 +2135,7 @@ extern int current_umask(void);
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
 extern int generic_update_time(struct inode *, struct timespec *, int);
+extern int update_time(struct inode *, struct timespec *, int);
 
 /* /sys/fs */
 extern struct kobject *fs_kobj;
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f8834f8..d94dc24 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -157,8 +157,6 @@ extern struct task_group root_task_group;
 # define INIT_VTIME(tsk)
 #endif
 
-#define INIT_TASK_COMM "swapper"
-
 #ifdef CONFIG_RT_MUTEXES
 # define INIT_RT_MUTEXES(tsk)						\
 	.pi_waiters = RB_ROOT,						\
@@ -187,6 +185,77 @@ extern struct task_group root_task_group;
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
  */
+#ifdef CONFIG_SCHED_MUQSS
+#define INIT_TASK_COMM "MuQSS"
+#define INIT_TASK(tsk)	\
+{									\
+	.state		= 0,						\
+	.stack		= &init_thread_info,				\
+	.usage		= ATOMIC_INIT(2),				\
+	.flags		= PF_KTHREAD,					\
+	.prio		= NORMAL_PRIO,					\
+	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= NORMAL_PRIO,					\
+	.deadline	= 0,						\
+	.policy		= SCHED_NORMAL,					\
+	.cpus_allowed	= CPU_MASK_ALL,					\
+	.mm		= NULL,						\
+	.active_mm	= &init_mm,					\
+	.restart_block = {						\
+		.fn = do_no_restart_syscall,				\
+	},								\
+	.time_slice	= 1000000,					\
+	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	INIT_PUSHABLE_TASKS(tsk)					\
+	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
+	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
+	.real_parent	= &tsk,						\
+	.parent		= &tsk,						\
+	.children	= LIST_HEAD_INIT(tsk.children),			\
+	.sibling	= LIST_HEAD_INIT(tsk.sibling),			\
+	.group_leader	= &tsk,						\
+	RCU_POINTER_INITIALIZER(real_cred, &init_cred),			\
+	RCU_POINTER_INITIALIZER(cred, &init_cred),			\
+	.comm		= INIT_TASK_COMM,				\
+	.thread		= INIT_THREAD,					\
+	.fs		= &init_fs,					\
+	.files		= &init_files,					\
+	.signal		= &init_signals,				\
+	.sighand	= &init_sighand,				\
+	.nsproxy	= &init_nsproxy,				\
+	.pending	= {						\
+		.list = LIST_HEAD_INIT(tsk.pending.list),		\
+		.signal = {{0}}},					\
+	.blocked	= {{0}},					\
+	.alloc_lock	= __SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.journal_info	= NULL,						\
+	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
+	.pi_lock	= __RAW_SPIN_LOCK_UNLOCKED(tsk.pi_lock),		\
+	.timer_slack_ns = 50000, /* 50 usec default slack */		\
+	.pids = {							\
+		[PIDTYPE_PID]  = INIT_PID_LINK(PIDTYPE_PID),		\
+		[PIDTYPE_PGID] = INIT_PID_LINK(PIDTYPE_PGID),		\
+		[PIDTYPE_SID]  = INIT_PID_LINK(PIDTYPE_SID),		\
+	},								\
+	.thread_group	= LIST_HEAD_INIT(tsk.thread_group),		\
+	.thread_node	= LIST_HEAD_INIT(init_signals.thread_head),	\
+	INIT_IDS							\
+	INIT_PERF_EVENTS(tsk)						\
+	INIT_TRACE_IRQFLAGS						\
+	INIT_LOCKDEP							\
+	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
+	INIT_TASK_RCU_PREEMPT(tsk)					\
+	INIT_TASK_RCU_TASKS(tsk)					\
+	INIT_CPUSET_SEQ(tsk)						\
+	INIT_RT_MUTEXES(tsk)						\
+	INIT_PREV_CPUTIME(tsk)						\
+	INIT_VTIME(tsk)							\
+	INIT_NUMA_BALANCING(tsk)					\
+	INIT_KASAN(tsk)							\
+}
+#else /* CONFIG_SCHED_MUQSS */
+#define INIT_TASK_COMM "swapper"
 #define INIT_TASK(tsk)	\
 {									\
 	.state		= 0,						\
@@ -261,7 +330,7 @@ extern struct task_group root_task_group;
 	INIT_NUMA_BALANCING(tsk)					\
 	INIT_KASAN(tsk)							\
 }
-
+#endif /* CONFIG_SCHED_MUQSS */
 
 #define INIT_CPU_TIMERS(cpu_timers)					\
 {									\
diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h
index beb9ce1..ce2fc3c 100644
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -52,6 +52,8 @@ enum {
  */
 static inline int task_nice_ioprio(struct task_struct *task)
 {
+	if (iso_task(task))
+		return 0;
 	return (task_nice(task) + 20) / 5;
 }
 
diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 5fdc553..893d5d1 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -164,7 +164,11 @@ static inline u64 get_jiffies_64(void)
  * Have the 32 bit jiffies value wrap 5 minutes after boot
  * so jiffies wrap bugs show up earlier.
  */
+#ifdef CONFIG_SCHED_MUQSS
+#define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-10*HZ))
+#else
 #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ))
+#endif
 
 /*
  * Change timeval to jiffies, trying to avoid the
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 481c8c4..5329b23 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -19,21 +19,6 @@ struct mem_cgroup;
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
-int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm);
-
-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
-{
-	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
-	return 0;
-}
-
-static inline void ksm_exit(struct mm_struct *mm)
-{
-	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-		__ksm_exit(mm);
-}
 
 static inline struct stable_node *page_stable_node(struct page *page)
 {
@@ -63,6 +48,33 @@ struct page *ksm_might_need_to_copy(struct page *page,
 int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 
+#ifdef CONFIG_KSM_LEGACY
+int __ksm_enter(struct mm_struct *mm);
+void __ksm_exit(struct mm_struct *mm);
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
+		return __ksm_enter(mm);
+	return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+		__ksm_exit(mm);
+}
+
+#elif defined(CONFIG_UKSM)
+static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+	return 0;
+}
+
+static inline void ksm_exit(struct mm_struct *mm)
+{
+}
+#endif /* !CONFIG_UKSM */
+
 #else  /* !CONFIG_KSM */
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
@@ -105,4 +117,6 @@ static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage)
 #endif /* CONFIG_MMU */
 #endif /* !CONFIG_KSM */
 
+#include <linux/uksm.h>
+
 #endif /* __LINUX_KSM_H */
diff --git a/include/linux/linux_logo.h b/include/linux/linux_logo.h
index ca5bd91..5489dcb 100644
--- a/include/linux/linux_logo.h
+++ b/include/linux/linux_logo.h
@@ -37,6 +37,18 @@ extern const struct linux_logo logo_linux_vga16;
 extern const struct linux_logo logo_linux_clut224;
 extern const struct linux_logo logo_blackfin_vga16;
 extern const struct linux_logo logo_blackfin_clut224;
+extern const struct linux_logo logo_zen_clut224;
+extern const struct linux_logo logo_oldzen_clut224;
+extern const struct linux_logo logo_arch_clut224;
+extern const struct linux_logo logo_gentoo_clut224;
+extern const struct linux_logo logo_exherbo_clut224;
+extern const struct linux_logo logo_slackware_clut224;
+extern const struct linux_logo logo_debian_clut224;
+extern const struct linux_logo logo_fedorasimple_clut224;
+extern const struct linux_logo logo_fedoraglossy_clut224;
+extern const struct linux_logo logo_tits_clut224;
+extern const struct linux_logo logo_bsd_clut224;
+extern const struct linux_logo logo_fbsd_clut224;
 extern const struct linux_logo logo_dec_clut224;
 extern const struct linux_logo logo_mac_clut224;
 extern const struct linux_logo logo_parisc_clut224;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 277cd39..582d30b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1278,6 +1278,28 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
+extern void vma_do_file_update_time(struct vm_area_struct *, const char[], int);
+extern struct file *vma_do_pr_or_file(struct vm_area_struct *, const char[],
+				      int);
+extern void vma_do_get_file(struct vm_area_struct *, const char[], int);
+extern void vma_do_fput(struct vm_area_struct *, const char[], int);
+
+#define vma_file_update_time(vma)	vma_do_file_update_time(vma, __func__, \
+								__LINE__)
+#define vma_pr_or_file(vma)		vma_do_pr_or_file(vma, __func__, \
+							  __LINE__)
+#define vma_get_file(vma)		vma_do_get_file(vma, __func__, __LINE__)
+#define vma_fput(vma)			vma_do_fput(vma, __func__, __LINE__)
+
+#ifndef CONFIG_MMU
+extern struct file *vmr_do_pr_or_file(struct vm_region *, const char[], int);
+extern void vmr_do_fput(struct vm_region *, const char[], int);
+
+#define vmr_pr_or_file(region)		vmr_do_pr_or_file(region, __func__, \
+							  __LINE__)
+#define vmr_fput(region)		vmr_do_fput(region, __func__, __LINE__)
+#endif /* !CONFIG_MMU */
+
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 903200f..bf28080 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -275,6 +275,7 @@ struct vm_region {
 	unsigned long	vm_top;		/* region allocated to here */
 	unsigned long	vm_pgoff;	/* the offset in vm_file corresponding to vm_start */
 	struct file	*vm_file;	/* the backing file or NULL */
+	struct file	*vm_prfile;	/* the virtual backing file or NULL */
 
 	int		vm_usage;	/* region usage count (access under nommu_region_sem) */
 	bool		vm_icache_flushed : 1; /* true if the icache has been flushed for
@@ -349,6 +350,7 @@ struct vm_area_struct {
 	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
 					   units */
 	struct file * vm_file;		/* File we map to (can be NULL). */
+	struct file *vm_prfile;		/* shadow of vm_file */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
 
 #ifndef CONFIG_MMU
@@ -358,6 +360,9 @@ struct vm_area_struct {
 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_UKSM
+	struct vma_slot *uksm_vma_slot;
+#endif
 };
 
 struct core_thread {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 7f2ae99..89f7dd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -138,6 +138,9 @@ enum zone_stat_item {
 	NUMA_OTHER,		/* allocation from other node */
 #endif
 	NR_FREE_CMA_PAGES,
+#ifdef CONFIG_UKSM
+	NR_UKSM_ZERO_PAGES,
+#endif
 	NR_VM_ZONE_STAT_ITEMS };
 
 enum node_stat_item {
@@ -869,7 +872,7 @@ static inline int is_highmem_idx(enum zone_type idx)
 }
 
 /**
- * is_highmem - helper function to quickly check if a struct zone is a 
+ * is_highmem - helper function to quickly check if a struct zone is a
  *              highmem zone or not.  This is an attempt to keep references
  *              to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum.
  * @zone - pointer to struct zone variable
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
index d9cf5a5..94d397e 100644
--- a/include/linux/sched/prio.h
+++ b/include/linux/sched/prio.h
@@ -19,8 +19,20 @@
  */
 
 #define MAX_USER_RT_PRIO	100
+
+#ifdef CONFIG_SCHED_MUQSS
+/* Note different MAX_RT_PRIO */
+#define MAX_RT_PRIO		(MAX_USER_RT_PRIO + 1)
+
+#define ISO_PRIO		(MAX_RT_PRIO)
+#define NORMAL_PRIO		(MAX_RT_PRIO + 1)
+#define IDLE_PRIO		(MAX_RT_PRIO + 2)
+#define PRIO_LIMIT		((IDLE_PRIO) + 1)
+#else /* CONFIG_SCHED_MUQSS */
 #define MAX_RT_PRIO		MAX_USER_RT_PRIO
 
+#endif /* CONFIG_SCHED_MUQSS */
+
 #define MAX_PRIO		(MAX_RT_PRIO + NICE_WIDTH)
 #define DEFAULT_PRIO		(MAX_RT_PRIO + NICE_WIDTH / 2)
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 62c68e5..e58ba7a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -59,6 +59,7 @@ struct sched_param {
 #include <linux/gfp.h>
 #include <linux/magic.h>
 #include <linux/cgroup-defs.h>
+#include <linux/skip_list.h>
 
 #include <asm/processor.h>
 
@@ -176,7 +177,7 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 
 extern void calc_global_load(unsigned long ticks);
 
-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS)
 extern void cpu_load_update_nohz_start(void);
 extern void cpu_load_update_nohz_stop(void);
 #else
@@ -340,8 +341,6 @@ extern void init_idle_bootup_task(struct task_struct *idle);
 
 extern cpumask_var_t cpu_isolated_map;
 
-extern int runqueue_is_locked(int cpu);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
@@ -1464,9 +1463,11 @@ struct task_struct {
 	unsigned int flags;	/* per process flags, defined below */
 	unsigned int ptrace;
 
+#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS)
+	int on_cpu;
+#endif
 #ifdef CONFIG_SMP
 	struct llist_node wake_entry;
-	int on_cpu;
 	unsigned int wakee_flips;
 	unsigned long wakee_flip_decay_ts;
 	struct task_struct *last_wakee;
@@ -1474,12 +1475,26 @@ struct task_struct {
 	int wake_cpu;
 #endif
 	int on_rq;
-
 	int prio, static_prio, normal_prio;
 	unsigned int rt_priority;
+#ifdef CONFIG_SCHED_MUQSS
+	int time_slice;
+	u64 deadline;
+	skiplist_node node; /* Skip list node */
+	u64 last_ran;
+	u64 sched_time; /* sched_clock time spent running */
+#ifdef CONFIG_SMT_NICE
+	int smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+#ifdef CONFIG_HOTPLUG_CPU
+	bool zerobound; /* Bound to CPU0 for hotplug */
+#endif
+	unsigned long rt_timeout;
+#else /* CONFIG_SCHED_MUQSS */
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_rt_entity rt;
+#endif
 #ifdef CONFIG_CGROUP_SCHED
 	struct task_group *sched_task_group;
 #endif
@@ -1603,6 +1618,10 @@ struct task_struct {
 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
 
 	cputime_t utime, stime, utimescaled, stimescaled;
+#ifdef CONFIG_SCHED_MUQSS
+	/* Unbanked cpu time */
+	unsigned long utime_ns, stime_ns;
+#endif
 	cputime_t gtime;
 	struct prev_cputime prev_cputime;
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
@@ -1939,6 +1958,40 @@ extern int arch_task_struct_size __read_mostly;
 # define arch_task_struct_size (sizeof(struct task_struct))
 #endif
 
+#ifdef CONFIG_SCHED_MUQSS
+#define tsk_seruntime(t)		((t)->sched_time)
+#define tsk_rttimeout(t)		((t)->rt_timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+}
+
+void print_scheduler_version(void);
+
+static inline bool iso_task(struct task_struct *p)
+{
+	return (p->policy == SCHED_ISO);
+}
+#else /* CFS */
+#define tsk_seruntime(t)	((t)->se.sum_exec_runtime)
+#define tsk_rttimeout(t)	((t)->rt.timeout)
+
+static inline void tsk_cpus_current(struct task_struct *p)
+{
+	p->nr_cpus_allowed = current->nr_cpus_allowed;
+}
+
+static inline void print_scheduler_version(void)
+{
+	printk(KERN_INFO"CFS CPU scheduler.\n");
+}
+
+static inline bool iso_task(struct task_struct *p)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_MUQSS */
+
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
 #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
 
@@ -2392,7 +2445,7 @@ static inline int set_cpus_allowed_ptr(struct task_struct *p,
 }
 #endif
 
-#ifdef CONFIG_NO_HZ_COMMON
+#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS)
 void calc_load_enter_idle(void);
 void calc_load_exit_idle(void);
 #else
@@ -2493,7 +2546,7 @@ extern unsigned long long
 task_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS)
 extern void sched_exec(void);
 #else
 #define sched_exec()   {}
diff --git b/include/linux/skip_list.h b/include/linux/skip_list.h
new file mode 100644
index 0000000..d4be84b
--- /dev/null
+++ b/include/linux/skip_list.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_SKIP_LISTS_H
+#define _LINUX_SKIP_LISTS_H
+typedef u64 keyType;
+typedef void *valueType;
+
+typedef struct nodeStructure skiplist_node;
+
+struct nodeStructure {
+	int level;	/* Levels in this structure */
+	keyType key;
+	valueType value;
+	skiplist_node *next[8];
+	skiplist_node *prev[8];
+};
+
+typedef struct listStructure {
+	int entries;
+	int level;	/* Maximum level of the list
+			(1 more than the number of levels in the list) */
+	skiplist_node *header; /* pointer to header */
+} skiplist;
+
+void skiplist_init(skiplist_node *slnode);
+skiplist *new_skiplist(skiplist_node *slnode);
+void free_skiplist(skiplist *l);
+void skiplist_node_init(skiplist_node *node);
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed);
+void skiplist_delete(skiplist *l, skiplist_node *node);
+
+static inline bool skiplist_node_empty(skiplist_node *node) {
+	return (!node->next[0]);
+}
+#endif /* _LINUX_SKIP_LISTS_H */
diff --git a/include/linux/splice.h b/include/linux/splice.h
index da2751d..2e0fca6 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -83,4 +83,10 @@ extern void splice_shrink_spd(struct splice_pipe_desc *);
 extern void spd_release_page(struct splice_pipe_desc *, unsigned int);
 
 extern const struct pipe_buf_operations page_cache_pipe_buf_ops;
+
+extern long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags);
+extern long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags);
 #endif
diff --git b/include/linux/sradix-tree.h b/include/linux/sradix-tree.h
new file mode 100644
index 0000000..6780fdb
--- /dev/null
+++ b/include/linux/sradix-tree.h
@@ -0,0 +1,77 @@
+#ifndef _LINUX_SRADIX_TREE_H
+#define _LINUX_SRADIX_TREE_H
+
+
+#define INIT_SRADIX_TREE(root, mask)					\
+do {									\
+	(root)->height = 0;						\
+	(root)->gfp_mask = (mask);					\
+	(root)->rnode = NULL;						\
+} while (0)
+
+#define ULONG_BITS	(sizeof(unsigned long) * 8)
+#define SRADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+//#define SRADIX_TREE_MAP_SHIFT	6
+//#define SRADIX_TREE_MAP_SIZE	(1UL << SRADIX_TREE_MAP_SHIFT)
+//#define SRADIX_TREE_MAP_MASK	(SRADIX_TREE_MAP_SIZE-1)
+
+struct sradix_tree_node {
+	unsigned int	height;		/* Height from the bottom */
+	unsigned int	count;		
+	unsigned int	fulls;		/* Number of full sublevel trees */ 
+	struct sradix_tree_node *parent;
+	void *stores[0];
+};
+
+/* A simple radix tree implementation */
+struct sradix_tree_root {
+        unsigned int            height;
+        struct sradix_tree_node *rnode;
+
+	/* Where found to have available empty stores in its sublevels */
+        struct sradix_tree_node *enter_node;
+	unsigned int shift;
+	unsigned int stores_size;
+	unsigned int mask;
+	unsigned long min;	/* The first hole index */
+	unsigned long num;
+	//unsigned long *height_to_maxindex;
+
+	/* How the node is allocated and freed. */
+	struct sradix_tree_node *(*alloc)(void); 
+	void (*free)(struct sradix_tree_node *node);
+
+	/* When a new node is added and removed */
+	void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child);
+	void (*assign)(struct sradix_tree_node *node, unsigned index, void *item);
+	void (*rm)(struct sradix_tree_node *node, unsigned offset);
+};
+
+struct sradix_tree_path {
+	struct sradix_tree_node *node;
+	int offset;
+};
+
+static inline 
+void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift)
+{
+	root->height = 0;
+	root->rnode = NULL;
+	root->shift = shift;
+	root->stores_size = 1UL << shift;
+	root->mask = root->stores_size - 1;
+}
+
+
+extern void *sradix_tree_next(struct sradix_tree_root *root,
+		       struct sradix_tree_node *node, unsigned long index,
+		       int (*iter)(void *, unsigned long));
+
+extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num);
+
+extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
+			struct sradix_tree_node *node, unsigned long index);
+
+extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index);
+
+#endif /* _LINUX_SRADIX_TREE_H */
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 7be9b12..996cde6 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -19,6 +19,7 @@
 
 
 #include <linux/skbuff.h>
+#include <linux/cryptohash.h>
 #include <net/sock.h>
 #include <net/inet_connection_sock.h>
 #include <net/inet_timewait_sock.h>
@@ -348,6 +349,21 @@ struct tcp_sock {
 	struct tcp_md5sig_info	__rcu *md5sig_info;
 #endif
 
+#ifdef CONFIG_TCP_STEALTH
+/* Stealth TCP socket configuration */
+	struct {
+		#define TCP_STEALTH_MODE_AUTH		BIT(0)
+		#define TCP_STEALTH_MODE_INTEGRITY	BIT(1)
+		#define TCP_STEALTH_MODE_INTEGRITY_LEN	BIT(2)
+		u8 mode;
+		u8 secret[MD5_MESSAGE_BYTES];
+		u16 integrity_hash;
+		size_t integrity_len;
+		struct skb_mstamp mstamp;
+		bool saw_tsval;
+	} stealth;
+#endif
+
 /* TCP fastopen related information */
 	struct tcp_fastopen_request *fastopen_req;
 	/* fastopen_rsk points to request_sock that resulted in this big
diff --git b/include/linux/thinkpad_ec.h b/include/linux/thinkpad_ec.h
new file mode 100644
index 0000000..1b80d7e
--- /dev/null
+++ b/include/linux/thinkpad_ec.h
@@ -0,0 +1,47 @@
+/*
+ *  thinkpad_ec.h - interface to ThinkPad embedded controller LPC3 functions
+ *
+ *  Copyright (C) 2005 Shem Multinymous <multinymous@gmail.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#ifndef _THINKPAD_EC_H
+#define _THINKPAD_EC_H
+
+#ifdef __KERNEL__
+
+#define TP_CONTROLLER_ROW_LEN 16
+
+/* EC transactions input and output (possibly partial) vectors of 16 bytes. */
+struct thinkpad_ec_row {
+	u16 mask; /* bitmap of which entries of val[] are meaningful */
+	u8 val[TP_CONTROLLER_ROW_LEN];
+};
+
+extern int __must_check thinkpad_ec_lock(void);
+extern int __must_check thinkpad_ec_try_lock(void);
+extern void thinkpad_ec_unlock(void);
+
+extern int thinkpad_ec_read_row(const struct thinkpad_ec_row *args,
+				struct thinkpad_ec_row *data);
+extern int thinkpad_ec_try_read_row(const struct thinkpad_ec_row *args,
+				    struct thinkpad_ec_row *mask);
+extern int thinkpad_ec_prefetch_row(const struct thinkpad_ec_row *args);
+extern void thinkpad_ec_invalidate(void);
+
+
+#endif /* __KERNEL */
+#endif /* _THINKPAD_EC_H */
diff --git b/include/linux/uksm.h b/include/linux/uksm.h
new file mode 100644
index 0000000..825f05e
--- /dev/null
+++ b/include/linux/uksm.h
@@ -0,0 +1,149 @@
+#ifndef __LINUX_UKSM_H
+#define __LINUX_UKSM_H
+/*
+ * Memory merging support.
+ *
+ * This code enables dynamic sharing of identical pages found in different
+ * memory areas, even if they are not shared by fork().
+ */
+
+/* if !CONFIG_UKSM this file should not be compiled at all. */
+#ifdef CONFIG_UKSM
+
+#include <linux/bitops.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/sched.h>
+
+extern unsigned long zero_pfn __read_mostly;
+extern unsigned long uksm_zero_pfn __read_mostly;
+extern struct page *empty_uksm_zero_page;
+
+/* must be done before linked to mm */
+extern void uksm_vma_add_new(struct vm_area_struct *vma);
+extern void uksm_remove_vma(struct vm_area_struct *vma);
+
+#define UKSM_SLOT_NEED_SORT	(1 << 0)
+#define UKSM_SLOT_NEED_RERAND 	(1 << 1)
+#define UKSM_SLOT_SCANNED     	(1 << 2) /* It's scanned in this round */
+#define UKSM_SLOT_FUL_SCANNED 	(1 << 3)
+#define UKSM_SLOT_IN_UKSM 	(1 << 4)
+
+struct vma_slot {
+	struct sradix_tree_node *snode;
+	unsigned long sindex;
+
+	struct list_head slot_list;
+	unsigned long fully_scanned_round;
+	unsigned long dedup_num;
+	unsigned long pages_scanned;
+	unsigned long this_sampled;
+	unsigned long last_scanned;
+	unsigned long pages_to_scan;
+	struct scan_rung *rung;
+	struct page **rmap_list_pool;
+	unsigned int *pool_counts;
+	unsigned long pool_size;
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	unsigned long ctime_j;
+	unsigned long pages;
+	unsigned long flags;
+	unsigned long pages_cowed; /* pages cowed this round */
+	unsigned long pages_merged; /* pages merged this round */
+	unsigned long pages_bemerged;
+
+	/* when it has page merged in this eval round */
+	struct list_head dedup_list;
+};
+
+static inline void uksm_unmap_zero_page(pte_t pte)
+{
+	if (pte_pfn(pte) == uksm_zero_pfn)
+		__dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
+}
+
+static inline void uksm_map_zero_page(pte_t pte)
+{
+	if (pte_pfn(pte) == uksm_zero_pfn)
+		__inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES);
+}
+
+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
+{
+	if (vma->uksm_vma_slot && PageKsm(page))
+		vma->uksm_vma_slot->pages_cowed++;
+}
+
+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
+{
+	if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn)
+		vma->uksm_vma_slot->pages_cowed++;
+}
+
+static inline int uksm_flags_can_scan(unsigned long vm_flags)
+{
+#ifdef VM_SAO
+		if (vm_flags & VM_SAO)
+			return 0;
+#endif
+
+	return !(vm_flags & (VM_PFNMAP | VM_IO  | VM_DONTEXPAND |
+			     VM_HUGETLB | VM_MIXEDMAP | VM_SHARED
+			     | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN));
+}
+
+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
+{
+	if (uksm_flags_can_scan(*vm_flags_p))
+		*vm_flags_p |= VM_MERGEABLE;
+}
+
+/*
+ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will
+ * be removed when uksm zero page patch is stable enough.
+ */
+static inline void uksm_bugon_zeropage(pte_t pte)
+{
+	BUG_ON(pte_pfn(pte) == uksm_zero_pfn);
+}
+#else
+static inline void uksm_vma_add_new(struct vm_area_struct *vma)
+{
+}
+
+static inline void uksm_remove_vma(struct vm_area_struct *vma)
+{
+}
+
+static inline void uksm_unmap_zero_page(pte_t pte)
+{
+}
+
+static inline void uksm_map_zero_page(pte_t pte)
+{
+}
+
+static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page)
+{
+}
+
+static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte)
+{
+}
+
+static inline int uksm_flags_can_scan(unsigned long vm_flags)
+{
+	return 0;
+}
+
+static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p)
+{
+}
+
+static inline void uksm_bugon_zeropage(pte_t pte)
+{
+}
+#endif /* !CONFIG_UKSM */
+#endif /* __LINUX_UKSM_H */
diff --git b/include/linux/wbt.h b/include/linux/wbt.h
new file mode 100644
index 0000000..68ba75e
--- /dev/null
+++ b/include/linux/wbt.h
@@ -0,0 +1,141 @@
+#ifndef WB_THROTTLE_H
+#define WB_THROTTLE_H
+
+#include <linux/atomic.h>
+#include <linux/wait.h>
+#include <linux/timer.h>
+#include <linux/ktime.h>
+
+enum wbt_flags {
+	WBT_TRACKED		= 1,	/* write, tracked for throttling */
+	WBT_READ		= 2,	/* read */
+	WBT_KSWAPD		= 4,	/* write, from kswapd */
+
+	WBT_NR_BITS		= 3,	/* number of bits */
+};
+
+enum {
+	/*
+	 * Set aside 3 bits for state, rest is a time stamp
+	 */
+	ISSUE_STAT_SHIFT	= 64 - WBT_NR_BITS,
+	ISSUE_STAT_MASK 	= ~((1ULL << ISSUE_STAT_SHIFT) - 1),
+	ISSUE_STAT_TIME_MASK	= ~ISSUE_STAT_MASK,
+
+	WBT_NUM_RWQ		= 2,
+};
+
+struct wb_issue_stat {
+	u64 time;
+};
+
+static inline void wbt_issue_stat_set_time(struct wb_issue_stat *stat)
+{
+	stat->time = (stat->time & ISSUE_STAT_MASK) |
+			(ktime_to_ns(ktime_get()) & ISSUE_STAT_TIME_MASK);
+}
+
+static inline u64 wbt_issue_stat_get_time(struct wb_issue_stat *stat)
+{
+	return stat->time & ISSUE_STAT_TIME_MASK;
+}
+
+static inline void wbt_clear_state(struct wb_issue_stat *stat)
+{
+	stat->time &= ISSUE_STAT_TIME_MASK;
+}
+
+static inline enum wbt_flags wbt_stat_to_mask(struct wb_issue_stat *stat)
+{
+	return (stat->time & ISSUE_STAT_MASK) >> ISSUE_STAT_SHIFT;
+}
+
+static inline void wbt_track(struct wb_issue_stat *stat, enum wbt_flags wb_acct)
+{
+	stat->time |= ((u64) wb_acct) << ISSUE_STAT_SHIFT;
+}
+
+static inline bool wbt_is_tracked(struct wb_issue_stat *stat)
+{
+	return (stat->time >> ISSUE_STAT_SHIFT) & WBT_TRACKED;
+}
+
+static inline bool wbt_is_read(struct wb_issue_stat *stat)
+{
+	return (stat->time >> ISSUE_STAT_SHIFT) & WBT_READ;
+}
+
+struct wb_stat_ops {
+	void (*get)(void *, struct blk_rq_stat *);
+	bool (*is_current)(struct blk_rq_stat *);
+	void (*clear)(void *);
+};
+
+struct rq_wait {
+	wait_queue_head_t wait;
+	atomic_t inflight;
+};
+
+struct rq_wb {
+	/*
+	 * Settings that govern how we throttle
+	 */
+	unsigned int wb_background;		/* background writeback */
+	unsigned int wb_normal;			/* normal writeback */
+	unsigned int wb_max;			/* max throughput writeback */
+	int scale_step;
+	bool scaled_max;
+
+	/*
+	 * Number of consecutive periods where we don't have enough
+	 * information to make a firm scale up/down decision.
+	 */
+	unsigned int unknown_cnt;
+
+	u64 win_nsec;				/* default window size */
+	u64 cur_win_nsec;			/* current window size */
+
+	struct timer_list window_timer;
+
+	s64 sync_issue;
+	void *sync_cookie;
+
+	unsigned int wc;
+	unsigned int queue_depth;
+
+	unsigned long last_issue;		/* last non-throttled issue */
+	unsigned long last_comp;		/* last non-throttled comp */
+	unsigned long min_lat_nsec;
+	struct backing_dev_info *bdi;
+	struct rq_wait rq_wait[WBT_NUM_RWQ];
+
+	struct wb_stat_ops *stat_ops;
+	void *ops_data;
+};
+
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
+{
+	unsigned int i, ret = 0;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++)
+		ret += atomic_read(&rwb->rq_wait[i].inflight);
+
+	return ret;
+}
+
+struct backing_dev_info;
+
+void __wbt_done(struct rq_wb *, enum wbt_flags);
+void wbt_done(struct rq_wb *, struct wb_issue_stat *);
+enum wbt_flags wbt_wait(struct rq_wb *, unsigned int, spinlock_t *);
+struct rq_wb *wbt_init(struct backing_dev_info *, struct wb_stat_ops *, void *);
+void wbt_exit(struct rq_wb *);
+void wbt_update_limits(struct rq_wb *);
+void wbt_requeue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_issue(struct rq_wb *, struct wb_issue_stat *);
+void wbt_disable(struct rq_wb *);
+
+void wbt_set_queue_depth(struct rq_wb *, unsigned int);
+void wbt_set_write_cache(struct rq_wb *, bool);
+
+#endif
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index fc1e16c..e53abf2 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -100,6 +100,16 @@ struct writeback_control {
 #endif
 };
 
+static inline int wbc_to_write_flags(struct writeback_control *wbc)
+{
+	if (wbc->sync_mode == WB_SYNC_ALL)
+		return WRITE_SYNC;
+	else if (wbc->for_kupdate || wbc->for_background)
+		return WRITE_BG;
+
+	return 0;
+}
+
 /*
  * A wb_domain represents a domain that wb's (bdi_writeback's) belong to
  * and are measured against each other in.  There always is one global
diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h
index 3f36d45..ec392a1 100644
--- a/include/net/secure_seq.h
+++ b/include/net/secure_seq.h
@@ -14,5 +14,10 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
 				__be16 sport, __be16 dport);
 u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
 				  __be16 sport, __be16 dport);
+#ifdef CONFIG_TCP_STEALTH
+u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb);
+u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr,
+				u32 daddr_size, __be16 dport);
+#endif
 
 #endif /* _NET_SECURE_SEQ */
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 0de6989..eb89754 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -430,6 +430,12 @@ void tcp_parse_options(const struct sk_buff *skb,
 		       struct tcp_options_received *opt_rx,
 		       int estab, struct tcp_fastopen_cookie *foc);
 const u8 *tcp_parse_md5sig_option(const struct tcphdr *th);
+#ifdef CONFIG_TCP_STEALTH
+const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th);
+int tcp_stealth_integrity(u16 *hash, u8 *secret, u8 *payload, int len);
+#define be32_isn_to_be16_av(x)	(((__be16 *)&x)[0])
+#define be32_isn_to_be16_ih(x)	(((__be16 *)&x)[1])
+#endif
 
 /*
  *	TCP v4 functions exported for the inet6 API
diff --git b/include/trace/events/fs.h b/include/trace/events/fs.h
new file mode 100644
index 0000000..fb634b7
--- /dev/null
+++ b/include/trace/events/fs.h
@@ -0,0 +1,53 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fs
+
+#if !defined(_TRACE_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FS_H
+
+#include <linux/fs.h>
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(do_sys_open,
+
+	TP_PROTO(const char *filename, int flags, int mode),
+
+	TP_ARGS(filename, flags, mode),
+
+	TP_STRUCT__entry(
+		__string(	filename, filename		)
+		__field(	int, flags			)
+		__field(	int, mode			)
+	),
+
+	TP_fast_assign(
+		__assign_str(filename, filename);
+		__entry->flags = flags;
+		__entry->mode = mode;
+	),
+
+	TP_printk("\"%s\" %x %o",
+		  __get_str(filename), __entry->flags, __entry->mode)
+);
+
+TRACE_EVENT(open_exec,
+
+	TP_PROTO(const char *filename),
+
+	TP_ARGS(filename),
+
+	TP_STRUCT__entry(
+		__string(	filename, filename		)
+	),
+
+	TP_fast_assign(
+		__assign_str(filename, filename);
+	),
+
+	TP_printk("\"%s\"",
+		  __get_str(filename))
+);
+
+#endif /* _TRACE_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git b/include/trace/events/wbt.h b/include/trace/events/wbt.h
new file mode 100644
index 0000000..926c7ee
--- /dev/null
+++ b/include/trace/events/wbt.h
@@ -0,0 +1,153 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM wbt
+
+#if !defined(_TRACE_WBT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WBT_H
+
+#include <linux/tracepoint.h>
+#include <linux/wbt.h>
+
+/**
+ * wbt_stat - trace stats for blk_wb
+ * @stat: array of read/write stats
+ */
+TRACE_EVENT(wbt_stat,
+
+	TP_PROTO(struct backing_dev_info *bdi, struct blk_rq_stat *stat),
+
+	TP_ARGS(bdi, stat),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(s64, rmean)
+		__field(u64, rmin)
+		__field(u64, rmax)
+		__field(s64, rnr_samples)
+		__field(s64, rtime)
+		__field(s64, wmean)
+		__field(u64, wmin)
+		__field(u64, wmax)
+		__field(s64, wnr_samples)
+		__field(s64, wtime)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->rmean		= stat[0].mean;
+		__entry->rmin		= stat[0].min;
+		__entry->rmax		= stat[0].max;
+		__entry->rnr_samples	= stat[0].nr_samples;
+		__entry->wmean		= stat[1].mean;
+		__entry->wmin		= stat[1].min;
+		__entry->wmax		= stat[1].max;
+		__entry->wnr_samples	= stat[1].nr_samples;
+	),
+
+	TP_printk("%s: rmean=%llu, rmin=%llu, rmax=%llu, rsamples=%llu, "
+		  "wmean=%llu, wmin=%llu, wmax=%llu, wsamples=%llu\n",
+		  __entry->name, __entry->rmean, __entry->rmin, __entry->rmax,
+		  __entry->rnr_samples, __entry->wmean, __entry->wmin,
+		  __entry->wmax, __entry->wnr_samples)
+);
+
+/**
+ * wbt_lat - trace latency event
+ * @lat: latency trigger
+ */
+TRACE_EVENT(wbt_lat,
+
+	TP_PROTO(struct backing_dev_info *bdi, unsigned long lat),
+
+	TP_ARGS(bdi, lat),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long, lat)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->lat = div_u64(lat, 1000);
+	),
+
+	TP_printk("%s: latency %lluus\n", __entry->name,
+			(unsigned long long) __entry->lat)
+);
+
+/**
+ * wbt_step - trace wb event step
+ * @msg: context message
+ * @step: the current scale step count
+ * @window: the current monitoring window
+ * @bg: the current background queue limit
+ * @normal: the current normal writeback limit
+ * @max: the current max throughput writeback limit
+ */
+TRACE_EVENT(wbt_step,
+
+	TP_PROTO(struct backing_dev_info *bdi, const char *msg,
+		 int step, unsigned long window, unsigned int bg,
+		 unsigned int normal, unsigned int max),
+
+	TP_ARGS(bdi, msg, step, window, bg, normal, max),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(const char *, msg)
+		__field(int, step)
+		__field(unsigned long, window)
+		__field(unsigned int, bg)
+		__field(unsigned int, normal)
+		__field(unsigned int, max)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->msg	= msg;
+		__entry->step	= step;
+		__entry->window	= div_u64(window, 1000);
+		__entry->bg	= bg;
+		__entry->normal	= normal;
+		__entry->max	= max;
+	),
+
+	TP_printk("%s: %s: step=%d, window=%luus, background=%u, normal=%u, max=%u\n",
+		  __entry->name, __entry->msg, __entry->step, __entry->window,
+		  __entry->bg, __entry->normal, __entry->max)
+);
+
+/**
+ * wbt_timer - trace wb timer event
+ * @status: timer state status
+ * @step: the current scale step count
+ * @inflight: tracked writes inflight
+ */
+TRACE_EVENT(wbt_timer,
+
+	TP_PROTO(struct backing_dev_info *bdi, unsigned int status,
+		 int step, unsigned int inflight),
+
+	TP_ARGS(bdi, status, step, inflight),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned int, status)
+		__field(int, step)
+		__field(unsigned int, inflight)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->status		= status;
+		__entry->step		= step;
+		__entry->inflight	= inflight;
+	),
+
+	TP_printk("%s: status=%u, step=%d, inflight=%u\n", __entry->name,
+		  __entry->status, __entry->step, __entry->inflight)
+);
+
+#endif /* _TRACE_WBT_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
index 407ca0d..899e547 100644
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -59,6 +59,7 @@ header-y += atmsvc.h
 header-y += atm_tcp.h
 header-y += atm_zatm.h
 header-y += audit.h
+header-y += aufs_type.h
 header-y += auto_fs4.h
 header-y += auto_fs.h
 header-y += auxvec.h
diff --git b/include/uapi/linux/aufs_type.h b/include/uapi/linux/aufs_type.h
new file mode 100644
index 0000000..e220283
--- /dev/null
+++ b/include/uapi/linux/aufs_type.h
@@ -0,0 +1,406 @@
+/*
+ * Copyright (C) 2005-2016 Junjiro R. Okajima
+ */
+
+#ifndef __AUFS_TYPE_H__
+#define __AUFS_TYPE_H__
+
+#define AUFS_NAME	"aufs"
+
+#ifdef __KERNEL__
+/*
+ * define it before including all other headers.
+ * sched.h may use pr_* macros before defining "current", so define the
+ * no-current version first, and re-define later.
+ */
+#define pr_fmt(fmt)	AUFS_NAME " %s:%d: " fmt, __func__, __LINE__
+#include <linux/sched.h>
+#undef pr_fmt
+#define pr_fmt(fmt) \
+		AUFS_NAME " %s:%d:%.*s[%d]: " fmt, __func__, __LINE__, \
+		(int)sizeof(current->comm), current->comm, current->pid
+#else
+#include <stdint.h>
+#include <sys/types.h>
+#endif /* __KERNEL__ */
+
+#include <linux/limits.h>
+
+#define AUFS_VERSION	"4.8"
+
+/* todo? move this to linux-2.6.19/include/magic.h */
+#define AUFS_SUPER_MAGIC	('a' << 24 | 'u' << 16 | 'f' << 8 | 's')
+
+/* ---------------------------------------------------------------------- */
+
+#ifdef CONFIG_AUFS_BRANCH_MAX_127
+typedef int8_t aufs_bindex_t;
+#define AUFS_BRANCH_MAX 127
+#else
+typedef int16_t aufs_bindex_t;
+#ifdef CONFIG_AUFS_BRANCH_MAX_511
+#define AUFS_BRANCH_MAX 511
+#elif defined(CONFIG_AUFS_BRANCH_MAX_1023)
+#define AUFS_BRANCH_MAX 1023
+#elif defined(CONFIG_AUFS_BRANCH_MAX_32767)
+#define AUFS_BRANCH_MAX 32767
+#endif
+#endif
+
+#ifdef __KERNEL__
+#ifndef AUFS_BRANCH_MAX
+#error unknown CONFIG_AUFS_BRANCH_MAX value
+#endif
+#endif /* __KERNEL__ */
+
+/* ---------------------------------------------------------------------- */
+
+#define AUFS_FSTYPE		AUFS_NAME
+
+#define AUFS_ROOT_INO		2
+#define AUFS_FIRST_INO		11
+
+#define AUFS_WH_PFX		".wh."
+#define AUFS_WH_PFX_LEN		((int)sizeof(AUFS_WH_PFX) - 1)
+#define AUFS_WH_TMP_LEN		4
+/* a limit for rmdir/rename a dir and copyup */
+#define AUFS_MAX_NAMELEN	(NAME_MAX \
+				- AUFS_WH_PFX_LEN * 2	/* doubly whiteouted */\
+				- 1			/* dot */\
+				- AUFS_WH_TMP_LEN)	/* hex */
+#define AUFS_XINO_FNAME		"." AUFS_NAME ".xino"
+#define AUFS_XINO_DEFPATH	"/tmp/" AUFS_XINO_FNAME
+#define AUFS_XINO_DEF_SEC	30 /* seconds */
+#define AUFS_XINO_DEF_TRUNC	45 /* percentage */
+#define AUFS_DIRWH_DEF		3
+#define AUFS_RDCACHE_DEF	10 /* seconds */
+#define AUFS_RDCACHE_MAX	3600 /* seconds */
+#define AUFS_RDBLK_DEF		512 /* bytes */
+#define AUFS_RDHASH_DEF		32
+#define AUFS_WKQ_NAME		AUFS_NAME "d"
+#define AUFS_MFS_DEF_SEC	30 /* seconds */
+#define AUFS_MFS_MAX_SEC	3600 /* seconds */
+#define AUFS_FHSM_CACHE_DEF_SEC	30 /* seconds */
+#define AUFS_PLINK_WARN		50 /* number of plinks in a single bucket */
+
+/* pseudo-link maintenace under /proc */
+#define AUFS_PLINK_MAINT_NAME	"plink_maint"
+#define AUFS_PLINK_MAINT_DIR	"fs/" AUFS_NAME
+#define AUFS_PLINK_MAINT_PATH	AUFS_PLINK_MAINT_DIR "/" AUFS_PLINK_MAINT_NAME
+
+#define AUFS_DIROPQ_NAME	AUFS_WH_PFX ".opq" /* whiteouted doubly */
+#define AUFS_WH_DIROPQ		AUFS_WH_PFX AUFS_DIROPQ_NAME
+
+#define AUFS_BASE_NAME		AUFS_WH_PFX AUFS_NAME
+#define AUFS_PLINKDIR_NAME	AUFS_WH_PFX "plnk"
+#define AUFS_ORPHDIR_NAME	AUFS_WH_PFX "orph"
+
+/* doubly whiteouted */
+#define AUFS_WH_BASE		AUFS_WH_PFX AUFS_BASE_NAME
+#define AUFS_WH_PLINKDIR	AUFS_WH_PFX AUFS_PLINKDIR_NAME
+#define AUFS_WH_ORPHDIR		AUFS_WH_PFX AUFS_ORPHDIR_NAME
+
+/* branch permissions and attributes */
+#define AUFS_BRPERM_RW		"rw"
+#define AUFS_BRPERM_RO		"ro"
+#define AUFS_BRPERM_RR		"rr"
+#define AUFS_BRATTR_COO_REG	"coo_reg"
+#define AUFS_BRATTR_COO_ALL	"coo_all"
+#define AUFS_BRATTR_FHSM	"fhsm"
+#define AUFS_BRATTR_UNPIN	"unpin"
+#define AUFS_BRATTR_ICEX	"icex"
+#define AUFS_BRATTR_ICEX_SEC	"icexsec"
+#define AUFS_BRATTR_ICEX_SYS	"icexsys"
+#define AUFS_BRATTR_ICEX_TR	"icextr"
+#define AUFS_BRATTR_ICEX_USR	"icexusr"
+#define AUFS_BRATTR_ICEX_OTH	"icexoth"
+#define AUFS_BRRATTR_WH		"wh"
+#define AUFS_BRWATTR_NLWH	"nolwh"
+#define AUFS_BRWATTR_MOO	"moo"
+
+#define AuBrPerm_RW		1		/* writable, hardlinkable wh */
+#define AuBrPerm_RO		(1 << 1)	/* readonly */
+#define AuBrPerm_RR		(1 << 2)	/* natively readonly */
+#define AuBrPerm_Mask		(AuBrPerm_RW | AuBrPerm_RO | AuBrPerm_RR)
+
+#define AuBrAttr_COO_REG	(1 << 3)	/* copy-up on open */
+#define AuBrAttr_COO_ALL	(1 << 4)
+#define AuBrAttr_COO_Mask	(AuBrAttr_COO_REG | AuBrAttr_COO_ALL)
+
+#define AuBrAttr_FHSM		(1 << 5)	/* file-based hsm */
+#define AuBrAttr_UNPIN		(1 << 6)	/* rename-able top dir of
+						   branch. meaningless since
+						   linux-3.18-rc1 */
+
+/* ignore error in copying XATTR */
+#define AuBrAttr_ICEX_SEC	(1 << 7)
+#define AuBrAttr_ICEX_SYS	(1 << 8)
+#define AuBrAttr_ICEX_TR	(1 << 9)
+#define AuBrAttr_ICEX_USR	(1 << 10)
+#define AuBrAttr_ICEX_OTH	(1 << 11)
+#define AuBrAttr_ICEX		(AuBrAttr_ICEX_SEC	\
+				 | AuBrAttr_ICEX_SYS	\
+				 | AuBrAttr_ICEX_TR	\
+				 | AuBrAttr_ICEX_USR	\
+				 | AuBrAttr_ICEX_OTH)
+
+#define AuBrRAttr_WH		(1 << 12)	/* whiteout-able */
+#define AuBrRAttr_Mask		AuBrRAttr_WH
+
+#define AuBrWAttr_NoLinkWH	(1 << 13)	/* un-hardlinkable whiteouts */
+#define AuBrWAttr_MOO		(1 << 14)	/* move-up on open */
+#define AuBrWAttr_Mask		(AuBrWAttr_NoLinkWH | AuBrWAttr_MOO)
+
+#define AuBrAttr_CMOO_Mask	(AuBrAttr_COO_Mask | AuBrWAttr_MOO)
+
+/* #warning test userspace */
+#ifdef __KERNEL__
+#ifndef CONFIG_AUFS_FHSM
+#undef AuBrAttr_FHSM
+#define AuBrAttr_FHSM		0
+#endif
+#ifndef CONFIG_AUFS_XATTR
+#undef	AuBrAttr_ICEX
+#define AuBrAttr_ICEX		0
+#undef	AuBrAttr_ICEX_SEC
+#define AuBrAttr_ICEX_SEC	0
+#undef	AuBrAttr_ICEX_SYS
+#define AuBrAttr_ICEX_SYS	0
+#undef	AuBrAttr_ICEX_TR
+#define AuBrAttr_ICEX_TR	0
+#undef	AuBrAttr_ICEX_USR
+#define AuBrAttr_ICEX_USR	0
+#undef	AuBrAttr_ICEX_OTH
+#define AuBrAttr_ICEX_OTH	0
+#endif
+#endif
+
+/* the longest combination */
+/* AUFS_BRATTR_ICEX and AUFS_BRATTR_ICEX_TR don't affect here */
+#define AuBrPermStrSz	sizeof(AUFS_BRPERM_RW			\
+			       "+" AUFS_BRATTR_COO_REG		\
+			       "+" AUFS_BRATTR_FHSM		\
+			       "+" AUFS_BRATTR_UNPIN		\
+			       "+" AUFS_BRATTR_ICEX_SEC		\
+			       "+" AUFS_BRATTR_ICEX_SYS		\
+			       "+" AUFS_BRATTR_ICEX_USR		\
+			       "+" AUFS_BRATTR_ICEX_OTH		\
+			       "+" AUFS_BRWATTR_NLWH)
+
+typedef struct {
+	char a[AuBrPermStrSz];
+} au_br_perm_str_t;
+
+static inline int au_br_writable(int brperm)
+{
+	return brperm & AuBrPerm_RW;
+}
+
+static inline int au_br_whable(int brperm)
+{
+	return brperm & (AuBrPerm_RW | AuBrRAttr_WH);
+}
+
+static inline int au_br_wh_linkable(int brperm)
+{
+	return !(brperm & AuBrWAttr_NoLinkWH);
+}
+
+static inline int au_br_cmoo(int brperm)
+{
+	return brperm & AuBrAttr_CMOO_Mask;
+}
+
+static inline int au_br_fhsm(int brperm)
+{
+	return brperm & AuBrAttr_FHSM;
+}
+
+/* ---------------------------------------------------------------------- */
+
+/* ioctl */
+enum {
+	/* readdir in userspace */
+	AuCtl_RDU,
+	AuCtl_RDU_INO,
+
+	AuCtl_WBR_FD,	/* pathconf wrapper */
+	AuCtl_IBUSY,	/* busy inode */
+	AuCtl_MVDOWN,	/* move-down */
+	AuCtl_BR,	/* info about branches */
+	AuCtl_FHSM_FD	/* connection for fhsm */
+};
+
+/* borrowed from linux/include/linux/kernel.h */
+#ifndef ALIGN
+#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
+#endif
+
+/* borrowed from linux/include/linux/compiler-gcc3.h */
+#ifndef __aligned
+#define __aligned(x)			__attribute__((aligned(x)))
+#endif
+
+#ifdef __KERNEL__
+#ifndef __packed
+#define __packed			__attribute__((packed))
+#endif
+#endif
+
+struct au_rdu_cookie {
+	uint64_t	h_pos;
+	int16_t		bindex;
+	uint8_t		flags;
+	uint8_t		pad;
+	uint32_t	generation;
+} __aligned(8);
+
+struct au_rdu_ent {
+	uint64_t	ino;
+	int16_t		bindex;
+	uint8_t		type;
+	uint8_t		nlen;
+	uint8_t		wh;
+	char		name[0];
+} __aligned(8);
+
+static inline int au_rdu_len(int nlen)
+{
+	/* include the terminating NULL */
+	return ALIGN(sizeof(struct au_rdu_ent) + nlen + 1,
+		     sizeof(uint64_t));
+}
+
+union au_rdu_ent_ul {
+	struct au_rdu_ent __user	*e;
+	uint64_t			ul;
+};
+
+enum {
+	AufsCtlRduV_SZ,
+	AufsCtlRduV_End
+};
+
+struct aufs_rdu {
+	/* input */
+	union {
+		uint64_t	sz;	/* AuCtl_RDU */
+		uint64_t	nent;	/* AuCtl_RDU_INO */
+	};
+	union au_rdu_ent_ul	ent;
+	uint16_t		verify[AufsCtlRduV_End];
+
+	/* input/output */
+	uint32_t		blk;
+
+	/* output */
+	union au_rdu_ent_ul	tail;
+	/* number of entries which were added in a single call */
+	uint64_t		rent;
+	uint8_t			full;
+	uint8_t			shwh;
+
+	struct au_rdu_cookie	cookie;
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+struct aufs_wbr_fd {
+	uint32_t	oflags;
+	int16_t		brid;
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+struct aufs_ibusy {
+	uint64_t	ino, h_ino;
+	int16_t		bindex;
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+/* error code for move-down */
+/* the actual message strings are implemented in aufs-util.git */
+enum {
+	EAU_MVDOWN_OPAQUE = 1,
+	EAU_MVDOWN_WHITEOUT,
+	EAU_MVDOWN_UPPER,
+	EAU_MVDOWN_BOTTOM,
+	EAU_MVDOWN_NOUPPER,
+	EAU_MVDOWN_NOLOWERBR,
+	EAU_Last
+};
+
+/* flags for move-down */
+#define AUFS_MVDOWN_DMSG	1
+#define AUFS_MVDOWN_OWLOWER	(1 << 1)	/* overwrite lower */
+#define AUFS_MVDOWN_KUPPER	(1 << 2)	/* keep upper */
+#define AUFS_MVDOWN_ROLOWER	(1 << 3)	/* do even if lower is RO */
+#define AUFS_MVDOWN_ROLOWER_R	(1 << 4)	/* did on lower RO */
+#define AUFS_MVDOWN_ROUPPER	(1 << 5)	/* do even if upper is RO */
+#define AUFS_MVDOWN_ROUPPER_R	(1 << 6)	/* did on upper RO */
+#define AUFS_MVDOWN_BRID_UPPER	(1 << 7)	/* upper brid */
+#define AUFS_MVDOWN_BRID_LOWER	(1 << 8)	/* lower brid */
+#define AUFS_MVDOWN_FHSM_LOWER	(1 << 9)	/* find fhsm attr for lower */
+#define AUFS_MVDOWN_STFS	(1 << 10)	/* req. stfs */
+#define AUFS_MVDOWN_STFS_FAILED	(1 << 11)	/* output: stfs is unusable */
+#define AUFS_MVDOWN_BOTTOM	(1 << 12)	/* output: no more lowers */
+
+/* index for move-down */
+enum {
+	AUFS_MVDOWN_UPPER,
+	AUFS_MVDOWN_LOWER,
+	AUFS_MVDOWN_NARRAY
+};
+
+/*
+ * additional info of move-down
+ * number of free blocks and inodes.
+ * subset of struct kstatfs, but smaller and always 64bit.
+ */
+struct aufs_stfs {
+	uint64_t	f_blocks;
+	uint64_t	f_bavail;
+	uint64_t	f_files;
+	uint64_t	f_ffree;
+};
+
+struct aufs_stbr {
+	int16_t			brid;	/* optional input */
+	int16_t			bindex;	/* output */
+	struct aufs_stfs	stfs;	/* output when AUFS_MVDOWN_STFS set */
+} __aligned(8);
+
+struct aufs_mvdown {
+	uint32_t		flags;			/* input/output */
+	struct aufs_stbr	stbr[AUFS_MVDOWN_NARRAY]; /* input/output */
+	int8_t			au_errno;		/* output */
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+union aufs_brinfo {
+	/* PATH_MAX may differ between kernel-space and user-space */
+	char	_spacer[4096];
+	struct {
+		int16_t	id;
+		int	perm;
+		char	path[0];
+	};
+} __aligned(8);
+
+/* ---------------------------------------------------------------------- */
+
+#define AuCtlType		'A'
+#define AUFS_CTL_RDU		_IOWR(AuCtlType, AuCtl_RDU, struct aufs_rdu)
+#define AUFS_CTL_RDU_INO	_IOWR(AuCtlType, AuCtl_RDU_INO, struct aufs_rdu)
+#define AUFS_CTL_WBR_FD		_IOW(AuCtlType, AuCtl_WBR_FD, \
+				     struct aufs_wbr_fd)
+#define AUFS_CTL_IBUSY		_IOWR(AuCtlType, AuCtl_IBUSY, struct aufs_ibusy)
+#define AUFS_CTL_MVDOWN		_IOWR(AuCtlType, AuCtl_MVDOWN, \
+				      struct aufs_mvdown)
+#define AUFS_CTL_BRINFO		_IOW(AuCtlType, AuCtl_BR, union aufs_brinfo)
+#define AUFS_CTL_FHSM_FD	_IOW(AuCtlType, AuCtl_FHSM_FD, int)
+
+#endif /* __AUFS_TYPE_H__ */
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 5f0fe01..950a481 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -36,9 +36,16 @@
 #define SCHED_FIFO		1
 #define SCHED_RR		2
 #define SCHED_BATCH		3
-/* SCHED_ISO: reserved but not implemented yet */
+/* SCHED_ISO: Implemented on MuQSS only */
 #define SCHED_IDLE		5
+#ifdef CONFIG_SCHED_MUQSS
+#define SCHED_ISO		4
+#define SCHED_IDLEPRIO		SCHED_IDLE
+#define SCHED_MAX		(SCHED_IDLEPRIO)
+#define SCHED_RANGE(policy)	((policy) <= SCHED_MAX)
+#else /* CONFIG_SCHED_MUQSS */
 #define SCHED_DEADLINE		6
+#endif /* CONFIG_SCHED_MUQSS */
 
 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
 #define SCHED_RESET_ON_FORK     0x40000000
diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h
index 482898f..fa24d4e 100644
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -116,6 +116,9 @@ enum {
 #define TCP_SAVE_SYN		27	/* Record SYN headers for new connections */
 #define TCP_SAVED_SYN		28	/* Get SYN headers recorded for connection */
 #define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
+#define TCP_STEALTH		30
+#define TCP_STEALTH_INTEGRITY	31
+#define TCP_STEALTH_INTEGRITY_LEN	32
 
 struct tcp_repair_opt {
 	__u32	opt_code;
diff --git a/include/uapi/linux/vt.h b/include/uapi/linux/vt.h
index f690348..18db275 100644
--- a/include/uapi/linux/vt.h
+++ b/include/uapi/linux/vt.h
@@ -3,11 +3,24 @@
 
 
 /*
+ * We will make this definition solely for the purpose of making packages
+ * such as splashutils build, because they can not understand that
+ * NR_TTY_DEVICES is defined in the kernel configuration.
+ */
+#ifndef CONFIG_NR_TTY_DEVICES
+#define CONFIG_NR_TTY_DEVICES 63
+#endif
+
+/*
  * These constants are also useful for user-level apps (e.g., VC
  * resizing).
  */
 #define MIN_NR_CONSOLES 1       /* must be at least 1 */
-#define MAX_NR_CONSOLES	63	/* serial lines start at 64 */
+/*
+ * NR_TTY_DEVICES:
+ * Value MUST be at least 12 and must never be higher then 63
+ */
+#define MAX_NR_CONSOLES CONFIG_NR_TTY_DEVICES	/* serial lines start above this */
 		/* Note: the ioctl VT_GETSTATE does not work for
 		   consoles 16 and higher (since it returns a short) */
 
diff --git a/init/Kconfig b/init/Kconfig
index cac3f09..bc6cd04 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -28,6 +28,47 @@ config BUILDTIME_EXTABLE_SORT
 
 menu "General setup"
 
+config SCHED_MUQSS
+	bool "MuQSS cpu scheduler"
+    select HIGH_RES_TIMERS
+	default n
+	---help---
+	  The Multiple Queue Skiplist Scheduler for excellent interactivity and
+	  responsiveness on the desktop and highly scalable deterministic
+	  low latency on any hardware.
+
+config PCK_INTERACTIVE
+	bool "Tune kernel for interactivity"
+	default y
+	help
+	  Tunes the kernel for responsiveness at the cost of throughput and power usage.
+
+	  --- Virtual Memory Subsystem ---------------------------
+
+	    Mem dirty before bg writeback..:  10 %  ->  20 %
+	    Mem dirty before sync writeback:  20 %  ->  50 %
+
+	  --- Block Layer ----------------------------------------
+
+	    Block Layer Queue Depth........: 128    ->  512
+
+	  --- CPU Scheduler (CFS) --------------------------------
+
+	    Scheduling latency.............:   6    ->   3    ms
+	    Minimal granularity............:   0.75 ->   0.3  ms
+	    Wakeup granularity.............:   1    ->   0.5  ms
+	    CPU migration cost.............:   0.5  ->   0.25 ms
+	    Bandwidth slice size...........:   5    ->   3    ms
+
+	  --- CPU Scheduler (MuQSS) ------------------------------
+
+	    Scheduling interval............:   6    ->   3    ms
+	    ISO task max realtime use......:  70 %  ->  25 %
+
+	  --- CPU Frequency Scaling ------------------------------
+
+	    Ondemand down scaling factor...:   1    ->  10
+
 config BROKEN
 	bool
 
@@ -540,7 +581,7 @@ config CONTEXT_TRACKING
 config CONTEXT_TRACKING_FORCE
 	bool "Force context tracking"
 	depends on CONTEXT_TRACKING
-	default y if !NO_HZ_FULL
+	default y if !NO_HZ_FULL && !SCHED_MUQSS
 	help
 	  The major pre-requirement for full dynticks to work is to
 	  support the context tracking subsystem. But there are also
@@ -930,6 +971,7 @@ config NUMA_BALANCING
 	depends on ARCH_SUPPORTS_NUMA_BALANCING
 	depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
 	depends on SMP && NUMA && MIGRATION
+	depends on !SCHED_MUQSS
 	help
 	  This option adds support for automatic NUMA aware memory/task placement.
 	  The mechanism is quite primitive and is based on migrating memory when
@@ -1032,9 +1074,13 @@ menuconfig CGROUP_SCHED
 	help
 	  This feature lets CPU scheduler recognize task groups and control CPU
 	  bandwidth allocation to such task groups. It uses cgroups to group
-	  tasks.
+	  tasks. In combination with MuQSS this is purely a STUB to create the
+	  files associated with the CPU controller cgroup but most of the
+	  controls do nothing. This is useful for working in environments and
+	  with applications that will only work if this control group is
+	  present.
 
-if CGROUP_SCHED
+if CGROUP_SCHED && !SCHED_MUQSS
 config FAIR_GROUP_SCHED
 	bool "Group scheduling for SCHED_OTHER"
 	depends on CGROUP_SCHED
@@ -1130,6 +1176,7 @@ config CGROUP_DEVICE
 
 config CGROUP_CPUACCT
 	bool "Simple CPU accounting controller"
+	depends on !SCHED_MUQSS
 	help
 	  Provides a simple controller for monitoring the
 	  total CPU consumed by the tasks in a cgroup.
@@ -1228,6 +1275,7 @@ endif # NAMESPACES
 
 config SCHED_AUTOGROUP
 	bool "Automatic process group scheduling"
+	depends on !SCHED_MUQSS
 	select CGROUPS
 	select CGROUP_SCHED
 	select FAIR_GROUP_SCHED
@@ -1320,6 +1368,13 @@ config CC_OPTIMIZE_FOR_PERFORMANCE
 	  with the "-O2" compiler flag for best performance and most
 	  helpful compile-time warnings.
 
+config CC_OPTIMIZE_HARDER
+	bool "Optimize harder"
+	help
+	  This option will pass "-O3" to your compiler resulting in a
+	  larger and faster kernel. The more complex optimizations also
+	  increase compilation time and may affect stability.
+
 config CC_OPTIMIZE_FOR_SIZE
 	bool "Optimize for size"
 	help
diff --git a/init/main.c b/init/main.c
index a8a58e2..76a2d14 100644
--- a/init/main.c
+++ b/init/main.c
@@ -792,7 +792,6 @@ int __init_or_module do_one_initcall(initcall_t fn)
 	return ret;
 }
 
-
 extern initcall_t __initcall_start[];
 extern initcall_t __initcall0_start[];
 extern initcall_t __initcall1_start[];
@@ -951,6 +950,8 @@ static int __ref kernel_init(void *unused)
 
 	rcu_end_inkernel_boot();
 
+	print_scheduler_version();
+
 	if (ramdisk_execute_command) {
 		ret = run_init_process(ramdisk_execute_command);
 		if (!ret)
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 2a202a8..1532e4e 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -4,7 +4,8 @@
 
 choice
 	prompt "Timer frequency"
-	default HZ_250
+	default HZ_128 if SCHED_MUQSS
+	default HZ_250 if !SCHED_MUQSS
 	help
 	 Allows the configuration of the timer frequency. It is customary
 	 to have the timer interrupt run at 1000 Hz but 100 Hz may be more
@@ -23,6 +24,16 @@ choice
 	  with lots of processors that may show reduced performance if
 	  too many timer interrupts are occurring.
 
+	config HZ_128
+		bool "128 HZ"
+	help
+	  128 Hz is a suitable choice in combination with MuQSS which does
+	  not rely on ticks for rescheduling interrupts, and is not Hz limited
+	  for timeouts and sleeps from both the kernel and userspace.
+	  This allows us to benefit from the lower overhead and higher
+	  throughput of fewer timer ticks and the microoptimisation of Hz
+	  divisions being a power of 2.
+
 	config HZ_250
 		bool "250 HZ"
 	help
@@ -50,6 +61,7 @@ endchoice
 config HZ
 	int
 	default 100 if HZ_100
+	default 128 if HZ_128
 	default 250 if HZ_250
 	default 300 if HZ_300
 	default 1000 if HZ_1000
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e..7241e34 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y     = fork.o exec_domain.o panic.o \
 	    extable.o params.o \
 	    kthread.o sys_ni.o nsproxy.o \
 	    notifier.o ksysfs.o cred.o reboot.o \
-	    async.o range.o smpboot.o
+	    async.o range.o smpboot.o skip_list.o
 
 obj-$(CONFIG_MULTIUSER) += groups.o
 
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 435c14a..a80d56d 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -104,7 +104,7 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
 	 */
 	t1 = tsk->sched_info.pcount;
 	t2 = tsk->sched_info.run_delay;
-	t3 = tsk->se.sum_exec_runtime;
+	t3 = tsk_seruntime(tsk);
 
 	d->cpu_count += t1;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 091a78b..c22b37f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -134,7 +134,7 @@ static void __exit_signal(struct task_struct *tsk)
 	sig->inblock += task_io_get_inblock(tsk);
 	sig->oublock += task_io_get_oublock(tsk);
 	task_io_accounting_add(&sig->ioac, &tsk->ioac);
-	sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
+	sig->sum_sched_runtime += tsk_seruntime(tsk);
 	sig->nr_threads--;
 	__unhash_process(tsk, group_dead);
 	write_sequnlock(&sig->stats_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index beb3172..adb2389 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -457,7 +457,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -477,7 +477,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			struct inode *inode = file_inode(file);
 			struct address_space *mapping = file->f_mapping;
 
-			get_file(file);
+			vma_get_file(tmp);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
 			i_mmap_lock_write(mapping);
@@ -510,7 +510,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		__vma_link_rb(mm, tmp, rb_link, rb_parent);
 		rb_link = &tmp->vm_rb.rb_right;
 		rb_parent = &tmp->vm_rb;
-
+		uksm_vma_add_new(tmp);
 		mm->map_count++;
 		retval = copy_page_range(mm, oldmm, mpnt);
 
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 5e59b83..77bdf98 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -15,13 +15,18 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 
-obj-y += core.o loadavg.o clock.o cputime.o
+ifdef CONFIG_SCHED_MUQSS
+obj-y += MuQSS.o clock.o
+else
+obj-y += core.o loadavg.o clock.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
-obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+endif
+obj-y += wait.o swait.o completion.o idle.o cputime.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git b/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
new file mode 100644
index 0000000..ba45cf3
--- /dev/null
+++ b/kernel/sched/MuQSS.c
@@ -0,0 +1,7898 @@
+/*
+ *  kernel/sched/MuQSS.c, was kernel/sched.c
+ *
+ *  Kernel scheduler and related syscalls
+ *
+ *  Copyright (C) 1991-2002  Linus Torvalds
+ *
+ *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
+ *		make semaphores SMP safe
+ *  1998-11-19	Implemented schedule_timeout() and related stuff
+ *		by Andrea Arcangeli
+ *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
+ *		hybrid priority-list and round-robin design with
+ *		an array-switch method of distributing timeslices
+ *		and per-CPU runqueues.  Cleanups and useful suggestions
+ *		by Davide Libenzi, preemptible kernel bits by Robert Love.
+ *  2003-09-03	Interactivity tuning by Con Kolivas.
+ *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2007-04-15  Work begun on replacing all interactivity tuning with a
+ *              fair scheduling design by Con Kolivas.
+ *  2007-05-05  Load balancing (smp-nice) and other improvements
+ *              by Peter Williams
+ *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
+ *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *              Thomas Gleixner, Mike Kravetz
+ *  2009-08-13	Brainfuck deadline scheduling policy by Con Kolivas deletes
+ *              a whole lot of those previous things.
+ *  2016-10-01  Multiple Queue Skiplist Scheduler scalable evolution of BFS
+ * 		scheduler by Con Kolivas.
+ */
+
+#include <linux/kasan.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <asm/uaccess.h>
+#include <linux/highmem.h>
+#include <linux/mmu_context.h>
+#include <linux/interrupt.h>
+#include <linux/capability.h>
+#include <linux/completion.h>
+#include <linux/kernel_stat.h>
+#include <linux/debug_locks.h>
+#include <linux/perf_event.h>
+#include <linux/security.h>
+#include <linux/notifier.h>
+#include <linux/profile.h>
+#include <linux/freezer.h>
+#include <linux/vmalloc.h>
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/smp.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/syscalls.h>
+#include <linux/sched/sysctl.h>
+#include <linux/times.h>
+#include <linux/tsacct_kern.h>
+#include <linux/kprobes.h>
+#include <linux/delayacct.h>
+#include <linux/log2.h>
+#include <linux/bootmem.h>
+#include <linux/ftrace.h>
+#include <linux/slab.h>
+#include <linux/init_task.h>
+#include <linux/binfmts.h>
+#include <linux/context_tracking.h>
+#include <linux/sched/prio.h>
+#include <linux/tick.h>
+#include <linux/skip_list.h>
+
+#include <asm/irq_regs.h>
+#include <asm/switch_to.h>
+#include <asm/tlb.h>
+#include <asm/unistd.h>
+#include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
+
+#include "cpupri.h"
+#include "../workqueue_internal.h"
+#include "../smpboot.h"
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
+#include "MuQSS.h"
+
+#define rt_prio(prio)		unlikely((prio) < MAX_RT_PRIO)
+#define rt_task(p)		rt_prio((p)->prio)
+#define batch_task(p)		(unlikely((p)->policy == SCHED_BATCH))
+#define is_rt_policy(policy)	((policy) == SCHED_FIFO || \
+					(policy) == SCHED_RR)
+#define has_rt_policy(p)	unlikely(is_rt_policy((p)->policy))
+
+#define is_idle_policy(policy)	((policy) == SCHED_IDLEPRIO)
+#define idleprio_task(p)	unlikely(is_idle_policy((p)->policy))
+#define task_running_idle(p)	unlikely((p)->prio == IDLE_PRIO)
+
+#define is_iso_policy(policy)	((policy) == SCHED_ISO)
+#define iso_task(p)		unlikely(is_iso_policy((p)->policy))
+#define task_running_iso(p)	unlikely((p)->prio == ISO_PRIO)
+
+#define rq_idle(rq)		((rq)->rq_prio == PRIO_LIMIT)
+
+#define ISO_PERIOD		(5 * HZ)
+
+#define STOP_PRIO		(MAX_RT_PRIO - 1)
+
+/*
+ * Some helpers for converting to/from various scales. Use shifts to get
+ * approximate multiples of ten for less overhead.
+ */
+#define JIFFIES_TO_NS(TIME)	((TIME) * (1073741824 / HZ))
+#define JIFFY_NS		(1073741824 / HZ)
+#define JIFFY_US		(1048576 / HZ)
+#define NS_TO_JIFFIES(TIME)	((TIME) / JIFFY_NS)
+#define HALF_JIFFY_NS		(1073741824 / HZ / 2)
+#define HALF_JIFFY_US		(1048576 / HZ / 2)
+#define MS_TO_NS(TIME)		((TIME) << 20)
+#define MS_TO_US(TIME)		((TIME) << 10)
+#define NS_TO_MS(TIME)		((TIME) >> 20)
+#define NS_TO_US(TIME)		((TIME) >> 10)
+#define US_TO_NS(TIME)		((TIME) << 10)
+
+#define RESCHED_US	(100) /* Reschedule if less than this many μs left */
+
+void print_scheduler_version(void)
+{
+	printk(KERN_INFO "MuQSS CPU scheduler v0.144 by Con Kolivas.\n");
+}
+
+/*
+ * This is the time all tasks within the same priority round robin.
+ * Value is in ms and set to a minimum of 6ms. Scales with number of cpus.
+ * Tunable via /proc interface.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int rr_interval __read_mostly = 3;
+#else
+int rr_interval __read_mostly = 6;
+#endif
+
+/* Tunable to choose whether to prioritise latency or throughput, simple
+ * binary yes or no */
+
+int sched_interactive __read_mostly = 1;
+
+/*
+ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks
+ * are allowed to run five seconds as real time tasks. This is the total over
+ * all online cpus.
+ */
+#ifdef CONFIG_PCK_INTERACTIVE
+int sched_iso_cpu __read_mostly = 25;
+#else
+int sched_iso_cpu __read_mostly = 70;
+#endif
+
+/*
+ * The relative length of deadline for each priority(nice) level.
+ */
+static int prio_ratios[NICE_WIDTH] __read_mostly;
+
+/*
+ * The quota handed out to tasks of all priority levels when refilling their
+ * time_slice.
+ */
+static inline int timeslice(void)
+{
+	return MS_TO_US(rr_interval);
+}
+
+static bool sched_smp_initialized __read_mostly;
+
+/*
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables and a cpu bitmap set atomically.
+ */
+struct global_rq {
+#ifdef CONFIG_SMP
+	atomic_t nr_running ____cacheline_aligned_in_smp;
+	atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
+	atomic64_t nr_switches ____cacheline_aligned_in_smp;
+	cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
+#else
+	atomic_t nr_running ____cacheline_aligned;
+	atomic_t nr_uninterruptible ____cacheline_aligned;
+	atomic64_t nr_switches ____cacheline_aligned;
+#endif
+};
+
+#ifdef CONFIG_SMP
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+	atomic_t refcount;
+	atomic_t rto_count;
+	struct rcu_head rcu;
+	cpumask_var_t span;
+	cpumask_var_t online;
+
+	/*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_var_t rto_mask;
+	struct cpupri cpupri;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif /* CONFIG_SMP */
+
+/* There can be only one */
+#ifdef CONFIG_SMP
+static struct global_rq grq ____cacheline_aligned_in_smp;
+#else
+static struct global_rq grq ____cacheline_aligned;
+#endif
+
+static DEFINE_MUTEX(sched_hotcpu_mutex);
+
+/* cpus with isolated domains */
+cpumask_var_t cpu_isolated_map;
+
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu)
+{
+	return &per_cpu(runqueues, (cpu));
+}
+#define task_rq(p)		cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
+/*
+ * sched_domains_mutex serialises calls to init_sched_domains,
+ * detach_destroy_domains and partition_sched_domains.
+ */
+DEFINE_MUTEX(sched_domains_mutex);
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
+}
+#else
+struct rq *uprq;
+#endif /* CONFIG_SMP */
+
+#ifdef CONFIG_SMP
+static inline int cpu_of(struct rq *rq)
+{
+	return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static inline int cpu_of(struct rq *rq)
+{
+	return 0;
+}
+#endif
+
+#include "stats.h"
+
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)	do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)	do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch()	do { } while (0)
+#endif
+
+/*
+ * All common locking functions performed on rq->lock. rq->clock is local to
+ * the CPU accessing it so it can be modified just with interrupts disabled
+ * when we're not updating niffies.
+ * Looking up task_rq must be done under rq->lock to be safe.
+ */
+static void update_rq_clock_task(struct rq *rq, s64 delta);
+
+static inline void update_rq_clock(struct rq *rq)
+{
+	s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+
+	if (unlikely(delta < 0))
+		return;
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
+}
+
+/*
+ * Niffies are a globally increasing nanosecond counter. They're only used by
+ * update_load_avg and time_slice_expired, however deadlines are based on them
+ * across CPUs. Update them whenever we will call one of those functions, and
+ * synchronise them across CPUs whenever we hold both runqueue locks.
+ */
+static inline void update_clocks(struct rq *rq)
+{
+	s64 ndiff, minndiff;
+	long jdiff;
+
+	update_rq_clock(rq);
+	ndiff = rq->clock - rq->old_clock;
+	rq->old_clock = rq->clock;
+	jdiff = jiffies - rq->last_jiffy;
+
+	/* Subtract any niffies added by balancing with other rqs */
+	ndiff -= rq->niffies - rq->last_niffy;
+	minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies;
+	if (minndiff < 0)
+		minndiff = 0;
+	ndiff = max(ndiff, minndiff);
+	rq->niffies += ndiff;
+	rq->last_niffy = rq->niffies;
+	if (jdiff) {
+		rq->last_jiffy += jdiff;
+		rq->last_jiffy_niffies = rq->niffies;
+	}
+}
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+	return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+	return p->on_cpu;
+#else
+	return task_current(rq, p);
+#endif
+}
+
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+	return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+	return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
+
+static inline void rq_lock(struct rq *rq)
+	__acquires(rq->lock)
+{
+	raw_spin_lock(&rq->lock);
+}
+
+static inline int rq_trylock(struct rq *rq)
+	__acquires(rq->lock)
+{
+	return raw_spin_trylock(&rq->lock);
+}
+
+static inline void rq_unlock(struct rq *rq)
+	__releases(rq->lock)
+{
+	raw_spin_unlock(&rq->lock);
+}
+
+static inline struct rq *this_rq_lock(void)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	local_irq_disable();
+	rq = this_rq();
+	raw_spin_lock(&rq->lock);
+
+	return rq;
+}
+
+/*
+ * Any time we have two runqueues locked we use that as an opportunity to
+ * synchronise niffies to the highest value as idle ticks may have artificially
+ * kept niffies low on one CPU and the truth can only be later.
+ */
+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2)
+{
+	if (rq1->niffies > rq2->niffies)
+		rq2->niffies = rq1->niffies;
+	else
+		rq1->niffies = rq2->niffies;
+}
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+
+/* For when we know rq1 != rq2 */
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	if (rq1 < rq2) {
+		raw_spin_lock(&rq1->lock);
+		raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		raw_spin_lock(&rq2->lock);
+		raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	BUG_ON(!irqs_disabled());
+	if (rq1 == rq2) {
+		raw_spin_lock(&rq1->lock);
+		__acquire(rq2->lock);	/* Fake it out ;) */
+	} else
+		__double_rq_lock(rq1, rq2);
+	synchronise_niffies(rq1, rq2);
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__releases(rq2->lock)
+{
+	raw_spin_unlock(&rq1->lock);
+	if (rq1 != rq2)
+		raw_spin_unlock(&rq2->lock);
+	else
+		__release(rq2->lock);
+}
+
+/* Must be sure rq1 != rq2 and irqs are disabled */
+static inline void lock_second_rq(struct rq *rq1, struct rq *rq2)
+	__releases(rq1->lock)
+	__acquires(rq1->lock)
+	__acquires(rq2->lock)
+{
+	BUG_ON(!irqs_disabled());
+	if (unlikely(!raw_spin_trylock(&rq2->lock))) {
+		raw_spin_unlock(&rq1->lock);
+		__double_rq_lock(rq1, rq2);
+	}
+	synchronise_niffies(rq1, rq2);
+}
+
+static inline void lock_all_rqs(void)
+{
+	int cpu;
+
+	preempt_disable();
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_lock(&rq->lock);
+	}
+}
+
+static inline void unlock_all_rqs(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct rq *rq = cpu_rq(cpu);
+
+		do_raw_spin_unlock(&rq->lock);
+	}
+	preempt_enable();
+}
+
+/* Specially nest trylock an rq */
+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq)
+{
+	if (unlikely(!do_raw_spin_trylock(&rq->lock)))
+		return false;
+	spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+	synchronise_niffies(this_rq, rq);
+	return true;
+}
+
+/* Unlock a specially nested trylocked rq */
+static inline void unlock_rq(struct rq *rq)
+{
+	spin_release(&rq->lock.dep_map, 1, _RET_IP_);
+	do_raw_spin_unlock(&rq->lock);
+}
+
+static inline void rq_lock_irq(struct rq *rq)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irq(&rq->lock);
+}
+
+static inline void rq_unlock_irq(struct rq *rq)
+	__releases(rq->lock)
+{
+	raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags)
+	__acquires(rq->lock)
+{
+	raw_spin_lock_irqsave(&rq->lock, *flags);
+}
+
+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags)
+	__releases(rq->lock)
+{
+	raw_spin_unlock_irqrestore(&rq->lock, *flags);
+}
+
+static inline struct rq
+*task_rq_lock(struct task_struct *p, unsigned long *flags)
+	__acquires(p->pi_lock)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	while (42) {
+		raw_spin_lock_irqsave(&p->pi_lock, *flags);
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+	}
+	return rq;
+}
+
+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+	__releases(rq->lock)
+	__releases(p->pi_lock)
+{
+	rq_unlock(rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+}
+
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	lockdep_assert_held(&p->pi_lock);
+
+	while (42) {
+		rq = task_rq(p);
+		raw_spin_lock(&rq->lock);
+		if (likely(rq == task_rq(p)))
+			break;
+		raw_spin_unlock(&rq->lock);
+	}
+	return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+{
+	rq_unlock(rq);
+}
+
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask)						\
+	({								\
+		typeof(ptr) _ptr = (ptr);				\
+		typeof(mask) _mask = (mask);				\
+		typeof(*_ptr) _old, _val = *_ptr;			\
+									\
+		for (;;) {						\
+			_old = cmpxchg(_ptr, _val, _val | _mask);	\
+			if (_old == _val)				\
+				break;					\
+			_val = _old;					\
+		}							\
+	_old;								\
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+	struct thread_info *ti = task_thread_info(p);
+	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+	struct thread_info *ti = task_thread_info(p);
+	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+
+	for (;;) {
+		if (!(val & _TIF_POLLING_NRFLAG))
+			return false;
+		if (val & _TIF_NEED_RESCHED)
+			return true;
+		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+		if (old == val)
+			break;
+		val = old;
+	}
+	return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+	set_tsk_need_resched(p);
+	return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+	return false;
+}
+#endif
+#endif
+
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+	struct wake_q_node *node = &task->wake_q;
+
+	/*
+	 * Atomically grab the task, if ->wake_q is !nil already it means
+	 * its already queued (either by us or someone else) and will get the
+	 * wakeup due to that.
+	 *
+	 * This cmpxchg() implies a full barrier, which pairs with the write
+	 * barrier implied by the wakeup in wake_up_q().
+	 */
+	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+		return;
+
+	get_task_struct(task);
+
+	/*
+	 * The head is context local, there can be no concurrency.
+	 */
+	*head->lastp = node;
+	head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+	struct wake_q_node *node = head->first;
+
+	while (node != WAKE_Q_TAIL) {
+		struct task_struct *task;
+
+		task = container_of(node, struct task_struct, wake_q);
+		BUG_ON(!task);
+		/* task can safely be re-inserted now */
+		node = node->next;
+		task->wake_q.next = NULL;
+
+		/*
+		 * wake_up_process() implies a wmb() to pair with the queueing
+		 * in wake_q_add() so as not to miss wakeups.
+		 */
+		wake_up_process(task);
+		put_task_struct(task);
+	}
+}
+
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+	next->on_cpu = 1;
+}
+
+static inline void smp_sched_reschedule(int cpu)
+{
+	if (likely(cpu_online(cpu)))
+		smp_send_reschedule(cpu);
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
+{
+	int cpu;
+#ifdef CONFIG_LOCKDEP
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&rq->lock);
+#endif
+	if (test_tsk_need_resched(p))
+		return;
+
+	cpu = task_cpu(p);
+	if (cpu == smp_processor_id()) {
+		set_tsk_need_resched(p);
+		set_preempt_need_resched();
+		return;
+	}
+
+	if (set_nr_and_not_polling(p))
+		smp_sched_reschedule(cpu);
+	else
+		trace_sched_wake_idle_without_ipi(cpu);
+}
+
+/*
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+	return !skiplist_node_empty(&p->node);
+}
+
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+static inline void resched_if_idle(struct rq *rq);
+
+/* Dodgy workaround till we figure out where the softirqs are going */
+static inline void do_pending_softirq(struct rq *rq, struct task_struct *next)
+{
+	if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt()))
+		do_softirq_own_stack();
+}
+
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+	/*
+	 * After ->on_cpu is cleared, the task can be moved to a different CPU.
+	 * We must ensure this doesn't happen until the switch is completely
+	 * finished.
+	 *
+	 * In particular, the load of prev->state in finish_task_switch() must
+	 * happen before this.
+	 *
+	 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+	 */
+	smp_store_release(&prev->on_cpu, 0);
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+	/* this is a valid case when another task releases the spinlock */
+	rq->lock.owner = current;
+#endif
+	/*
+	 * If we are tracking spinlock dependencies then we have to
+	 * fix up the runqueue lock - which gets 'carried over' from
+	 * prev into current:
+	 */
+	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+
+#ifdef CONFIG_SMP
+	/*
+	 * If prev was marked as migrating to another CPU in return_task, drop
+	 * the local runqueue lock but leave interrupts disabled and grab the
+	 * remote lock we're migrating it to before enabling them.
+	 */
+	if (unlikely(task_on_rq_migrating(prev))) {
+		sched_info_dequeued(rq, prev);
+		/*
+		 * We move the ownership of prev to the new cpu now. ttwu can't
+		 * activate prev to the wrong cpu since it has to grab this
+		 * runqueue in ttwu_remote.
+		 */
+		task_thread_info(prev)->cpu = prev->wake_cpu;
+		raw_spin_unlock(&rq->lock);
+
+		raw_spin_lock(&prev->pi_lock);
+		rq = __task_rq_lock(prev);
+		/* Check that someone else hasn't already queued prev */
+		if (likely(!task_queued(prev))) {
+			enqueue_task(rq, prev, 0);
+			prev->on_rq = TASK_ON_RQ_QUEUED;
+			/* Wake up the CPU if it's not already running */
+			resched_if_idle(rq);
+		}
+		raw_spin_unlock(&prev->pi_lock);
+	}
+#endif
+	rq_unlock(rq);
+
+	do_pending_softirq(rq, current);
+
+	local_irq_enable();
+}
+
+static inline bool deadline_before(u64 deadline, u64 time)
+{
+	return (deadline < time);
+}
+
+/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+	return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+	return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+	return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+	return NS_TO_MS(longest_deadline_diff());
+}
+
+static inline int rq_load(struct rq *rq)
+{
+	return rq->sl->entries + !rq_idle(rq);
+}
+
+static inline bool rq_local(struct rq *rq);
+
+/*
+ * Update the load average for feeding into cpu frequency governors. Use a
+ * rough estimate of a rolling average with ~ time constant of 32ms.
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
+ * Make sure a call to update_clocks has been made before calling this to get
+ * an updated rq->niffies.
+ */
+static void update_load_avg(struct rq *rq)
+{
+	unsigned long us_interval;
+	long load, curload;
+
+	if (unlikely(rq->niffies <= rq->load_update))
+		return;
+
+	us_interval = NS_TO_US(rq->niffies - rq->load_update);
+	curload = rq_load(rq);
+	load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+	if (unlikely(load < 0))
+		load = 0;
+	load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
+	/* If this CPU has all the load, make it ramp up quickly */
+	if (curload > load && curload >= atomic_read(&grq.nr_running))
+		load = curload;
+	rq->load_avg = load;
+
+	rq->load_update = rq->niffies;
+	if (likely(rq_local(rq)))
+		cpufreq_trigger(rq->niffies, rq->load_avg);
+}
+
+/*
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
+ * from the skip list is done via the stored node reference in the task struct
+ * and does not require a full look up. Thus it occurs in O(k) time where k
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
+ */
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+	skiplist_delete(rq->sl, &p->node);
+	rq->best_key = rq->node.next[0]->key;
+	update_clocks(rq);
+	if (!(flags & DEQUEUE_SAVE))
+		sched_info_dequeued(task_rq(p), p);
+	update_load_avg(rq);
+}
+
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+	return p->rcu_read_unlock_special.b.blocked;
+}
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
+
+/*
+ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
+ * an idle task, we ensure none of the following conditions are met.
+ */
+static bool idleprio_suitable(struct task_struct *p)
+{
+	return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+		!signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
+}
+
+/*
+ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
+ * that the iso_refractory flag is not set.
+ */
+static inline bool isoprio_suitable(struct rq *rq)
+{
+	return !rq->iso_refractory;
+}
+
+/*
+ * Adding to the runqueue. Enter with rq locked.
+ */
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
+{
+	unsigned int randseed;
+	u64 sl_id;
+
+	if (!rt_task(p)) {
+		/* Check it hasn't gotten rt from PI */
+		if ((idleprio_task(p) && idleprio_suitable(p)) ||
+		   (iso_task(p) && isoprio_suitable(rq)))
+			p->prio = p->normal_prio;
+		else
+			p->prio = NORMAL_PRIO;
+	}
+	/*
+	 * The sl_id key passed to the skiplist generates a sorted list.
+	 * Realtime and sched iso tasks run FIFO so they only need be sorted
+	 * according to priority. The skiplist will put tasks of the same
+	 * key inserted later in FIFO order. Tasks of sched normal, batch
+	 * and idleprio are sorted according to their deadlines. Idleprio
+	 * tasks are offset by an impossibly large deadline value ensuring
+	 * they get sorted into last positions, but still according to their
+	 * own deadlines. This creates a "landscape" of skiplists running
+	 * from priority 0 realtime in first place to the lowest priority
+	 * idleprio tasks last. Skiplist insertion is an O(log n) process.
+	 */
+	if (p->prio <= ISO_PRIO)
+		sl_id = p->prio;
+	else {
+		sl_id = p->deadline;
+		if (idleprio_task(p)) {
+			if (p->prio == IDLE_PRIO)
+				sl_id |= 0xF000000000000000;
+			else
+				sl_id += longest_deadline_diff();
+		}
+	}
+	/*
+	 * Some architectures don't have better than microsecond resolution
+	 * so mask out ~microseconds as the random seed for skiplist insertion.
+	 */
+	update_clocks(rq);
+	if (!(flags & ENQUEUE_RESTORE))
+		sched_info_queued(rq, p);
+	randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
+	skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
+	rq->best_key = rq->node.next[0]->key;
+	update_load_avg(rq);
+}
+
+/*
+ * Returns the relative length of deadline all compared to the shortest
+ * deadline which is that of nice -20.
+ */
+static inline int task_prio_ratio(struct task_struct *p)
+{
+	return prio_ratios[TASK_USER_PRIO(p)];
+}
+
+/*
+ * task_timeslice - all tasks of all priorities get the exact same timeslice
+ * length. CPU distribution is handled by giving different deadlines to
+ * tasks of different priorities. Use 128 as the base value for fast shifts.
+ */
+static inline int task_timeslice(struct task_struct *p)
+{
+	return (rr_interval * task_prio_ratio(p) / 128);
+}
+
+#ifdef CONFIG_SMP
+/* Entered with rq locked */
+static inline void resched_if_idle(struct rq *rq)
+{
+	if (rq_idle(rq))
+		resched_task(rq->curr);
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+	return (rq->cpu == smp_processor_id());
+}
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(struct rq *this_rq)
+{
+	int other_cpu, best_bias = 0;
+
+	for_each_cpu(other_cpu, &this_rq->thread_mask) {
+		struct rq *rq = cpu_rq(other_cpu);
+
+		if (rq_idle(rq))
+			continue;
+		if (unlikely(!rq->online))
+			continue;
+		if (!rq->rq_mm)
+			continue;
+		if (likely(rq->rq_smt_bias > best_bias))
+			best_bias = rq->rq_smt_bias;
+	}
+	return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+	if (rt_task(p))
+		return 1 << 30;
+	else if (task_running_iso(p))
+		return 1 << 29;
+	else if (task_running_idle(p))
+		return 0;
+	return MAX_PRIO - p->static_prio;
+}
+
+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
+{
+	return true;
+}
+
+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
+{
+	int best_bias, task_bias;
+
+	/* Kernel threads always run */
+	if (unlikely(!p->mm))
+		return true;
+	if (rt_task(p))
+		return true;
+	if (!idleprio_suitable(p))
+		return true;
+	best_bias = best_smt_bias(this_rq);
+	/* The smt siblings are all idle or running IDLEPRIO */
+	if (best_bias < 1)
+		return true;
+	task_bias = task_prio_bias(p);
+	if (task_bias < 1)
+		return false;
+	if (task_bias >= best_bias)
+		return true;
+	/* Dither 25% cpu of normal tasks regardless of nice difference */
+	if (best_bias % 4 == 1)
+		return true;
+	/* Sorry, you lose */
+	return false;
+}
+#else /* CONFIG_SMT_NICE */
+#define smt_schedule(p, this_rq) (true)
+#endif /* CONFIG_SMT_NICE */
+
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
+{
+	set_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+/*
+ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to
+ * allow easy lookup of whether any suitable idle CPUs are available.
+ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
+ * bits are set atomically but read locklessly as occasional false positive /
+ * negative is harmless.
+ */
+static inline void set_cpuidle_map(int cpu)
+{
+	if (likely(cpu_online(cpu)))
+		atomic_set_cpu(cpu, &grq.cpu_idle_map);
+}
+
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
+{
+	clear_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+	atomic_clear_cpu(cpu, &grq.cpu_idle_map);
+}
+
+static bool suitable_idle_cpus(struct task_struct *p)
+{
+	return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
+}
+
+/*
+ * Resched current on rq. We don't know if rq is local to this CPU nor if it
+ * is locked so we do not use an intermediate variable for the task to avoid
+ * having it dereferenced.
+ */
+static void resched_curr(struct rq *rq)
+{
+	int cpu;
+
+	if (test_tsk_need_resched(rq->curr))
+		return;
+
+	rq->preempt = rq->curr;
+	cpu = rq->cpu;
+
+	/* We're doing this without holding the rq lock if it's not task_rq */
+
+	if (cpu == smp_processor_id()) {
+		set_tsk_need_resched(rq->curr);
+		set_preempt_need_resched();
+		return;
+	}
+
+	if (set_nr_and_not_polling(rq->curr))
+		smp_sched_reschedule(cpu);
+	else
+		trace_sched_wake_idle_without_ipi(cpu);
+}
+
+#define CPUIDLE_DIFF_THREAD	(1)
+#define CPUIDLE_DIFF_CORE	(2)
+#define CPUIDLE_CACHE_BUSY	(4)
+#define CPUIDLE_DIFF_CPU	(8)
+#define CPUIDLE_THREAD_BUSY	(16)
+#define CPUIDLE_DIFF_NODE	(32)
+
+/*
+ * The best idle CPU is chosen according to the CPUIDLE ranking above where the
+ * lowest value would give the most suitable CPU to schedule p onto next. The
+ * order works out to be the following:
+ *
+ * Same thread, idle or busy cache, idle or busy threads
+ * Other core, same cache, idle or busy cache, idle threads.
+ * Same node, other CPU, idle cache, idle threads.
+ * Same node, other CPU, busy cache, idle threads.
+ * Other core, same cache, busy threads.
+ * Same node, other CPU, busy threads.
+ * Other node, other CPU, idle cache, idle threads.
+ * Other node, other CPU, busy cache, idle threads.
+ * Other node, other CPU, busy threads.
+ */
+static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
+{
+	int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY |
+		CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE |
+		CPUIDLE_DIFF_THREAD;
+	int cpu_tmp;
+
+	if (cpumask_test_cpu(best_cpu, tmpmask))
+		goto out;
+
+	for_each_cpu(cpu_tmp, tmpmask) {
+		int ranking, locality;
+		struct rq *tmp_rq;
+
+		ranking = 0;
+		tmp_rq = cpu_rq(cpu_tmp);
+
+		locality = rq->cpu_locality[cpu_tmp];
+#ifdef CONFIG_NUMA
+		if (locality > 3)
+			ranking |= CPUIDLE_DIFF_NODE;
+		else
+#endif
+		if (locality > 2)
+			ranking |= CPUIDLE_DIFF_CPU;
+#ifdef CONFIG_SCHED_MC
+		else if (locality == 2)
+			ranking |= CPUIDLE_DIFF_CORE;
+		else if (!(tmp_rq->cache_idle(tmp_rq)))
+			ranking |= CPUIDLE_CACHE_BUSY;
+#endif
+#ifdef CONFIG_SCHED_SMT
+		if (locality == 1)
+			ranking |= CPUIDLE_DIFF_THREAD;
+		if (!(tmp_rq->siblings_idle(tmp_rq)))
+			ranking |= CPUIDLE_THREAD_BUSY;
+#endif
+		if (ranking < best_ranking) {
+			best_cpu = cpu_tmp;
+			best_ranking = ranking;
+		}
+	}
+out:
+	return best_cpu;
+}
+
+bool cpus_share_cache(int this_cpu, int that_cpu)
+{
+	struct rq *this_rq = cpu_rq(this_cpu);
+
+	return (this_rq->cpu_locality[that_cpu] < 3);
+}
+
+/* As per resched_curr but only will resched idle task */
+static inline void resched_idle(struct rq *rq)
+{
+	if (test_tsk_need_resched(rq->idle))
+		return;
+
+	rq->preempt = rq->idle;
+
+	set_tsk_need_resched(rq->idle);
+
+	if (rq_local(rq)) {
+		set_preempt_need_resched();
+		return;
+	}
+
+	smp_sched_reschedule(rq->cpu);
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+	cpumask_t tmpmask;
+	struct rq *rq;
+	int best_cpu;
+
+	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+	best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
+	rq = cpu_rq(best_cpu);
+	if (!smt_schedule(p, rq))
+		return NULL;
+	resched_idle(rq);
+	return rq;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+	if (suitable_idle_cpus(p))
+		resched_best_idle(p, task_cpu(p));
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+	return rq->rq_order[cpu];
+}
+#else /* CONFIG_SMP */
+static inline void set_cpuidle_map(int cpu)
+{
+}
+
+static inline void clear_cpuidle_map(int cpu)
+{
+}
+
+static inline bool suitable_idle_cpus(struct task_struct *p)
+{
+	return uprq->curr == uprq->idle;
+}
+
+static inline void resched_suitable_idle(struct task_struct *p)
+{
+}
+
+static inline void resched_curr(struct rq *rq)
+{
+	resched_task(rq->curr);
+}
+
+static inline void resched_if_idle(struct rq *rq)
+{
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+	return true;
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+	return rq;
+}
+
+static inline bool smt_schedule(struct task_struct *p, struct rq *rq)
+{
+	return true;
+}
+#endif /* CONFIG_SMP */
+
+static inline int normal_prio(struct task_struct *p)
+{
+	if (has_rt_policy(p))
+		return MAX_RT_PRIO - 1 - p->rt_priority;
+	if (idleprio_task(p))
+		return IDLE_PRIO;
+	if (iso_task(p))
+		return ISO_PRIO;
+	return NORMAL_PRIO;
+}
+
+/*
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks as it will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
+ */
+static int effective_prio(struct task_struct *p)
+{
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
+}
+
+/*
+ * activate_task - move a task to the runqueue. Enter with rq locked.
+ */
+static void activate_task(struct task_struct *p, struct rq *rq)
+{
+	resched_if_idle(rq);
+
+	/*
+	 * Sleep time is in units of nanosecs, so shift by 20 to get a
+	 * milliseconds-range estimation of the amount of time that the task
+	 * spent sleeping:
+	 */
+	if (unlikely(prof_on == SLEEP_PROFILING)) {
+		if (p->state == TASK_UNINTERRUPTIBLE)
+			profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
+				     (rq->niffies - p->last_ran) >> 20);
+	}
+
+	p->prio = effective_prio(p);
+	if (task_contributes_to_load(p))
+		atomic_dec(&grq.nr_uninterruptible);
+
+	enqueue_task(rq, p, 0);
+	p->on_rq = TASK_ON_RQ_QUEUED;
+	atomic_inc(&grq.nr_running);
+}
+
+/*
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
+ * decrement the nr_running. Enter with rq locked.
+ */
+static inline void deactivate_task(struct task_struct *p, struct rq *rq)
+{
+	if (task_contributes_to_load(p))
+		atomic_inc(&grq.nr_uninterruptible);
+
+	p->on_rq = 0;
+	atomic_dec(&grq.nr_running);
+	sched_info_dequeued(rq, p);
+}
+
+#ifdef CONFIG_SMP
+void set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+	struct rq *rq = task_rq(p);
+	bool queued;
+
+#ifdef CONFIG_LOCKDEP
+	/*
+	 * The caller should hold either p->pi_lock or rq->lock, when changing
+	 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+	 *
+	 * Furthermore, all task_rq users should acquire both locks, see
+	 * task_rq_lock().
+	 */
+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+				      lockdep_is_held(&task_rq(p)->lock)));
+#endif
+	if (task_cpu(p) == cpu)
+		return;
+	trace_sched_migrate_task(p, cpu);
+	perf_event_task_migrate(p);
+
+	/*
+	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+	 * successfully executed on another CPU. We must ensure that updates of
+	 * per-task data have been completed by this moment.
+	 */
+	smp_wmb();
+
+	if (task_running(rq, p)) {
+		/*
+		 * We should only be calling this on a running task if we're
+		 * holding rq lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+
+		/*
+		 * We can't change the task_thread_info cpu on a running task
+		 * as p will still be protected by the rq lock of the cpu it
+		 * is still running on so we set the wake_cpu for it to be
+		 * lazily updated once off the cpu.
+		 */
+		p->wake_cpu = cpu;
+		return;
+	}
+
+	if ((queued = task_queued(p)))
+		dequeue_task(rq, p, 0);
+	task_thread_info(p)->cpu = p->wake_cpu = cpu;
+	if (queued)
+		enqueue_task(cpu_rq(cpu), p, 0);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Move a task off the runqueue and take it to a cpu for it will
+ * become the running task.
+ */
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
+{
+	struct rq *p_rq = task_rq(p);
+
+	dequeue_task(p_rq, p, DEQUEUE_SAVE);
+	if (p_rq != rq) {
+		sched_info_dequeued(p_rq, p);
+		sched_info_queued(rq, p);
+	}
+	set_task_cpu(p, cpu);
+}
+
+/*
+ * Returns a descheduling task to the runqueue unless it is being
+ * deactivated.
+ */
+static inline void return_task(struct task_struct *p, struct rq *rq,
+			       int cpu, bool deactivate)
+{
+	if (deactivate)
+		deactivate_task(p, rq);
+	else {
+#ifdef CONFIG_SMP
+		/*
+		 * set_task_cpu was called on the running task that doesn't
+		 * want to deactivate so it has to be enqueued to a different
+		 * CPU and we need its lock. Tag it to be moved with as the
+		 * lock is dropped in finish_lock_switch.
+		 */
+		if (unlikely(p->wake_cpu != cpu))
+			p->on_rq = TASK_ON_RQ_MIGRATING;
+		else
+#endif
+			enqueue_task(rq, p, ENQUEUE_RESTORE);
+	}
+}
+
+/* Enter with rq lock held. We know p is on the local cpu */
+static inline void __set_tsk_resched(struct task_struct *p)
+{
+	set_tsk_need_resched(p);
+	set_preempt_need_resched();
+}
+
+/**
+ * task_curr - is this task currently executing on a CPU?
+ * @p: the task in question.
+ *
+ * Return: 1 if the task is currently executing. 0 otherwise.
+ */
+inline int task_curr(const struct task_struct *p)
+{
+	return cpu_curr(task_cpu(p)) == p;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * wait_task_inactive - wait for a thread to unschedule.
+ *
+ * If @match_state is nonzero, it's the @p->state value just checked and
+ * not expected to change.  If it changes, i.e. @p might have woken up,
+ * then return zero.  When we succeed in waiting for @p to be off its CPU,
+ * we return a positive number (its total switch count).  If a second call
+ * a short while later returns the same number, the caller can be sure that
+ * @p has remained unscheduled the whole time.
+ *
+ * The caller must ensure that the task *will* unschedule sometime soon,
+ * else this function might spin for a *long* time. This function can't
+ * be called with interrupts off, or it may introduce deadlock with
+ * smp_call_function() if an IPI is sent by the same process we are
+ * waiting to become inactive.
+ */
+unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+{
+	int running, queued;
+	unsigned long flags;
+	unsigned long ncsw;
+	struct rq *rq;
+
+	for (;;) {
+		rq = task_rq(p);
+
+		/*
+		 * If the task is actively running on another CPU
+		 * still, just relax and busy-wait without holding
+		 * any locks.
+		 *
+		 * NOTE! Since we don't hold any locks, it's not
+		 * even sure that "rq" stays as the right runqueue!
+		 * But we don't care, since this will return false
+		 * if the runqueue has changed and p is actually now
+		 * running somewhere else!
+		 */
+		while (task_running(rq, p)) {
+			if (match_state && unlikely(p->state != match_state))
+				return 0;
+			cpu_relax();
+		}
+
+		/*
+		 * Ok, time to look more closely! We need the rq
+		 * lock now, to be *sure*. If we're wrong, we'll
+		 * just go back and repeat.
+		 */
+		rq = task_rq_lock(p, &flags);
+		trace_sched_wait_task(p);
+		running = task_running(rq, p);
+		queued = task_on_rq_queued(p);
+		ncsw = 0;
+		if (!match_state || p->state == match_state)
+			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
+		task_rq_unlock(rq, p, &flags);
+
+		/*
+		 * If it changed from the expected state, bail out now.
+		 */
+		if (unlikely(!ncsw))
+			break;
+
+		/*
+		 * Was it really running after all now that we
+		 * checked with the proper locks actually held?
+		 *
+		 * Oops. Go back and try again..
+		 */
+		if (unlikely(running)) {
+			cpu_relax();
+			continue;
+		}
+
+		/*
+		 * It's not enough that it's not actively running,
+		 * it must be off the runqueue _entirely_, and not
+		 * preempted!
+		 *
+		 * So if it was still runnable (but just not actively
+		 * running right now), it's preempted, and we should
+		 * yield - it could be a while.
+		 */
+		if (unlikely(queued)) {
+			ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
+
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_hrtimeout(&to, HRTIMER_MODE_REL);
+			continue;
+		}
+
+		/*
+		 * Ahh, all good. It wasn't running, and it wasn't
+		 * runnable, which means that it will never become
+		 * running in the future either. We're all done!
+		 */
+		break;
+	}
+
+	return ncsw;
+}
+
+/***
+ * kick_process - kick a running thread to enter/exit the kernel
+ * @p: the to-be-kicked thread
+ *
+ * Cause a process which is running on another CPU to enter
+ * kernel-mode, without any delay. (to get signals handled.)
+ *
+ * NOTE: this function doesn't have to take the runqueue lock,
+ * because all it wants to ensure is that the remote task enters
+ * the kernel. If the IPI races and the task has been migrated
+ * to another CPU then no harm is done and the purpose has been
+ * achieved as well.
+ */
+void kick_process(struct task_struct *p)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if ((cpu != smp_processor_id()) && task_curr(p))
+		smp_sched_reschedule(cpu);
+	preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kick_process);
+#endif
+
+/*
+ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the
+ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or
+ * between themselves, they cooperatively multitask. An idle rq scores as
+ * prio PRIO_LIMIT so it is always preempted.
+ */
+static inline bool
+can_preempt(struct task_struct *p, int prio, u64 deadline)
+{
+	/* Better static priority RT task or better policy preemption */
+	if (p->prio < prio)
+		return true;
+	if (p->prio > prio)
+		return false;
+	if (p->policy == SCHED_BATCH)
+		return false;
+	/* SCHED_NORMAL and ISO will preempt based on deadline */
+	if (!deadline_before(p->deadline, deadline))
+		return false;
+	return true;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * Check to see if p can run on cpu, and if not, whether there are any online
+ * CPUs it can run on instead.
+ */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed)))
+		return true;
+	return false;
+}
+#define cpu_online_map		(*(cpumask_t *)cpu_online_mask)
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	int i, this_entries = rq_load(this_rq);
+	cpumask_t tmp;
+
+	if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p)))
+		return;
+
+	/* IDLEPRIO tasks never preempt anything but idle */
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+
+	cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *rq = this_rq->rq_order[i];
+
+		if (!cpumask_test_cpu(rq->cpu, &tmp))
+			continue;
+
+		if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries)
+			continue;
+		if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
+			resched_curr(rq);
+			return;
+		}
+	}
+}
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check);
+#else /* CONFIG_SMP */
+static inline bool needs_other_cpu(struct task_struct *p, int cpu)
+{
+	return false;
+}
+
+static void try_preempt(struct task_struct *p, struct rq *this_rq)
+{
+	if (p->policy == SCHED_IDLEPRIO)
+		return;
+	if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline))
+		resched_curr(uprq);
+}
+
+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
+					 const struct cpumask *new_mask, bool check)
+{
+	return set_cpus_allowed_ptr(p, new_mask);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * wake flags
+ */
+#define WF_SYNC		0x01		/* waker goes to sleep after wakeup */
+#define WF_FORK		0x02		/* child wakeup after fork */
+#define WF_MIGRATED	0x04		/* internal use, task got migrated */
+
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
+{
+#ifdef CONFIG_SCHEDSTATS
+	struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+	int this_cpu = smp_processor_id();
+
+	if (cpu == this_cpu)
+		schedstat_inc(rq, ttwu_local);
+	else {
+		struct sched_domain *sd;
+
+		rcu_read_lock();
+		for_each_domain(this_cpu, sd) {
+			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+#endif /* CONFIG_SMP */
+
+	schedstat_inc(rq, ttwu_count);
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p)
+{
+	activate_task(p, rq);
+
+	/* if a worker is waking up, notify workqueue */
+	if (p->flags & PF_WQ_WORKER)
+		wq_worker_waking_up(p, cpu_of(rq));
+}
+
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+	/*
+	 * Sync wakeups (i.e. those types of wakeups where the waker
+	 * has indicated that it will leave the CPU in short order)
+	 * don't trigger a preemption if there are no idle cpus,
+	 * instead waiting for current to deschedule.
+	 */
+	if (wake_flags & WF_SYNC)
+		resched_suitable_idle(p);
+	else
+		try_preempt(p, rq);
+	p->state = TASK_RUNNING;
+	trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+	lockdep_assert_held(&rq->lock);
+
+#ifdef CONFIG_SMP
+	if (p->sched_contributes_to_load)
+		atomic_dec(&grq.nr_uninterruptible);
+#endif
+
+	ttwu_activate(rq, p);
+	ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+	struct rq *rq;
+	int ret = 0;
+
+	rq = __task_rq_lock(p);
+	if (likely(task_on_rq_queued(p))) {
+		ttwu_do_wakeup(rq, p, wake_flags);
+		ret = 1;
+	}
+	__task_rq_unlock(rq);
+
+	return ret;
+}
+
+#ifdef CONFIG_SMP
+void sched_ttwu_pending(void)
+{
+	struct rq *rq = this_rq();
+	struct llist_node *llist = llist_del_all(&rq->wake_list);
+	struct task_struct *p;
+	unsigned long flags;
+
+	if (!llist)
+		return;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+
+	while (llist) {
+		int wake_flags = 0;
+
+		p = llist_entry(llist, struct task_struct, wake_entry);
+		llist = llist_next(llist);
+
+		ttwu_do_activate(rq, p, wake_flags);
+	}
+
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+void scheduler_ipi(void)
+{
+	/*
+	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+	 * TIF_NEED_RESCHED remotely (for the first time) will also send
+	 * this IPI.
+	 */
+	preempt_fold_need_resched();
+
+	if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched()))
+		return;
+
+	/*
+	 * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+	 * traditionally all their work was done from the interrupt return
+	 * path. Now that we actually do some work, we need to make sure
+	 * we do call them.
+	 *
+	 * Some archs already do call them, luckily irq_enter/exit nest
+	 * properly.
+	 *
+	 * Arguably we should visit all archs and update all handlers,
+	 * however a fair share of IPIs are still resched only so this would
+	 * somewhat pessimize the simple resched case.
+	 */
+	irq_enter();
+	sched_ttwu_pending();
+	irq_exit();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+		if (!set_nr_if_polling(rq->idle))
+			smp_sched_reschedule(cpu);
+		else
+			trace_sched_wake_idle_without_ipi(cpu);
+	}
+}
+
+void wake_up_if_idle(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	rcu_read_lock();
+
+	if (!is_idle_task(rcu_dereference(rq->curr)))
+		goto out;
+
+	if (set_nr_if_polling(rq->idle)) {
+		trace_sched_wake_idle_without_ipi(cpu);
+	} else {
+		rq_lock_irqsave(rq, &flags);
+		if (likely(is_idle_task(rq->curr)))
+			smp_sched_reschedule(cpu);
+		/* Else cpu is not in idle, do nothing here */
+		rq_unlock_irqrestore(rq, &flags);
+	}
+
+out:
+	rcu_read_unlock();
+}
+
+static int valid_task_cpu(struct task_struct *p)
+{
+	cpumask_t valid_mask;
+
+	if (p->flags & PF_KTHREAD)
+		cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask);
+	else
+		cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask);
+
+	if (unlikely(!cpumask_weight(&valid_mask))) {
+		/* Hotplug boot threads do this before the CPU is up */
+		WARN_ON(sched_smp_initialized);
+		return cpumask_any(tsk_cpus_allowed(p));
+	}
+	return cpumask_any(&valid_mask);
+}
+
+/*
+ * For a task that's just being woken up we have a valuable balancing
+ * opportunity so choose the nearest cache most lightly loaded runqueue.
+ * Entered with rq locked and returns with the chosen runqueue locked.
+ */
+static inline int select_best_cpu(struct task_struct *p)
+{
+	unsigned int idlest = ~0U;
+	struct rq *rq = NULL;
+	int i;
+
+	if (suitable_idle_cpus(p)) {
+		int cpu = task_cpu(p);
+
+		if (unlikely(needs_other_cpu(p, cpu)))
+			cpu = valid_task_cpu(p);
+		rq = resched_best_idle(p, cpu);
+		if (likely(rq))
+			return rq->cpu;
+	}
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *other_rq = task_rq(p)->rq_order[i];
+		int entries;
+
+		if (!other_rq->online)
+			continue;
+		if (needs_other_cpu(p, other_rq->cpu))
+			continue;
+		entries = rq_load(other_rq);
+		if (entries >= idlest)
+			continue;
+		idlest = entries;
+		rq = other_rq;
+	}
+	if (unlikely(!rq))
+		return task_cpu(p);
+	return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static int valid_task_cpu(struct task_struct *p)
+{
+	return 0;
+}
+
+static inline int select_best_cpu(struct task_struct *p)
+{
+	return 0;
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+	return NULL;
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
+{
+	struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+	if (!cpus_share_cache(smp_processor_id(), cpu)) {
+		sched_clock_cpu(cpu); /* sync clocks x-cpu */
+		ttwu_queue_remote(p, cpu, wake_flags);
+		return;
+	}
+#endif
+	rq_lock(rq);
+	ttwu_do_activate(rq, p, wake_flags);
+	rq_unlock(rq);
+}
+
+/***
+ * try_to_wake_up - wake up a thread
+ * @p: the thread to be awakened
+ * @state: the mask of task states that can be woken
+ * @wake_flags: wake modifier flags (WF_*)
+ *
+ * Put it on the run-queue if it's not already there. The "current"
+ * thread is always on the run-queue (except when the actual
+ * re-schedule is in progress), and as such you're allowed to do
+ * the simpler "current->state = TASK_RUNNING" to mark yourself
+ * runnable without the overhead of this.
+ *
+ * Return: %true if @p was woken up, %false if it was already running.
+ * or @state didn't match @p's state.
+ */
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+{
+	unsigned long flags;
+	int cpu, success = 0;
+
+	/*
+	 * If we are going to wake up a thread waiting for CONDITION we
+	 * need to ensure that CONDITION=1 done by the caller can not be
+	 * reordered with p->state check below. This pairs with mb() in
+	 * set_current_state() the waiting thread does.
+	 */
+	smp_mb__before_spinlock();
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	/* state is a volatile long, どうして、分からない */
+	if (!((unsigned int)p->state & state))
+		goto out;
+
+	trace_sched_waking(p);
+
+	success = 1; /* we're going to change ->state */
+	cpu = task_cpu(p);
+
+	/*
+	 * Ensure we load p->on_rq _after_ p->state, otherwise it would
+	 * be possible to, falsely, observe p->on_rq == 0 and get stuck
+	 * in smp_cond_load_acquire() below.
+	 *
+	 * sched_ttwu_pending()                 try_to_wake_up()
+	 *   [S] p->on_rq = 1;                  [L] P->state
+	 *       UNLOCK rq->lock  -----.
+	 *                              \
+	 *				 +---   RMB
+	 * schedule()                   /
+	 *       LOCK rq->lock    -----'
+	 *       UNLOCK rq->lock
+	 *
+	 * [task p]
+	 *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
+	 *
+	 * Pairs with the UNLOCK+LOCK on rq->lock from the
+	 * last wakeup of our task and the schedule that got our task
+	 * current.
+	 */
+	smp_rmb();
+	if (p->on_rq && ttwu_remote(p, wake_flags))
+		goto stat;
+
+#ifdef CONFIG_SMP
+	/*
+	 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+	 * possible to, falsely, observe p->on_cpu == 0.
+	 *
+	 * One must be running (->on_cpu == 1) in order to remove oneself
+	 * from the runqueue.
+	 *
+	 *  [S] ->on_cpu = 1;	[L] ->on_rq
+	 *      UNLOCK rq->lock
+	 *			RMB
+	 *      LOCK   rq->lock
+	 *  [S] ->on_rq = 0;    [L] ->on_cpu
+	 *
+	 * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+	 * from the consecutive calls to schedule(); the first switching to our
+	 * task, the second putting it to sleep.
+	 */
+	smp_rmb();
+
+	/*
+	 * If the owning (remote) cpu is still in the middle of schedule() with
+	 * this task as prev, wait until its done referencing the task.
+	 *
+	 * Pairs with the smp_store_release() in finish_lock_switch().
+	 *
+	 * This ensures that tasks getting woken will be fully ordered against
+	 * their previous state and preserve Program Order.
+	 */
+	smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+	p->sched_contributes_to_load = !!task_contributes_to_load(p);
+	p->state = TASK_WAKING;
+
+	cpu = select_best_cpu(p);
+	if (task_cpu(p) != cpu)
+		set_task_cpu(p, cpu);
+#endif /* CONFIG_SMP */
+
+	ttwu_queue(p, cpu, wake_flags);
+stat:
+	if (schedstat_enabled())
+		ttwu_stat(p, cpu, wake_flags);
+out:
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+	return success;
+}
+
+/**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not already there. The caller must
+ * ensure that rq is locked and, @p is not the current task.
+ * rq stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+	struct rq *rq = task_rq(p);
+
+	if (WARN_ON_ONCE(rq != this_rq()) ||
+	    WARN_ON_ONCE(p == current))
+		return;
+
+	lockdep_assert_held(&rq->lock);
+
+	if (!raw_spin_trylock(&p->pi_lock)) {
+		/*
+		 * This is OK, because current is on_cpu, which avoids it being
+		 * picked for load-balance and preemption/IRQs are still
+		 * disabled avoiding further scheduler activity on it and we've
+		 * not yet picked a replacement task.
+		 */
+		raw_spin_unlock(&rq->lock);
+		raw_spin_lock(&p->pi_lock);
+		raw_spin_lock(&rq->lock);
+	}
+
+	if (!(p->state & TASK_NORMAL))
+		goto out;
+
+	trace_sched_waking(p);
+
+	if (!task_on_rq_queued(p))
+		ttwu_activate(rq, p);
+
+	ttwu_do_wakeup(rq, p, 0);
+	if (schedstat_enabled())
+		ttwu_stat(p, smp_processor_id(), 0);
+out:
+	raw_spin_unlock(&p->pi_lock);
+}
+
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.
+ *
+ * Return: 1 if the process was woken up, 0 if it was already running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
+int wake_up_process(struct task_struct *p)
+{
+	return try_to_wake_up(p, TASK_NORMAL, 0);
+}
+EXPORT_SYMBOL(wake_up_process);
+
+int wake_up_state(struct task_struct *p, unsigned int state)
+{
+	return try_to_wake_up(p, state, 0);
+}
+
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
+
+/*
+ * Perform scheduler related setup for a newly forked process p.
+ * p is forked by current.
+ */
+int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
+{
+	unsigned long flags;
+	int cpu = get_cpu();
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&p->preempt_notifiers);
+#endif
+	/*
+	 * We mark the process as NEW here. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_NEW;
+
+	/*
+	 * The process state is set to the same value of the process executing
+	 * do_fork() code. That is running. This guarantees that nobody will
+	 * actually run it, and a signal or other external event cannot wake
+	 * it up and insert it on the runqueue either.
+	 */
+
+	/* Should be reset in fork.c but done here for ease of MuQSS patching */
+	p->on_cpu =
+	p->on_rq =
+	p->utime =
+	p->stime =
+	p->utimescaled =
+	p->stimescaled =
+	p->sched_time =
+	p->stime_ns =
+	p->utime_ns = 0;
+	skiplist_node_init(&p->node);
+
+	/*
+	 * Revert to default priority/policy on fork if requested.
+	 */
+	if (unlikely(p->sched_reset_on_fork)) {
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+			p->policy = SCHED_NORMAL;
+			p->normal_prio = normal_prio(p);
+		}
+
+		if (PRIO_TO_NICE(p->static_prio) < 0) {
+			p->static_prio = NICE_TO_PRIO(0);
+			p->normal_prio = p->static_prio;
+		}
+
+		/*
+		 * We don't need the reset flag anymore after the fork. It has
+		 * fulfilled its duty:
+		 */
+		p->sched_reset_on_fork = 0;
+	}
+
+	/*
+	 * Silence PROVE_RCU.
+	 */
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	set_task_cpu(p, cpu);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+#ifdef CONFIG_SCHED_INFO
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
+#endif
+	init_task_preempt_count(p);
+
+	put_cpu();
+	return 0;
+}
+
+#ifdef CONFIG_SCHEDSTATS
+
+DEFINE_STATIC_KEY_FALSE(sched_schedstats);
+static bool __initdata __sched_schedstats = false;
+
+static void set_schedstats(bool enabled)
+{
+	if (enabled)
+		static_branch_enable(&sched_schedstats);
+	else
+		static_branch_disable(&sched_schedstats);
+}
+
+void force_schedstat_enabled(void)
+{
+	if (!schedstat_enabled()) {
+		pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
+		static_branch_enable(&sched_schedstats);
+	}
+}
+
+static int __init setup_schedstats(char *str)
+{
+	int ret = 0;
+	if (!str)
+		goto out;
+
+	/*
+	 * This code is called before jump labels have been set up, so we can't
+	 * change the static branch directly just yet.  Instead set a temporary
+	 * variable so init_schedstats() can do it later.
+	 */
+	if (!strcmp(str, "enable")) {
+		__sched_schedstats = true;
+		ret = 1;
+	} else if (!strcmp(str, "disable")) {
+		__sched_schedstats = false;
+		ret = 1;
+	}
+out:
+	if (!ret)
+		pr_warn("Unable to parse schedstats=\n");
+
+	return ret;
+}
+__setup("schedstats=", setup_schedstats);
+
+static void __init init_schedstats(void)
+{
+	set_schedstats(__sched_schedstats);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_schedstats(struct ctl_table *table, int write,
+			 void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_schedstats);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write)
+		set_schedstats(state);
+	return err;
+}
+#endif /* CONFIG_PROC_SYSCTL */
+#else  /* !CONFIG_SCHEDSTATS */
+static inline void init_schedstats(void) {}
+#endif /* CONFIG_SCHEDSTATS */
+
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p);
+
+static void account_task_cpu(struct rq *rq, struct task_struct *p)
+{
+	update_clocks(rq);
+	/* This isn't really a context switch but accounting is the same */
+	update_cpu_clock_switch(rq, p);
+	p->last_ran = rq->niffies;
+}
+
+static inline int hrexpiry_enabled(struct rq *rq)
+{
+	if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrexpiry_timer);
+}
+
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ */
+static inline void hrexpiry_clear(struct rq *rq)
+{
+	if (!hrexpiry_enabled(rq))
+		return;
+	if (hrtimer_active(&rq->hrexpiry_timer))
+		hrtimer_cancel(&rq->hrexpiry_timer);
+}
+
+/*
+ * High-resolution time_slice expiry.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrexpiry(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrexpiry_timer);
+	struct task_struct *p;
+
+	/* This can happen during CPU hotplug / resume */
+	if (unlikely(cpu_of(rq) != smp_processor_id()))
+		goto out;
+
+	/*
+	 * We're doing this without the runqueue lock but this should always
+	 * be run on the local CPU. Time slice should run out in __schedule
+	 * but we set it to zero here in case niffies is slightly less.
+	 */
+	p = rq->curr;
+	p->time_slice = 0;
+	__set_tsk_resched(p);
+out:
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * Called to set the hrexpiry timer state.
+ *
+ * called with irqs disabled from the local CPU only
+ */
+static void hrexpiry_start(struct rq *rq, u64 delay)
+{
+	if (!hrexpiry_enabled(rq))
+		return;
+
+	hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED);
+}
+
+static void init_rq_hrexpiry(struct rq *rq)
+{
+	hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrexpiry_timer.function = hrexpiry;
+}
+
+static inline int rq_dither(struct rq *rq)
+{
+	if (!hrexpiry_enabled(rq))
+		return HALF_JIFFY_US;
+	return 0;
+}
+
+/*
+ * wake_up_new_task - wake up a newly created task for the first time.
+ *
+ * This function will do some initial scheduler statistics housekeeping
+ * that must be done for every newly created context, then puts the task
+ * on the runqueue and wakes it.
+ */
+void wake_up_new_task(struct task_struct *p)
+{
+	struct task_struct *parent, *rq_curr;
+	struct rq *rq, *new_rq;
+	unsigned long flags;
+
+	parent = p->parent;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	p->state = TASK_RUNNING;
+	/* Task_rq can't change yet on a new task */
+	new_rq = rq = task_rq(p);
+	if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
+		set_task_cpu(p, valid_task_cpu(p));
+		new_rq = task_rq(p);
+	}
+
+	double_rq_lock(rq, new_rq);
+	rq_curr = rq->curr;
+
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = rq_curr->normal_prio;
+
+	trace_sched_wakeup_new(p);
+
+	/*
+	 * Share the timeslice between parent and child, thus the
+	 * total amount of pending timeslices in the system doesn't change,
+	 * resulting in more scheduling fairness. If it's negative, it won't
+	 * matter since that's the same as being 0. rq->rq_deadline is only
+	 * modified within schedule() so it is always equal to
+	 * current->deadline.
+	 */
+	account_task_cpu(rq, rq_curr);
+	p->last_ran = rq_curr->last_ran;
+	if (likely(rq_curr->policy != SCHED_FIFO)) {
+		rq_curr->time_slice /= 2;
+		if (rq_curr->time_slice < RESCHED_US) {
+			/*
+			 * Forking task has run out of timeslice. Reschedule it and
+			 * start its child with a new time slice and deadline. The
+			 * child will end up running first because its deadline will
+			 * be slightly earlier.
+			 */
+			__set_tsk_resched(rq_curr);
+			time_slice_expired(p, new_rq);
+			if (suitable_idle_cpus(p))
+				resched_best_idle(p, task_cpu(p));
+			else if (unlikely(rq != new_rq))
+				try_preempt(p, new_rq);
+		} else {
+			p->time_slice = rq_curr->time_slice;
+			if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
+				/*
+				 * The VM isn't cloned, so we're in a good position to
+				 * do child-runs-first in anticipation of an exec. This
+				 * usually avoids a lot of COW overhead.
+				 */
+				__set_tsk_resched(rq_curr);
+			} else {
+				/*
+				 * Adjust the hrexpiry since rq_curr will keep
+				 * running and its timeslice has been shortened.
+				 */
+				hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice));
+				try_preempt(p, new_rq);
+			}
+		}
+	} else {
+		time_slice_expired(p, new_rq);
+		try_preempt(p, new_rq);
+	}
+	activate_task(p, new_rq);
+	double_rq_unlock(rq, new_rq);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+}
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+	static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+	static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
+/**
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
+ * @notifier: notifier struct to register
+ */
+void preempt_notifier_register(struct preempt_notifier *notifier)
+{
+	if (!static_key_false(&preempt_notifier_key))
+		WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
+	hlist_add_head(&notifier->link, &current->preempt_notifiers);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_register);
+
+/**
+ * preempt_notifier_unregister - no longer interested in preemption notifications
+ * @notifier: notifier struct to unregister
+ *
+ * This is *not* safe to call from within a preemption notifier.
+ */
+void preempt_notifier_unregister(struct preempt_notifier *notifier)
+{
+	hlist_del(&notifier->link);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
+
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	struct preempt_notifier *notifier;
+
+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+		notifier->ops->sched_in(notifier, raw_smp_processor_id());
+}
+
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_in_preempt_notifiers(curr);
+}
+
+static void
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	struct preempt_notifier *notifier;
+
+	hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
+		notifier->ops->sched_out(notifier, next);
+}
+
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+	if (static_key_false(&preempt_notifier_key))
+		__fire_sched_out_preempt_notifiers(curr, next);
+}
+
+#else /* !CONFIG_PREEMPT_NOTIFIERS */
+
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+}
+
+static inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+				 struct task_struct *next)
+{
+}
+
+#endif /* CONFIG_PREEMPT_NOTIFIERS */
+
+/**
+ * prepare_task_switch - prepare to switch tasks
+ * @rq: the runqueue preparing to switch
+ * @next: the task we are going to switch to.
+ *
+ * This is called with the rq lock held and interrupts off. It must
+ * be paired with a subsequent finish_task_switch after the context
+ * switch.
+ *
+ * prepare_task_switch sets up locking and calls architecture specific
+ * hooks.
+ */
+static inline void
+prepare_task_switch(struct rq *rq, struct task_struct *prev,
+		    struct task_struct *next)
+{
+	sched_info_switch(rq, prev, next);
+	perf_event_task_sched_out(prev, next);
+	fire_sched_out_preempt_notifiers(prev, next);
+	prepare_lock_switch(rq, next);
+	prepare_arch_switch(next);
+}
+
+/**
+ * finish_task_switch - clean up after a task-switch
+ * @rq: runqueue associated with task-switch
+ * @prev: the thread we just switched away from.
+ *
+ * finish_task_switch must be called after the context switch, paired
+ * with a prepare_task_switch call before the context switch.
+ * finish_task_switch will reconcile locking set up by prepare_task_switch,
+ * and do any other architecture-specific cleanup actions.
+ *
+ * Note that we may have delayed dropping an mm in context_switch(). If
+ * so, we finish that here outside of the runqueue lock.  (Doing it
+ * with the lock held can cause deadlocks; see schedule() for
+ * details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
+ */
+static void finish_task_switch(struct task_struct *prev)
+	__releases(rq->lock)
+{
+	struct rq *rq = this_rq();
+	struct mm_struct *mm = rq->prev_mm;
+	long prev_state;
+
+	/*
+	 * The previous task will have left us with a preempt_count of 2
+	 * because it left us after:
+	 *
+	 *	schedule()
+	 *	  preempt_disable();			// 1
+	 *	  __schedule()
+	 *	    raw_spin_lock_irq(&rq->lock)	// 2
+	 *
+	 * Also, see FORK_PREEMPT_COUNT.
+	 */
+	if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
+		      "corrupted preempt_count: %s/%d/0x%x\n",
+		      current->comm, current->pid, preempt_count()))
+		preempt_count_set(FORK_PREEMPT_COUNT);
+
+	rq->prev_mm = NULL;
+
+	/*
+	 * A task struct has one reference for the use as "current".
+	 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
+	 * schedule one last time. The schedule call will never return, and
+	 * the scheduled task must drop that reference.
+	 *
+	 * We must observe prev->state before clearing prev->on_cpu (in
+	 * finish_lock_switch), otherwise a concurrent wakeup can get prev
+	 * running on another CPU and we could rave with its RUNNING -> DEAD
+	 * transition, resulting in a double drop.
+	 */
+	prev_state = prev->state;
+	vtime_task_switch(prev);
+	perf_event_task_sched_in(prev, current);
+	finish_lock_switch(rq, prev);
+	finish_arch_post_lock_switch();
+
+	fire_sched_in_preempt_notifiers(current);
+	if (mm)
+		mmdrop(mm);
+	if (unlikely(prev_state == TASK_DEAD)) {
+		/*
+		 * Remove function-return probe instances associated with this
+		 * task and put them back on the free list.
+		 */
+		kprobe_flush_task(prev);
+		put_task_struct(prev);
+	}
+}
+
+/**
+ * schedule_tail - first thing a freshly forked thread must call.
+ * @prev: the thread we just switched away from.
+ */
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
+{
+	/*
+	 * New tasks start with FORK_PREEMPT_COUNT, see there and
+	 * finish_task_switch() for details.
+	 *
+	 * finish_task_switch() will drop rq->lock() and lower preempt_count
+	 * and the preempt_enable() will end up enabling preemption (on
+	 * PREEMPT_COUNT kernels).
+	 */
+
+	finish_task_switch(prev);
+	preempt_enable();
+
+	if (current->set_child_tid)
+		put_user(task_pid_vnr(current), current->set_child_tid);
+}
+
+/*
+ * context_switch - switch to the new MM and the new thread's register state.
+ */
+static __always_inline void
+context_switch(struct rq *rq, struct task_struct *prev,
+	       struct task_struct *next)
+{
+	struct mm_struct *mm, *oldmm;
+
+	prepare_task_switch(rq, prev, next);
+
+	mm = next->mm;
+	oldmm = prev->active_mm;
+	/*
+	 * For paravirt, this is coupled with an exit in switch_to to
+	 * combine the page table reload and the switch backend into
+	 * one hypercall.
+	 */
+	arch_start_context_switch(prev);
+
+	if (!mm) {
+		next->active_mm = oldmm;
+		atomic_inc(&oldmm->mm_count);
+		enter_lazy_tlb(oldmm, next);
+	} else
+		switch_mm_irqs_off(oldmm, mm, next);
+
+	if (!prev->mm) {
+		prev->active_mm = NULL;
+		rq->prev_mm = oldmm;
+	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+
+	/* Here we just switch the register state and the stack. */
+	switch_to(prev, next, prev);
+	barrier();
+
+	finish_task_switch(prev);
+}
+
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+	return atomic_read(&grq.nr_running);
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+	return atomic_read(&grq.nr_uninterruptible);
+}
+
+/*
+ * Check if only the current task is running on the cpu.
+ *
+ * Caution: this function does not check that the caller has disabled
+ * preemption, thus the result might have a time-of-check-to-time-of-use
+ * race.  The caller is responsible to use it correctly, for example:
+ *
+ * - from a non-preemptable section (of course)
+ *
+ * - from a thread that is bound to a single CPU
+ *
+ * - in a loop with very short iterations (e.g. a polling loop)
+ */
+bool single_task_running(void)
+{
+	struct rq *rq = cpu_rq(smp_processor_id());
+
+	if (rq_load(rq) == 1)
+		return true;
+	else
+		return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
+unsigned long long nr_context_switches(void)
+{
+	return (unsigned long long)atomic64_read(&grq.nr_switches);
+}
+
+unsigned long nr_iowait(void)
+{
+	unsigned long i, sum = 0;
+
+	for_each_possible_cpu(i)
+		sum += atomic_read(&cpu_rq(i)->nr_iowait);
+
+	return sum;
+}
+
+unsigned long nr_iowait_cpu(int cpu)
+{
+	struct rq *this = cpu_rq(cpu);
+	return atomic_read(&this->nr_iowait);
+}
+
+unsigned long nr_active(void)
+{
+	return nr_running() + nr_uninterruptible();
+}
+
+/*
+ * I/O wait is the number of running or queued tasks with their ->rq pointer
+ * set to this cpu as being the CPU they're more likely to run on.
+ */
+void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
+{
+	struct rq *rq = this_rq();
+
+	*nr_waiters = atomic_read(&rq->nr_iowait);
+	*load = rq_load(rq);
+}
+
+/* Variables and functions for calc_load */
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	unsigned long newload;
+
+	newload = load * exp + active * (FIXED_1 - exp);
+	if (active >= load)
+		newload += FIXED_1-1;
+
+	return newload / FIXED_1;
+}
+
+/*
+ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds.
+ */
+void calc_global_load(unsigned long ticks)
+{
+	long active;
+
+	if (time_before(jiffies, calc_load_update))
+		return;
+	active = nr_active() * FIXED_1;
+
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update = jiffies + LOAD_FREQ;
+}
+
+DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
+
+EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
+
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
+{
+	if (unlikely(steal > NSEC_PER_SEC))
+		return div_u64(steal, TICK_NSEC);
+
+	return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	s64 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	if (static_key_false((&paravirt_steal_rq_enabled))) {
+		s64 steal = paravirt_steal_clock(cpu_of(rq));
+
+		steal -= rq->prev_steal_time_rq;
+
+		if (unlikely(steal > delta))
+			steal = delta;
+
+		rq->prev_steal_time_rq += steal;
+
+		delta -= steal;
+	}
+#endif
+	rq->clock_task += delta;
+}
+
+#ifndef nsecs_to_cputime
+# define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
+#endif
+
+/*
+ * On each tick, add the number of nanoseconds to the unbanked variables and
+ * once one tick's worth has accumulated, account it allowing for accurate
+ * sub-tick accounting and totals.
+ */
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns)
+{
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	unsigned long ticks;
+
+	if (atomic_read(&rq->nr_iowait) > 0) {
+		rq->iowait_ns += ns;
+		if (rq->iowait_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->iowait_ns);
+			cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks;
+			rq->iowait_ns %= JIFFY_NS;
+		}
+	} else {
+		rq->idle_ns += ns;
+		if (rq->idle_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->idle_ns);
+			cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks;
+			rq->idle_ns %= JIFFY_NS;
+		}
+	}
+	acct_update_integrals(idle);
+}
+
+static void pc_system_time(struct rq *rq, struct task_struct *p,
+			   int hardirq_offset, unsigned long ns)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	unsigned long ticks;
+
+	p->stime_ns += ns;
+	if (p->stime_ns >= JIFFY_NS) {
+		ticks = NS_TO_JIFFIES(p->stime_ns);
+		p->stime_ns %= JIFFY_NS;
+		p->stime += (__force u64)cputime_one_jiffy * ticks;
+		p->stimescaled += one_jiffy_scaled * ticks;
+		account_group_system_time(p, cputime_one_jiffy * ticks);
+	}
+	p->sched_time += ns;
+	account_group_exec_runtime(p, ns);
+
+	if (hardirq_count() - hardirq_offset) {
+		rq->irq_ns += ns;
+		if (rq->irq_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->irq_ns);
+			cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks;
+			rq->irq_ns %= JIFFY_NS;
+		}
+	} else if (in_serving_softirq()) {
+		rq->softirq_ns += ns;
+		if (rq->softirq_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->softirq_ns);
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+			rq->softirq_ns %= JIFFY_NS;
+		}
+	} else {
+		rq->system_ns += ns;
+		if (rq->system_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->system_ns);
+			cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks;
+			rq->system_ns %= JIFFY_NS;
+		}
+	}
+	acct_update_integrals(p);
+}
+
+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
+{
+	cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	unsigned long ticks;
+
+	p->utime_ns += ns;
+	if (p->utime_ns >= JIFFY_NS) {
+		ticks = NS_TO_JIFFIES(p->utime_ns);
+		p->utime_ns %= JIFFY_NS;
+		p->utime += (__force u64)cputime_one_jiffy * ticks;
+		p->utimescaled += one_jiffy_scaled * ticks;
+		account_group_user_time(p, cputime_one_jiffy * ticks);
+	}
+	p->sched_time += ns;
+	account_group_exec_runtime(p, ns);
+
+	if (this_cpu_ksoftirqd() == p) {
+		/*
+		 * ksoftirqd time do not get accounted in cpu_softirq_time.
+		 * So, we have to handle it separately here.
+		 */
+		rq->softirq_ns += ns;
+		if (rq->softirq_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->softirq_ns);
+			cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+			rq->softirq_ns %= JIFFY_NS;
+		}
+	}
+
+	if (task_nice(p) > 0 || idleprio_task(p)) {
+		rq->nice_ns += ns;
+		if (rq->nice_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->nice_ns);
+			cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks;
+			rq->nice_ns %= JIFFY_NS;
+		}
+	} else {
+		rq->user_ns += ns;
+		if (rq->user_ns >= JIFFY_NS) {
+			ticks = NS_TO_JIFFIES(rq->user_ns);
+			cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks;
+			rq->user_ns %= JIFFY_NS;
+		}
+	}
+	acct_update_integrals(p);
+}
+
+/*
+ * This is called on clock ticks.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+{
+	s64 account_ns = rq->niffies - p->last_ran;
+	struct task_struct *idle = rq->idle;
+
+	/* Accurate tick timekeeping */
+	if (user_mode(get_irq_regs()))
+		pc_user_time(rq, p, account_ns);
+	else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) {
+		pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns);
+	} else
+		pc_idle_time(rq, idle, account_ns);
+
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (p->policy != SCHED_FIFO && p != idle)
+		p->time_slice -= NS_TO_US(account_ns);
+
+	p->last_ran = rq->niffies;
+}
+
+/*
+ * This is called on context switches.
+ * Bank in p->sched_time the ns elapsed since the last tick or switch.
+ * CPU scheduler quota accounting is also performed here in microseconds.
+ */
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+{
+	s64 account_ns = rq->niffies - p->last_ran;
+	struct task_struct *idle = rq->idle;
+
+	/* Accurate subtick timekeeping */
+	if (p != idle)
+		pc_user_time(rq, p, account_ns);
+	else
+		pc_idle_time(rq, idle, account_ns);
+
+	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
+	if (p->policy != SCHED_FIFO && p != idle)
+		p->time_slice -= NS_TO_US(account_ns);
+}
+
+/*
+ * Return any ns on the sched_clock that have not yet been accounted in
+ * @p in case that task is currently running.
+ *
+ * Called with task_rq_lock(p) held.
+ */
+static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+{
+	u64 ns = 0;
+
+	/*
+	 * Must be ->curr _and_ ->on_rq.  If dequeued, we would
+	 * project cycles that may never be accounted to this
+	 * thread, breaking clock_gettime().
+	 */
+	if (p == rq->curr && task_on_rq_queued(p)) {
+		update_clocks(rq);
+		ns = rq->niffies - p->last_ran;
+	}
+
+	return ns;
+}
+
+/*
+ * Return accounted runtime for the task.
+ * Return separately the current's pending runtime that have not been
+ * accounted yet.
+ *
+ */
+unsigned long long task_sched_runtime(struct task_struct *p)
+{
+	unsigned long flags;
+	struct rq *rq;
+	u64 ns;
+
+#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
+	/*
+	 * 64-bit doesn't need locks to atomically read a 64bit value.
+	 * So we have a optimization chance when the task's delta_exec is 0.
+	 * Reading ->on_cpu is racy, but this is ok.
+	 *
+	 * If we race with it leaving cpu, we'll take a lock. So we're correct.
+	 * If we race with it entering cpu, unaccounted time is 0. This is
+	 * indistinguishable from the read occurring a few cycles earlier.
+	 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
+	 * been accounted, so we're correct here as well.
+	 */
+	if (!p->on_cpu || !task_on_rq_queued(p))
+		return tsk_seruntime(p);
+#endif
+
+	rq = task_rq_lock(p, &flags);
+	ns = p->sched_time + do_task_delta_exec(p, rq);
+	task_rq_unlock(rq, p, &flags);
+
+	return ns;
+}
+
+/*
+ * Functions to test for when SCHED_ISO tasks have used their allocated
+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All
+ * data is modified only by the local runqueue during scheduler_tick with
+ * interrupts disabled.
+ */
+
+/*
+ * Test if SCHED_ISO tasks have run longer than their alloted period as RT
+ * tasks and set the refractory flag if necessary. There is 10% hysteresis
+ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
+ * slow division.
+ */
+static inline void iso_tick(struct rq *rq)
+{
+	rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+	rq->iso_ticks += 100;
+	if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) {
+		rq->iso_refractory = true;
+		if (unlikely(rq->iso_ticks > ISO_PERIOD * 100))
+			rq->iso_ticks = ISO_PERIOD * 100;
+	}
+}
+
+/* No SCHED_ISO task was running so decrease rq->iso_ticks */
+static inline void no_iso_tick(struct rq *rq, int ticks)
+{
+	if (rq->iso_ticks > 0 || rq->iso_refractory) {
+		rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD;
+		if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) {
+			rq->iso_refractory = false;
+			if (unlikely(rq->iso_ticks < 0))
+				rq->iso_ticks = 0;
+		}
+	}
+}
+
+/* This manages tasks that have run out of timeslice during a scheduler_tick */
+static void task_running_tick(struct rq *rq)
+{
+	struct task_struct *p = rq->curr;
+
+	/*
+	 * If a SCHED_ISO task is running we increment the iso_ticks. In
+	 * order to prevent SCHED_ISO tasks from causing starvation in the
+	 * presence of true RT tasks we account those as iso_ticks as well.
+	 */
+	if (rt_task(p) || task_running_iso(p))
+		iso_tick(rq);
+	else
+		no_iso_tick(rq, 1);
+
+	/* SCHED_FIFO tasks never run out of timeslice. */
+	if (p->policy == SCHED_FIFO)
+		return;
+
+	if (iso_task(p)) {
+		if (task_running_iso(p)) {
+			if (rq->iso_refractory) {
+				/*
+				 * SCHED_ISO task is running as RT and limit
+				 * has been hit. Force it to reschedule as
+				 * SCHED_NORMAL by zeroing its time_slice
+				 */
+				p->time_slice = 0;
+			}
+		} else if (!rq->iso_refractory) {
+			/* Can now run again ISO. Reschedule to pick up prio */
+			goto out_resched;
+		}
+	}
+
+	/*
+	 * Tasks that were scheduled in the first half of a tick are not
+	 * allowed to run into the 2nd half of the next tick if they will
+	 * run out of time slice in the interim. Otherwise, if they have
+	 * less than RESCHED_US μs of time slice left they will be rescheduled.
+	 * Dither is used as a backup for when hrexpiry is disabled or high res
+	 * timers not configured in.
+	 */
+	if (p->time_slice - rq->dither >= RESCHED_US)
+		return;
+out_resched:
+	rq_lock(rq);
+	__set_tsk_resched(p);
+	rq_unlock(rq);
+}
+
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * We can stop the timer tick any time highres timers are active since
+ * we rely entirely on highres timeouts for task expiry rescheduling.
+ */
+static void sched_stop_tick(struct rq *rq, int cpu)
+{
+	if (!hrexpiry_enabled(rq))
+		return;
+	if (!tick_nohz_full_enabled())
+		return;
+	if (!tick_nohz_full_cpu(cpu))
+		return;
+	tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+	tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running.
+ *
+ * This makes sure that uptime continues to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+	struct rq *rq = this_rq();
+	unsigned long next, now = READ_ONCE(jiffies);
+
+	next = rq->last_jiffy + HZ;
+
+	if (time_before_eq(next, now))
+		return 0;
+
+	return jiffies_to_nsecs(next - now);
+}
+#else
+static inline void sched_stop_tick(struct rq *rq, int cpu)
+{
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+}
+#endif
+
+/*
+ * This function gets called by the timer code, with HZ frequency.
+ * We call it with interrupts disabled.
+ */
+void scheduler_tick(void)
+{
+	int cpu __maybe_unused = smp_processor_id();
+	struct rq *rq = cpu_rq(cpu);
+
+	sched_clock_tick();
+	update_clocks(rq);
+	update_load_avg(rq);
+	update_cpu_clock_tick(rq, rq->curr);
+	if (!rq_idle(rq))
+		task_running_tick(rq);
+	else
+		no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy);
+	rq->last_scheduler_tick = rq->last_jiffy;
+	rq->last_tick = rq->clock;
+	perf_event_task_tick();
+	sched_stop_tick(rq, cpu);
+}
+
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+/*
+ * If the value passed in is equal to the current preempt count
+ * then we just disabled preemption. Start timing the latency.
+ */
+static inline void preempt_latency_start(int val)
+{
+	if (preempt_count() == val) {
+		unsigned long ip = get_lock_parent_ip();
+#ifdef CONFIG_DEBUG_PREEMPT
+		current->preempt_disable_ip = ip;
+#endif
+		trace_preempt_off(CALLER_ADDR0, ip);
+	}
+}
+
+void preempt_count_add(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
+		return;
+#endif
+	__preempt_count_add(val);
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
+				PREEMPT_MASK - 10);
+#endif
+	preempt_latency_start(val);
+}
+EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
+
+/*
+ * If the value passed in equals to the current preempt count
+ * then we just enabled preemption. Stop timing the latency.
+ */
+static inline void preempt_latency_stop(int val)
+{
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
+}
+
+void preempt_count_sub(int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
+		return;
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
+			!(preempt_count() & PREEMPT_MASK)))
+		return;
+#endif
+
+	preempt_latency_stop(val);
+	__preempt_count_sub(val);
+}
+EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
+
+#else
+static inline void preempt_latency_start(int val) { }
+static inline void preempt_latency_stop(int val) { }
+#endif
+
+/*
+ * The time_slice is only refilled when it is empty and that is when we set a
+ * new deadline. Make sure update_clocks has been called recently to update
+ * rq->niffies.
+ */
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
+{
+	p->time_slice = timeslice();
+	p->deadline = rq->niffies + task_deadline_diff(p);
+#ifdef CONFIG_SMT_NICE
+	if (!p->mm)
+		p->smt_bias = 0;
+	else if (rt_task(p))
+		p->smt_bias = 1 << 30;
+	else if (task_running_iso(p))
+		p->smt_bias = 1 << 29;
+	else if (idleprio_task(p)) {
+		if (task_running_idle(p))
+			p->smt_bias = 0;
+		else
+			p->smt_bias = 1;
+	} else if (--p->smt_bias < 1)
+		p->smt_bias = MAX_PRIO - p->static_prio;
+#endif
+}
+
+/*
+ * Timeslices below RESCHED_US are considered as good as expired as there's no
+ * point rescheduling when there's so little time left. SCHED_BATCH tasks
+ * have been flagged be not latency sensitive and likely to be fully CPU
+ * bound so every time they're rescheduled they have their time_slice
+ * refilled, but get a new later deadline to have little effect on
+ * SCHED_NORMAL tasks.
+
+ */
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
+{
+	if (p->time_slice < RESCHED_US || batch_task(p))
+		time_slice_expired(p, rq);
+}
+
+/*
+ * Task selection with skiplists is a simple matter of picking off the first
+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1)
+ * being bound to the number of processors.
+ *
+ * Runqueues are selectively locked based on their unlocked data and then
+ * unlocked if not needed. At most 3 locks will be held at any time and are
+ * released as soon as they're no longer needed. All balancing between CPUs
+ * is thus done here in an extremely simple first come best fit manner.
+ *
+ * This iterates over runqueues in cache locality order. In interactive mode
+ * it iterates over all CPUs and finds the task with the best key/deadline.
+ * In non-interactive mode it will only take a task if it's from the current
+ * runqueue or a runqueue with more tasks than the current one with a better
+ * key/deadline.
+ */
+#ifdef CONFIG_SMP
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+	struct rq *locked = NULL, *chosen = NULL;
+	struct task_struct *edt = idle;
+	int i, best_entries = 0;
+	u64 best_key = ~0ULL;
+
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *other_rq = rq_order(rq, i);
+		int entries = other_rq->sl->entries;
+		skiplist_node *next;
+
+		/*
+		 * Check for queued entres lockless first. The local runqueue
+		 * is locked so entries will always be accurate.
+		 */
+		if (!sched_interactive) {
+			/*
+			 * Don't reschedule balance across nodes unless the CPU
+			 * is idle.
+			 */
+			if (edt != idle && rq->cpu_locality[other_rq->cpu] > 3)
+				break;
+			if (entries <= best_entries)
+				continue;
+		} else if (!entries)
+			continue;
+
+		/* if (i) implies other_rq != rq */
+		if (i) {
+			/* Check for best id queued lockless first */
+			if (other_rq->best_key >= best_key)
+				continue;
+
+			if (unlikely(!trylock_rq(rq, other_rq)))
+				continue;
+
+			/* Need to reevaluate entries after locking */
+			entries = other_rq->sl->entries;
+			if (unlikely(!entries)) {
+				unlock_rq(other_rq);
+				continue;
+			}
+		}
+
+		next = &other_rq->node;
+		/*
+		 * In interactive mode we check beyond the best entry on other
+		 * runqueues if we can't get the best for smt or affinity
+		 * reasons.
+		 */
+		while ((next = next->next[0]) != &other_rq->node) {
+			struct task_struct *p;
+			u64 key = next->key;
+
+			/* Reevaluate key after locking */
+			if (key >= best_key)
+				break;
+
+			p = next->value;
+			if (!smt_schedule(p, rq)) {
+				if (i && !sched_interactive)
+					break;
+				continue;
+			}
+
+			/* Make sure affinity is ok */
+			if (i) {
+				if (needs_other_cpu(p, cpu)) {
+					if (sched_interactive)
+						continue;
+					break;
+				}
+				/* From this point on p is the best so far */
+				if (locked)
+					unlock_rq(locked);
+				chosen = locked = other_rq;
+			}
+			best_entries = entries;
+			best_key = key;
+			edt = p;
+			break;
+		}
+		if (i && other_rq != chosen)
+			unlock_rq(other_rq);
+	}
+
+	if (likely(edt != idle))
+		take_task(rq, cpu, edt);
+
+	if (locked)
+		unlock_rq(locked);
+
+	return edt;
+}
+#else /* CONFIG_SMP */
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+	struct task_struct *edt;
+
+	if (unlikely(!rq->sl->entries))
+		return idle;
+	edt = rq->node.next[0]->value;
+	take_task(rq, cpu, edt);
+	return edt;
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * Print scheduling while atomic bug:
+ */
+static noinline void __schedule_bug(struct task_struct *prev)
+{
+	if (oops_in_progress)
+		return;
+
+	printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
+		prev->comm, prev->pid, preempt_count());
+
+	debug_show_held_locks(prev);
+	print_modules();
+	if (irqs_disabled())
+		print_irqtrace_events(prev);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (in_atomic_preempt_off()) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
+	dump_stack();
+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
+}
+
+/*
+ * Various schedule()-time debugging checks and statistics:
+ */
+static inline void schedule_debug(struct task_struct *prev)
+{
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+	if (task_stack_end_corrupted(prev))
+		panic("corrupted stack end detected inside scheduler\n");
+#endif
+
+	if (unlikely(in_atomic_preempt_off())) {
+		__schedule_bug(prev);
+		preempt_count_set(PREEMPT_DISABLED);
+	}
+	rcu_sleep_check();
+
+	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
+
+	schedstat_inc(this_rq(), sched_count);
+}
+
+/*
+ * The currently running task's information is all stored in rq local data
+ * which is only modified by the local CPU.
+ */
+static inline void set_rq_task(struct rq *rq, struct task_struct *p)
+{
+	if (p == rq->idle || p->policy == SCHED_FIFO)
+		hrexpiry_clear(rq);
+	else
+		hrexpiry_start(rq, US_TO_NS(p->time_slice));
+	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+		rq->dither = 0;
+	else
+		rq->dither = rq_dither(rq);
+
+	rq->rq_deadline = p->deadline;
+	rq->rq_prio = p->prio;
+#ifdef CONFIG_SMT_NICE
+	rq->rq_mm = p->mm;
+	rq->rq_smt_bias = p->smt_bias;
+#endif
+}
+
+#ifdef CONFIG_SMT_NICE
+static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
+
+/* Iterate over smt siblings when we've scheduled a process on cpu and decide
+ * whether they should continue running or be descheduled. */
+static void check_smt_siblings(struct rq *this_rq)
+{
+	int other_cpu;
+
+	for_each_cpu(other_cpu, &this_rq->thread_mask) {
+		struct task_struct *p;
+		struct rq *rq;
+
+		rq = cpu_rq(other_cpu);
+		if (rq_idle(rq))
+			continue;
+		p = rq->curr;
+		if (!smt_schedule(p, this_rq))
+			resched_curr(rq);
+	}
+}
+
+static void wake_smt_siblings(struct rq *this_rq)
+{
+	int other_cpu;
+
+	for_each_cpu(other_cpu, &this_rq->thread_mask) {
+		struct rq *rq;
+
+		rq = cpu_rq(other_cpu);
+		if (rq_idle(rq))
+			resched_idle(rq);
+	}
+}
+#else
+static void check_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_siblings(struct rq __maybe_unused *this_rq) {}
+#endif
+
+/*
+ * schedule() is the main scheduler function.
+ *
+ * The main means of driving the scheduler and thus entering this function are:
+ *
+ *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
+ *
+ *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
+ *      paths. For example, see arch/x86/entry_64.S.
+ *
+ *      To drive preemption between tasks, the scheduler sets the flag in timer
+ *      interrupt handler scheduler_tick().
+ *
+ *   3. Wakeups don't really cause entry into schedule(). They add a
+ *      task to the run-queue and that's it.
+ *
+ *      Now, if the new task added to the run-queue preempts the current
+ *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
+ *      called on the nearest possible occasion:
+ *
+ *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
+ *
+ *         - in syscall or exception context, at the next outmost
+ *           preempt_enable(). (this might be as soon as the wake_up()'s
+ *           spin_unlock()!)
+ *
+ *         - in IRQ context, return from interrupt-handler to
+ *           preemptible context
+ *
+ *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
+ *         then at the next:
+ *
+ *          - cond_resched() call
+ *          - explicit schedule() call
+ *          - return from syscall or exception to user-space
+ *          - return from interrupt-handler to user-space
+ *
+ * WARNING: must be called with preemption disabled!
+ */
+static void __sched notrace __schedule(bool preempt)
+{
+	struct task_struct *prev, *next, *idle;
+	unsigned long *switch_count;
+	bool deactivate = false;
+	struct rq *rq;
+	u64 niffies;
+	int cpu;
+
+	cpu = smp_processor_id();
+	rq = cpu_rq(cpu);
+	prev = rq->curr;
+	idle = rq->idle;
+
+	/*
+	 * do_exit() calls schedule() with preemption disabled as an exception;
+	 * however we must fix that up, otherwise the next task will see an
+	 * inconsistent (higher) preempt count.
+	 *
+	 * It also avoids the below schedule_debug() test from complaining
+	 * about this.
+	 */
+	if (unlikely(prev->state == TASK_DEAD))
+		preempt_enable_no_resched_notrace();
+
+	schedule_debug(prev);
+
+	local_irq_disable();
+	rcu_note_context_switch();
+
+	/*
+	 * Make sure that signal_pending_state()->signal_pending() below
+	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
+	 * done by the caller to avoid the race with signal_wake_up().
+	 */
+	smp_mb__before_spinlock();
+	rq_lock(rq);
+#ifdef CONFIG_SMP
+	if (rq->preempt) {
+		/*
+		 * Make sure resched_curr hasn't triggered a preemption
+		 * locklessly on a task that has since scheduled away. Spurious
+		 * wakeup of idle is okay though.
+		 */
+		if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) {
+			rq->preempt = NULL;
+			clear_preempt_need_resched();
+			rq_unlock_irq(rq);
+			return;
+		}
+		rq->preempt = NULL;
+	}
+#endif
+
+	switch_count = &prev->nivcsw;
+	if (!preempt && prev->state) {
+		if (unlikely(signal_pending_state(prev->state, prev))) {
+			prev->state = TASK_RUNNING;
+		} else {
+			deactivate = true;
+			prev->on_rq = 0;
+
+			/*
+			 * If a worker is going to sleep, notify and
+			 * ask workqueue whether it wants to wake up a
+			 * task to maintain concurrency.  If so, wake
+			 * up the task.
+			 */
+			if (prev->flags & PF_WQ_WORKER) {
+				struct task_struct *to_wakeup;
+
+				to_wakeup = wq_worker_sleeping(prev);
+				if (to_wakeup)
+					try_to_wake_up_local(to_wakeup);
+			}
+		}
+		switch_count = &prev->nvcsw;
+	}
+
+	/*
+	 * Store the niffy value here for use by the next task's last_ran
+	 * below to avoid losing niffies due to update_clocks being called
+	 * again after this point.
+	 */
+	update_clocks(rq);
+	niffies = rq->niffies;
+	update_cpu_clock_switch(rq, prev);
+
+	clear_tsk_need_resched(prev);
+	clear_preempt_need_resched();
+
+	if (idle != prev) {
+		check_deadline(prev, rq);
+		return_task(prev, rq, cpu, deactivate);
+	}
+
+	next = earliest_deadline_task(rq, cpu, idle);
+	if (likely(next->prio != PRIO_LIMIT))
+		clear_cpuidle_map(cpu);
+	else {
+		set_cpuidle_map(cpu);
+		update_load_avg(rq);
+	}
+
+	set_rq_task(rq, next);
+	next->last_ran = niffies;
+
+	if (likely(prev != next)) {
+		/*
+		 * Don't reschedule an idle task or deactivated tasks
+		 */
+		if (prev != idle && !deactivate)
+			resched_suitable_idle(prev);
+		if (next != idle)
+			check_siblings(rq);
+		else
+			wake_siblings(rq);
+		atomic64_inc(&grq.nr_switches);
+		rq->curr = next;
+		++*switch_count;
+
+		trace_sched_switch(preempt, prev, next);
+		context_switch(rq, prev, next); /* unlocks the rq */
+	} else {
+		check_siblings(rq);
+		rq_unlock(rq);
+		do_pending_softirq(rq, next);
+		local_irq_enable();
+	}
+}
+
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+	if (!tsk->state || tsk_is_pi_blocked(tsk) ||
+	    preempt_count() ||
+	    signal_pending_state(tsk->state, tsk))
+		return;
+
+	/*
+	 * If we are going to sleep and we have plugged IO queued,
+	 * make sure to submit it to avoid deadlocks.
+	 */
+	if (blk_needs_flush_plug(tsk))
+		blk_schedule_flush_plug(tsk);
+}
+
+asmlinkage __visible void __sched schedule(void)
+{
+	struct task_struct *tsk = current;
+
+	sched_submit_work(tsk);
+	do {
+		preempt_disable();
+		__schedule(false);
+		sched_preempt_enable_no_resched();
+	} while (need_resched());
+}
+
+EXPORT_SYMBOL(schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+asmlinkage __visible void __sched schedule_user(void)
+{
+	/*
+	 * If we come here after a random call to set_need_resched(),
+	 * or we have been woken up remotely but the IPI has not yet arrived,
+	 * we haven't yet exited the RCU idle mode. Do it here manually until
+	 * we find a better solution.
+	 *
+	 * NB: There are buggy callers of this function.  Ideally we
+	 * should warn if prev_state != IN_USER, but that will trigger
+	 * too frequently to make sense yet.
+	 */
+	enum ctx_state prev_state = exception_enter();
+	schedule();
+	exception_exit(prev_state);
+}
+#endif
+
+/**
+ * schedule_preempt_disabled - called with preemption disabled
+ *
+ * Returns with preemption disabled. Note: preempt_count must be 1
+ */
+void __sched schedule_preempt_disabled(void)
+{
+	sched_preempt_enable_no_resched();
+	schedule();
+	preempt_disable();
+}
+
+static void __sched notrace preempt_schedule_common(void)
+{
+	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
+		preempt_disable_notrace();
+		preempt_latency_start(1);
+		__schedule(true);
+		preempt_latency_stop(1);
+		preempt_enable_no_resched_notrace();
+
+		/*
+		 * Check again in case we missed a preemption opportunity
+		 * between schedule and now.
+		 */
+	} while (need_resched());
+}
+
+#ifdef CONFIG_PREEMPT
+/*
+ * this is the entry point to schedule() from in-kernel preemption
+ * off of preempt_enable. Kernel preemptions off return from interrupt
+ * occur there and call schedule directly.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule(void)
+{
+	/*
+	 * If there is a non-zero preempt_count or interrupts are disabled,
+	 * we do not want to preempt the current task. Just return..
+	 */
+	if (likely(!preemptible()))
+		return;
+
+	preempt_schedule_common();
+}
+NOKPROBE_SYMBOL(preempt_schedule);
+EXPORT_SYMBOL(preempt_schedule);
+
+/**
+ * preempt_schedule_notrace - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
+{
+	enum ctx_state prev_ctx;
+
+	if (likely(!preemptible()))
+		return;
+
+	do {
+		/*
+		 * Because the function tracer can trace preempt_count_sub()
+		 * and it also uses preempt_enable/disable_notrace(), if
+		 * NEED_RESCHED is set, the preempt_enable_notrace() called
+		 * by the function tracer will call this function again and
+		 * cause infinite recursion.
+		 *
+		 * Preemption must be disabled here before the function
+		 * tracer can trace. Break up preempt_disable() into two
+		 * calls. One to disable preemption without fear of being
+		 * traced. The other to still record the preemption latency,
+		 * which can also be traced by the function tracer.
+		 */
+		preempt_disable_notrace();
+		preempt_latency_start(1);
+		/*
+		 * Needs preempt disabled in case user_exit() is traced
+		 * and the tracer calls preempt_enable_notrace() causing
+		 * an infinite recursion.
+		 */
+		prev_ctx = exception_enter();
+		__schedule(true);
+		exception_exit(prev_ctx);
+
+		preempt_latency_stop(1);
+		preempt_enable_no_resched_notrace();
+	} while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
+
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * this is the entry point to schedule() from kernel preemption
+ * off of irq context.
+ * Note, that this is called and return with irqs disabled. This will
+ * protect us against recursive calling from irq.
+ */
+asmlinkage __visible void __sched preempt_schedule_irq(void)
+{
+	enum ctx_state prev_state;
+
+	/* Catch callers which need to be fixed */
+	BUG_ON(preempt_count() || !irqs_disabled());
+
+	prev_state = exception_enter();
+
+	do {
+		preempt_disable();
+		local_irq_enable();
+		__schedule(true);
+		local_irq_disable();
+		sched_preempt_enable_no_resched();
+	} while (need_resched());
+
+	exception_exit(prev_state);
+}
+
+int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,
+			  void *key)
+{
+	return try_to_wake_up(curr->private, mode, wake_flags);
+}
+EXPORT_SYMBOL(default_wake_function);
+
+#ifdef CONFIG_RT_MUTEXES
+
+/*
+ * rt_mutex_setprio - set the current priority of a task
+ * @p: task
+ * @prio: prio value (kernel-internal form)
+ *
+ * This function changes the 'effective' priority of a task. It does
+ * not touch ->normal_prio like __setscheduler().
+ *
+ * Used by the rt_mutex code to implement priority inheritance
+ * logic. Call site only calls if the priority of the task changed.
+ */
+void rt_mutex_setprio(struct task_struct *p, int prio)
+{
+	struct rq *rq;
+	int oldprio;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = __task_rq_lock(p);
+
+	/*
+	 * Idle task boosting is a nono in general. There is one
+	 * exception, when PREEMPT_RT and NOHZ is active:
+	 *
+	 * The idle task calls get_next_timer_interrupt() and holds
+	 * the timer wheel base->lock on the CPU and another CPU wants
+	 * to access the timer (probably to cancel it). We can safely
+	 * ignore the boosting request, as the idle CPU runs this code
+	 * with interrupts disabled and will complete the lock
+	 * protected section without being interrupted. So there is no
+	 * real need to boost.
+	 */
+	if (unlikely(p == rq->idle)) {
+		WARN_ON(p != rq->curr);
+		WARN_ON(p->pi_blocked_on);
+		goto out_unlock;
+	}
+
+	trace_sched_pi_setprio(p, prio);
+	oldprio = p->prio;
+	p->prio = prio;
+	if (task_running(rq, p)){
+		if (prio > oldprio)
+			resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(rq, p, DEQUEUE_SAVE);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
+		if (prio < oldprio)
+			try_preempt(p, rq);
+	}
+out_unlock:
+	__task_rq_unlock(rq);
+}
+
+#endif
+
+/*
+ * Adjust the deadline for when the priority is to change, before it's
+ * changed.
+ */
+static inline void adjust_deadline(struct task_struct *p, int new_prio)
+{
+	p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p);
+}
+
+void set_user_nice(struct task_struct *p, long nice)
+{
+	int new_static, old_static;
+	unsigned long flags;
+	struct rq *rq;
+
+	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
+		return;
+	new_static = NICE_TO_PRIO(nice);
+	/*
+	 * We have to be careful, if called from sys_setpriority(),
+	 * the task might be in the middle of scheduling on another CPU.
+	 */
+	rq = task_rq_lock(p, &flags);
+	/*
+	 * The RT priorities are set via sched_setscheduler(), but we still
+	 * allow the 'normal' nice value to be set - but as expected
+	 * it wont have any effect on scheduling until the task is
+	 * not SCHED_NORMAL/SCHED_BATCH:
+	 */
+	if (has_rt_policy(p)) {
+		p->static_prio = new_static;
+		goto out_unlock;
+	}
+
+	adjust_deadline(p, new_static);
+	old_static = p->static_prio;
+	p->static_prio = new_static;
+	p->prio = effective_prio(p);
+
+	if (task_queued(p)) {
+		dequeue_task(rq, p, DEQUEUE_SAVE);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
+		if (new_static < old_static)
+			try_preempt(p, rq);
+	} else if (task_running(rq, p)) {
+		set_rq_task(rq, p);
+		if (old_static < new_static)
+			resched_task(p);
+	}
+out_unlock:
+	task_rq_unlock(rq, p, &flags);
+}
+EXPORT_SYMBOL(set_user_nice);
+
+/*
+ * can_nice - check if a task can reduce its nice value
+ * @p: task
+ * @nice: nice value
+ */
+int can_nice(const struct task_struct *p, const int nice)
+{
+	/* convert nice value [19,-20] to rlimit style value [1,40] */
+	int nice_rlim = nice_to_rlimit(nice);
+
+	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
+		capable(CAP_SYS_NICE));
+}
+
+#ifdef __ARCH_WANT_SYS_NICE
+
+/*
+ * sys_nice - change the priority of the current process.
+ * @increment: priority increment
+ *
+ * sys_setpriority is a more generic, but much slower function that
+ * does similar things.
+ */
+SYSCALL_DEFINE1(nice, int, increment)
+{
+	long nice, retval;
+
+	/*
+	 * Setpriority might change our priority at the same moment.
+	 * We don't have to worry. Conceptually one call occurs first
+	 * and we have a single winner.
+	 */
+
+	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
+	nice = task_nice(current) + increment;
+
+	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
+	if (increment < 0 && !can_nice(current, nice))
+		return -EPERM;
+
+	retval = security_task_setnice(current, nice);
+	if (retval)
+		return retval;
+
+	set_user_nice(current, nice);
+	return 0;
+}
+
+#endif
+
+/**
+ * task_prio - return the priority value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The priority value as seen by users in /proc.
+ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes
+ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO).
+ */
+int task_prio(const struct task_struct *p)
+{
+	int delta, prio = p->prio - MAX_RT_PRIO;
+
+	/* rt tasks and iso tasks */
+	if (prio <= 0)
+		goto out;
+
+	/* Convert to ms to avoid overflows */
+	delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
+	delta = delta * 40 / ms_longest_deadline_diff();
+	if (delta > 0 && delta <= 80)
+		prio += delta;
+	if (idleprio_task(p))
+		prio += 40;
+out:
+	return prio;
+}
+
+/**
+ * idle_cpu - is a given cpu idle currently?
+ * @cpu: the processor in question.
+ *
+ * Return: 1 if the CPU is currently idle. 0 otherwise.
+ */
+int idle_cpu(int cpu)
+{
+	return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+}
+
+/**
+ * idle_task - return the idle task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * Return: The idle task for the cpu @cpu.
+ */
+struct task_struct *idle_task(int cpu)
+{
+	return cpu_rq(cpu)->idle;
+}
+
+/**
+ * find_process_by_pid - find a process with a matching PID value.
+ * @pid: the pid in question.
+ *
+ * The task of @pid, if found. %NULL otherwise.
+ */
+static inline struct task_struct *find_process_by_pid(pid_t pid)
+{
+	return pid ? find_task_by_vpid(pid) : current;
+}
+
+/* Actually do priority change: must hold rq lock. */
+static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
+			   int prio, bool keep_boost)
+{
+	int oldrtprio, oldprio;
+
+	p->policy = policy;
+	oldrtprio = p->rt_priority;
+	p->rt_priority = prio;
+	p->normal_prio = normal_prio(p);
+	oldprio = p->prio;
+	/*
+	 * Keep a potential priority boosting if called from
+	 * sched_setscheduler().
+	 */
+	if (keep_boost) {
+		/*
+		 * Take priority boosted tasks into account. If the new
+		 * effective priority is unchanged, we just store the new
+		 * normal parameters and do not touch the scheduler class and
+		 * the runqueue. This will be done when the task deboost
+		 * itself.
+		 */
+		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
+	} else
+		p->prio = p->normal_prio;
+
+	if (task_running(rq, p)) {
+		set_rq_task(rq, p);
+		resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(rq, p, DEQUEUE_SAVE);
+		enqueue_task(rq, p, ENQUEUE_RESTORE);
+		if (p->prio < oldprio || p->rt_priority > oldrtprio)
+			try_preempt(p, rq);
+	}
+}
+
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+	const struct cred *cred = current_cred(), *pcred;
+	bool match;
+
+	rcu_read_lock();
+	pcred = __task_cred(p);
+	match = (uid_eq(cred->euid, pcred->euid) ||
+		 uid_eq(cred->euid, pcred->uid));
+	rcu_read_unlock();
+	return match;
+}
+
+static int
+__sched_setscheduler(struct task_struct *p, int policy,
+		     const struct sched_param *param, bool user, bool pi)
+{
+	struct sched_param zero_param = { .sched_priority = 0 };
+	unsigned long flags, rlim_rtprio = 0;
+	int retval, oldpolicy = -1;
+	int reset_on_fork;
+	struct rq *rq;
+
+	/* may grab non-irq protected spin_locks */
+	BUG_ON(in_interrupt());
+
+	if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) {
+		unsigned long lflags;
+
+		if (!lock_task_sighand(p, &lflags))
+			return -ESRCH;
+		rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
+		unlock_task_sighand(p, &lflags);
+		if (rlim_rtprio)
+			goto recheck;
+		/*
+		 * If the caller requested an RT policy without having the
+		 * necessary rights, we downgrade the policy to SCHED_ISO.
+		 * We also set the parameter to zero to pass the checks.
+		 */
+		policy = SCHED_ISO;
+		param = &zero_param;
+	}
+recheck:
+	/* double check policy once rq lock held */
+	if (policy < 0) {
+		reset_on_fork = p->sched_reset_on_fork;
+		policy = oldpolicy = p->policy;
+	} else {
+		reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+		policy &= ~SCHED_RESET_ON_FORK;
+
+		if (!SCHED_RANGE(policy))
+			return -EINVAL;
+	}
+
+	/*
+	 * Valid priorities for SCHED_FIFO and SCHED_RR are
+	 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and
+	 * SCHED_BATCH is 0.
+	 */
+	if (param->sched_priority < 0 ||
+	    (p->mm && param->sched_priority > MAX_USER_RT_PRIO - 1) ||
+	    (!p->mm && param->sched_priority > MAX_RT_PRIO - 1))
+		return -EINVAL;
+	if (is_rt_policy(policy) != (param->sched_priority != 0))
+		return -EINVAL;
+
+	/*
+	 * Allow unprivileged RT tasks to decrease priority:
+	 */
+	if (user && !capable(CAP_SYS_NICE)) {
+		if (is_rt_policy(policy)) {
+			unsigned long rlim_rtprio =
+					task_rlimit(p, RLIMIT_RTPRIO);
+
+			/* can't set/change the rt policy */
+			if (policy != p->policy && !rlim_rtprio)
+				return -EPERM;
+
+			/* can't increase priority */
+			if (param->sched_priority > p->rt_priority &&
+			    param->sched_priority > rlim_rtprio)
+				return -EPERM;
+		} else {
+			switch (p->policy) {
+				/*
+				 * Can only downgrade policies but not back to
+				 * SCHED_NORMAL
+				 */
+				case SCHED_ISO:
+					if (policy == SCHED_ISO)
+						goto out;
+					if (policy != SCHED_NORMAL)
+						return -EPERM;
+					break;
+				case SCHED_BATCH:
+					if (policy == SCHED_BATCH)
+						goto out;
+					if (policy != SCHED_IDLEPRIO)
+						return -EPERM;
+					break;
+				case SCHED_IDLEPRIO:
+					if (policy == SCHED_IDLEPRIO)
+						goto out;
+					return -EPERM;
+				default:
+					break;
+			}
+		}
+
+		/* can't change other user's priorities */
+		if (!check_same_owner(p))
+			return -EPERM;
+
+		/* Normal users shall not reset the sched_reset_on_fork flag */
+		if (p->sched_reset_on_fork && !reset_on_fork)
+			return -EPERM;
+	}
+
+	if (user) {
+		retval = security_task_setscheduler(p);
+		if (retval)
+			return retval;
+	}
+
+	/*
+	 * make sure no PI-waiters arrive (or leave) while we are
+	 * changing the priority of the task:
+	 *
+	 * To be able to change p->policy safely, the runqueue lock must be
+	 * held.
+	 */
+	rq = task_rq_lock(p, &flags);
+
+	/*
+	 * Changing the policy of the stop threads its a very bad idea
+	 */
+	if (p == rq->stop) {
+		task_rq_unlock(rq, p, &flags);
+		return -EINVAL;
+	}
+
+	/*
+	 * If not changing anything there's no need to proceed further:
+	 */
+	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
+			param->sched_priority == p->rt_priority))) {
+		task_rq_unlock(rq, p, &flags);
+		return 0;
+	}
+
+	/* recheck policy now with rq lock held */
+	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
+		policy = oldpolicy = -1;
+		task_rq_unlock(rq, p, &flags);
+		goto recheck;
+	}
+	p->sched_reset_on_fork = reset_on_fork;
+
+	__setscheduler(p, rq, policy, param->sched_priority, pi);
+	task_rq_unlock(rq, p, &flags);
+
+	if (pi)
+		rt_mutex_adjust_pi(p);
+out:
+	return 0;
+}
+
+/**
+ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ *
+ * NOTE that the task may be already dead.
+ */
+int sched_setscheduler(struct task_struct *p, int policy,
+		       const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, true, true);
+}
+
+EXPORT_SYMBOL_GPL(sched_setscheduler);
+
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+	const struct sched_param param = { .sched_priority = attr->sched_priority };
+	int policy = attr->sched_policy;
+
+	return __sched_setscheduler(p, policy, &param, true, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
+
+/**
+ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
+ * @p: the task in question.
+ * @policy: new policy.
+ * @param: structure containing the new RT priority.
+ *
+ * Just like sched_setscheduler, only don't bother checking if the
+ * current context has permission.  For example, this is needed in
+ * stop_machine(): we create temporary high priority worker threads,
+ * but our caller might not have that capability.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+int sched_setscheduler_nocheck(struct task_struct *p, int policy,
+			       const struct sched_param *param)
+{
+	return __sched_setscheduler(p, policy, param, false, true);
+}
+EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
+
+static int
+do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+{
+	struct sched_param lparam;
+	struct task_struct *p;
+	int retval;
+
+	if (!param || pid < 0)
+		return -EINVAL;
+	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
+		return -EFAULT;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setscheduler(p, policy, &lparam);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+			   struct sched_attr *attr)
+{
+	u32 size;
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+		return -EFAULT;
+
+	/*
+	 * zero the full structure, so that a short copy will be nice.
+	 */
+	memset(attr, 0, sizeof(*attr));
+
+	ret = get_user(size, &uattr->size);
+	if (ret)
+		return ret;
+
+	if (size > PAGE_SIZE)	/* silly large */
+		goto err_size;
+
+	if (!size)		/* abi compat */
+		size = SCHED_ATTR_SIZE_VER0;
+
+	if (size < SCHED_ATTR_SIZE_VER0)
+		goto err_size;
+
+	/*
+	 * If we're handed a bigger struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
+	 */
+	if (size > sizeof(*attr)) {
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
+
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
+
+		for (; addr < end; addr++) {
+			ret = get_user(val, addr);
+			if (ret)
+				return ret;
+			if (val)
+				goto err_size;
+		}
+		size = sizeof(*attr);
+	}
+
+	ret = copy_from_user(attr, uattr, size);
+	if (ret)
+		return -EFAULT;
+
+	/*
+	 * XXX: do we want to be lenient like existing syscalls; or do we want
+	 * to be strict and return an error on out-of-bounds values?
+	 */
+	attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+
+	/* sched/core.c uses zero here but we already know ret is zero */
+	return 0;
+
+err_size:
+	put_user(sizeof(*attr), &uattr->size);
+	return -E2BIG;
+}
+
+/**
+ * sys_sched_setscheduler - set/change the scheduler policy and RT priority
+ * @pid: the pid in question.
+ * @policy: new policy.
+ *
+ * Return: 0 on success. An error code otherwise.
+ * @param: structure containing the new RT priority.
+ */
+asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
+				       struct sched_param __user *param)
+{
+	/* negative values for policy are not valid */
+	if (policy < 0)
+		return -EINVAL;
+
+	return do_sched_setscheduler(pid, policy, param);
+}
+
+/*
+ * sched_setparam() passes in -1 for its policy, to let the functions
+ * it calls know not to change it.
+ */
+#define SETPARAM_POLICY	-1
+
+/**
+ * sys_sched_setparam - set/change the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the new RT priority.
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
+{
+	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
+}
+
+/**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+			       unsigned int, flags)
+{
+	struct sched_attr attr;
+	struct task_struct *p;
+	int retval;
+
+	if (!uattr || pid < 0 || flags)
+		return -EINVAL;
+
+	retval = sched_copy_attr(uattr, &attr);
+	if (retval)
+		return retval;
+
+	if ((int)attr.sched_policy < 0)
+		return -EINVAL;
+
+	rcu_read_lock();
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (p != NULL)
+		retval = sched_setattr(p, &attr);
+	rcu_read_unlock();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the policy (scheduling class) of a thread
+ * @pid: the pid in question.
+ *
+ * Return: On success, the policy of the thread. Otherwise, a negative error
+ * code.
+ */
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
+{
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (pid < 0)
+		goto out_nounlock;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (p) {
+		retval = security_task_getscheduler(p);
+		if (!retval)
+			retval = p->policy;
+	}
+	rcu_read_unlock();
+
+out_nounlock:
+	return retval;
+}
+
+/**
+ * sys_sched_getscheduler - get the RT priority of a thread
+ * @pid: the pid in question.
+ * @param: structure containing the RT priority.
+ *
+ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
+ * code.
+ */
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
+{
+	struct sched_param lp = { .sched_priority = 0 };
+	struct task_struct *p;
+	int retval = -EINVAL;
+
+	if (!param || pid < 0)
+		goto out_nounlock;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	if (has_rt_policy(p))
+		lp.sched_priority = p->rt_priority;
+	rcu_read_unlock();
+
+	/*
+	 * This one might sleep, we cannot do it with a spinlock held ...
+	 */
+	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
+
+out_nounlock:
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static int sched_read_attr(struct sched_attr __user *uattr,
+			   struct sched_attr *attr,
+			   unsigned int usize)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_WRITE, uattr, usize))
+		return -EFAULT;
+
+	/*
+	 * If we're handed a smaller struct than we know of,
+	 * ensure all the unknown bits are 0 - i.e. old
+	 * user-space does not get uncomplete information.
+	 */
+	if (usize < sizeof(*attr)) {
+		unsigned char *addr;
+		unsigned char *end;
+
+		addr = (void *)attr + usize;
+		end  = (void *)attr + sizeof(*attr);
+
+		for (; addr < end; addr++) {
+			if (*addr)
+				return -EFBIG;
+		}
+
+		attr->size = usize;
+	}
+
+	ret = copy_to_user(uattr, attr, attr->size);
+	if (ret)
+		return -EFAULT;
+
+	/* sched/core.c uses zero here but we already know ret is zero */
+	return ret;
+}
+
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+		unsigned int, size, unsigned int, flags)
+{
+	struct sched_attr attr = {
+		.size = sizeof(struct sched_attr),
+	};
+	struct task_struct *p;
+	int retval;
+
+	if (!uattr || pid < 0 || size > PAGE_SIZE ||
+	    size < SCHED_ATTR_SIZE_VER0 || flags)
+		return -EINVAL;
+
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	retval = -ESRCH;
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	attr.sched_policy = p->policy;
+	if (rt_task(p))
+		attr.sched_priority = p->rt_priority;
+	else
+		attr.sched_nice = task_nice(p);
+
+	rcu_read_unlock();
+
+	retval = sched_read_attr(uattr, &attr, size);
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
+	cpumask_var_t cpus_allowed, new_mask;
+	struct task_struct *p;
+	int retval;
+
+	rcu_read_lock();
+
+	p = find_process_by_pid(pid);
+	if (!p) {
+		rcu_read_unlock();
+		return -ESRCH;
+	}
+
+	/* Prevent p going away */
+	get_task_struct(p);
+	rcu_read_unlock();
+
+	if (p->flags & PF_NO_SETAFFINITY) {
+		retval = -EINVAL;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_put_task;
+	}
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+		retval = -ENOMEM;
+		goto out_free_cpus_allowed;
+	}
+	retval = -EPERM;
+	if (!check_same_owner(p)) {
+		rcu_read_lock();
+		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
+			rcu_read_unlock();
+			goto out_unlock;
+		}
+		rcu_read_unlock();
+	}
+
+	retval = security_task_setscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	cpuset_cpus_allowed(p, cpus_allowed);
+	cpumask_and(new_mask, in_mask, cpus_allowed);
+again:
+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+
+	if (!retval) {
+		cpuset_cpus_allowed(p, cpus_allowed);
+		if (!cpumask_subset(new_mask, cpus_allowed)) {
+			/*
+			 * We must have raced with a concurrent cpuset
+			 * update. Just reset the cpus_allowed to the
+			 * cpuset's cpus_allowed
+			 */
+			cpumask_copy(new_mask, cpus_allowed);
+			goto again;
+		}
+	}
+out_unlock:
+	free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+	free_cpumask_var(cpus_allowed);
+out_put_task:
+	put_task_struct(p);
+	return retval;
+}
+
+static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
+			     cpumask_t *new_mask)
+{
+	if (len < sizeof(cpumask_t)) {
+		memset(new_mask, 0, sizeof(cpumask_t));
+	} else if (len > sizeof(cpumask_t)) {
+		len = sizeof(cpumask_t);
+	}
+	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
+}
+
+
+/**
+ * sys_sched_setaffinity - set the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to the new cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	cpumask_var_t new_mask;
+	int retval;
+
+	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
+	if (retval == 0)
+		retval = sched_setaffinity(pid, new_mask);
+	free_cpumask_var(new_mask);
+	return retval;
+}
+
+long sched_getaffinity(pid_t pid, cpumask_t *mask)
+{
+	struct task_struct *p;
+	unsigned long flags;
+	int retval;
+
+	get_online_cpus();
+	rcu_read_lock();
+
+	retval = -ESRCH;
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	raw_spin_lock_irqsave(&p->pi_lock, flags);
+	cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+out_unlock:
+	rcu_read_unlock();
+	put_online_cpus();
+
+	return retval;
+}
+
+/**
+ * sys_sched_getaffinity - get the cpu affinity of a process
+ * @pid: pid of the process
+ * @len: length in bytes of the bitmask pointed to by user_mask_ptr
+ * @user_mask_ptr: user-space pointer to hold the current cpu mask
+ *
+ * Return: 0 on success. An error code otherwise.
+ */
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
+{
+	int ret;
+	cpumask_var_t mask;
+
+	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+		return -EINVAL;
+	if (len & (sizeof(unsigned long)-1))
+		return -EINVAL;
+
+	if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	ret = sched_getaffinity(pid, mask);
+	if (ret == 0) {
+		size_t retlen = min_t(size_t, len, cpumask_size());
+
+		if (copy_to_user(user_mask_ptr, mask, retlen))
+			ret = -EFAULT;
+		else
+			ret = retlen;
+	}
+	free_cpumask_var(mask);
+
+	return ret;
+}
+
+/**
+ * sys_sched_yield - yield the current processor to other threads.
+ *
+ * This function yields the current CPU to other tasks. It does this by
+ * scheduling away the current task. If it still has the earliest deadline
+ * it will be scheduled again as the next task.
+ *
+ * Return: 0.
+ */
+SYSCALL_DEFINE0(sched_yield)
+{
+	struct task_struct *p;
+	struct rq *rq;
+
+	p = current;
+	rq = this_rq_lock();
+	time_slice_expired(p, rq);
+	schedstat_inc(task_rq(p), yld_count);
+
+	/*
+	 * Since we are going to call schedule() anyway, there's
+	 * no need to preempt or enable interrupts:
+	 */
+	__release(rq->lock);
+	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+	do_raw_spin_unlock(&rq->lock);
+	sched_preempt_enable_no_resched();
+
+	schedule();
+
+	return 0;
+}
+
+int __sched _cond_resched(void)
+{
+	if (should_resched(0)) {
+		preempt_schedule_common();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched);
+
+/*
+ * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
+ * call schedule, and on return reacquire the lock.
+ *
+ * This works OK both with and without CONFIG_PREEMPT.  We do strange low-level
+ * operations here to prevent schedule() from being called twice (once via
+ * spin_unlock(), once by hand).
+ */
+int __cond_resched_lock(spinlock_t *lock)
+{
+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
+	int ret = 0;
+
+	lockdep_assert_held(lock);
+
+	if (spin_needbreak(lock) || resched) {
+		spin_unlock(lock);
+		if (resched)
+			preempt_schedule_common();
+		else
+			cpu_relax();
+		ret = 1;
+		spin_lock(lock);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(__cond_resched_lock);
+
+int __sched __cond_resched_softirq(void)
+{
+	BUG_ON(!in_softirq());
+
+	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
+		local_bh_enable();
+		preempt_schedule_common();
+		local_bh_disable();
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(__cond_resched_softirq);
+
+/**
+ * yield - yield the current processor to other threads.
+ *
+ * Do not ever use this function, there's a 99% chance you're doing it wrong.
+ *
+ * The scheduler is at all times free to pick the calling task as the most
+ * eligible task to run, if removing the yield() call from your code breaks
+ * it, its already broken.
+ *
+ * Typical broken usage is:
+ *
+ * while (!event)
+ * 	yield();
+ *
+ * where one assumes that yield() will let 'the other' process run that will
+ * make event true. If the current task is a SCHED_FIFO task that will never
+ * happen. Never use yield() as a progress guarantee!!
+ *
+ * If you want to use yield() to wait for something, use wait_event().
+ * If you want to use yield() to be 'nice' for others, use cond_resched().
+ * If you still want to use yield(), do not!
+ */
+void __sched yield(void)
+{
+	set_current_state(TASK_RUNNING);
+	sys_sched_yield();
+}
+EXPORT_SYMBOL(yield);
+
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Return:
+ *	true (>0) if we indeed boosted the target task.
+ *	false (0) if we failed to boost the target.
+ *	-ESRCH if there's no task to yield to.
+ */
+int __sched yield_to(struct task_struct *p, bool preempt)
+{
+	struct task_struct *rq_p;
+	struct rq *rq, *p_rq;
+	unsigned long flags;
+	int yielded = 0;
+
+	local_irq_save(flags);
+	rq = this_rq();
+
+again:
+	p_rq = task_rq(p);
+	/*
+	 * If we're the only runnable task on the rq and target rq also
+	 * has only one task, there's absolutely no point in yielding.
+	 */
+	if (task_running(p_rq, p) || p->state) {
+		yielded = -ESRCH;
+		goto out_irq;
+	}
+
+	double_rq_lock(rq, p_rq);
+	if (unlikely(task_rq(p) != p_rq)) {
+		double_rq_unlock(rq, p_rq);
+		goto again;
+	}
+
+	yielded = 1;
+	rq_p = rq->curr;
+	if (p->deadline > rq_p->deadline)
+		p->deadline = rq_p->deadline;
+	p->time_slice += rq_p->time_slice;
+	if (p->time_slice > timeslice())
+		p->time_slice = timeslice();
+	time_slice_expired(rq_p, rq);
+	if (preempt && rq != p_rq)
+		resched_task(p_rq->curr);
+	double_rq_unlock(rq, p_rq);
+out_irq:
+	local_irq_restore(flags);
+
+	if (yielded > 0)
+		schedule();
+	return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
+/*
+ * This task is about to go to sleep on IO.  Increment rq->nr_iowait so
+ * that process accounting knows that this is a task in IO wait state.
+ *
+ * But don't do that if it is a deliberate, throttling IO wait (this task
+ * has set its backing_dev_info: the queue against which it should throttle)
+ */
+
+long __sched io_schedule_timeout(long timeout)
+{
+	int old_iowait = current->in_iowait;
+	struct rq *rq;
+	long ret;
+
+	current->in_iowait = 1;
+	blk_schedule_flush_plug(current);
+
+	delayacct_blkio_start();
+	rq = raw_rq();
+	atomic_inc(&rq->nr_iowait);
+	ret = schedule_timeout(timeout);
+	current->in_iowait = old_iowait;
+	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
+
+	return ret;
+}
+EXPORT_SYMBOL(io_schedule_timeout);
+
+/**
+ * sys_sched_get_priority_max - return maximum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the maximum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = MAX_USER_RT_PRIO-1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_ISO:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_get_priority_min - return minimum RT priority.
+ * @policy: scheduling class.
+ *
+ * Return: On success, this syscall returns the minimum
+ * rt_priority that can be used by a given scheduling class.
+ * On failure, a negative error code is returned.
+ */
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
+{
+	int ret = -EINVAL;
+
+	switch (policy) {
+	case SCHED_FIFO:
+	case SCHED_RR:
+		ret = 1;
+		break;
+	case SCHED_NORMAL:
+	case SCHED_BATCH:
+	case SCHED_ISO:
+	case SCHED_IDLEPRIO:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+/**
+ * sys_sched_rr_get_interval - return the default timeslice of a process.
+ * @pid: pid of the process.
+ * @interval: userspace pointer to the timeslice value.
+ *
+ *
+ * Return: On success, 0 and the timeslice is in @interval. Otherwise,
+ * an error code.
+ */
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
+{
+	struct task_struct *p;
+	unsigned int time_slice;
+	unsigned long flags;
+	struct timespec t;
+	struct rq *rq;
+	int retval;
+
+	if (pid < 0)
+		return -EINVAL;
+
+	retval = -ESRCH;
+	rcu_read_lock();
+	p = find_process_by_pid(pid);
+	if (!p)
+		goto out_unlock;
+
+	retval = security_task_getscheduler(p);
+	if (retval)
+		goto out_unlock;
+
+	rq = task_rq_lock(p, &flags);
+	time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
+	task_rq_unlock(rq, p, &flags);
+
+	rcu_read_unlock();
+	t = ns_to_timespec(time_slice);
+	retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
+	return retval;
+
+out_unlock:
+	rcu_read_unlock();
+	return retval;
+}
+
+static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
+
+void sched_show_task(struct task_struct *p)
+{
+	unsigned long free = 0;
+	int ppid;
+	unsigned long state = p->state;
+
+	if (state)
+		state = __ffs(state) + 1;
+	printk(KERN_INFO "%-15.15s %c", p->comm,
+		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
+#if BITS_PER_LONG == 32
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT " running  ");
+	else
+		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
+#else
+	if (state == TASK_RUNNING)
+		printk(KERN_CONT "  running task    ");
+	else
+		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
+#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+	free = stack_not_used(p);
+#endif
+	ppid = 0;
+	rcu_read_lock();
+	if (pid_alive(p))
+		ppid = task_pid_nr(rcu_dereference(p->real_parent));
+	rcu_read_unlock();
+	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+		task_pid_nr(p), ppid,
+		(unsigned long)task_thread_info(p)->flags);
+
+	print_worker_info(KERN_INFO, p);
+	show_stack(p, NULL);
+}
+
+void show_state_filter(unsigned long state_filter)
+{
+	struct task_struct *g, *p;
+
+#if BITS_PER_LONG == 32
+	printk(KERN_INFO
+		"  task                PC stack   pid father\n");
+#else
+	printk(KERN_INFO
+		"  task                        PC stack   pid father\n");
+#endif
+	rcu_read_lock();
+	for_each_process_thread(g, p) {
+		/*
+		 * reset the NMI-timeout, listing all files on a slow
+		 * console might take a lot of time:
+		 * Also, reset softlockup watchdogs on all CPUs, because
+		 * another CPU might be blocked waiting for us to process
+		 * an IPI.
+		 */
+		touch_nmi_watchdog();
+		touch_all_softlockup_watchdogs();
+		if (!state_filter || (p->state & state_filter))
+			sched_show_task(p);
+	}
+
+	rcu_read_unlock();
+	/*
+	 * Only show locks if all tasks are dumped:
+	 */
+	if (!state_filter)
+		debug_show_all_locks();
+}
+
+void dump_cpu_task(int cpu)
+{
+	pr_info("Task dump for CPU %d:\n", cpu);
+	sched_show_task(cpu_curr(cpu));
+}
+
+#ifdef CONFIG_SMP
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+{
+	cpumask_copy(&p->cpus_allowed, new_mask);
+	p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+static void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	struct rq *rq = task_rq(p);
+
+	lockdep_assert_held(&p->pi_lock);
+
+	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+
+	if (task_queued(p)) {
+		/*
+		 * Because __kthread_bind() calls this on blocked tasks without
+		 * holding rq->lock.
+		 */
+		lockdep_assert_held(&rq->lock);
+	}
+}
+
+/*
+ * Calling do_set_cpus_allowed from outside the scheduler code may make the
+ * task not be able to run on its current CPU so we resched it here.
+ */
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask);
+	if (needs_other_cpu(p, task_cpu(p))) {
+		struct rq *rq;
+
+		set_task_cpu(p, valid_task_cpu(p));
+		rq = __task_rq_lock(p);
+		resched_task(p);
+		__task_rq_unlock(rq);
+	}
+}
+
+/*
+ * For internal scheduler calls to do_set_cpus_allowed which will resched
+ * themselves if needed.
+ */
+static void _do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask);
+	/* __set_cpus_allowed_ptr will handle the reschedule in this variant */
+	if (needs_other_cpu(p, task_cpu(p)))
+		set_task_cpu(p, valid_task_cpu(p));
+}
+#endif
+
+/**
+ * init_idle - set up an idle thread for a given CPU
+ * @idle: task in question
+ * @cpu: cpu the idle task belongs to
+ *
+ * NOTE: this function does not set the idle thread's NEED_RESCHED
+ * flag, to make booting more robust.
+ */
+void init_idle(struct task_struct *idle, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
+	raw_spin_lock(&rq->lock);
+	idle->last_ran = rq->niffies;
+	time_slice_expired(idle, rq);
+	idle->state = TASK_RUNNING;
+	/* Setting prio to illegal value shouldn't matter when never queued */
+	idle->prio = PRIO_LIMIT;
+
+	kasan_unpoison_task_stack(idle);
+
+#ifdef CONFIG_SMP
+	/*
+	 * It's possible that init_idle() gets called multiple times on a task,
+	 * in that case do_set_cpus_allowed() will not do the right thing.
+	 *
+	 * And since this is boot we can forgo the serialisation.
+	 */
+	set_cpus_allowed_common(idle, cpumask_of(cpu));
+#ifdef CONFIG_SMT_NICE
+	idle->smt_bias = 0;
+#endif
+#endif
+	set_rq_task(rq, idle);
+
+	/* Silence PROVE_RCU */
+	rcu_read_lock();
+	set_task_cpu(idle, cpu);
+	rcu_read_unlock();
+
+	rq->curr = rq->idle = idle;
+	idle->on_rq = TASK_ON_RQ_QUEUED;
+	raw_spin_unlock(&rq->lock);
+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
+
+	/* Set the preempt count _outside_ the spinlocks! */
+	init_idle_preempt_count(idle, cpu);
+
+	ftrace_graph_init_idle_task(idle, cpu);
+	vtime_init_idle(idle, cpu);
+#ifdef CONFIG_SMP
+	sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
+}
+
+int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur,
+			      const struct cpumask __maybe_unused *trial)
+{
+	return 1;
+}
+
+int task_can_attach(struct task_struct *p,
+		    const struct cpumask *cs_cpus_allowed)
+{
+	int ret = 0;
+
+	/*
+	 * Kthreads which disallow setaffinity shouldn't be moved
+	 * to a new cpuset; we don't want to change their cpu
+	 * affinity and isolating such threads by their set of
+	 * allowed nodes is unnecessary.  Thus, cpusets are not
+	 * applicable for such threads.  This prevents checking for
+	 * success of set_cpus_allowed_ptr() on all attached tasks
+	 * before cpus_allowed may be changed.
+	 */
+	if (p->flags & PF_NO_SETAFFINITY)
+		ret = -EINVAL;
+
+	return ret;
+}
+
+void resched_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	rq_lock_irqsave(rq, &flags);
+	resched_task(cpu_curr(cpu));
+	rq_unlock_irqrestore(rq, &flags);
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_NO_HZ_COMMON
+void nohz_balance_enter_idle(int cpu)
+{
+}
+
+void select_nohz_load_balancer(int stop_tick)
+{
+}
+
+void set_cpu_sd_state_idle(void) {}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+#endif /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+
+/*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+	int i, cpu = smp_processor_id();
+	struct sched_domain *sd;
+
+	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+		return cpu;
+
+	rcu_read_lock();
+	for_each_domain(cpu, sd) {
+		for_each_cpu(i, sched_domain_span(sd)) {
+			if (cpu == i)
+				continue;
+
+			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ 				cpu = i;
+				cpu = i;
+				goto unlock;
+			}
+		}
+	}
+
+	if (!is_housekeeping_cpu(cpu))
+		cpu = housekeeping_any_cpu();
+unlock:
+	rcu_read_unlock();
+	return cpu;
+}
+
+/*
+ * When add_timer_on() enqueues a timer into the timer wheel of an
+ * idle CPU then this timer might expire before the next timer event
+ * which is scheduled to wake up that CPU. In case of a completely
+ * idle system the next event might even be infinite time into the
+ * future. wake_up_idle_cpu() ensures that the CPU is woken up and
+ * leaves the inner idle loop so the newly added timer is taken into
+ * account when the CPU goes back to idle and evaluates the timer
+ * wheel for the next timer event.
+ */
+void wake_up_idle_cpu(int cpu)
+{
+	if (cpu == smp_processor_id())
+		return;
+
+	if (set_nr_and_not_polling(cpu_rq(cpu)->idle))
+		smp_sched_reschedule(cpu);
+	else
+		trace_sched_wake_idle_without_ipi(cpu);
+}
+
+void wake_up_nohz_cpu(int cpu)
+{
+	wake_up_idle_cpu(cpu);
+}
+#endif /* CONFIG_NO_HZ_COMMON */
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+				  const struct cpumask *new_mask, bool check)
+{
+	const struct cpumask *cpu_valid_mask = cpu_active_mask;
+	bool queued = false, running_wrong = false, kthread;
+	struct cpumask old_mask;
+	unsigned long flags;
+	struct rq *rq;
+	int ret = 0;
+
+	rq = task_rq_lock(p, &flags);
+
+	kthread = !!(p->flags & PF_KTHREAD);
+	if (kthread) {
+		/*
+		 * Kernel threads are allowed on online && !active CPUs
+		 */
+		cpu_valid_mask = cpu_online_mask;
+	}
+
+	/*
+	 * Must re-check here, to close a race against __kthread_bind(),
+	 * sched_setaffinity() is not guaranteed to observe the flag.
+	 */
+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+	if (cpumask_equal(&old_mask, new_mask))
+		goto out;
+
+	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	queued = task_queued(p);
+
+	_do_set_cpus_allowed(p, new_mask);
+
+	if (kthread) {
+		/*
+		 * For kernel threads that do indeed end up on online &&
+		 * !active we want to ensure they are strict per-cpu threads.
+		 */
+		WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
+			!cpumask_intersects(new_mask, cpu_active_mask) &&
+			tsk_nr_cpus_allowed(p) != 1);
+	}
+
+	/* Can the task run on the task's current CPU? If so, we're done */
+	if (cpumask_test_cpu(task_cpu(p), new_mask))
+		goto out;
+
+	if (task_running(rq, p)) {
+		/* Task is running on the wrong cpu now, reschedule it. */
+		if (rq == this_rq()) {
+			set_tsk_need_resched(p);
+			running_wrong = kthread;
+		} else
+			resched_task(p);
+	} else {
+		int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+		struct rq *dest_rq = cpu_rq(dest_cpu);
+
+		/* Switch rq locks here */
+		lock_second_rq(rq, dest_rq);
+		set_task_cpu(p, dest_cpu);
+		rq_unlock(rq);
+
+		rq = dest_rq;
+	}
+out:
+	if (queued && !cpumask_subset(new_mask, &old_mask))
+		try_preempt(p, rq);
+	if (running_wrong)
+		preempt_disable();
+	task_rq_unlock(rq, p, &flags);
+
+	if (running_wrong) {
+		__schedule(true);
+		preempt_enable();
+	}
+
+	return ret;
+}
+
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+	return __set_cpus_allowed_ptr(p, new_mask, false);
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
+ * cpu 0 and src_cpu's runqueue locks.
+ */
+static void bind_zero(int src_cpu)
+{
+	struct task_struct *p, *t;
+	int bound = 0;
+
+	if (src_cpu == 0)
+		return;
+
+	do_each_thread(t, p) {
+		if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
+			bool local = (task_cpu(p) == src_cpu);
+
+			/* task_running is the cpu stopper thread */
+			if (local && task_running(task_rq(p), p))
+				continue;
+			atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p));
+			atomic_set_cpu(0, tsk_cpus_allowed(p));
+			p->zerobound = true;
+			bound++;
+			if (local)
+				set_task_cpu(p, 0);
+		}
+	} while_each_thread(t, p);
+
+	if (bound) {
+		printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n",
+		       bound, src_cpu);
+	}
+}
+
+/* Find processes with the zerobound flag and reenable their affinity for the
+ * CPU coming alive. */
+static void unbind_zero(int src_cpu)
+{
+	int unbound = 0, zerobound = 0;
+	struct task_struct *p, *t;
+
+	if (src_cpu == 0)
+		return;
+
+	do_each_thread(t, p) {
+		if (!p->mm)
+			p->zerobound = false;
+		if (p->zerobound) {
+			unbound++;
+			cpumask_set_cpu(src_cpu, tsk_cpus_allowed(p));
+			/* Once every CPU affinity has been re-enabled, remove
+			 * the zerobound flag */
+			if (cpumask_subset(cpu_possible_mask, tsk_cpus_allowed(p))) {
+				p->zerobound = false;
+				zerobound++;
+			}
+		}
+	} while_each_thread(t, p);
+
+	if (unbound) {
+		printk(KERN_INFO "Added affinity for %d processes to cpu %d\n",
+		       unbound, src_cpu);
+	}
+	if (zerobound) {
+		printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n",
+		       zerobound);
+	}
+}
+
+/*
+ * Ensures that the idle task is using init_mm right before its cpu goes
+ * offline.
+ */
+void idle_task_exit(void)
+{
+	struct mm_struct *mm = current->active_mm;
+
+	BUG_ON(cpu_online(smp_processor_id()));
+
+	if (mm != &init_mm) {
+		switch_mm_irqs_off(mm, &init_mm, current);
+		finish_arch_post_lock_switch();
+	}
+	mmdrop(mm);
+}
+#else /* CONFIG_HOTPLUG_CPU */
+static void unbind_zero(int src_cpu) {}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+void sched_set_stop_task(int cpu, struct task_struct *stop)
+{
+	struct sched_param stop_param = { .sched_priority = STOP_PRIO };
+	struct sched_param start_param = { .sched_priority = 0 };
+	struct task_struct *old_stop = cpu_rq(cpu)->stop;
+
+	if (stop) {
+		/*
+		 * Make it appear like a SCHED_FIFO task, its something
+		 * userspace knows about and won't get confused about.
+		 *
+		 * Also, it will make PI more or less work without too
+		 * much confusion -- but then, stop work should not
+		 * rely on PI working anyway.
+		 */
+		sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param);
+	}
+
+	cpu_rq(cpu)->stop = stop;
+
+	if (old_stop) {
+		/*
+		 * Reset it back to a normal scheduling policy so that
+		 * it can die in pieces.
+		 */
+		sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param);
+	}
+}
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+
+static struct ctl_table sd_ctl_dir[] = {
+	{
+		.procname	= "sched_domain",
+		.mode		= 0555,
+	},
+	{}
+};
+
+static struct ctl_table sd_ctl_root[] = {
+	{
+		.procname	= "kernel",
+		.mode		= 0555,
+		.child		= sd_ctl_dir,
+	},
+	{}
+};
+
+static struct ctl_table *sd_alloc_ctl_entry(int n)
+{
+	struct ctl_table *entry =
+		kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
+
+	return entry;
+}
+
+static void sd_free_ctl_entry(struct ctl_table **tablep)
+{
+	struct ctl_table *entry;
+
+	/*
+	 * In the intermediate directories, both the child directory and
+	 * procname are dynamically allocated and could fail but the mode
+	 * will always be set. In the lowest directory the names are
+	 * static strings and all have proc handlers.
+	 */
+	for (entry = *tablep; entry->mode; entry++) {
+		if (entry->child)
+			sd_free_ctl_entry(&entry->child);
+		if (entry->proc_handler == NULL)
+			kfree(entry->procname);
+	}
+
+	kfree(*tablep);
+	*tablep = NULL;
+}
+
+#define CPU_LOAD_IDX_MAX 5
+static int min_load_idx = 0;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
+
+static void
+set_table_entry(struct ctl_table *entry,
+		const char *procname, void *data, int maxlen,
+		umode_t mode, proc_handler *proc_handler,
+		bool load_idx)
+{
+	entry->procname = procname;
+	entry->data = data;
+	entry->maxlen = maxlen;
+	entry->mode = mode;
+	entry->proc_handler = proc_handler;
+
+	if (load_idx) {
+		entry->extra1 = &min_load_idx;
+		entry->extra2 = &max_load_idx;
+	}
+}
+
+static struct ctl_table *
+sd_alloc_ctl_domain_table(struct sched_domain *sd)
+{
+	struct ctl_table *table = sd_alloc_ctl_entry(14);
+
+	if (table == NULL)
+		return NULL;
+
+	set_table_entry(&table[0], "min_interval", &sd->min_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[1], "max_interval", &sd->max_interval,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+		sizeof(int), 0644, proc_dointvec_minmax, true);
+	set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[9], "cache_nice_tries",
+		&sd->cache_nice_tries,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[10], "flags", &sd->flags,
+		sizeof(int), 0644, proc_dointvec_minmax, false);
+	set_table_entry(&table[11], "max_newidle_lb_cost",
+		&sd->max_newidle_lb_cost,
+		sizeof(long), 0644, proc_doulongvec_minmax, false);
+	set_table_entry(&table[12], "name", sd->name,
+		CORENAME_MAX_SIZE, 0444, proc_dostring, false);
+	/* &table[13] is terminator */
+
+	return table;
+}
+
+static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
+{
+	struct ctl_table *entry, *table;
+	struct sched_domain *sd;
+	int domain_num = 0, i;
+	char buf[32];
+
+	for_each_domain(cpu, sd)
+		domain_num++;
+	entry = table = sd_alloc_ctl_entry(domain_num + 1);
+	if (table == NULL)
+		return NULL;
+
+	i = 0;
+	for_each_domain(cpu, sd) {
+		snprintf(buf, 32, "domain%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_domain_table(sd);
+		entry++;
+		i++;
+	}
+	return table;
+}
+
+static struct ctl_table_header *sd_sysctl_header;
+void register_sched_domain_sysctl(void)
+{
+	int i, cpu_num = num_possible_cpus();
+	struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
+	char buf[32];
+
+	WARN_ON(sd_ctl_dir[0].child);
+	sd_ctl_dir[0].child = entry;
+
+	if (entry == NULL)
+		return;
+
+	for_each_possible_cpu(i) {
+		snprintf(buf, 32, "cpu%d", i);
+		entry->procname = kstrdup(buf, GFP_KERNEL);
+		entry->mode = 0555;
+		entry->child = sd_alloc_ctl_cpu_table(i);
+		entry++;
+	}
+
+	WARN_ON(sd_sysctl_header);
+	sd_sysctl_header = register_sysctl_table(sd_ctl_root);
+}
+
+/* may be called multiple times per register */
+void unregister_sched_domain_sysctl(void)
+{
+	unregister_sysctl_table(sd_sysctl_header);
+	sd_sysctl_header = NULL;
+	if (sd_ctl_dir[0].child)
+		sd_free_ctl_entry(&sd_ctl_dir[0].child);
+}
+#endif /* CONFIG_SYSCTL */
+
+static void set_rq_online(struct rq *rq)
+{
+	if (!rq->online) {
+		cpumask_set_cpu(cpu_of(rq), rq->rd->online);
+		rq->online = true;
+	}
+}
+
+static void set_rq_offline(struct rq *rq)
+{
+	if (rq->online) {
+		int cpu = cpu_of(rq);
+
+		cpumask_clear_cpu(cpu, rq->rd->online);
+		rq->online = false;
+		clear_cpuidle_map(cpu);
+	}
+}
+
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
+#ifdef CONFIG_SCHED_DEBUG
+
+static __read_mostly int sched_debug_enabled;
+
+static int __init sched_debug_setup(char *str)
+{
+	sched_debug_enabled = 1;
+
+	return 0;
+}
+early_param("sched_debug", sched_debug_setup);
+
+static inline bool sched_debug(void)
+{
+	return sched_debug_enabled;
+}
+
+static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
+				  struct cpumask *groupmask)
+{
+	cpumask_clear(groupmask);
+
+	printk(KERN_DEBUG "%*s domain %d: ", level, "", level);
+
+	if (!(sd->flags & SD_LOAD_BALANCE)) {
+		printk("does not load-balance\n");
+		if (sd->parent)
+			printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+					" has parent");
+		return -1;
+	}
+
+	printk(KERN_CONT "span %*pbl level %s\n",
+	       cpumask_pr_args(sched_domain_span(sd)), sd->name);
+
+	if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+		printk(KERN_ERR "ERROR: domain->span does not contain "
+				"CPU%d\n", cpu);
+	}
+
+	printk(KERN_CONT "\n");
+
+	if (!cpumask_equal(sched_domain_span(sd), groupmask))
+		printk(KERN_ERR "ERROR: groups don't span domain->span\n");
+
+	if (sd->parent &&
+	    !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
+		printk(KERN_ERR "ERROR: parent span is not a superset "
+			"of domain->span\n");
+	return 0;
+}
+
+static void sched_domain_debug(struct sched_domain *sd, int cpu)
+{
+	int level = 0;
+
+	if (!sched_debug_enabled)
+		return;
+
+	if (!sd) {
+		printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
+		return;
+	}
+
+	printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
+
+	for (;;) {
+		if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
+			break;
+		level++;
+		sd = sd->parent;
+		if (!sd)
+			break;
+	}
+}
+#else /* !CONFIG_SCHED_DEBUG */
+# define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+	return false;
+}
+#endif /* CONFIG_SCHED_DEBUG */
+
+static int sd_degenerate(struct sched_domain *sd)
+{
+	if (cpumask_weight(sched_domain_span(sd)) == 1)
+		return 1;
+
+	/* Following flags don't use groups */
+	if (sd->flags & (SD_WAKE_AFFINE))
+		return 0;
+
+	return 1;
+}
+
+static int
+sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
+{
+	unsigned long cflags = sd->flags, pflags = parent->flags;
+
+	if (sd_degenerate(parent))
+		return 1;
+
+	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
+		return 0;
+
+	if (~cflags & pflags)
+		return 0;
+
+	return 1;
+}
+
+static void free_rootdomain(struct rcu_head *rcu)
+{
+	struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
+
+	cpupri_cleanup(&rd->cpupri);
+	free_cpumask_var(rd->rto_mask);
+	free_cpumask_var(rd->online);
+	free_cpumask_var(rd->span);
+	kfree(rd);
+}
+
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	struct root_domain *old_rd = NULL;
+	unsigned long flags;
+
+	rq_lock_irqsave(rq, &flags);
+
+	if (rq->rd) {
+		old_rd = rq->rd;
+
+		if (cpumask_test_cpu(rq->cpu, old_rd->online))
+			set_rq_offline(rq);
+
+		cpumask_clear_cpu(rq->cpu, old_rd->span);
+
+		/*
+		 * If we dont want to free the old_rd yet then
+		 * set old_rd to NULL to skip the freeing later
+		 * in this function:
+		 */
+		if (!atomic_dec_and_test(&old_rd->refcount))
+			old_rd = NULL;
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	cpumask_set_cpu(rq->cpu, rd->span);
+	if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
+		set_rq_online(rq);
+
+	rq_unlock_irqrestore(rq, &flags);
+
+	if (old_rd)
+		call_rcu_sched(&old_rd->rcu, free_rootdomain);
+}
+
+static int init_rootdomain(struct root_domain *rd)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
+		goto out;
+	if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
+		goto free_span;
+	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+		goto free_online;
+
+	if (cpupri_init(&rd->cpupri) != 0)
+		goto free_rto_mask;
+	return 0;
+
+free_rto_mask:
+	free_cpumask_var(rd->rto_mask);
+free_online:
+	free_cpumask_var(rd->online);
+free_span:
+	free_cpumask_var(rd->span);
+out:
+	return -ENOMEM;
+}
+
+static void init_defrootdomain(void)
+{
+	init_rootdomain(&def_root_domain);
+
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	if (init_rootdomain(rd) != 0) {
+		kfree(rd);
+		return NULL;
+	}
+
+	return rd;
+}
+
+static void free_sched_domain(struct rcu_head *rcu)
+{
+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+
+	kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+	call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+	for (; sd; sd = sd->parent)
+		destroy_sched_domain(sd, cpu);
+}
+
+/*
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
+ * hold the hotplug lock.
+ */
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	struct sched_domain *tmp;
+
+	/* Remove the sched domains which do not contribute to scheduling. */
+	for (tmp = sd; tmp; ) {
+		struct sched_domain *parent = tmp->parent;
+		if (!parent)
+			break;
+
+		if (sd_parent_degenerate(tmp, parent)) {
+			tmp->parent = parent->parent;
+			if (parent->parent)
+				parent->parent->child = tmp;
+			/*
+			 * Transfer SD_PREFER_SIBLING down in case of a
+			 * degenerate parent; the spans match for this
+			 * so the property transfers.
+			 */
+			if (parent->flags & SD_PREFER_SIBLING)
+				tmp->flags |= SD_PREFER_SIBLING;
+			destroy_sched_domain(parent, cpu);
+		} else
+			tmp = tmp->parent;
+	}
+
+	if (sd && sd_degenerate(sd)) {
+		tmp = sd;
+		sd = sd->parent;
+		destroy_sched_domain(tmp, cpu);
+		if (sd)
+			sd->child = NULL;
+	}
+
+	sched_domain_debug(sd, cpu);
+
+	rq_attach_root(rq, rd);
+	tmp = rq->sd;
+	rcu_assign_pointer(rq->sd, sd);
+	destroy_sched_domains(tmp, cpu);
+}
+
+/* Setup the mask of cpus configured for isolated domains */
+static int __init isolated_cpu_setup(char *str)
+{
+	int ret;
+
+	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	ret = cpulist_parse(str, cpu_isolated_map);
+	if (ret) {
+		pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
+		return 0;
+	}
+	return 1;
+}
+
+__setup("isolcpus=", isolated_cpu_setup);
+
+struct s_data {
+	struct sched_domain ** __percpu sd;
+	struct root_domain	*rd;
+};
+
+enum s_alloc {
+	sa_rootdomain,
+	sa_sd,
+	sa_sd_storage,
+	sa_none,
+};
+
+/*
+ * Initializers for schedule domains
+ * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
+ */
+
+static int default_relax_domain_level = -1;
+int sched_domain_level_max;
+
+static int __init setup_relax_domain_level(char *str)
+{
+	if (kstrtoint(str, 0, &default_relax_domain_level))
+		pr_warn("Unable to set relax_domain_level\n");
+
+	return 1;
+}
+__setup("relax_domain_level=", setup_relax_domain_level);
+
+static void set_domain_attribute(struct sched_domain *sd,
+				 struct sched_domain_attr *attr)
+{
+	int request;
+
+	if (!attr || attr->relax_domain_level < 0) {
+		if (default_relax_domain_level < 0)
+			return;
+		else
+			request = default_relax_domain_level;
+	} else
+		request = attr->relax_domain_level;
+	if (request < sd->level) {
+		/* turn off idle balance on this domain */
+		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	} else {
+		/* turn on idle balance on this domain */
+		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
+	}
+}
+
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
+static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
+				 const struct cpumask *cpu_map)
+{
+	switch (what) {
+	case sa_rootdomain:
+		if (!atomic_read(&d->rd->refcount))
+			free_rootdomain(&d->rd->rcu); /* fall through */
+	case sa_sd:
+		free_percpu(d->sd); /* fall through */
+	case sa_sd_storage:
+		__sdt_free(cpu_map); /* fall through */
+	case sa_none:
+		break;
+	}
+}
+
+static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
+						   const struct cpumask *cpu_map)
+{
+	memset(d, 0, sizeof(*d));
+
+	if (__sdt_alloc(cpu_map))
+		return sa_sd_storage;
+	d->sd = alloc_percpu(struct sched_domain *);
+	if (!d->sd)
+		return sa_sd_storage;
+	d->rd = alloc_rootdomain();
+	if (!d->rd)
+		return sa_sd;
+	return sa_rootdomain;
+}
+
+/*
+ * NULL the sd_data elements we've used to build the sched_domain
+ * structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
+{
+	struct sd_data *sdd = sd->private;
+
+	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+	*per_cpu_ptr(sdd->sd, cpu) = NULL;
+}
+
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+#endif
+
+/*
+ * SD_flags allowed in topology descriptions.
+ *
+ * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * SD_NUMA                - describes NUMA topologies
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS		\
+	(SD_SHARE_CPUCAPACITY |		\
+	 SD_SHARE_PKG_RESOURCES |	\
+	 SD_NUMA |			\
+	 SD_ASYM_PACKING |		\
+	 SD_SHARE_POWERDOMAIN)
+
+static struct sched_domain *
+sd_init(struct sched_domain_topology_level *tl, int cpu)
+{
+	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+	int sd_weight, sd_flags = 0;
+
+#ifdef CONFIG_NUMA
+	/*
+	 * Ugly hack to pass state to sd_numa_mask()...
+	 */
+	sched_domains_curr_level = tl->numa_level;
+#endif
+
+	sd_weight = cpumask_weight(tl->mask(cpu));
+
+	if (tl->sd_flags)
+		sd_flags = (*tl->sd_flags)();
+	if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+			"wrong sd_flags in topology description\n"))
+		sd_flags &= ~TOPOLOGY_SD_FLAGS;
+
+	*sd = (struct sched_domain){
+		.min_interval		= sd_weight,
+		.max_interval		= 2*sd_weight,
+		.busy_factor		= 32,
+		.imbalance_pct		= 125,
+
+		.cache_nice_tries	= 0,
+		.busy_idx		= 0,
+		.idle_idx		= 0,
+		.newidle_idx		= 0,
+		.wake_idx		= 0,
+		.forkexec_idx		= 0,
+
+		.flags			= 1*SD_LOAD_BALANCE
+					| 1*SD_BALANCE_NEWIDLE
+					| 1*SD_BALANCE_EXEC
+					| 1*SD_BALANCE_FORK
+					| 0*SD_BALANCE_WAKE
+					| 1*SD_WAKE_AFFINE
+					| 0*SD_SHARE_CPUCAPACITY
+					| 0*SD_SHARE_PKG_RESOURCES
+					| 0*SD_SERIALIZE
+					| 0*SD_PREFER_SIBLING
+					| 0*SD_NUMA
+					| sd_flags
+					,
+
+		.last_balance		= jiffies,
+		.balance_interval	= sd_weight,
+		.smt_gain		= 0,
+		.max_newidle_lb_cost	= 0,
+		.next_decay_max_lb_cost	= jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+		.name			= tl->name,
+#endif
+	};
+
+	/*
+	 * Convert topological properties into behaviour.
+	 */
+
+	if (sd->flags & SD_SHARE_CPUCAPACITY) {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->imbalance_pct = 110;
+		sd->smt_gain = 1178; /* ~15% */
+
+	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+		sd->imbalance_pct = 117;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+
+#ifdef CONFIG_NUMA
+	} else if (sd->flags & SD_NUMA) {
+		sd->cache_nice_tries = 2;
+		sd->busy_idx = 3;
+		sd->idle_idx = 2;
+
+		sd->flags |= SD_SERIALIZE;
+		if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+			sd->flags &= ~(SD_BALANCE_EXEC |
+				       SD_BALANCE_FORK |
+				       SD_WAKE_AFFINE);
+		}
+
+#endif
+	} else {
+		sd->flags |= SD_PREFER_SIBLING;
+		sd->cache_nice_tries = 1;
+		sd->busy_idx = 2;
+		sd->idle_idx = 1;
+	}
+
+	sd->private = &tl->data;
+
+	return sd;
+}
+
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
+	{ NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology =
+	default_topology;
+
+#define for_each_sd_topology(tl)			\
+	for (tl = sched_domain_topology; tl->mask; tl++)
+
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+	sched_domain_topology = tl;
+}
+
+#ifdef CONFIG_NUMA
+
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+	return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+
+static void sched_numa_warn(const char *str)
+{
+	static int done = false;
+	int i,j;
+
+	if (done)
+		return;
+
+	done = true;
+
+	printk(KERN_WARNING "ERROR: %s\n\n", str);
+
+	for (i = 0; i < nr_node_ids; i++) {
+		printk(KERN_WARNING "  ");
+		for (j = 0; j < nr_node_ids; j++)
+			printk(KERN_CONT "%02d ", node_distance(i,j));
+		printk(KERN_CONT "\n");
+	}
+	printk(KERN_WARNING "\n");
+}
+
+static bool find_numa_distance(int distance)
+{
+	int i;
+
+	if (distance == node_distance(0, 0))
+		return true;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		if (sched_domains_numa_distance[i] == distance)
+			return true;
+	}
+
+	return false;
+}
+
+static void sched_init_numa(void)
+{
+	int next_distance, curr_distance = node_distance(0, 0);
+	struct sched_domain_topology_level *tl;
+	int level = 0;
+	int i, j, k;
+
+	sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+	if (!sched_domains_numa_distance)
+		return;
+
+	/*
+	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+	 * unique distances in the node_distance() table.
+	 *
+	 * Assumes node_distance(0,j) includes all distances in
+	 * node_distance(i,j) in order to avoid cubic time.
+	 */
+	next_distance = curr_distance;
+	for (i = 0; i < nr_node_ids; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			for (k = 0; k < nr_node_ids; k++) {
+				int distance = node_distance(i, k);
+
+				if (distance > curr_distance &&
+				    (distance < next_distance ||
+				     next_distance == curr_distance))
+					next_distance = distance;
+
+				/*
+				 * While not a strong assumption it would be nice to know
+				 * about cases where if node A is connected to B, B is not
+				 * equally connected to A.
+				 */
+				if (sched_debug() && node_distance(k, i) != distance)
+					sched_numa_warn("Node-distance not symmetric");
+
+				if (sched_debug() && i && !find_numa_distance(distance))
+					sched_numa_warn("Node-0 not representative");
+			}
+			if (next_distance != curr_distance) {
+				sched_domains_numa_distance[level++] = next_distance;
+				sched_domains_numa_levels = level;
+				curr_distance = next_distance;
+			} else break;
+		}
+
+		/*
+		 * In case of sched_debug() we verify the above assumption.
+		 */
+		if (!sched_debug())
+			break;
+	}
+	/*
+	 * 'level' contains the number of unique distances, excluding the
+	 * identity distance node_distance(i,i).
+	 *
+	 * The sched_domains_numa_distance[] array includes the actual distance
+	 * numbers.
+	 */
+
+	/*
+	 * Here, we should temporarily reset sched_domains_numa_levels to 0.
+	 * If it fails to allocate memory for array sched_domains_numa_masks[][],
+	 * the array will contain less then 'level' members. This could be
+	 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
+	 * in other functions.
+	 *
+	 * We reset it to 'level' at the end of this function.
+	 */
+	sched_domains_numa_levels = 0;
+
+	sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+	if (!sched_domains_numa_masks)
+		return;
+
+	/*
+	 * Now for each level, construct a mask per node which contains all
+	 * cpus of nodes that are that many hops away from us.
+	 */
+	for (i = 0; i < level; i++) {
+		sched_domains_numa_masks[i] =
+			kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+		if (!sched_domains_numa_masks[i])
+			return;
+
+		for (j = 0; j < nr_node_ids; j++) {
+			struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+			if (!mask)
+				return;
+
+			sched_domains_numa_masks[i][j] = mask;
+
+			for_each_node(k) {
+				if (node_distance(j, k) > sched_domains_numa_distance[i])
+					continue;
+
+				cpumask_or(mask, mask, cpumask_of_node(k));
+			}
+		}
+	}
+
+	/* Compute default topology size */
+	for (i = 0; sched_domain_topology[i].mask; i++);
+
+	tl = kzalloc((i + level + 1) *
+			sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+	if (!tl)
+		return;
+
+	/*
+	 * Copy the default topology bits..
+	 */
+	for (i = 0; sched_domain_topology[i].mask; i++)
+		tl[i] = sched_domain_topology[i];
+
+	/*
+	 * .. and append 'j' levels of NUMA goodness.
+	 */
+	for (j = 0; j < level; i++, j++) {
+		tl[i] = (struct sched_domain_topology_level){
+			.mask = sd_numa_mask,
+			.sd_flags = cpu_numa_flags,
+			.flags = SDTL_OVERLAP,
+			.numa_level = j,
+			SD_INIT_NAME(NUMA)
+		};
+	}
+
+	sched_domain_topology = tl;
+
+	sched_domains_numa_levels = level;
+}
+
+static void sched_domains_numa_masks_set(int cpu)
+{
+	int node = cpu_to_node(cpu);
+	int i, j;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++) {
+			if (node_distance(j, node) <= sched_domains_numa_distance[i])
+				cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
+		}
+	}
+}
+
+static void sched_domains_numa_masks_clear(int cpu)
+{
+	int i, j;
+
+	for (i = 0; i < sched_domains_numa_levels; i++) {
+		for (j = 0; j < nr_node_ids; j++)
+			cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
+	}
+}
+
+#else
+static inline void sched_init_numa(void) { }
+static void sched_domains_numa_masks_set(unsigned int cpu) { }
+static void sched_domains_numa_masks_clear(unsigned int cpu) { }
+#endif /* CONFIG_NUMA */
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		sdd->sd = alloc_percpu(struct sched_domain *);
+		if (!sdd->sd)
+			return -ENOMEM;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+
+			sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+					GFP_KERNEL, cpu_to_node(j));
+			if (!sd)
+				return -ENOMEM;
+
+			*per_cpu_ptr(sdd->sd, j) = sd;
+		}
+	}
+
+	return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+	struct sched_domain_topology_level *tl;
+	int j;
+
+	for_each_sd_topology(tl) {
+		struct sd_data *sdd = &tl->data;
+
+		for_each_cpu(j, cpu_map) {
+			struct sched_domain *sd;
+
+			if (sdd->sd) {
+				sd = *per_cpu_ptr(sdd->sd, j);
+				kfree(*per_cpu_ptr(sdd->sd, j));
+			}
+		}
+		free_percpu(sdd->sd);
+		sdd->sd = NULL;
+	}
+}
+
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+		struct sched_domain *child, int cpu)
+{
+	struct sched_domain *sd = sd_init(tl, cpu);
+	if (!sd)
+		return child;
+
+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+	if (child) {
+		sd->level = child->level + 1;
+		sched_domain_level_max = max(sched_domain_level_max, sd->level);
+		child->parent = sd;
+		sd->child = child;
+
+		if (!cpumask_subset(sched_domain_span(child),
+				    sched_domain_span(sd))) {
+			pr_err("BUG: arch topology borken\n");
+#ifdef CONFIG_SCHED_DEBUG
+			pr_err("     the %s domain not a subset of the %s domain\n",
+					child->name, sd->name);
+#endif
+			/* Fixup, ensure @sd has at least @child cpus. */
+			cpumask_or(sched_domain_span(sd),
+				   sched_domain_span(sd),
+				   sched_domain_span(child));
+		}
+
+	}
+	set_domain_attribute(sd, attr);
+
+	return sd;
+}
+
+/*
+ * Build sched domains for a given set of cpus and attach the sched domains
+ * to the individual cpus
+ */
+static int build_sched_domains(const struct cpumask *cpu_map,
+			       struct sched_domain_attr *attr)
+{
+	enum s_alloc alloc_state;
+	struct sched_domain *sd;
+	struct s_data d;
+	int i, ret = -ENOMEM;
+
+	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
+	if (alloc_state != sa_rootdomain)
+		goto error;
+
+	/* Set up domains for cpus specified by the cpu_map. */
+	for_each_cpu(i, cpu_map) {
+		struct sched_domain_topology_level *tl;
+
+		sd = NULL;
+		for_each_sd_topology(tl) {
+			sd = build_sched_domain(tl, cpu_map, attr, sd, i);
+			if (tl == sched_domain_topology)
+				*per_cpu_ptr(d.sd, i) = sd;
+			if (tl->flags & SDTL_OVERLAP)
+				sd->flags |= SD_OVERLAP;
+			if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+				break;
+		}
+	}
+
+	/* Calculate CPU capacity for physical packages and nodes */
+	for (i = nr_cpumask_bits-1; i >= 0; i--) {
+		if (!cpumask_test_cpu(i, cpu_map))
+			continue;
+
+		for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+			claim_allocations(i, sd);
+		}
+	}
+
+	/* Attach the domains */
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map) {
+		sd = *per_cpu_ptr(d.sd, i);
+		cpu_attach_domain(sd, d.rd, i);
+	}
+	rcu_read_unlock();
+
+	ret = 0;
+error:
+	__free_domain_allocs(&d, alloc_state, cpu_map);
+	return ret;
+}
+
+static cpumask_var_t *doms_cur;	/* current sched domains */
+static int ndoms_cur;		/* number of sched domains in 'doms_cur' */
+static struct sched_domain_attr *dattr_cur;
+				/* attribues of custom domains in 'doms_cur' */
+
+/*
+ * Special case: If a kmalloc of a doms_cur partition (array of
+ * cpumask) fails, then fallback to a single sched domain,
+ * as determined by the single cpumask fallback_doms.
+ */
+static cpumask_var_t fallback_doms;
+
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __weak arch_update_cpu_topology(void)
+{
+	return 0;
+}
+
+cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
+{
+	int i;
+	cpumask_var_t *doms;
+
+	doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
+	if (!doms)
+		return NULL;
+	for (i = 0; i < ndoms; i++) {
+		if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
+			free_sched_domains(doms, i);
+			return NULL;
+		}
+	}
+	return doms;
+}
+
+void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
+{
+	unsigned int i;
+	for (i = 0; i < ndoms; i++)
+		free_cpumask_var(doms[i]);
+	kfree(doms);
+}
+
+/*
+ * Set up scheduler domains and groups. Callers must hold the hotplug lock.
+ * For now this just excludes isolated cpus, but could be used to
+ * exclude other special cases in the future.
+ */
+static int init_sched_domains(const struct cpumask *cpu_map)
+{
+	int err;
+
+	arch_update_cpu_topology();
+	ndoms_cur = 1;
+	doms_cur = alloc_sched_domains(ndoms_cur);
+	if (!doms_cur)
+		doms_cur = &fallback_doms;
+	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
+	err = build_sched_domains(doms_cur[0], NULL);
+	register_sched_domain_sysctl();
+
+	return err;
+}
+
+/*
+ * Detach sched domains from a group of cpus specified in cpu_map
+ * These cpus will now be attached to the NULL domain
+ */
+static void detach_destroy_domains(const struct cpumask *cpu_map)
+{
+	int i;
+
+	rcu_read_lock();
+	for_each_cpu(i, cpu_map)
+		cpu_attach_domain(NULL, &def_root_domain, i);
+	rcu_read_unlock();
+}
+
+/* handle null as "default" */
+static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
+			struct sched_domain_attr *new, int idx_new)
+{
+	struct sched_domain_attr tmp;
+
+	/* fast path */
+	if (!new && !cur)
+		return 1;
+
+	tmp = SD_ATTR_INIT;
+	return !memcmp(cur ? (cur + idx_cur) : &tmp,
+			new ? (new + idx_new) : &tmp,
+			sizeof(struct sched_domain_attr));
+}
+
+/*
+ * Partition sched domains as specified by the 'ndoms_new'
+ * cpumasks in the array doms_new[] of cpumasks. This compares
+ * doms_new[] to the current sched domain partitioning, doms_cur[].
+ * It destroys each deleted domain and builds each new domain.
+ *
+ * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
+ * The masks don't intersect (don't overlap.) We should setup one
+ * sched domain for each mask. CPUs not in any of the cpumasks will
+ * not be load balanced. If the same cpumask appears both in the
+ * current 'doms_cur' domains and in the new 'doms_new', we can leave
+ * it as it is.
+ *
+ * The passed in 'doms_new' should be allocated using
+ * alloc_sched_domains.  This routine takes ownership of it and will
+ * free_sched_domains it when done with it. If the caller failed the
+ * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
+ * and partition_sched_domains() will fallback to the single partition
+ * 'fallback_doms', it also forces the domains to be rebuilt.
+ *
+ * If doms_new == NULL it will be replaced with cpu_online_mask.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
+ *
+ * Call with hotplug lock held
+ */
+void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
+			     struct sched_domain_attr *dattr_new)
+{
+	int i, j, n;
+	int new_topology;
+
+	mutex_lock(&sched_domains_mutex);
+
+	/* always unregister in case we don't destroy any domains */
+	unregister_sched_domain_sysctl();
+
+	/* Let architecture update cpu core mappings. */
+	new_topology = arch_update_cpu_topology();
+
+	n = doms_new ? ndoms_new : 0;
+
+	/* Destroy deleted domains */
+	for (i = 0; i < ndoms_cur; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_cur[i], doms_new[j])
+			    && dattrs_equal(dattr_cur, i, dattr_new, j))
+				goto match1;
+		}
+		/* no match - a current sched domain not in new doms_new[] */
+		detach_destroy_domains(doms_cur[i]);
+match1:
+		;
+	}
+
+	n = ndoms_cur;
+	if (doms_new == NULL) {
+		n = 0;
+		doms_new = &fallback_doms;
+		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
+		WARN_ON_ONCE(dattr_new);
+	}
+
+	/* Build new domains */
+	for (i = 0; i < ndoms_new; i++) {
+		for (j = 0; j < n && !new_topology; j++) {
+			if (cpumask_equal(doms_new[i], doms_cur[j])
+			    && dattrs_equal(dattr_new, i, dattr_cur, j))
+				goto match2;
+		}
+		/* no match - add a new doms_new */
+		build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
+match2:
+		;
+	}
+
+	/* Remember the new sched domains */
+	if (doms_cur != &fallback_doms)
+		free_sched_domains(doms_cur, ndoms_cur);
+	kfree(dattr_cur);	/* kfree(NULL) is safe */
+	doms_cur = doms_new;
+	dattr_cur = dattr_new;
+	ndoms_cur = ndoms_new;
+
+	register_sched_domain_sysctl();
+
+	mutex_unlock(&sched_domains_mutex);
+}
+
+static int num_cpus_frozen;	/* used to mark begin/end of suspend/resume */
+
+/*
+ * Update cpusets according to cpu_active mask.  If cpusets are
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
+ *
+ * If we come here as part of a suspend/resume, don't touch cpusets because we
+ * want to restore it back to its original state upon resume anyway.
+ */
+static void cpuset_cpu_active(void)
+{
+	if (cpuhp_tasks_frozen) {
+		/*
+		 * num_cpus_frozen tracks how many CPUs are involved in suspend
+		 * resume sequence. As long as this is not the last online
+		 * operation in the resume sequence, just build a single sched
+		 * domain, ignoring cpusets.
+		 */
+		num_cpus_frozen--;
+		if (likely(num_cpus_frozen)) {
+			partition_sched_domains(1, NULL, NULL);
+			return;
+		}
+		/*
+		 * This is the last CPU online operation. So fall through and
+		 * restore the original sched domains by considering the
+		 * cpuset configurations.
+		 */
+	}
+
+	cpuset_update_active_cpus(true);
+}
+
+static int cpuset_cpu_inactive(unsigned int cpu)
+{
+	if (!cpuhp_tasks_frozen) {
+		cpuset_update_active_cpus(false);
+	} else {
+		num_cpus_frozen++;
+		partition_sched_domains(1, NULL, NULL);
+	}
+	return 0;
+}
+
+int sched_cpu_activate(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	set_cpu_active(cpu, true);
+
+	if (sched_smp_initialized) {
+		sched_domains_numa_masks_set(cpu);
+		cpuset_cpu_active();
+	}
+
+	/*
+	 * Put the rq online, if not already. This happens:
+	 *
+	 * 1) In the early boot process, because we build the real domains
+	 *    after all cpus have been brought up.
+	 *
+	 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
+	 *    domains.
+	 */
+	rq_lock_irqsave(rq, &flags);
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_online(rq);
+	}
+	unbind_zero(cpu);
+	rq_unlock_irqrestore(rq, &flags);
+
+	return 0;
+}
+
+int sched_cpu_deactivate(unsigned int cpu)
+{
+	int ret;
+
+	set_cpu_active(cpu, false);
+	/*
+	 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
+	 * users of this state to go away such that all new such users will
+	 * observe it.
+	 *
+	 * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+	 * not imply sync_sched(), so wait for both.
+	 *
+	 * Do sync before park smpboot threads to take care the rcu boost case.
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT))
+		synchronize_rcu_mult(call_rcu, call_rcu_sched);
+	else
+		synchronize_rcu();
+
+	if (!sched_smp_initialized)
+		return 0;
+
+	ret = cpuset_cpu_inactive(cpu);
+	if (ret) {
+		set_cpu_active(cpu, true);
+		return ret;
+	}
+	sched_domains_numa_masks_clear(cpu);
+	return 0;
+}
+
+int sched_cpu_starting(unsigned int __maybe_unused cpu)
+{
+	return 0;
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+int sched_cpu_dying(unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	double_rq_lock(rq, cpu_rq(0));
+	if (rq->rd) {
+		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
+		set_rq_offline(rq);
+	}
+	bind_zero(cpu);
+	double_rq_unlock(rq, cpu_rq(0));
+	sched_start_tick(rq, cpu);
+	hrexpiry_clear(rq);
+	local_irq_restore(flags);
+
+	return 0;
+}
+#endif
+
+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
+/*
+ * Cheaper version of the below functions in case support for SMT and MC is
+ * compiled in but CPUs have no siblings.
+ */
+static bool sole_cpu_idle(struct rq *rq)
+{
+	return rq_idle(rq);
+}
+#endif
+#ifdef CONFIG_SCHED_SMT
+static const cpumask_t *thread_cpumask(int cpu)
+{
+	return topology_sibling_cpumask(cpu);
+}
+/* All this CPU's SMT siblings are idle */
+static bool siblings_cpu_idle(struct rq *rq)
+{
+	return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
+}
+#endif
+#ifdef CONFIG_SCHED_MC
+static const cpumask_t *core_cpumask(int cpu)
+{
+	return topology_core_cpumask(cpu);
+}
+/* All this CPU's shared cache siblings are idle */
+static bool cache_cpu_idle(struct rq *rq)
+{
+	return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
+}
+#endif
+
+enum sched_domain_level {
+	SD_LV_NONE = 0,
+	SD_LV_SIBLING,
+	SD_LV_MC,
+	SD_LV_BOOK,
+	SD_LV_CPU,
+	SD_LV_NODE,
+	SD_LV_ALLNODES,
+	SD_LV_MAX
+};
+
+void __init sched_init_smp(void)
+{
+	struct sched_domain *sd;
+	int cpu, other_cpu;
+#ifdef CONFIG_SCHED_SMT
+	bool smt_threads = false;
+#endif
+	cpumask_var_t non_isolated_cpus;
+	struct rq *rq;
+
+	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
+	alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+
+	sched_init_numa();
+
+	/*
+	 * There's no userspace yet to cause hotplug operations; hence all the
+	 * cpu masks are stable and all blatant races in the below code cannot
+	 * happen.
+	 */
+	mutex_lock(&sched_domains_mutex);
+	init_sched_domains(cpu_active_mask);
+	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
+	if (cpumask_empty(non_isolated_cpus))
+		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
+	mutex_unlock(&sched_domains_mutex);
+
+	/* Move init over to a non-isolated CPU */
+	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+		BUG();
+	free_cpumask_var(non_isolated_cpus);
+
+	mutex_lock(&sched_domains_mutex);
+	local_irq_disable();
+	lock_all_rqs();
+	/*
+	 * Set up the relative cache distance of each online cpu from each
+	 * other in a simple array for quick lookup. Locality is determined
+	 * by the closest sched_domain that CPUs are separated by. CPUs with
+	 * shared cache in SMT and MC are treated as local. Separate CPUs
+	 * (within the same package or physically) within the same node are
+	 * treated as not local. CPUs not even in the same domain (different
+	 * nodes) are treated as very distant.
+	 */
+	for_each_online_cpu(cpu) {
+		rq = cpu_rq(cpu);
+
+		/* First check if this cpu is in the same node */
+		for_each_domain(cpu, sd) {
+			if (sd->level > SD_LV_MC)
+				continue;
+			/* Set locality to local node if not already found lower */
+			for_each_cpu(other_cpu, sched_domain_span(sd)) {
+				if (rq->cpu_locality[other_cpu] > 3)
+					rq->cpu_locality[other_cpu] = 3;
+			}
+		}
+
+		/*
+		 * Each runqueue has its own function in case it doesn't have
+		 * siblings of its own allowing mixed topologies.
+		 */
+#ifdef CONFIG_SCHED_MC
+		for_each_cpu(other_cpu, core_cpumask(cpu)) {
+			if (rq->cpu_locality[other_cpu] > 2)
+				rq->cpu_locality[other_cpu] = 2;
+		}
+		if (cpumask_weight(core_cpumask(cpu)) > 1) {
+			cpumask_copy(&rq->core_mask, core_cpumask(cpu));
+			cpumask_clear_cpu(cpu, &rq->core_mask);
+			rq->cache_idle = cache_cpu_idle;
+		}
+#endif
+#ifdef CONFIG_SCHED_SMT
+		if (cpumask_weight(thread_cpumask(cpu)) > 1) {
+			cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
+			cpumask_clear_cpu(cpu, &rq->thread_mask);
+			for_each_cpu(other_cpu, thread_cpumask(cpu))
+				rq->cpu_locality[other_cpu] = 1;
+			rq->siblings_idle = siblings_cpu_idle;
+			smt_threads = true;
+		}
+#endif
+	}
+	for_each_possible_cpu(cpu) {
+		int total_cpus = 1, locality;
+
+		rq = cpu_rq(cpu);
+		for (locality = 1; locality <= 4; locality++) {
+			for_each_possible_cpu(other_cpu) {
+				if (rq->cpu_locality[other_cpu] == locality)
+					rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+			}
+		}
+	}
+#ifdef CONFIG_SMT_NICE
+	if (smt_threads) {
+		check_siblings = &check_smt_siblings;
+		wake_siblings = &wake_smt_siblings;
+		smt_schedule = &smt_should_schedule;
+	}
+#endif
+	unlock_all_rqs();
+	local_irq_enable();
+	mutex_unlock(&sched_domains_mutex);
+
+	for_each_online_cpu(cpu) {
+		rq = cpu_rq(cpu);
+
+		for_each_online_cpu(other_cpu) {
+			if (other_cpu <= cpu)
+				continue;
+			printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
+		}
+	}
+	sched_smp_initialized = true;
+}
+#else
+void __init sched_init_smp(void)
+{
+	sched_smp_initialized = true;
+}
+#endif /* CONFIG_SMP */
+
+int in_sched_functions(unsigned long addr)
+{
+	return in_lock_functions(addr) ||
+		(addr >= (unsigned long)__sched_text_start
+		&& addr < (unsigned long)__sched_text_end);
+}
+
+#ifdef CONFIG_CGROUP_SCHED
+/* task group related information */
+struct task_group {
+	struct cgroup_subsys_state css;
+
+	struct rcu_head rcu;
+	struct list_head list;
+
+	struct task_group *parent;
+	struct list_head siblings;
+	struct list_head children;
+};
+
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
+#endif /* CONFIG_CGROUP_SCHED */
+
+void __init sched_init(void)
+{
+#ifdef CONFIG_SMP
+	int cpu_ids;
+#endif
+	int i;
+	struct rq *rq;
+
+	prio_ratios[0] = 128;
+	for (i = 1 ; i < NICE_WIDTH ; i++)
+		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+
+	atomic_set(&grq.nr_running, 0);
+	atomic_set(&grq.nr_uninterruptible, 0);
+	atomic64_set(&grq.nr_switches, 0);
+	skiplist_node_init(&init_task.node);
+
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+	cpumask_clear(&grq.cpu_idle_map);
+#else
+	uprq = &per_cpu(runqueues, 0);
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+	task_group_cache = KMEM_CACHE(task_group, 0);
+
+	list_add(&root_task_group.list, &task_groups);
+	INIT_LIST_HEAD(&root_task_group.children);
+	INIT_LIST_HEAD(&root_task_group.siblings);
+#endif /* CONFIG_CGROUP_SCHED */
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		skiplist_init(&rq->node);
+		rq->sl = new_skiplist(&rq->node);
+		raw_spin_lock_init(&rq->lock);
+		rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0;
+		rq->last_jiffy = jiffies;
+		rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns =
+			      rq->iowait_ns = rq->idle_ns = 0;
+		rq->dither = 0;
+		set_rq_task(rq, &init_task);
+		rq->iso_ticks = 0;
+		rq->iso_refractory = false;
+#ifdef CONFIG_SMP
+		rq->sd = NULL;
+		rq->rd = NULL;
+		rq->online = false;
+		rq->cpu = i;
+		rq_attach_root(rq, &def_root_domain);
+#endif
+		init_rq_hrexpiry(rq);
+		atomic_set(&rq->nr_iowait, 0);
+	}
+
+#ifdef CONFIG_SMP
+	cpu_ids = i;
+	/*
+	 * Set the base locality for cpu cache distance calculation to
+	 * "distant" (3). Make sure the distance from a CPU to itself is 0.
+	 */
+	for_each_possible_cpu(i) {
+		int j;
+
+		rq = cpu_rq(i);
+#ifdef CONFIG_SCHED_SMT
+		rq->siblings_idle = sole_cpu_idle;
+#endif
+#ifdef CONFIG_SCHED_MC
+		rq->cache_idle = sole_cpu_idle;
+#endif
+		rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC);
+		for_each_possible_cpu(j) {
+			if (i == j)
+				rq->cpu_locality[j] = 0;
+			else
+				rq->cpu_locality[j] = 4;
+		}
+		rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+		rq->rq_order[0] = rq;
+		for (j = 1; j < cpu_ids; j++)
+			rq->rq_order[j] = cpu_rq(j);
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_NOTIFIERS
+	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
+#endif
+
+	/*
+	 * The boot idle thread does lazy MMU switching as well:
+	 */
+	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current);
+
+	/*
+	 * Make us the idle thread. Technically, schedule() should not be
+	 * called from this thread, however somewhere below it might be,
+	 * but because we are the idle thread, we just pick up running again
+	 * when this runqueue becomes "idle".
+	 */
+	init_idle(current, smp_processor_id());
+
+#ifdef CONFIG_SMP
+	zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
+	/* May be allocated at isolcpus cmdline parse time */
+	if (cpu_isolated_map == NULL)
+		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+	idle_thread_set_boot_cpu();
+#endif /* SMP */
+
+	init_schedstats();
+}
+
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+static inline int preempt_count_equals(int preempt_offset)
+{
+	int nested = preempt_count() + rcu_preempt_depth();
+
+	return (nested == preempt_offset);
+}
+
+void __might_sleep(const char *file, int line, int preempt_offset)
+{
+	/*
+	 * Blocking primitives will set (and therefore destroy) current->state,
+	 * since we will exit with TASK_RUNNING make sure we enter with it,
+	 * otherwise we will destroy state.
+	 */
+	WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
+			"do not call blocking ops when !TASK_RUNNING; "
+			"state=%lx set at [<%p>] %pS\n",
+			current->state,
+			(void *)current->task_state_change,
+			(void *)current->task_state_change);
+
+	___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
+	static unsigned long prev_jiffy;	/* ratelimiting */
+
+	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
+	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
+	     !is_idle_task(current)) ||
+	    system_state != SYSTEM_RUNNING || oops_in_progress)
+		return;
+	if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
+		return;
+	prev_jiffy = jiffies;
+
+	printk(KERN_ERR
+		"BUG: sleeping function called from invalid context at %s:%d\n",
+			file, line);
+	printk(KERN_ERR
+		"in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
+			in_atomic(), irqs_disabled(),
+			current->pid, current->comm);
+
+	if (task_stack_end_corrupted(current))
+		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
+
+	debug_show_held_locks(current);
+	if (irqs_disabled())
+		print_irqtrace_events(current);
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (!preempt_count_equals(preempt_offset)) {
+		pr_err("Preemption disabled at:");
+		print_ip_sym(current->preempt_disable_ip);
+		pr_cont("\n");
+	}
+#endif
+	dump_stack();
+}
+EXPORT_SYMBOL(___might_sleep);
+#endif
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static inline void normalise_rt_tasks(void)
+{
+	struct task_struct *g, *p;
+	unsigned long flags;
+	struct rq *rq;
+
+	read_lock(&tasklist_lock);
+	for_each_process_thread(g, p) {
+		/*
+		 * Only normalize user tasks:
+		 */
+		if (p->flags & PF_KTHREAD)
+			continue;
+
+		if (!rt_task(p) && !iso_task(p))
+			continue;
+
+		rq = task_rq_lock(p, &flags);
+		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
+		task_rq_unlock(rq, p, &flags);
+	}
+	read_unlock(&tasklist_lock);
+}
+
+void normalize_rt_tasks(void)
+{
+	normalise_rt_tasks();
+}
+#endif /* CONFIG_MAGIC_SYSRQ */
+
+#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
+/*
+ * These functions are only useful for the IA64 MCA handling, or kdb.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ *
+ * Return: The current task for @cpu.
+ */
+struct task_struct *curr_task(int cpu)
+{
+	return cpu_curr(cpu);
+}
+
+#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
+
+#ifdef CONFIG_IA64
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronised, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, struct task_struct *p)
+{
+	cpu_curr(cpu) = p;
+}
+
+#endif
+
+void init_idle_bootup_task(struct task_struct *idle)
+{}
+
+#ifdef CONFIG_SCHED_DEBUG
+void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
+{}
+
+void proc_sched_set_task(struct task_struct *p)
+{}
+#endif
+
+#ifdef CONFIG_SMP
+#define SCHED_LOAD_SHIFT	(10)
+#define SCHED_LOAD_SCALE	(1L << SCHED_LOAD_SHIFT)
+
+unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+{
+	return SCHED_LOAD_SCALE;
+}
+
+unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+{
+	unsigned long weight = cpumask_weight(sched_domain_span(sd));
+	unsigned long smt_gain = sd->smt_gain;
+
+	smt_gain /= weight;
+
+	return smt_gain;
+}
+#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static void sched_free_group(struct task_group *tg)
+{
+	kmem_cache_free(task_group_cache, tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+	struct task_group *tg;
+
+	tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+	if (!tg)
+		return ERR_PTR(-ENOMEM);
+
+	return tg;
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+	/* now it should be safe to free those cfs_rqs */
+	sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+	/* wait for possible concurrent references to cfs_rqs complete */
+	call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+}
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+	return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct task_group *parent = css_tg(parent_css);
+	struct task_group *tg;
+
+	if (!parent) {
+		/* This is early initialization for the top cgroup */
+		return &root_task_group.css;
+	}
+
+	tg = sched_create_group(parent);
+	if (IS_ERR(tg))
+		return ERR_PTR(-ENOMEM);
+	return &tg->css;
+}
+
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+
+	sched_offline_group(tg);
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+	struct task_group *tg = css_tg(css);
+
+	/*
+	 * Relies on the RCU grace period between css_released() and this.
+	 */
+	sched_free_group(tg);
+}
+
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+	return 0;
+}
+
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+
+static struct cftype cpu_files[] = {
+	{ }	/* terminate */
+};
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+	.css_alloc	= cpu_cgroup_css_alloc,
+	.css_released	= cpu_cgroup_css_released,
+	.css_free	= cpu_cgroup_css_free,
+	.fork		= cpu_cgroup_fork,
+	.can_attach	= cpu_cgroup_can_attach,
+	.attach		= cpu_cgroup_attach,
+	.legacy_cftypes	= cpu_files,
+	.early_init	= true,
+};
+#endif	/* CONFIG_CGROUP_SCHED */
diff --git b/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
new file mode 100644
index 0000000..3565a7d
--- /dev/null
+++ b/kernel/sched/MuQSS.h
@@ -0,0 +1,343 @@
+#include <linux/sched.h>
+#include <linux/cpuidle.h>
+#include <linux/interrupt.h>
+#include <linux/skip_list.h>
+#include <linux/stop_machine.h>
+#include "cpuacct.h"
+
+#ifndef MUQSS_SCHED_H
+#define MUQSS_SCHED_H
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED	1
+#define TASK_ON_RQ_MIGRATING	2
+
+/*
+ * This is the main, per-CPU runqueue data structure.
+ * This data should only be modified by the local cpu.
+ */
+struct rq {
+	struct task_struct *curr, *idle, *stop;
+	struct mm_struct *prev_mm;
+
+	raw_spinlock_t lock;
+
+	/* Stored data about rq->curr to work outside rq lock */
+	u64 rq_deadline;
+	int rq_prio;
+
+	/* Best queued id for use outside lock */
+	u64 best_key;
+
+	unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */
+	unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
+	u64 niffies; /* Last time this RQ updated rq clock */
+	u64 last_niffy; /* Last niffies as updated by local clock */
+	u64 last_jiffy_niffies; /* Niffies @ last_jiffy */
+
+	u64 load_update; /* When we last updated load */
+	unsigned long load_avg; /* Rolling load average */
+#ifdef CONFIG_SMT_NICE
+	struct mm_struct *rq_mm;
+	int rq_smt_bias; /* Policy/nice level bias across smt siblings */
+#endif
+	/* Accurate timekeeping data */
+	unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns,
+		iowait_ns, idle_ns;
+	atomic_t nr_iowait;
+
+	skiplist_node node;
+	skiplist *sl;
+#ifdef CONFIG_SMP
+	struct task_struct *preempt; /* Preempt triggered on this task */
+
+	int cpu;		/* cpu of this runqueue */
+	bool online;
+
+	struct root_domain *rd;
+	struct sched_domain *sd;
+	int *cpu_locality; /* CPU relative cache distance */
+	struct rq **rq_order; /* RQs ordered by relative cache distance */
+
+#ifdef CONFIG_SCHED_SMT
+	cpumask_t thread_mask;
+	bool (*siblings_idle)(struct rq *rq);
+	/* See if all smt siblings are idle */
+#endif /* CONFIG_SCHED_SMT */
+#ifdef CONFIG_SCHED_MC
+	cpumask_t core_mask;
+	bool (*cache_idle)(struct rq *rq);
+	/* See if all cache siblings are idle */
+#endif /* CONFIG_SCHED_MC */
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+	u64 prev_irq_time;
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+	u64 prev_steal_time;
+#endif /* CONFIG_PARAVIRT */
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+	u64 prev_steal_time_rq;
+#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */
+
+	u64 clock, old_clock, last_tick;
+	u64 clock_task;
+	int dither;
+
+	int iso_ticks;
+	bool iso_refractory;
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+	struct hrtimer hrexpiry_timer;
+#endif
+
+#ifdef CONFIG_SCHEDSTATS
+
+	/* latency stats */
+	struct sched_info rq_sched_info;
+	unsigned long long rq_cpu_time;
+	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+
+	/* sys_sched_yield() stats */
+	unsigned int yld_count;
+
+	/* schedule() stats */
+	unsigned int sched_switch;
+	unsigned int sched_count;
+	unsigned int sched_goidle;
+
+	/* try_to_wake_up() stats */
+	unsigned int ttwu_count;
+	unsigned int ttwu_local;
+#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_SMP
+	struct llist_head wake_list;
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+	/* Must be inspected within a rcu lock section */
+	struct cpuidle_state *idle_state;
+#endif
+};
+
+#ifdef CONFIG_SMP
+struct rq *cpu_rq(int cpu);
+#endif
+
+#ifndef CONFIG_SMP
+extern struct rq *uprq;
+#define cpu_rq(cpu)	(uprq)
+#define this_rq()	(uprq)
+#define raw_rq()	(uprq)
+#define task_rq(p)	(uprq)
+#define cpu_curr(cpu)	((uprq)->curr)
+#else /* CONFIG_SMP */
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#define this_rq()		this_cpu_ptr(&runqueues)
+#define raw_rq()		raw_cpu_ptr(&runqueues)
+#endif /* CONFIG_SMP */
+
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_MIGRATED  - the task was migrated during wakeup
+ *
+ */
+
+#define DEQUEUE_SLEEP		0x01
+#define DEQUEUE_SAVE		0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE		0x04 /* matches ENQUEUE_MOVE */
+
+#define ENQUEUE_WAKEUP		0x01
+#define ENQUEUE_RESTORE		0x02
+#define ENQUEUE_MOVE		0x04
+
+#define ENQUEUE_HEAD		0x08
+#define ENQUEUE_REPLENISH	0x10
+#ifdef CONFIG_SMP
+#define ENQUEUE_MIGRATED	0x20
+#else
+#define ENQUEUE_MIGRATED	0x00
+#endif
+
+static inline u64 __rq_clock_broken(struct rq *rq)
+{
+	return READ_ONCE(rq->clock);
+}
+
+static inline u64 rq_clock(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	return rq->clock;
+}
+
+static inline u64 rq_clock_task(struct rq *rq)
+{
+	lockdep_assert_held(&rq->lock);
+	return rq->clock_task;
+}
+
+extern struct mutex sched_domains_mutex;
+extern struct static_key_false sched_schedstats;
+
+#define rcu_dereference_check_sched_domain(p) \
+	rcu_dereference_check((p), \
+			      lockdep_is_held(&sched_domains_mutex))
+
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+	for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+
+#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
+void register_sched_domain_sysctl(void);
+void unregister_sched_domain_sysctl(void);
+#else
+static inline void register_sched_domain_sysctl(void)
+{
+}
+static inline void unregister_sched_domain_sysctl(void)
+{
+}
+#endif
+
+#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
+#else
+static inline void sched_ttwu_pending(void) { }
+#endif
+
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+				  struct cpuidle_state *idle_state)
+{
+	rq->idle_state = idle_state;
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+	WARN_ON(!rcu_read_lock_held());
+	return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+				  struct cpuidle_state *idle_state)
+{
+}
+
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+	return NULL;
+}
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(u64, cpu_softirq_time);
+
+#ifndef CONFIG_64BIT
+DECLARE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
+{
+	__this_cpu_inc(irq_time_seq.sequence);
+	smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+	smp_wmb();
+	__this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	u64 irq_time;
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+		irq_time = per_cpu(cpu_softirq_time, cpu) +
+			   per_cpu(cpu_hardirq_time, cpu);
+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+	return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+#endif /* CONFIG_64BIT */
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+static inline void cpufreq_trigger(u64 time, unsigned long util)
+{
+       struct update_util_data *data;
+
+       if (util > SCHED_CAPACITY_SCALE)
+	       util = SCHED_CAPACITY_SCALE;
+       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+       if (data)
+               data->func(data, time, util, SCHED_CAPACITY_SCALE);
+}
+#else
+static inline void cpufreq_trigger(u64 time, unsigned long util)
+{
+}
+#endif /* CONFIG_CPU_FREQ */
+
+#ifdef arch_scale_freq_capacity
+#ifndef arch_scale_freq_invariant
+#define arch_scale_freq_invariant()	(true)
+#endif
+#else /* arch_scale_freq_capacity */
+#define arch_scale_freq_invariant()	(false)
+#endif
+
+/*
+ * This should only be called when current == rq->idle. Dodgy workaround for
+ * when softirqs are pending and we are in the idle loop. Setting current to
+ * resched will kick us out of the idle loop and the softirqs will be serviced
+ * on our next pass through schedule().
+ */
+static inline bool softirq_pending(int cpu)
+{
+	if (likely(!local_softirq_pending()))
+		return false;
+	set_tsk_need_resched(current);
+	return true;
+}
+
+#endif /* MUQSS_SCHED_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954..2a644b6 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -9,7 +9,11 @@
  * published by the Free Software Foundation.
  */
 
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#else
 #include "sched.h"
+#endif
 
 DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a84641b..a2bf1ea 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -16,7 +16,11 @@
 #include <linux/slab.h>
 #include <trace/events/power.h>
 
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#else
 #include "sched.h"
+#endif
 
 struct sugov_tunables {
 	struct gov_attr_set attr_set;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a846cf8..f09077a 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -4,7 +4,12 @@
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
 #include <linux/context_tracking.h>
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#include "stats.h"
+#else
 #include "sched.h"
+#endif
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
@@ -671,7 +676,7 @@ out:
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 	struct task_cputime cputime = {
-		.sum_exec_runtime = p->se.sum_exec_runtime,
+		.sum_exec_runtime = tsk_seruntime(p),
 	};
 
 	task_cputime(p, &cputime.utime, &cputime.stime);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8b3610c..b728c41 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,8 +47,13 @@
  * (to see the precise effective timeslice length of your workload,
  *  run vmstat and monitor the context-switches (cs) field)
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_latency = 3000000ULL;
+unsigned int normalized_sysctl_sched_latency = 3000000ULL;
+#else
 unsigned int sysctl_sched_latency = 6000000ULL;
 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+#endif
 
 /*
  * The initial- and re-scaling of tunables is configurable
@@ -66,13 +71,22 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
  * Minimal preemption granularity for CPU-bound tasks:
  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_min_granularity = 300000ULL;
+unsigned int normalized_sysctl_sched_min_granularity = 300000ULL;
+#else
 unsigned int sysctl_sched_min_granularity = 750000ULL;
 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+#endif
 
 /*
  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+static unsigned int sched_nr_latency = 10;
+#else
 static unsigned int sched_nr_latency = 8;
+#endif
 
 /*
  * After fork, child runs first. If set to 0 (default) then
@@ -88,10 +102,17 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
  * and reduces their over-scheduling. Synchronous workloads will still
  * have immediate wakeup/sleep latencies.
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_wakeup_granularity = 500000UL;
+unsigned int normalized_sysctl_sched_wakeup_granularity = 500000UL;
+
+const_debug unsigned int sysctl_sched_migration_cost = 250000UL;
+#else
 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
+#endif
 
 /*
  * The exponential sliding  window over which load is averaged for shares
@@ -111,8 +132,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
  *
  * default: 5 msec, units: microseconds
   */
+#ifdef CONFIG_PCK_INTERACTIVE
+unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL;
+#else
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
+#endif
 
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9fb873c..51264e6 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -14,7 +14,11 @@
 
 #include <trace/events/power.h>
 
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
+#else
 #include "sched.h"
+#endif
 
 /**
  * sched_idle_set_state - Record idle state for the current CPU.
@@ -204,6 +208,8 @@ static void cpu_idle_loop(void)
 	int cpu = smp_processor_id();
 
 	while (1) {
+		bool pending = false;
+
 		/*
 		 * If the arch has a polling bit, we maintain an invariant:
 		 *
@@ -215,7 +221,10 @@ static void cpu_idle_loop(void)
 
 		__current_set_polling();
 		quiet_vmstat();
-		tick_nohz_idle_enter();
+		if (unlikely(softirq_pending(cpu)))
+			pending = true;
+		else
+			tick_nohz_idle_enter();
 
 		while (!need_resched()) {
 			check_pgt_cache();
@@ -255,7 +264,8 @@ static void cpu_idle_loop(void)
 		 * not have had an IPI to fold the state for us.
 		 */
 		preempt_set_need_resched();
-		tick_nohz_idle_exit();
+		if (!pending)
+			tick_nohz_idle_exit();
 		__current_clr_polling();
 
 		/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c64fc51..cdefab6 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1813,3 +1813,8 @@ static inline void cpufreq_trigger_update(u64 time) {}
 #else /* arch_scale_freq_capacity */
 #define arch_scale_freq_invariant()	(false)
 #endif
+
+static inline bool softirq_pending(int cpu)
+{
+	return false;
+}
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 87e2c9f..ba7b137 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,7 +4,11 @@
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
 
+#ifndef CONFIG_SCHED_MUQSS
 #include "sched.h"
+#else
+#include "MuQSS.h"
+#endif
 
 /*
  * bump this up when changing the output format or the meaning of an existing
diff --git b/kernel/skip_list.c b/kernel/skip_list.c
new file mode 100644
index 0000000..d525080
--- /dev/null
+++ b/kernel/skip_list.c
@@ -0,0 +1,148 @@
+/*
+  Copyright (C) 2011,2016 Con Kolivas.
+
+  Code based on example originally by William Pugh.
+
+Skip Lists are a probabilistic alternative to balanced trees, as
+described in the June 1990 issue of CACM and were invented by
+William Pugh in 1987.
+
+A couple of comments about this implementation:
+The routine randomLevel has been hard-coded to generate random
+levels using p=0.25. It can be easily changed.
+
+The insertion routine has been implemented so as to use the
+dirty hack described in the CACM paper: if a random level is
+generated that is more than the current maximum level, the
+current maximum level plus one is used instead.
+
+Levels start at zero and go up to MaxLevel (which is equal to
+MaxNumberOfLevels-1).
+
+The routines defined in this file are:
+
+init: defines slnode
+
+new_skiplist: returns a new, empty list
+
+randomLevel: Returns a random level based on a u64 random seed passed to it.
+In MuQSS, the "niffy" time is used for this purpose.
+
+insert(l,key, value): inserts the binding (key, value) into l. This operation
+occurs in O(log n) time.
+
+delnode(slnode, l, node): deletes any binding of key from the l based on the
+actual node value. This operation occurs in O(k) time where k is the
+number of levels of the node in question (max 8). The original delete
+function occurred in O(log n) time and involved a search.
+
+MuQSS Notes: In this implementation of skiplists, there are bidirectional
+next/prev pointers and the insert function returns a pointer to the actual
+node the value is stored. The key here is chosen by the scheduler so as to
+sort tasks according to the priority list requirements and is no longer used
+by the scheduler after insertion. The scheduler lookup, however, occurs in
+O(1) time because it is always the first item in the level 0 linked list.
+Since the task struct stores a copy of the node pointer upon skiplist_insert,
+it can also remove it much faster than the original implementation with the
+aid of prev<->next pointer manipulation and no searching.
+
+*/
+
+#include <linux/slab.h>
+#include <linux/skip_list.h>
+
+#define MaxNumberOfLevels 8
+#define MaxLevel (MaxNumberOfLevels - 1)
+
+void skiplist_init(skiplist_node *slnode)
+{
+	int i;
+
+	slnode->key = 0xFFFFFFFFFFFFFFFF;
+	slnode->level = 0;
+	slnode->value = NULL;
+	for (i = 0; i < MaxNumberOfLevels; i++)
+		slnode->next[i] = slnode->prev[i] = slnode;
+}
+
+skiplist *new_skiplist(skiplist_node *slnode)
+{
+	skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC);
+
+	BUG_ON(!l);
+	l->header = slnode;
+	return l;
+}
+
+void free_skiplist(skiplist *l)
+{
+	skiplist_node *p, *q;
+
+	p = l->header;
+	do {
+		q = p->next[0];
+		p->next[0]->prev[0] = q->prev[0];
+		skiplist_node_init(p);
+		p = q;
+	} while (p != l->header);
+	kfree(l);
+}
+
+void skiplist_node_init(skiplist_node *node)
+{
+	memset(node, 0, sizeof(skiplist_node));
+}
+
+static inline unsigned int randomLevel(const long unsigned int randseed)
+{
+	return find_first_bit(&randseed, MaxLevel);
+}
+
+void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed)
+{
+	skiplist_node *update[MaxNumberOfLevels];
+	skiplist_node *p, *q;
+	int k = l->level;
+
+	p = l->header;
+	do {
+		while (q = p->next[k], q->key <= key)
+			p = q;
+		update[k] = p;
+	} while (--k >= 0);
+
+	++l->entries;
+	k = randomLevel(randseed);
+	if (k > l->level) {
+		k = ++l->level;
+		update[k] = l->header;
+	}
+
+	node->level = k;
+	node->key = key;
+	node->value = value;
+	do {
+		p = update[k];
+		node->next[k] = p->next[k];
+		p->next[k] = node;
+		node->prev[k] = p;
+		node->next[k]->prev[k] = node;
+	} while (--k >= 0);
+}
+
+void skiplist_delete(skiplist *l, skiplist_node *node)
+{
+	int k, m = node->level;
+
+	for (k = 0; k <= m; k++) {
+		node->prev[k]->next[k] = node->next[k];
+		node->next[k]->prev[k] = node->prev[k];
+	}
+	skiplist_node_init(node);
+	if (m == l->level) {
+		while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0)
+			m--;
+		l->level = m;
+	}
+	l->entries--;
+}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index a13bbda..4bd1859 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -125,8 +125,13 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static int __maybe_unused four = 4;
 static unsigned long one_ul = 1;
-static int one_hundred = 100;
-static int one_thousand = 1000;
+static int __read_mostly one_hundred = 100;
+static int __read_mostly one_thousand = 1000;
+#ifdef CONFIG_SCHED_MUQSS
+extern int rr_interval;
+extern int sched_interactive;
+extern int sched_iso_cpu;
+#endif
 #ifdef CONFIG_PRINTK
 static int ten_thousand = 10000;
 #endif
@@ -264,7 +269,7 @@ static struct ctl_table sysctl_base_table[] = {
 	{ }
 };
 
-#ifdef CONFIG_SCHED_DEBUG
+#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS)
 static int min_sched_granularity_ns = 100000;		/* 100 usecs */
 static int max_sched_granularity_ns = NSEC_PER_SEC;	/* 1 second */
 static int min_wakeup_granularity_ns;			/* 0 usecs */
@@ -281,6 +286,7 @@ static int max_extfrag_threshold = 1000;
 #endif
 
 static struct ctl_table kern_table[] = {
+#ifndef CONFIG_SCHED_MUQSS
 	{
 		.procname	= "sched_child_runs_first",
 		.data		= &sysctl_sched_child_runs_first,
@@ -449,6 +455,7 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#endif /* !CONFIG_SCHED_MUQSS */
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
@@ -1013,6 +1020,35 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 #endif
+#ifdef CONFIG_SCHED_MUQSS
+	{
+		.procname	= "rr_interval",
+		.data		= &rr_interval,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one_thousand,
+	},
+	{
+		.procname	= "interactive",
+		.data		= &sched_interactive,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+	{
+		.procname	= "iso_cpu",
+		.data		= &sched_iso_cpu,
+		.maxlen		= sizeof (int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
 	{
 		.procname	= "spin_retry",
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051..e056d54 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -119,3 +119,4 @@ void task_work_run(void)
 		} while (work);
 	}
 }
+EXPORT_SYMBOL_GPL(task_work_run);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2c5bc77..b96deed 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -198,8 +198,13 @@ int clockevents_tick_resume(struct clock_event_device *dev)
 
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST
 
+#ifdef CONFIG_SCHED_MUQSS
+/* Limit min_delta to 100us */
+#define MIN_DELTA_LIMIT		(NSEC_PER_SEC / 10000)
+#else
 /* Limit min_delta to a jiffie */
 #define MIN_DELTA_LIMIT		(NSEC_PER_SEC / HZ)
+#endif
 
 /**
  * clockevents_increase_min_delta - raise minimum delta of a clock event device
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 39008d7..784f3a1 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -447,7 +447,7 @@ static void cleanup_timers(struct list_head *head)
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
+	add_device_randomness((const void*) &tsk_seruntime(tsk),
 						sizeof(unsigned long long));
 	cleanup_timers(tsk->cpu_timers);
 
@@ -848,7 +848,7 @@ static void check_thread_timers(struct task_struct *tsk,
 	tsk_expires->virt_exp = expires_to_cputime(expires);
 
 	tsk_expires->sched_exp = check_timers_list(++timers, firing,
-						   tsk->se.sum_exec_runtime);
+						   tsk_seruntime(tsk));
 
 	/*
 	 * Check for the special case thread timers.
@@ -859,7 +859,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			READ_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max);
 
 		if (hard != RLIM_INFINITY &&
-		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+		    tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the hard limit, we just die.
 			 * No need to calculate anything else now.
@@ -867,7 +867,7 @@ static void check_thread_timers(struct task_struct *tsk,
 			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
 			return;
 		}
-		if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
+		if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) {
 			/*
 			 * At the soft limit, send a SIGXCPU every second.
 			 */
@@ -1115,7 +1115,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 		struct task_cputime task_sample;
 
 		task_cputime(tsk, &task_sample.utime, &task_sample.stime);
-		task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime;
+		task_sample.sum_exec_runtime = tsk_seruntime(tsk);
 		if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
 			return 1;
 	}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 96db64b..e2e7158 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1464,7 +1464,7 @@ static unsigned long __next_timer_interrupt(struct timer_base *base)
  * Check, if the next hrtimer event is before the next timer wheel
  * event:
  */
-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
+static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires)
 {
 	u64 nextevt = hrtimer_get_next_event();
 
@@ -1482,6 +1482,9 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 	if (nextevt <= basem)
 		return basem;
 
+	if (nextevt < expires && nextevt - basem <= TICK_NSEC)
+		base->is_idle = false;
+
 	/*
 	 * Round up to the next jiffie. High resolution timers are
 	 * off, so the hrtimers are expired in the tick and we need to
@@ -1545,7 +1548,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 	}
 	spin_unlock(&base->lock);
 
-	return cmp_next_hrtimer_event(basem, expires);
+	return cmp_next_hrtimer_event(base, basem, expires);
 }
 
 /**
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea..69ee53a 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1039,10 +1039,15 @@ static int trace_wakeup_test_thread(void *data)
 {
 	/* Make this a -deadline thread */
 	static const struct sched_attr attr = {
+#ifdef CONFIG_SCHED_MUQSS
+		/* No deadline on MuQSS, use RR */
+		.sched_policy = SCHED_RR,
+#else
 		.sched_policy = SCHED_DEADLINE,
 		.sched_runtime = 100000ULL,
 		.sched_deadline = 10000000ULL,
 		.sched_period = 10000000ULL
+#endif
 	};
 	struct wakeup_test_data *x = data;
 
diff --git a/lib/Kconfig b/lib/Kconfig
index d79909d..c585e4c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -550,4 +550,7 @@ config STACKDEPOT
 	bool
 	select STACKTRACE
 
+config WBT
+	bool
+
 endmenu
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cab7405..6f1b7a7 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -924,7 +924,7 @@ config SCHED_INFO
 
 config SCHEDSTATS
 	bool "Collect scheduler statistics"
-	depends on DEBUG_KERNEL && PROC_FS
+	depends on DEBUG_KERNEL && PROC_FS && !SCHED_MUQSS
 	select SCHED_INFO
 	help
 	  If you say Y here, additional code will be inserted into the
@@ -1676,6 +1676,7 @@ config LATENCYTOP
 	depends on DEBUG_KERNEL
 	depends on STACKTRACE_SUPPORT
 	depends on PROC_FS
+	depends on !SCHED_MUQSS
 	select FRAME_POINTER if !MIPS && !PPC && !S390 && !MICROBLAZE && !ARM_UNWIND && !ARC
 	select KALLSYMS
 	select KALLSYMS_ALL
diff --git a/lib/Makefile b/lib/Makefile
index 5dc77a8..5bd016b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -17,7 +17,7 @@ KCOV_INSTRUMENT_debugobjects.o := n
 KCOV_INSTRUMENT_dynamic_debug.o := n
 
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
-	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
+	 rbtree.o radix-tree.o sradix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
 	 sha1.o chacha20.o md5.o irq_regs.o argv_split.o \
 	 flex_proportions.o ratelimit.o show_mem.o \
@@ -177,6 +177,7 @@ obj-$(CONFIG_SG_SPLIT) += sg_split.o
 obj-$(CONFIG_SG_POOL) += sg_pool.o
 obj-$(CONFIG_STMP_DEVICE) += stmp_device.o
 obj-$(CONFIG_IRQ_POLL) += irq_poll.o
+obj-$(CONFIG_WBT) += wbt.o
 
 obj-$(CONFIG_STACKDEPOT) += stackdepot.o
 KASAN_SANITIZE_stackdepot.o := n
diff --git b/lib/sradix-tree.c b/lib/sradix-tree.c
new file mode 100644
index 0000000..8d06329
--- /dev/null
+++ b/lib/sradix-tree.c
@@ -0,0 +1,476 @@
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/gcd.h>
+#include <linux/sradix-tree.h>
+
+static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node)
+{
+	return node->fulls == root->stores_size || 
+		(node->height == 1 && node->count == root->stores_size);
+}
+
+/*
+ *	Extend a sradix tree so it can store key @index.
+ */
+static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index)
+{
+	struct sradix_tree_node *node;
+	unsigned int height;
+
+	if (unlikely(root->rnode == NULL)) {
+		if (!(node = root->alloc()))
+			return -ENOMEM;
+
+		node->height = 1;
+		root->rnode = node;
+		root->height = 1;
+	}
+
+	/* Figure out what the height should be.  */
+	height = root->height;
+	index >>= root->shift * height;
+
+	while (index) {
+		index >>= root->shift;
+		height++;
+	}
+
+	while (height > root->height) {
+		unsigned int newheight;
+		if (!(node = root->alloc()))
+			return -ENOMEM;
+
+		/* Increase the height.  */
+		node->stores[0] = root->rnode;
+		root->rnode->parent = node;
+		if (root->extend)
+			root->extend(node, root->rnode);
+
+		newheight = root->height + 1;
+		node->height = newheight;
+		node->count = 1;
+		if (sradix_node_full(root, root->rnode))
+			node->fulls = 1;
+
+		root->rnode = node;
+		root->height = newheight;
+	}
+
+	return 0;
+}
+
+/*
+ * Search the next item from the current node, that is not NULL
+ * and can satify root->iter().
+ */
+void *sradix_tree_next(struct sradix_tree_root *root,
+		       struct sradix_tree_node *node, unsigned long index,
+		       int (*iter)(void *item, unsigned long height))
+{
+	unsigned long offset;
+	void *item;
+
+	if (unlikely(node == NULL)) {
+		node = root->rnode;
+		for (offset = 0; offset < root->stores_size; offset++) {
+			item = node->stores[offset];
+			if (item && (!iter || iter(item, node->height)))
+				break;
+		}
+
+		if (unlikely(offset >= root->stores_size))
+			return NULL;
+
+		if (node->height == 1)
+			return item;
+		else
+			goto go_down;
+	}
+
+	while (node) {
+		offset = (index & root->mask) + 1;					
+		for (;offset < root->stores_size; offset++) {
+			item = node->stores[offset];
+			if (item && (!iter || iter(item, node->height)))
+				break;
+		}
+
+		if (offset < root->stores_size)
+			break;
+
+		node = node->parent;
+		index >>= root->shift;
+	}
+
+	if (!node)
+		return NULL;
+
+	while (node->height > 1) {
+go_down:
+		node = item;
+		for (offset = 0; offset < root->stores_size; offset++) {
+			item = node->stores[offset];
+			if (item && (!iter || iter(item, node->height)))
+				break;
+		}
+
+		if (unlikely(offset >= root->stores_size))
+			return NULL;
+	}
+
+	BUG_ON(offset > root->stores_size);
+
+	return item;
+}
+
+/*
+ * Blindly insert the item to the tree. Typically, we reuse the
+ * first empty store item.
+ */
+int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num)
+{
+	unsigned long index;
+	unsigned int height;
+	struct sradix_tree_node *node, *tmp = NULL;
+	int offset, offset_saved;
+	void **store = NULL;
+	int error, i, j, shift;
+
+go_on:
+	index = root->min;
+
+	if (root->enter_node && !sradix_node_full(root, root->enter_node)) {
+		node = root->enter_node;
+		BUG_ON((index >> (root->shift * root->height)));
+	} else {
+		node = root->rnode;
+		if (node == NULL || (index >> (root->shift * root->height))
+		    || sradix_node_full(root, node)) {
+			error = sradix_tree_extend(root, index);
+			if (error)
+				return error;
+
+			node = root->rnode;
+		}
+	}
+
+
+	height = node->height;
+	shift = (height - 1) * root->shift;
+	offset = (index >> shift) & root->mask;
+	while (shift > 0) {
+		offset_saved = offset;
+		for (; offset < root->stores_size; offset++) {
+			store = &node->stores[offset];
+			tmp = *store;
+
+			if (!tmp || !sradix_node_full(root, tmp))
+				break;
+		}
+		BUG_ON(offset >= root->stores_size);
+
+		if (offset != offset_saved) {
+			index += (offset - offset_saved) << shift;
+			index &= ~((1UL << shift) - 1);
+		}
+
+		if (!tmp) {
+			if (!(tmp = root->alloc()))
+				return -ENOMEM;
+
+			tmp->height = shift / root->shift;
+			*store = tmp;
+			tmp->parent = node;
+			node->count++;
+//			if (root->extend)
+//				root->extend(node, tmp);
+		}
+
+		node = tmp;
+		shift -= root->shift;
+		offset = (index >> shift) & root->mask;
+	}
+
+	BUG_ON(node->height != 1);
+
+
+	store = &node->stores[offset];
+	for (i = 0, j = 0;
+	      j < root->stores_size - node->count && 
+	      i < root->stores_size - offset && j < num; i++) {
+		if (!store[i]) {
+			store[i] = item[j];
+			if (root->assign)
+				root->assign(node, index + i, item[j]);
+			j++;
+		}
+	}
+
+	node->count += j;
+	root->num += j;
+	num -= j;
+
+	while (sradix_node_full(root, node)) {
+		node = node->parent;
+		if (!node)
+			break;
+
+		node->fulls++;
+	}
+
+	if (unlikely(!node)) {
+		/* All nodes are full */
+		root->min = 1 << (root->height * root->shift);
+		root->enter_node = NULL;
+	} else {
+		root->min = index + i - 1;
+		root->min |= (1UL << (node->height - 1)) - 1;
+		root->min++;
+		root->enter_node = node;
+	}
+
+	if (num) {
+		item += j;
+		goto go_on;
+	}
+
+	return 0;
+}
+
+
+/**
+ *	sradix_tree_shrink    -    shrink height of a sradix tree to minimal
+ *      @root		sradix tree root
+ *  
+ */
+static inline void sradix_tree_shrink(struct sradix_tree_root *root)
+{
+	/* try to shrink tree height */
+	while (root->height > 1) {
+		struct sradix_tree_node *to_free = root->rnode;
+
+		/*
+		 * The candidate node has more than one child, or its child
+		 * is not at the leftmost store, we cannot shrink.
+		 */
+		if (to_free->count != 1 || !to_free->stores[0])
+			break;
+
+		root->rnode = to_free->stores[0];
+		root->rnode->parent = NULL;
+		root->height--;
+		if (unlikely(root->enter_node == to_free)) {
+			root->enter_node = NULL;
+		}
+		root->free(to_free);
+	}
+}
+
+/*
+ * Del the item on the known leaf node and index
+ */
+void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, 
+				  struct sradix_tree_node *node, unsigned long index)
+{
+	unsigned int offset;
+	struct sradix_tree_node *start, *end;
+
+	BUG_ON(node->height != 1);
+
+	start = node;
+	while (node && !(--node->count))
+		node = node->parent;
+
+	end = node;
+	if (!node) {
+		root->rnode = NULL;
+		root->height = 0;
+		root->min = 0;
+		root->num = 0;
+		root->enter_node = NULL;
+	} else {
+		offset = (index >> (root->shift * (node->height - 1))) & root->mask;
+		if (root->rm)
+			root->rm(node, offset);
+		node->stores[offset] = NULL;
+		root->num--;
+		if (root->min > index) {
+			root->min = index;
+			root->enter_node = node;
+		}
+	}
+
+	if (start != end) {
+		do {
+			node = start;
+			start = start->parent;
+			if (unlikely(root->enter_node == node))
+				root->enter_node = end;
+			root->free(node);
+		} while (start != end);
+
+		/*
+		 * Note that shrink may free "end", so enter_node still need to
+		 * be checked inside.
+		 */
+		sradix_tree_shrink(root);
+	} else if (node->count == root->stores_size - 1) {
+		/* It WAS a full leaf node. Update the ancestors */
+		node = node->parent;
+		while (node) {
+			node->fulls--;
+			if (node->fulls != root->stores_size - 1)
+				break;
+
+			node = node->parent;
+		}
+	}
+}
+
+void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index)
+{
+	unsigned int height, offset;
+	struct sradix_tree_node *node;
+	int shift;
+
+	node = root->rnode;
+	if (node == NULL || (index >> (root->shift * root->height)))
+		return NULL;
+
+	height = root->height;
+	shift = (height - 1) * root->shift;
+
+	do {
+		offset = (index >> shift) & root->mask;
+		node = node->stores[offset];
+		if (!node)
+			return NULL;
+
+		shift -= root->shift;
+	} while (shift >= 0);
+
+	return node;
+}
+
+/*
+ * Return the item if it exists, otherwise create it in place
+ * and return the created item.
+ */
+void *sradix_tree_lookup_create(struct sradix_tree_root *root, 
+			unsigned long index, void *(*item_alloc)(void))
+{
+	unsigned int height, offset;
+	struct sradix_tree_node *node, *tmp;
+	void *item;
+	int shift, error;
+
+	if (root->rnode == NULL || (index >> (root->shift * root->height))) {
+		if (item_alloc) {
+			error = sradix_tree_extend(root, index);
+			if (error)
+				return NULL;
+		} else {
+			return NULL;
+		}
+	}
+
+	node = root->rnode;
+	height = root->height;
+	shift = (height - 1) * root->shift;
+
+	do {
+		offset = (index >> shift) & root->mask;
+		if (!node->stores[offset]) {
+			if (!(tmp = root->alloc()))
+				return NULL;
+
+			tmp->height = shift / root->shift;
+			node->stores[offset] = tmp;
+			tmp->parent = node;
+			node->count++;
+			node = tmp;
+		} else {
+			node = node->stores[offset];
+		}
+
+		shift -= root->shift;
+	} while (shift > 0);
+
+	BUG_ON(node->height != 1);
+	offset = index & root->mask;
+	if (node->stores[offset]) {
+		return node->stores[offset];
+	} else if (item_alloc) {
+		if (!(item = item_alloc()))
+			return NULL;
+
+		node->stores[offset] = item;
+
+		/*
+		 * NOTE: we do NOT call root->assign here, since this item is
+		 * newly created by us having no meaning. Caller can call this
+		 * if it's necessary to do so.
+		 */
+
+		node->count++;
+		root->num++;
+
+		while (sradix_node_full(root, node)) {
+			node = node->parent;
+			if (!node)
+				break;
+
+			node->fulls++;
+		}
+
+		if (unlikely(!node)) {
+			/* All nodes are full */
+			root->min = 1 << (root->height * root->shift);
+		} else {
+			if (root->min == index) {
+				root->min |= (1UL << (node->height - 1)) - 1;
+				root->min++;
+				root->enter_node = node;
+			}
+		}
+
+		return item;
+	} else {
+		return NULL;
+	}
+
+}
+
+int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index)
+{
+	unsigned int height, offset;
+	struct sradix_tree_node *node;
+	int shift;
+
+	node = root->rnode;
+	if (node == NULL || (index >> (root->shift * root->height)))
+		return -ENOENT;
+
+	height = root->height;
+	shift = (height - 1) * root->shift;
+
+	do {
+		offset = (index >> shift) & root->mask;
+		node = node->stores[offset];
+		if (!node)
+			return -ENOENT;
+
+		shift -= root->shift;
+	} while (shift > 0);
+
+	offset = index & root->mask;
+	if (!node->stores[offset])
+		return -ENOENT;
+
+	sradix_tree_delete_from_leaf(root, node, index);
+
+	return 0;
+}
diff --git b/lib/wbt.c b/lib/wbt.c
new file mode 100644
index 0000000..257c7b0
--- /dev/null
+++ b/lib/wbt.c
@@ -0,0 +1,703 @@
+/*
+ * buffered writeback throttling. loosely based on CoDel. We can't drop
+ * packets for IO scheduling, so the logic is something like this:
+ *
+ * - Monitor latencies in a defined window of time.
+ * - If the minimum latency in the above window exceeds some target, increment
+ *   scaling step and scale down queue depth by a factor of 2x. The monitoring
+ *   window is then shrunk to 100 / sqrt(scaling step + 1).
+ * - For any window where we don't have solid data on what the latencies
+ *   look like, retain status quo.
+ * - If latencies look good, decrement scaling step.
+ * - If we're only doing writes, allow the scaling step to go negative. This
+ *   will temporarily boost write performance, snapping back to a stable
+ *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
+ *   positive scaling steps where we shrink the monitoring window, a negative
+ *   scaling step retains the default step==0 window size.
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/wbt.h>
+#include <linux/swap.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/wbt.h>
+
+enum {
+	/*
+	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
+	 * from here depending on device stats
+	 */
+	RWB_DEF_DEPTH	= 16,
+
+	/*
+	 * 100msec window
+	 */
+	RWB_WINDOW_NSEC		= 100 * 1000 * 1000ULL,
+
+	/*
+	 * Disregard stats, if we don't meet this minimum
+	 */
+	RWB_MIN_WRITE_SAMPLES	= 3,
+
+	/*
+	 * If we have this number of consecutive windows with not enough
+	 * information to scale up or down, scale up.
+	 */
+	RWB_UNKNOWN_BUMP	= 5,
+};
+
+static inline bool rwb_enabled(struct rq_wb *rwb)
+{
+	return rwb && rwb->wb_normal != 0;
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+	int cur = atomic_read(v);
+
+	for (;;) {
+		int old;
+
+		if (cur >= below)
+			return false;
+		old = atomic_cmpxchg(v, cur, cur + 1);
+		if (old == cur)
+			break;
+		cur = old;
+	}
+
+	return true;
+}
+
+static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
+{
+	if (rwb_enabled(rwb)) {
+		const unsigned long cur = jiffies;
+
+		if (cur != *var)
+			*var = cur;
+	}
+}
+
+/*
+ * If a task was rate throttled in balance_dirty_pages() within the last
+ * second or so, use that to indicate a higher cleaning rate.
+ */
+static bool wb_recent_wait(struct rq_wb *rwb)
+{
+	struct bdi_writeback *wb = &rwb->bdi->wb;
+
+	return time_before(jiffies, wb->dirty_sleep + HZ);
+}
+
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+{
+	return &rwb->rq_wait[is_kswapd];
+}
+
+static void rwb_wake_all(struct rq_wb *rwb)
+{
+	int i;
+
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		struct rq_wait *rqw = &rwb->rq_wait[i];
+
+		if (waitqueue_active(&rqw->wait))
+			wake_up_all(&rqw->wait);
+	}
+}
+
+void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
+{
+	struct rq_wait *rqw;
+	int inflight, limit;
+
+	if (!(wb_acct & WBT_TRACKED))
+		return;
+
+	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+	inflight = atomic_dec_return(&rqw->inflight);
+
+	/*
+	 * wbt got disabled with IO in flight. Wake up any potential
+	 * waiters, we don't have to do more than that.
+	 */
+	if (unlikely(!rwb_enabled(rwb))) {
+		rwb_wake_all(rwb);
+		return;
+	}
+
+	/*
+	 * If the device does write back caching, drop further down
+	 * before we wake people up.
+	 */
+	if (rwb->wc && !wb_recent_wait(rwb))
+		limit = 0;
+	else
+		limit = rwb->wb_normal;
+
+	/*
+	 * Don't wake anyone up if we are above the normal limit.
+	 */
+	if (inflight && inflight >= limit)
+		return;
+
+	if (waitqueue_active(&rqw->wait)) {
+		int diff = limit - inflight;
+
+		if (!inflight || diff >= rwb->wb_background / 2)
+			wake_up(&rqw->wait);
+	}
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void wbt_done(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+	if (!rwb)
+		return;
+
+	if (!wbt_is_tracked(stat)) {
+		if (rwb->sync_cookie == stat) {
+			rwb->sync_issue = 0;
+			rwb->sync_cookie = NULL;
+		}
+
+		if (wbt_is_read(stat))
+			wb_timestamp(rwb, &rwb->last_comp);
+		wbt_clear_state(stat);
+	} else {
+		WARN_ON_ONCE(stat == rwb->sync_cookie);
+		__wbt_done(rwb, wbt_stat_to_mask(stat));
+		wbt_clear_state(stat);
+	}
+}
+
+/*
+ * Return true, if we can't increase the depth further by scaling
+ */
+static bool calc_wb_limits(struct rq_wb *rwb)
+{
+	unsigned int depth;
+	bool ret = false;
+
+	if (!rwb->min_lat_nsec) {
+		rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0;
+		return false;
+	}
+
+	/*
+	 * For QD=1 devices, this is a special case. It's important for those
+	 * to have one request ready when one completes, so force a depth of
+	 * 2 for those devices. On the backend, it'll be a depth of 1 anyway,
+	 * since the device can't have more than that in flight. If we're
+	 * scaling down, then keep a setting of 1/1/1.
+	 */
+	if (rwb->queue_depth == 1) {
+		if (rwb->scale_step > 0)
+			rwb->wb_max = rwb->wb_normal = 1;
+		else {
+			rwb->wb_max = rwb->wb_normal = 2;
+			ret = true;
+		}
+		rwb->wb_background = 1;
+	} else {
+		/*
+		 * scale_step == 0 is our default state. If we have suffered
+		 * latency spikes, step will be > 0, and we shrink the
+		 * allowed write depths. If step is < 0, we're only doing
+		 * writes, and we allow a temporarily higher depth to
+		 * increase performance.
+		 */
+		depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth);
+		if (rwb->scale_step > 0)
+			depth = 1 + ((depth - 1) >> min(31, rwb->scale_step));
+		else if (rwb->scale_step < 0) {
+			unsigned int maxd = 3 * rwb->queue_depth / 4;
+
+			depth = 1 + ((depth - 1) << -rwb->scale_step);
+			if (depth > maxd) {
+				depth = maxd;
+				ret = true;
+			}
+		}
+
+		/*
+		 * Set our max/normal/bg queue depths based on how far
+		 * we have scaled down (->scale_step).
+		 */
+		rwb->wb_max = depth;
+		rwb->wb_normal = (rwb->wb_max + 1) / 2;
+		rwb->wb_background = (rwb->wb_max + 3) / 4;
+	}
+
+	return ret;
+}
+
+static bool inline stat_sample_valid(struct blk_rq_stat *stat)
+{
+	/*
+	 * We need at least one read sample, and a minimum of
+	 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
+	 * that it's writes impacting us, and not just some sole read on
+	 * a device that is in a lower power state.
+	 */
+	return stat[0].nr_samples >= 1 &&
+		stat[1].nr_samples >= RWB_MIN_WRITE_SAMPLES;
+}
+
+static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
+{
+	u64 now, issue = ACCESS_ONCE(rwb->sync_issue);
+
+	if (!issue || !rwb->sync_cookie)
+		return 0;
+
+	now = ktime_to_ns(ktime_get());
+	return now - issue;
+}
+
+enum {
+	LAT_OK = 1,
+	LAT_UNKNOWN,
+	LAT_UNKNOWN_WRITES,
+	LAT_EXCEEDED,
+};
+
+static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
+{
+	u64 thislat;
+
+	/*
+	 * If our stored sync issue exceeds the window size, or it
+	 * exceeds our min target AND we haven't logged any entries,
+	 * flag the latency as exceeded. wbt works off completion latencies,
+	 * but for a flooded device, a single sync IO can take a long time
+	 * to complete after being issued. If this time exceeds our
+	 * monitoring window AND we didn't see any other completions in that
+	 * window, then count that sync IO as a violation of the latency.
+	 */
+	thislat = rwb_sync_issue_lat(rwb);
+	if (thislat > rwb->cur_win_nsec ||
+	    (thislat > rwb->min_lat_nsec && !stat[0].nr_samples)) {
+		trace_wbt_lat(rwb->bdi, thislat);
+		return LAT_EXCEEDED;
+	}
+
+	/*
+	 * No read/write mix, if stat isn't valid
+	 */
+	if (!stat_sample_valid(stat)) {
+		/*
+		 * If we had writes in this stat window and the window is
+		 * current, we're only doing writes. If a task recently
+		 * waited or still has writes in flights, consider us doing
+		 * just writes as well.
+		 */
+		if ((stat[1].nr_samples && rwb->stat_ops->is_current(stat)) ||
+		    wb_recent_wait(rwb) || wbt_inflight(rwb))
+			return LAT_UNKNOWN_WRITES;
+		return LAT_UNKNOWN;
+	}
+
+	/*
+	 * If the 'min' latency exceeds our target, step down.
+	 */
+	if (stat[0].min > rwb->min_lat_nsec) {
+		trace_wbt_lat(rwb->bdi, stat[0].min);
+		trace_wbt_stat(rwb->bdi, stat);
+		return LAT_EXCEEDED;
+	}
+
+	if (rwb->scale_step)
+		trace_wbt_stat(rwb->bdi, stat);
+
+	return LAT_OK;
+}
+
+static int latency_exceeded(struct rq_wb *rwb)
+{
+	struct blk_rq_stat stat[2];
+
+	rwb->stat_ops->get(rwb->ops_data, stat);
+	return __latency_exceeded(rwb, stat);
+}
+
+static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
+{
+	trace_wbt_step(rwb->bdi, msg, rwb->scale_step, rwb->cur_win_nsec,
+			rwb->wb_background, rwb->wb_normal, rwb->wb_max);
+}
+
+static void scale_up(struct rq_wb *rwb)
+{
+	/*
+	 * Hit max in previous round, stop here
+	 */
+	if (rwb->scaled_max)
+		return;
+
+	rwb->scale_step--;
+	rwb->unknown_cnt = 0;
+	rwb->stat_ops->clear(rwb->ops_data);
+
+	rwb->scaled_max = calc_wb_limits(rwb);
+
+	rwb_wake_all(rwb);
+
+	rwb_trace_step(rwb, "step up");
+}
+
+/*
+ * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
+ * had a latency violation.
+ */
+static void scale_down(struct rq_wb *rwb, bool hard_throttle)
+{
+	/*
+	 * Stop scaling down when we've hit the limit. This also prevents
+	 * ->scale_step from going to crazy values, if the device can't
+	 * keep up.
+	 */
+	if (rwb->wb_max == 1)
+		return;
+
+	if (rwb->scale_step < 0 && hard_throttle)
+		rwb->scale_step = 0;
+	else
+		rwb->scale_step++;
+
+	rwb->scaled_max = false;
+	rwb->unknown_cnt = 0;
+	rwb->stat_ops->clear(rwb->ops_data);
+	calc_wb_limits(rwb);
+	rwb_trace_step(rwb, "step down");
+}
+
+static void rwb_arm_timer(struct rq_wb *rwb)
+{
+	unsigned long expires;
+
+	if (rwb->scale_step > 0) {
+		/*
+		 * We should speed this up, using some variant of a fast
+		 * integer inverse square root calculation. Since we only do
+		 * this for every window expiration, it's not a huge deal,
+		 * though.
+		 */
+		rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
+					int_sqrt((rwb->scale_step + 1) << 8));
+	} else {
+		/*
+		 * For step < 0, we don't want to increase/decrease the
+		 * window size.
+		 */
+		rwb->cur_win_nsec = rwb->win_nsec;
+	}
+
+	expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec);
+	mod_timer(&rwb->window_timer, expires);
+}
+
+static void wb_timer_fn(unsigned long data)
+{
+	struct rq_wb *rwb = (struct rq_wb *) data;
+	unsigned int inflight = wbt_inflight(rwb);
+	int status;
+
+	status = latency_exceeded(rwb);
+
+	trace_wbt_timer(rwb->bdi, status, rwb->scale_step, inflight);
+
+	/*
+	 * If we exceeded the latency target, step down. If we did not,
+	 * step one level up. If we don't know enough to say either exceeded
+	 * or ok, then don't do anything.
+	 */
+	switch (status) {
+	case LAT_EXCEEDED:
+		scale_down(rwb, true);
+		break;
+	case LAT_OK:
+		scale_up(rwb);
+		break;
+	case LAT_UNKNOWN_WRITES:
+		scale_up(rwb);
+		break;
+	case LAT_UNKNOWN:
+		if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
+			break;
+		/*
+		 * We get here for two reasons:
+		 *
+		 * 1) We previously scaled reduced depth, and we currently
+		 *    don't have a valid read/write sample. For that case,
+		 *    slowly return to center state (step == 0).
+		 * 2) We started a the center step, but don't have a valid
+		 *    read/write sample, but we do have writes going on.
+		 *    Allow step to go negative, to increase write perf.
+		 */
+		if (rwb->scale_step > 0)
+			scale_up(rwb);
+		else if (rwb->scale_step < 0)
+			scale_down(rwb, false);
+		break;
+	default:
+		break;
+	}
+
+	/*
+	 * Re-arm timer, if we have IO in flight
+	 */
+	if (rwb->scale_step || inflight)
+		rwb_arm_timer(rwb);
+}
+
+void wbt_update_limits(struct rq_wb *rwb)
+{
+	rwb->scale_step = 0;
+	rwb->scaled_max = false;
+	calc_wb_limits(rwb);
+
+	rwb_wake_all(rwb);
+}
+
+static bool close_io(struct rq_wb *rwb)
+{
+	const unsigned long now = jiffies;
+
+	return time_before(now, rwb->last_issue + HZ / 10) ||
+		time_before(now, rwb->last_comp + HZ / 10);
+}
+
+#define REQ_HIPRIO	(REQ_SYNC | REQ_META | REQ_PRIO)
+
+static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+{
+	unsigned int limit;
+
+	/*
+	 * At this point we know it's a buffered write. If this is
+	 * kswapd trying to free memory, or REQ_SYNC is set, set, then
+	 * it's WB_SYNC_ALL writeback, and we'll use the max limit for
+	 * that. If the write is marked as a background write, then use
+	 * the idle limit, or go to normal if we haven't had competing
+	 * IO for a bit.
+	 */
+	if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+		limit = rwb->wb_max;
+	else if ((rw & REQ_BG) || close_io(rwb)) {
+		/*
+		 * If less than 100ms since we completed unrelated IO,
+		 * limit us to half the depth for background writeback.
+		 */
+		limit = rwb->wb_background;
+	} else
+		limit = rwb->wb_normal;
+
+	return limit;
+}
+
+static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw,
+			     unsigned long rw)
+{
+	/*
+	 * inc it here even if disabled, since we'll dec it at completion.
+	 * this only happens if the task was sleeping in __wbt_wait(),
+	 * and someone turned it off at the same time.
+	 */
+	if (!rwb_enabled(rwb)) {
+		atomic_inc(&rqw->inflight);
+		return true;
+	}
+
+	return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw));
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
+{
+	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+	DEFINE_WAIT(wait);
+
+	if (may_queue(rwb, rqw, rw))
+		return;
+
+	do {
+		prepare_to_wait_exclusive(&rqw->wait, &wait,
+						TASK_UNINTERRUPTIBLE);
+
+		if (may_queue(rwb, rqw, rw))
+			break;
+
+		if (lock)
+			spin_unlock_irq(lock);
+
+		io_schedule();
+
+		if (lock)
+			spin_lock_irq(lock);
+	} while (1);
+
+	finish_wait(&rqw->wait, &wait);
+}
+
+static inline bool wbt_should_throttle(struct rq_wb *rwb, unsigned int rw)
+{
+	const int op = rw >> BIO_OP_SHIFT;
+
+	/*
+	 * If not a WRITE (or a discard), do nothing
+	 */
+	if (!(op == REQ_OP_WRITE || op == REQ_OP_DISCARD))
+		return false;
+
+	/*
+	 * Don't throttle WRITE_ODIRECT
+	 */
+	if ((rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+		return false;
+
+	return true;
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+unsigned int wbt_wait(struct rq_wb *rwb, unsigned int rw, spinlock_t *lock)
+{
+	unsigned int ret = 0;
+
+	if (!rwb_enabled(rwb))
+		return 0;
+
+	if ((rw >> BIO_OP_SHIFT) == REQ_OP_READ)
+		ret = WBT_READ;
+
+	if (!wbt_should_throttle(rwb, rw)) {
+		if (ret & WBT_READ)
+			wb_timestamp(rwb, &rwb->last_issue);
+		return ret;
+	}
+
+	__wbt_wait(rwb, rw, lock);
+
+	if (!timer_pending(&rwb->window_timer))
+		rwb_arm_timer(rwb);
+
+	if (current_is_kswapd())
+		ret |= WBT_KSWAPD;
+
+	return ret | WBT_TRACKED;
+}
+
+void wbt_issue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+
+	wbt_issue_stat_set_time(stat);
+
+	/*
+	 * Track sync issue, in case it takes a long time to complete. Allows
+	 * us to react quicker, if a sync IO takes a long time to complete.
+	 * Note that this is just a hint. 'stat' can go away when the
+	 * request completes, so it's important we never dereference it. We
+	 * only use the address to compare with, which is why we store the
+	 * sync_issue time locally.
+	 */
+	if (wbt_is_read(stat) && !rwb->sync_issue) {
+		rwb->sync_cookie = stat;
+		rwb->sync_issue = wbt_issue_stat_get_time(stat);
+	}
+}
+
+void wbt_requeue(struct rq_wb *rwb, struct wb_issue_stat *stat)
+{
+	if (!rwb_enabled(rwb))
+		return;
+	if (stat == rwb->sync_cookie) {
+		rwb->sync_issue = 0;
+		rwb->sync_cookie = NULL;
+	}
+}
+
+void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth)
+{
+	if (rwb) {
+		rwb->queue_depth = depth;
+		wbt_update_limits(rwb);
+	}
+}
+
+void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on)
+{
+	if (rwb)
+		rwb->wc = write_cache_on;
+}
+
+void wbt_disable(struct rq_wb *rwb)
+{
+	if (rwb) {
+		del_timer_sync(&rwb->window_timer);
+		rwb->win_nsec = rwb->min_lat_nsec = 0;
+		wbt_update_limits(rwb);
+	}
+}
+EXPORT_SYMBOL_GPL(wbt_disable);
+
+struct rq_wb *wbt_init(struct backing_dev_info *bdi, struct wb_stat_ops *ops,
+		       void *ops_data)
+{
+	struct rq_wb *rwb;
+	int i;
+
+	if (!ops->get || !ops->is_current || !ops->clear)
+		return ERR_PTR(-EINVAL);
+
+	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+	if (!rwb)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; i < WBT_NUM_RWQ; i++) {
+		atomic_set(&rwb->rq_wait[i].inflight, 0);
+		init_waitqueue_head(&rwb->rq_wait[i].wait);
+	}
+
+	setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb);
+	rwb->wc = 1;
+	rwb->queue_depth = RWB_DEF_DEPTH;
+	rwb->last_comp = rwb->last_issue = jiffies;
+	rwb->bdi = bdi;
+	rwb->win_nsec = RWB_WINDOW_NSEC;
+	rwb->stat_ops = ops;
+	rwb->ops_data = ops_data;
+	wbt_update_limits(rwb);
+	return rwb;
+}
+
+void wbt_exit(struct rq_wb *rwb)
+{
+	if (rwb) {
+		del_timer_sync(&rwb->window_timer);
+		kfree(rwb);
+	}
+}
diff --git a/mm/Kconfig b/mm/Kconfig
index be0ee11..64fd3bc 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -340,6 +340,32 @@ config KSM
 	  See Documentation/vm/ksm.txt for more information: KSM is inactive
 	  until a program has madvised that an area is MADV_MERGEABLE, and
 	  root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set).
+choice
+	prompt "Choose UKSM/KSM strategy"
+	default UKSM
+	depends on KSM
+	help
+	  This option allows to select a UKSM/KSM stragety.
+
+config UKSM
+	bool "Ultra-KSM for page merging"
+	depends on KSM
+	help
+	UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same
+	page Merging), but with a fundamentally rewritten core algorithm. With
+	an advanced algorithm, UKSM now can transparently scans all anonymously
+	mapped user space applications with an significantly improved scan speed
+	and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from
+	UKSM. Now UKSM has its first stable release and first real world enterprise user.
+	For more information, please goto its project page.
+	(www.kerneldedup.org)
+
+config KSM_LEGACY
+	bool "Legacy KSM implementation"
+	depends on KSM
+	help
+	The legacy KSM implementation from Redhat.
+endchoice
 
 config DEFAULT_MMAP_MIN_ADDR
         int "Low address space to protect from user allocation"
diff --git a/mm/Makefile b/mm/Makefile
index 295bd7a..ed472c5 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ obj-y			:= filemap.o mempool.o oom_kill.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o vmacache.o \
 			   interval_tree.o list_lru.o workingset.o \
-			   debug.o $(mmu-y)
+			   prfile.o debug.o $(mmu-y)
 
 obj-y += init-mm.o
 
@@ -63,7 +63,8 @@ obj-$(CONFIG_SPARSEMEM)	+= sparse.o
 obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
 obj-$(CONFIG_SLOB) += slob.o
 obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
-obj-$(CONFIG_KSM) += ksm.o
+obj-$(CONFIG_KSM_LEGACY) += ksm.o
+obj-$(CONFIG_UKSM) += uksm.o
 obj-$(CONFIG_PAGE_POISONING) += page_poison.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8fde443..3bfed5a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -310,6 +310,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
 	spin_lock_init(&wb->work_lock);
 	INIT_LIST_HEAD(&wb->work_list);
 	INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
+	wb->dirty_sleep = jiffies;
 
 	wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
 	if (!wb->congested)
diff --git a/mm/filemap.c b/mm/filemap.c
index ced9ef6..ea47a7d 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2301,7 +2301,7 @@ int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	int ret = VM_FAULT_LOCKED;
 
 	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
+	vma_file_update_time(vma);
 	lock_page(page);
 	if (page->mapping != inode->i_mapping) {
 		unlock_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 793fe0f..3eeb8e2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -124,6 +124,25 @@ unsigned long highest_memmap_pfn __read_mostly;
 
 EXPORT_SYMBOL(zero_pfn);
 
+#ifdef CONFIG_UKSM
+unsigned long uksm_zero_pfn __read_mostly;
+EXPORT_SYMBOL_GPL(uksm_zero_pfn);
+struct page *empty_uksm_zero_page;
+
+static int __init setup_uksm_zero_page(void)
+{
+	empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0);
+	if (!empty_uksm_zero_page)
+		panic("Oh boy, that early out of memory?");
+
+	SetPageReserved(empty_uksm_zero_page);
+	uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page);
+
+	return 0;
+}
+core_initcall(setup_uksm_zero_page);
+#endif
+
 /*
  * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init()
  */
@@ -135,6 +154,7 @@ static int __init init_zero_pfn(void)
 core_initcall(init_zero_pfn);
 
 
+
 #if defined(SPLIT_RSS_COUNTING)
 
 void sync_mm_rss(struct mm_struct *mm)
@@ -914,6 +934,11 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		get_page(page);
 		page_dup_rmap(page, false);
 		rss[mm_counter(page)]++;
+		
+		/* Should return NULL in vm_normal_page() */
+		uksm_bugon_zeropage(pte);
+	} else {
+		uksm_map_zero_page(pte);
 	}
 
 out_set_pte:
@@ -1148,8 +1173,10 @@ again:
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
-			if (unlikely(!page))
+			if (unlikely(!page)) {
+				uksm_unmap_zero_page(ptent);
 				continue;
+			}
 
 			if (!PageAnon(page)) {
 				if (pte_dirty(ptent)) {
@@ -2010,8 +2037,10 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 			clear_page(kaddr);
 		kunmap_atomic(kaddr);
 		flush_dcache_page(dst);
-	} else
+	} else {
 		copy_user_highpage(dst, src, va, vma);
+		uksm_cow_page(vma, src);
+	}
 }
 
 static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
@@ -2113,7 +2142,7 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte,
 		}
 
 		if (!page_mkwrite)
-			file_update_time(vma->vm_file);
+			vma_file_update_time(vma);
 	}
 
 	return VM_FAULT_WRITE;
@@ -2154,6 +2183,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
 		new_page = alloc_zeroed_user_highpage_movable(vma, fe->address);
 		if (!new_page)
 			goto oom;
+		uksm_cow_pte(vma, orig_pte);
 	} else {
 		new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
 				fe->address);
@@ -2180,7 +2210,9 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte,
 						mm_counter_file(old_page));
 				inc_mm_counter_fast(mm, MM_ANONPAGES);
 			}
+			uksm_bugon_zeropage(orig_pte);
 		} else {
+			uksm_unmap_zero_page(orig_pte);
 			inc_mm_counter_fast(mm, MM_ANONPAGES);
 		}
 		flush_cache_page(vma, fe->address, pte_pfn(orig_pte));
diff --git a/mm/mmap.c b/mm/mmap.c
index ca9d91b..1f631e0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -44,6 +44,7 @@
 #include <linux/userfaultfd_k.h>
 #include <linux/moduleparam.h>
 #include <linux/pkeys.h>
+#include <linux/ksm.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -163,8 +164,9 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
-		fput(vma->vm_file);
+		vma_fput(vma);
 	mpol_put(vma_policy(vma));
+	uksm_remove_vma(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 	return next;
 }
@@ -629,9 +631,16 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start,
 	long adjust_next = 0;
 	int remove_next = 0;
 
+/*
+ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is
+ * acquired
+ */
+	uksm_remove_vma(vma);
+
 	if (next && !insert) {
 		struct vm_area_struct *exporter = NULL, *importer = NULL;
 
+		uksm_remove_vma(next);
 		if (end >= next->vm_end) {
 			/*
 			 * vma expands, overlapping all the next, and
@@ -733,6 +742,7 @@ again:
 		end_changed = true;
 	}
 	vma->vm_pgoff = pgoff;
+
 	if (adjust_next) {
 		next->vm_start += adjust_next << PAGE_SHIFT;
 		next->vm_pgoff += adjust_next;
@@ -790,7 +800,7 @@ again:
 	if (remove_next) {
 		if (file) {
 			uprobe_munmap(next, next->vm_start, next->vm_end);
-			fput(file);
+			vma_fput(vma);
 		}
 		if (next->anon_vma)
 			anon_vma_merge(vma, next);
@@ -806,16 +816,21 @@ again:
 		if (remove_next == 2) {
 			remove_next = 1;
 			end = next->vm_end;
+			uksm_remove_vma(next);
 			goto again;
-		}
-		else if (next)
+		} else if (next) {
 			vma_gap_update(next);
-		else
+		} else {
 			mm->highest_vm_end = end;
+		}
+	} else {
+		if (next && !insert)
+			uksm_vma_add_new(next);
 	}
 	if (insert && file)
 		uprobe_mmap(insert);
 
+	uksm_vma_add_new(vma);
 	validate_mm(mm);
 
 	return 0;
@@ -1207,6 +1222,9 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
 	vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
 			mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
+	/* If uksm is enabled, we add VM_MERGABLE to new VMAs. */
+	uksm_vm_flags_mod(&vm_flags);
+
 	if (flags & MAP_LOCKED)
 		if (!can_do_mlock())
 			return -EPERM;
@@ -1545,6 +1563,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 			allow_write_access(file);
 	}
 	file = vma->vm_file;
+	uksm_vma_add_new(vma);
 out:
 	perf_event_mmap(vma);
 
@@ -1574,8 +1593,8 @@ out:
 	return addr;
 
 unmap_and_free_vma:
+	vma_fput(vma);
 	vma->vm_file = NULL;
-	fput(file);
 
 	/* Undo any partial mapping done by a device driver. */
 	unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
@@ -1586,6 +1605,7 @@ allow_write_and_free_vma:
 	if (vm_flags & VM_DENYWRITE)
 		allow_write_access(file);
 free_vma:
+	uksm_remove_vma(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 unacct_error:
 	if (charged)
@@ -2380,7 +2400,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_free_mpol;
 
 	if (new->vm_file)
-		get_file(new->vm_file);
+		vma_get_file(new);
 
 	if (new->vm_ops && new->vm_ops->open)
 		new->vm_ops->open(new);
@@ -2391,6 +2411,8 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	else
 		err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new);
 
+	uksm_vma_add_new(new);
+
 	/* Success. */
 	if (!err)
 		return 0;
@@ -2399,7 +2421,7 @@ static int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (new->vm_ops && new->vm_ops->close)
 		new->vm_ops->close(new);
 	if (new->vm_file)
-		fput(new->vm_file);
+		vma_fput(new);
 	unlink_anon_vmas(new);
  out_free_mpol:
 	mpol_put(vma_policy(new));
@@ -2550,7 +2572,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	struct vm_area_struct *vma;
 	unsigned long populate = 0;
 	unsigned long ret = -EINVAL;
-	struct file *file;
+	struct file *file, *prfile;
 
 	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. See Documentation/vm/remap_file_pages.txt.\n",
 		     current->comm, current->pid);
@@ -2625,10 +2647,27 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 		}
 	}
 
-	file = get_file(vma->vm_file);
+	vma_get_file(vma);
+	file = vma->vm_file;
+	prfile = vma->vm_prfile;
 	ret = do_mmap_pgoff(vma->vm_file, start, size,
 			prot, flags, pgoff, &populate);
+	if (!IS_ERR_VALUE(ret) && file && prfile) {
+		struct vm_area_struct *new_vma;
+
+		new_vma = find_vma(mm, ret);
+		if (!new_vma->vm_prfile)
+			new_vma->vm_prfile = prfile;
+		if (new_vma != vma)
+			get_file(prfile);
+	}
+	/*
+	 * two fput()s instead of vma_fput(vma),
+	 * coz vma may not be available anymore.
+	 */
 	fput(file);
+	if (prfile)
+		fput(prfile);
 out:
 	up_write(&mm->mmap_sem);
 	if (populate)
@@ -2669,6 +2708,7 @@ static int do_brk(unsigned long addr, unsigned long request)
 		return 0;
 
 	flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
+	uksm_vm_flags_mod(&flags);
 
 	error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
 	if (offset_in_page(error))
@@ -2726,6 +2766,7 @@ static int do_brk(unsigned long addr, unsigned long request)
 	vma->vm_flags = flags;
 	vma->vm_page_prot = vm_get_page_prot(flags);
 	vma_link(mm, vma, prev, rb_link, rb_parent);
+	uksm_vma_add_new(vma);
 out:
 	perf_event_mmap(vma);
 	mm->total_vm += len >> PAGE_SHIFT;
@@ -2764,6 +2805,12 @@ void exit_mmap(struct mm_struct *mm)
 	/* mm's last user has gone, and its about to be pulled down */
 	mmu_notifier_release(mm);
 
+	/*
+	 * Taking write lock on mmap_sem does not harm others,
+	 * but it's crucial for uksm to avoid races.
+	 */
+	down_write(&mm->mmap_sem);
+
 	if (mm->locked_vm) {
 		vma = mm->mmap;
 		while (vma) {
@@ -2799,6 +2846,11 @@ void exit_mmap(struct mm_struct *mm)
 		vma = remove_vma(vma);
 	}
 	vm_unacct_memory(nr_accounted);
+
+	mm->mmap = NULL;
+	mm->mm_rb = RB_ROOT;
+	vmacache_invalidate(mm);
+	up_write(&mm->mmap_sem);
 }
 
 /* Insert vm structure into process list sorted by address
@@ -2903,11 +2955,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		if (anon_vma_clone(new_vma, vma))
 			goto out_free_mempol;
 		if (new_vma->vm_file)
-			get_file(new_vma->vm_file);
+			vma_get_file(new_vma);
 		if (new_vma->vm_ops && new_vma->vm_ops->open)
 			new_vma->vm_ops->open(new_vma);
 		vma_link(mm, new_vma, prev, rb_link, rb_parent);
 		*need_rmap_locks = false;
+		uksm_vma_add_new(new_vma);
 	}
 	return new_vma;
 
@@ -3055,6 +3108,7 @@ static struct vm_area_struct *__install_special_mapping(
 	vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT);
 
 	perf_event_mmap(vma);
+	uksm_vma_add_new(vma);
 
 	return vma;
 
diff --git a/mm/nommu.c b/mm/nommu.c
index 95daf81..5086a29 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -644,7 +644,7 @@ static void __put_nommu_region(struct vm_region *region)
 		up_write(&nommu_region_sem);
 
 		if (region->vm_file)
-			fput(region->vm_file);
+			vmr_fput(region);
 
 		/* IO memory and memory shared directly out of the pagecache
 		 * from ramfs/tmpfs mustn't be released here */
@@ -802,7 +802,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
-		fput(vma->vm_file);
+		vma_fput(vma);
 	put_nommu_region(vma->vm_region);
 	kmem_cache_free(vm_area_cachep, vma);
 }
@@ -1328,7 +1328,7 @@ unsigned long do_mmap(struct file *file,
 					goto error_just_free;
 				}
 			}
-			fput(region->vm_file);
+			vmr_fput(region);
 			kmem_cache_free(vm_region_jar, region);
 			region = pregion;
 			result = start;
@@ -1403,10 +1403,10 @@ error_just_free:
 	up_write(&nommu_region_sem);
 error:
 	if (region->vm_file)
-		fput(region->vm_file);
+		vmr_fput(region);
 	kmem_cache_free(vm_region_jar, region);
 	if (vma->vm_file)
-		fput(vma->vm_file);
+		vma_fput(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 	return ret;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f4cd7d8..56be152 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -70,7 +70,11 @@ static long ratelimit_pages = 32;
 /*
  * Start background writeback (via writeback threads) at this percentage
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+int dirty_background_ratio = 20;
+#else
 int dirty_background_ratio = 10;
+#endif
 
 /*
  * dirty_background_bytes starts at 0 (disabled) so that it is a function of
@@ -87,7 +91,11 @@ int vm_highmem_is_dirtyable;
 /*
  * The generator of dirty data starts writeback at this percentage
  */
+#ifdef CONFIG_PCK_INTERACTIVE
+int vm_dirty_ratio = 50;
+#else
 int vm_dirty_ratio = 20;
+#endif
 
 /*
  * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
@@ -1778,6 +1786,7 @@ pause:
 					  pause,
 					  start_time);
 		__set_current_state(TASK_KILLABLE);
+		wb->dirty_sleep = now;
 		io_schedule_timeout(pause);
 
 		current->dirty_paused_when = now + pause;
diff --git b/mm/prfile.c b/mm/prfile.c
new file mode 100644
index 0000000..b323b8a
--- /dev/null
+++ b/mm/prfile.c
@@ -0,0 +1,86 @@
+/*
+ * Mainly for aufs which mmap(2) diffrent file and wants to print different path
+ * in /proc/PID/maps.
+ * Call these functions via macros defined in linux/mm.h.
+ *
+ * See Documentation/filesystems/aufs/design/06mmap.txt
+ *
+ * Copyright (c) 2014 Junjro R. Okajima
+ * Copyright (c) 2014 Ian Campbell
+ */
+
+#include <linux/mm.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+
+/* #define PRFILE_TRACE */
+static inline void prfile_trace(struct file *f, struct file *pr,
+			      const char func[], int line, const char func2[])
+{
+#ifdef PRFILE_TRACE
+	if (pr)
+		pr_info("%s:%d: %s, %s\n", func, line, func2,
+			f ? (char *)f->f_path.dentry->d_name.name : "(null)");
+#endif
+}
+
+void vma_do_file_update_time(struct vm_area_struct *vma, const char func[],
+			     int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	file_update_time(f);
+	if (f && pr)
+		file_update_time(pr);
+}
+
+struct file *vma_do_pr_or_file(struct vm_area_struct *vma, const char func[],
+			       int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	return (f && pr) ? pr : f;
+}
+
+void vma_do_get_file(struct vm_area_struct *vma, const char func[], int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	get_file(f);
+	if (f && pr)
+		get_file(pr);
+}
+
+void vma_do_fput(struct vm_area_struct *vma, const char func[], int line)
+{
+	struct file *f = vma->vm_file, *pr = vma->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	fput(f);
+	if (f && pr)
+		fput(pr);
+}
+
+#ifndef CONFIG_MMU
+struct file *vmr_do_pr_or_file(struct vm_region *region, const char func[],
+			       int line)
+{
+	struct file *f = region->vm_file, *pr = region->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	return (f && pr) ? pr : f;
+}
+
+void vmr_do_fput(struct vm_region *region, const char func[], int line)
+{
+	struct file *f = region->vm_file, *pr = region->vm_prfile;
+
+	prfile_trace(f, pr, func, line, __func__);
+	fput(f);
+	if (f && pr)
+		fput(pr);
+}
+#endif /* !CONFIG_MMU */
diff --git a/mm/rmap.c b/mm/rmap.c
index 1ef3640..1c40463 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1110,9 +1110,9 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
 
 /**
  * __page_set_anon_rmap - set up new anonymous rmap
- * @page:	Page to add to rmap	
+ * @page:	Page to add to rmap
  * @vma:	VM area to add page to.
- * @address:	User virtual address of the mapping	
+ * @address:	User virtual address of the mapping
  * @exclusive:	the page is exclusively owned by the current process
  */
 static void __page_set_anon_rmap(struct page *page,
diff --git b/mm/uksm.c b/mm/uksm.c
new file mode 100644
index 0000000..bef4a50
--- /dev/null
+++ b/mm/uksm.c
@@ -0,0 +1,5522 @@
+/*
+ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia
+ *
+ * This is an improvement upon KSM. Some basic data structures and routines
+ * are borrowed from ksm.c .
+ *
+ * Its new features:
+ * 1. Full system scan:
+ *      It automatically scans all user processes' anonymous VMAs. Kernel-user
+ *      interaction to submit a memory area to KSM is no longer needed.
+ *
+ * 2. Rich area detection:
+ *      It automatically detects rich areas containing abundant duplicated
+ *      pages based. Rich areas are given a full scan speed. Poor areas are
+ *      sampled at a reasonable speed with very low CPU consumption.
+ *
+ * 3. Ultra Per-page scan speed improvement:
+ *      A new hash algorithm is proposed. As a result, on a machine with
+ *      Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it
+ *      can scan memory areas that does not contain duplicated pages at speed of
+ *      627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of
+ *      477MB/sec ~ 923MB/sec.
+ *
+ * 4. Thrashing area avoidance:
+ *      Thrashing area(an VMA that has frequent Ksm page break-out) can be
+ *      filtered out. My benchmark shows it's more efficient than KSM's per-page
+ *      hash value based volatile page detection.
+ *
+ *
+ * 5. Misc changes upon KSM:
+ *      * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page
+ *        comparison. It's much faster than default C version on x86.
+ *      * rmap_item now has an struct *page member to loosely cache a
+ *        address-->page mapping, which reduces too much time-costly
+ *        follow_page().
+ *      * The VMA creation/exit procedures are hooked to let the Ultra KSM know.
+ *      * try_to_merge_two_pages() now can revert a pte if it fails. No break_
+ *        ksm is needed for this case.
+ *
+ * 6. Full Zero Page consideration(contributed by Figo Zhang)
+ *    Now uksmd consider full zero pages as special pages and merge them to an
+ *    special unswappable uksm zero page.
+ */
+
+#include <linux/errno.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/mman.h>
+#include <linux/sched.h>
+#include <linux/rwsem.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/spinlock.h>
+#include <linux/jhash.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/wait.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/memory.h>
+#include <linux/mmu_notifier.h>
+#include <linux/swap.h>
+#include <linux/ksm.h>
+#include <linux/crypto.h>
+#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+#include <linux/random.h>
+#include <linux/math64.h>
+#include <linux/gcd.h>
+#include <linux/freezer.h>
+#include <linux/sradix-tree.h>
+
+#include <asm/tlbflush.h>
+#include "internal.h"
+
+#ifdef CONFIG_X86
+#undef memcmp
+
+#ifdef CONFIG_X86_32
+#define memcmp memcmpx86_32
+/*
+ * Compare 4-byte-aligned address s1 and s2, with length n
+ */
+int memcmpx86_32(void *s1, void *s2, size_t n)
+{
+	size_t num = n / 4;
+	register int res;
+
+	__asm__ __volatile__
+	(
+	 "testl %3,%3\n\t"
+	 "repe; cmpsd\n\t"
+	 "je        1f\n\t"
+	 "sbbl      %0,%0\n\t"
+	 "orl       $1,%0\n"
+	 "1:"
+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
+	 : "0" (0)
+	 : "cc");
+
+	return res;
+}
+
+/*
+ * Check the page is all zero ?
+ */
+static int is_full_zero(const void *s1, size_t len)
+{
+	unsigned char same;
+
+	len /= 4;
+
+	__asm__ __volatile__
+	("repe; scasl;"
+	 "sete %0"
+	 : "=qm" (same), "+D" (s1), "+c" (len)
+	 : "a" (0)
+	 : "cc");
+
+	return same;
+}
+
+
+#elif defined(CONFIG_X86_64)
+#define memcmp memcmpx86_64
+/*
+ * Compare 8-byte-aligned address s1 and s2, with length n
+ */
+int memcmpx86_64(void *s1, void *s2, size_t n)
+{
+	size_t num = n / 8;
+	register int res;
+
+	__asm__ __volatile__
+	(
+	 "testq %q3,%q3\n\t"
+	 "repe; cmpsq\n\t"
+	 "je        1f\n\t"
+	 "sbbq      %q0,%q0\n\t"
+	 "orq       $1,%q0\n"
+	 "1:"
+	 : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num)
+	 : "0" (0)
+	 : "cc");
+
+	return res;
+}
+
+static int is_full_zero(const void *s1, size_t len)
+{
+	unsigned char same;
+
+	len /= 8;
+
+	__asm__ __volatile__
+	("repe; scasq;"
+	 "sete %0"
+	 : "=qm" (same), "+D" (s1), "+c" (len)
+	 : "a" (0)
+	 : "cc");
+
+	return same;
+}
+
+#endif
+#else
+static int is_full_zero(const void *s1, size_t len)
+{
+	unsigned long *src = s1;
+	int i;
+
+	len /= sizeof(*src);
+
+	for (i = 0; i < len; i++) {
+		if (src[i])
+			return 0;
+	}
+
+	return 1;
+}
+#endif
+
+#define UKSM_RUNG_ROUND_FINISHED  (1 << 0)
+#define TIME_RATIO_SCALE	10000
+
+#define SLOT_TREE_NODE_SHIFT	8
+#define SLOT_TREE_NODE_STORE_SIZE	(1UL << SLOT_TREE_NODE_SHIFT)
+struct slot_tree_node {
+	unsigned long size;
+	struct sradix_tree_node snode;
+	void *stores[SLOT_TREE_NODE_STORE_SIZE];
+};
+
+static struct kmem_cache *slot_tree_node_cachep;
+
+static struct sradix_tree_node *slot_tree_node_alloc(void)
+{
+	struct slot_tree_node *p;
+	p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL |
+			      __GFP_NORETRY | __GFP_NOWARN);
+	if (!p)
+		return NULL;
+
+	return &p->snode;
+}
+
+static void slot_tree_node_free(struct sradix_tree_node *node)
+{
+	struct slot_tree_node *p;
+
+	p = container_of(node, struct slot_tree_node, snode);
+	kmem_cache_free(slot_tree_node_cachep, p);
+}
+
+static void slot_tree_node_extend(struct sradix_tree_node *parent,
+				  struct sradix_tree_node *child)
+{
+	struct slot_tree_node *p, *c;
+
+	p = container_of(parent, struct slot_tree_node, snode);
+	c = container_of(child, struct slot_tree_node, snode);
+
+	p->size += c->size;
+}
+
+void slot_tree_node_assign(struct sradix_tree_node *node,
+			   unsigned index, void *item)
+{
+	struct vma_slot *slot = item;
+	struct slot_tree_node *cur;
+
+	slot->snode = node;
+	slot->sindex = index;
+
+	while (node) {
+		cur = container_of(node, struct slot_tree_node, snode);
+		cur->size += slot->pages;
+		node = node->parent;
+	}
+}
+
+void slot_tree_node_rm(struct sradix_tree_node *node, unsigned offset)
+{
+	struct vma_slot *slot;
+	struct slot_tree_node *cur;
+	unsigned long pages;
+
+	if (node->height == 1) {
+		slot = node->stores[offset];
+		pages = slot->pages;
+	} else {
+		cur = container_of(node->stores[offset],
+				   struct slot_tree_node, snode);
+		pages = cur->size;
+	}
+
+	while (node) {
+		cur = container_of(node, struct slot_tree_node, snode);
+		cur->size -= pages;
+		node = node->parent;
+	}
+}
+
+unsigned long slot_iter_index;
+int slot_iter(void *item,  unsigned long height)
+{
+	struct slot_tree_node *node;
+	struct vma_slot *slot;
+
+	if (height == 1) {
+		slot = item;
+		if (slot_iter_index < slot->pages) {
+			/*in this one*/
+			return 1;
+		} else {
+			slot_iter_index -= slot->pages;
+			return 0;
+		}
+
+	} else {
+		node = container_of(item, struct slot_tree_node, snode);
+		if (slot_iter_index < node->size) {
+			/*in this one*/
+			return 1;
+		} else {
+			slot_iter_index -= node->size;
+			return 0;
+		}
+	}
+}
+
+
+static inline void slot_tree_init_root(struct sradix_tree_root *root)
+{
+	init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT);
+	root->alloc = slot_tree_node_alloc;
+	root->free = slot_tree_node_free;
+	root->extend = slot_tree_node_extend;
+	root->assign = slot_tree_node_assign;
+	root->rm = slot_tree_node_rm;
+}
+
+void slot_tree_init(void)
+{
+	slot_tree_node_cachep = kmem_cache_create("slot_tree_node",
+				sizeof(struct slot_tree_node), 0,
+				SLAB_PANIC | SLAB_RECLAIM_ACCOUNT,
+				NULL);
+}
+
+
+/* Each rung of this ladder is a list of VMAs having a same scan ratio */
+struct scan_rung {
+	//struct list_head scanned_list;
+	struct sradix_tree_root vma_root;
+	struct sradix_tree_root vma_root2;
+
+	struct vma_slot *current_scan;
+	unsigned long current_offset;
+
+	/*
+	 * The initial value for current_offset, it should loop over
+	 * [0~ step - 1] to let all slot have its chance to be scanned.
+	 */
+	unsigned long offset_init;
+	unsigned long step; /* dynamic step for current_offset */
+	unsigned int flags;
+	unsigned long pages_to_scan;
+	//unsigned long fully_scanned_slots;
+	/*
+	 * a little bit tricky - if cpu_time_ratio > 0, then the value is the
+	 * the cpu time ratio it can spend in rung_i for every scan
+	 * period. if < 0, then it is the cpu time ratio relative to the
+	 * max cpu percentage user specified. Both in unit of
+	 * 1/TIME_RATIO_SCALE
+	 */
+	int cpu_ratio;
+
+	/*
+	 * How long it will take for all slots in this rung to be fully
+	 * scanned? If it's zero, we don't care about the cover time:
+	 * it's fully scanned.
+	 */
+	unsigned int cover_msecs;
+	//unsigned long vma_num;
+	//unsigned long pages; /* Sum of all slot's pages in rung */
+};
+
+/**
+ * node of either the stable or unstale rbtree
+ *
+ */
+struct tree_node {
+	struct rb_node node; /* link in the main (un)stable rbtree */
+	struct rb_root sub_root; /* rb_root for sublevel collision rbtree */
+	u32 hash;
+	unsigned long count; /* TODO: merged with sub_root */
+	struct list_head all_list; /* all tree nodes in stable/unstable tree */
+};
+
+/**
+ * struct stable_node - node of the stable rbtree
+ * @node: rb node of this ksm page in the stable tree
+ * @hlist: hlist head of rmap_items using this ksm page
+ * @kpfn: page frame number of this ksm page
+ */
+struct stable_node {
+	struct rb_node node; /* link in sub-rbtree */
+	struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */
+	struct hlist_head hlist;
+	unsigned long kpfn;
+	u32 hash_max; /* if ==0 then it's not been calculated yet */
+	struct list_head all_list; /* in a list for all stable nodes */
+};
+
+/**
+ * struct node_vma - group rmap_items linked in a same stable
+ * node together.
+ */
+struct node_vma {
+	union {
+		struct vma_slot *slot;
+		unsigned long key;  /* slot is used as key sorted on hlist */
+	};
+	struct hlist_node hlist;
+	struct hlist_head rmap_hlist;
+	struct stable_node *head;
+};
+
+/**
+ * struct rmap_item - reverse mapping item for virtual addresses
+ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list
+ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree
+ * @mm: the memory structure this rmap_item is pointing into
+ * @address: the virtual address this rmap_item tracks (+ flags in low bits)
+ * @node: rb node of this rmap_item in the unstable tree
+ * @head: pointer to stable_node heading this list in the stable tree
+ * @hlist: link into hlist of rmap_items hanging off that stable_node
+ */
+struct rmap_item {
+	struct vma_slot *slot;
+	struct page *page;
+	unsigned long address;	/* + low bits used for flags below */
+	unsigned long hash_round;
+	unsigned long entry_index;
+	union {
+		struct {/* when in unstable tree */
+			struct rb_node node;
+			struct tree_node *tree_node;
+			u32 hash_max;
+		};
+		struct { /* when in stable tree */
+			struct node_vma *head;
+			struct hlist_node hlist;
+			struct anon_vma *anon_vma;
+		};
+	};
+} __attribute__((aligned(4)));
+
+struct rmap_list_entry {
+	union {
+		struct rmap_item *item;
+		unsigned long addr;
+	};
+	/* lowest bit is used for is_addr tag */
+} __attribute__((aligned(4))); /* 4 aligned to fit in to pages*/
+
+
+/* Basic data structure definition ends */
+
+
+/*
+ * Flags for rmap_item to judge if it's listed in the stable/unstable tree.
+ * The flags use the low bits of rmap_item.address
+ */
+#define UNSTABLE_FLAG	0x1
+#define STABLE_FLAG	0x2
+#define get_rmap_addr(x)	((x)->address & PAGE_MASK)
+
+/*
+ * rmap_list_entry helpers
+ */
+#define IS_ADDR_FLAG	1
+#define is_addr(ptr)		((unsigned long)(ptr) & IS_ADDR_FLAG)
+#define set_is_addr(ptr)	((ptr) |= IS_ADDR_FLAG)
+#define get_clean_addr(ptr)	(((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG))
+
+
+/*
+ * High speed caches for frequently allocated and freed structs
+ */
+static struct kmem_cache *rmap_item_cache;
+static struct kmem_cache *stable_node_cache;
+static struct kmem_cache *node_vma_cache;
+static struct kmem_cache *vma_slot_cache;
+static struct kmem_cache *tree_node_cache;
+#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\
+		sizeof(struct __struct), __alignof__(struct __struct),\
+		(__flags), NULL)
+
+/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */
+#define SCAN_LADDER_SIZE 4
+static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE];
+
+/* The evaluation rounds uksmd has finished */
+static unsigned long long uksm_eval_round = 1;
+
+/*
+ * we add 1 to this var when we consider we should rebuild the whole
+ * unstable tree.
+ */
+static unsigned long uksm_hash_round = 1;
+
+/*
+ * How many times the whole memory is scanned.
+ */
+static unsigned long long fully_scanned_round = 1;
+
+/* The total number of virtual pages of all vma slots */
+static u64 uksm_pages_total;
+
+/* The number of pages has been scanned since the start up */
+static u64 uksm_pages_scanned;
+
+static u64 scanned_virtual_pages;
+
+/* The number of pages has been scanned since last encode_benefit call */
+static u64 uksm_pages_scanned_last;
+
+/* If the scanned number is tooo large, we encode it here */
+static u64 pages_scanned_stored;
+
+static unsigned long pages_scanned_base;
+
+/* The number of nodes in the stable tree */
+static unsigned long uksm_pages_shared;
+
+/* The number of page slots additionally sharing those nodes */
+static unsigned long uksm_pages_sharing;
+
+/* The number of nodes in the unstable tree */
+static unsigned long uksm_pages_unshared;
+
+/*
+ * Milliseconds ksmd should sleep between scans,
+ * >= 100ms to be consistent with
+ * scan_time_to_sleep_msec()
+ */
+static unsigned int uksm_sleep_jiffies;
+
+/* The real value for the uksmd next sleep */
+static unsigned int uksm_sleep_real;
+
+/* Saved value for user input uksm_sleep_jiffies when it's enlarged */
+static unsigned int uksm_sleep_saved;
+
+/* Max percentage of cpu utilization ksmd can take to scan in one batch */
+static unsigned int uksm_max_cpu_percentage;
+
+static int uksm_cpu_governor;
+
+static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" };
+
+struct uksm_cpu_preset_s {
+	int cpu_ratio[SCAN_LADDER_SIZE];
+	unsigned int cover_msecs[SCAN_LADDER_SIZE];
+	unsigned int max_cpu; /* percentage */
+};
+
+struct uksm_cpu_preset_s uksm_cpu_preset[4] = {
+	{ {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95},
+	{ {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50},
+	{ {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20},
+	{ {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1},
+};
+
+/* The default value for uksm_ema_page_time if it's not initialized */
+#define UKSM_PAGE_TIME_DEFAULT	500
+
+/*cost to scan one page by expotional moving average in nsecs */
+static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
+
+/* The expotional moving average alpha weight, in percentage. */
+#define EMA_ALPHA	20
+
+/*
+ * The threshold used to filter out thrashing areas,
+ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound
+ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio
+ * will be considered as having a zero duplication ratio.
+ */
+static unsigned int uksm_thrash_threshold = 50;
+
+/* How much dedup ratio is considered to be abundant*/
+static unsigned int uksm_abundant_threshold = 10;
+
+/* All slots having merged pages in this eval round. */
+struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup);
+
+/* How many times the ksmd has slept since startup */
+static unsigned long long uksm_sleep_times;
+
+#define UKSM_RUN_STOP	0
+#define UKSM_RUN_MERGE	1
+static unsigned int uksm_run = 1;
+
+static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait);
+static DEFINE_MUTEX(uksm_thread_mutex);
+
+/*
+ * List vma_slot_new is for newly created vma_slot waiting to be added by
+ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to
+ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding
+ * VMA has been removed/freed.
+ */
+struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new);
+struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd);
+struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del);
+static DEFINE_SPINLOCK(vma_slot_list_lock);
+
+/* The unstable tree heads */
+static struct rb_root root_unstable_tree = RB_ROOT;
+
+/*
+ * All tree_nodes are in a list to be freed at once when unstable tree is
+ * freed after each scan round.
+ */
+static struct list_head unstable_tree_node_list =
+				LIST_HEAD_INIT(unstable_tree_node_list);
+
+/* List contains all stable nodes */
+static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list);
+
+/*
+ * When the hash strength is changed, the stable tree must be delta_hashed and
+ * re-structured. We use two set of below structs to speed up the
+ * re-structuring of stable tree.
+ */
+static struct list_head
+stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]),
+			    LIST_HEAD_INIT(stable_tree_node_list[1])};
+
+static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0];
+static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT};
+static struct rb_root *root_stable_treep = &root_stable_tree[0];
+static unsigned long stable_tree_index;
+
+/* The hash strength needed to hash a full page */
+#define HASH_STRENGTH_FULL		(PAGE_SIZE / sizeof(u32))
+
+/* The hash strength needed for loop-back hashing */
+#define HASH_STRENGTH_MAX		(HASH_STRENGTH_FULL + 10)
+
+/* The random offsets in a page */
+static u32 *random_nums;
+
+/* The hash strength */
+static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4;
+
+/* The delta value each time the hash strength increases or decreases */
+static unsigned long hash_strength_delta;
+#define HASH_STRENGTH_DELTA_MAX	5
+
+/* The time we have saved due to random_sample_hash */
+static u64 rshash_pos;
+
+/* The time we have wasted due to hash collision */
+static u64 rshash_neg;
+
+struct uksm_benefit {
+	u64 pos;
+	u64 neg;
+	u64 scanned;
+	unsigned long base;
+} benefit;
+
+/*
+ * The relative cost of memcmp, compared to 1 time unit of random sample
+ * hash, this value is tested when ksm module is initialized
+ */
+static unsigned long memcmp_cost;
+
+static unsigned long  rshash_neg_cont_zero;
+static unsigned long  rshash_cont_obscure;
+
+/* The possible states of hash strength adjustment heuristic */
+enum rshash_states {
+		RSHASH_STILL,
+		RSHASH_TRYUP,
+		RSHASH_TRYDOWN,
+		RSHASH_NEW,
+		RSHASH_PRE_STILL,
+};
+
+/* The possible direction we are about to adjust hash strength */
+enum rshash_direct {
+	GO_UP,
+	GO_DOWN,
+	OBSCURE,
+	STILL,
+};
+
+/* random sampling hash state machine */
+static struct {
+	enum rshash_states state;
+	enum rshash_direct pre_direct;
+	u8 below_count;
+	/* Keep a lookup window of size 5, iff above_count/below_count > 3
+	 * in this window we stop trying.
+	 */
+	u8 lookup_window_index;
+	u64 stable_benefit;
+	unsigned long turn_point_down;
+	unsigned long turn_benefit_down;
+	unsigned long turn_point_up;
+	unsigned long turn_benefit_up;
+	unsigned long stable_point;
+} rshash_state;
+
+/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/
+static u32 *zero_hash_table;
+
+static inline struct node_vma *alloc_node_vma(void)
+{
+	struct node_vma *node_vma;
+	node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL |
+				     __GFP_NORETRY | __GFP_NOWARN);
+	if (node_vma) {
+		INIT_HLIST_HEAD(&node_vma->rmap_hlist);
+		INIT_HLIST_NODE(&node_vma->hlist);
+	}
+	return node_vma;
+}
+
+static inline void free_node_vma(struct node_vma *node_vma)
+{
+	kmem_cache_free(node_vma_cache, node_vma);
+}
+
+
+static inline struct vma_slot *alloc_vma_slot(void)
+{
+	struct vma_slot *slot;
+
+	/*
+	 * In case ksm is not initialized by now.
+	 * Oops, we need to consider the call site of uksm_init() in the future.
+	 */
+	if (!vma_slot_cache)
+		return NULL;
+
+	slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL |
+				 __GFP_NORETRY | __GFP_NOWARN);
+	if (slot) {
+		INIT_LIST_HEAD(&slot->slot_list);
+		INIT_LIST_HEAD(&slot->dedup_list);
+		slot->flags |= UKSM_SLOT_NEED_RERAND;
+	}
+	return slot;
+}
+
+static inline void free_vma_slot(struct vma_slot *vma_slot)
+{
+	kmem_cache_free(vma_slot_cache, vma_slot);
+}
+
+
+
+static inline struct rmap_item *alloc_rmap_item(void)
+{
+	struct rmap_item *rmap_item;
+
+	rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL |
+				      __GFP_NORETRY | __GFP_NOWARN);
+	if (rmap_item) {
+		/* bug on lowest bit is not clear for flag use */
+		BUG_ON(is_addr(rmap_item));
+	}
+	return rmap_item;
+}
+
+static inline void free_rmap_item(struct rmap_item *rmap_item)
+{
+	rmap_item->slot = NULL;	/* debug safety */
+	kmem_cache_free(rmap_item_cache, rmap_item);
+}
+
+static inline struct stable_node *alloc_stable_node(void)
+{
+	struct stable_node *node;
+	node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | GFP_ATOMIC);
+	if (!node)
+		return NULL;
+
+	INIT_HLIST_HEAD(&node->hlist);
+	list_add(&node->all_list, &stable_node_list);
+	return node;
+}
+
+static inline void free_stable_node(struct stable_node *stable_node)
+{
+	list_del(&stable_node->all_list);
+	kmem_cache_free(stable_node_cache, stable_node);
+}
+
+static inline struct tree_node *alloc_tree_node(struct list_head *list)
+{
+	struct tree_node *node;
+	node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | GFP_ATOMIC);
+	if (!node)
+		return NULL;
+
+	list_add(&node->all_list, list);
+	return node;
+}
+
+static inline void free_tree_node(struct tree_node *node)
+{
+	list_del(&node->all_list);
+	kmem_cache_free(tree_node_cache, node);
+}
+
+static void uksm_drop_anon_vma(struct rmap_item *rmap_item)
+{
+	struct anon_vma *anon_vma = rmap_item->anon_vma;
+
+	put_anon_vma(anon_vma);
+}
+
+
+/**
+ * Remove a stable node from stable_tree, may unlink from its tree_node and
+ * may remove its parent tree_node if no other stable node is pending.
+ *
+ * @stable_node 	The node need to be removed
+ * @unlink_rb 		Will this node be unlinked from the rbtree?
+ * @remove_tree_	node Will its tree_node be removed if empty?
+ */
+static void remove_node_from_stable_tree(struct stable_node *stable_node,
+					 int unlink_rb,  int remove_tree_node)
+{
+	struct node_vma *node_vma;
+	struct rmap_item *rmap_item;
+	struct hlist_node *n;
+
+	if (!hlist_empty(&stable_node->hlist)) {
+		hlist_for_each_entry_safe(node_vma, n,
+					  &stable_node->hlist, hlist) {
+			hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
+				uksm_pages_sharing--;
+
+				uksm_drop_anon_vma(rmap_item);
+				rmap_item->address &= PAGE_MASK;
+			}
+			free_node_vma(node_vma);
+			cond_resched();
+		}
+
+		/* the last one is counted as shared */
+		uksm_pages_shared--;
+		uksm_pages_sharing++;
+	}
+
+	if (stable_node->tree_node && unlink_rb) {
+		rb_erase(&stable_node->node,
+			 &stable_node->tree_node->sub_root);
+
+		if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) &&
+		    remove_tree_node) {
+			rb_erase(&stable_node->tree_node->node,
+				 root_stable_treep);
+			free_tree_node(stable_node->tree_node);
+		} else {
+			stable_node->tree_node->count--;
+		}
+	}
+
+	free_stable_node(stable_node);
+}
+
+
+/*
+ * get_uksm_page: checks if the page indicated by the stable node
+ * is still its ksm page, despite having held no reference to it.
+ * In which case we can trust the content of the page, and it
+ * returns the gotten page; but if the page has now been zapped,
+ * remove the stale node from the stable tree and return NULL.
+ *
+ * You would expect the stable_node to hold a reference to the ksm page.
+ * But if it increments the page's count, swapping out has to wait for
+ * ksmd to come around again before it can free the page, which may take
+ * seconds or even minutes: much too unresponsive.  So instead we use a
+ * "keyhole reference": access to the ksm page from the stable node peeps
+ * out through its keyhole to see if that page still holds the right key,
+ * pointing back to this stable node.  This relies on freeing a PageAnon
+ * page to reset its page->mapping to NULL, and relies on no other use of
+ * a page to put something that might look like our key in page->mapping.
+ *
+ * include/linux/pagemap.h page_cache_get_speculative() is a good reference,
+ * but this is different - made simpler by uksm_thread_mutex being held, but
+ * interesting for assuming that no other use of the struct page could ever
+ * put our expected_mapping into page->mapping (or a field of the union which
+ * coincides with page->mapping).  The RCU calls are not for KSM at all, but
+ * to keep the page_count protocol described with page_cache_get_speculative.
+ *
+ * Note: it is possible that get_uksm_page() will return NULL one moment,
+ * then page the next, if the page is in between page_freeze_refs() and
+ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page
+ * is on its way to being freed; but it is an anomaly to bear in mind.
+ *
+ * @unlink_rb: 		if the removal of this node will firstly unlink from
+ * its rbtree. stable_node_reinsert will prevent this when restructuring the
+ * node from its old tree.
+ *
+ * @remove_tree_node:	if this is the last one of its tree_node, will the
+ * tree_node be freed ? If we are inserting stable node, this tree_node may
+ * be reused, so don't free it.
+ */
+static struct page *get_uksm_page(struct stable_node *stable_node,
+				 int unlink_rb, int remove_tree_node)
+{
+	struct page *page;
+	void *expected_mapping;
+
+	page = pfn_to_page(stable_node->kpfn);
+	expected_mapping = (void *)((unsigned long)stable_node |
+				    PAGE_MAPPING_KSM);
+	rcu_read_lock();
+	if (page->mapping != expected_mapping)
+		goto stale;
+	if (!get_page_unless_zero(page))
+		goto stale;
+	if (page->mapping != expected_mapping) {
+		put_page(page);
+		goto stale;
+	}
+	rcu_read_unlock();
+	return page;
+stale:
+	rcu_read_unlock();
+	remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node);
+
+	return NULL;
+}
+
+/*
+ * Removing rmap_item from stable or unstable tree.
+ * This function will clean the information from the stable/unstable tree.
+ */
+static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
+{
+	if (rmap_item->address & STABLE_FLAG) {
+		struct stable_node *stable_node;
+		struct node_vma *node_vma;
+		struct page *page;
+
+		node_vma = rmap_item->head;
+		stable_node = node_vma->head;
+		page = get_uksm_page(stable_node, 1, 1);
+		if (!page)
+			goto out;
+
+		/*
+		 * page lock is needed because it's racing with
+		 * try_to_unmap_ksm(), etc.
+		 */
+		lock_page(page);
+		hlist_del(&rmap_item->hlist);
+
+		if (hlist_empty(&node_vma->rmap_hlist)) {
+			hlist_del(&node_vma->hlist);
+			free_node_vma(node_vma);
+		}
+		unlock_page(page);
+
+		put_page(page);
+		if (hlist_empty(&stable_node->hlist)) {
+			/* do NOT call remove_node_from_stable_tree() here,
+			 * it's possible for a forked rmap_item not in
+			 * stable tree while the in-tree rmap_items were
+			 * deleted.
+			 */
+			uksm_pages_shared--;
+		} else
+			uksm_pages_sharing--;
+
+
+		uksm_drop_anon_vma(rmap_item);
+	} else if (rmap_item->address & UNSTABLE_FLAG) {
+		if (rmap_item->hash_round == uksm_hash_round) {
+
+			rb_erase(&rmap_item->node,
+				 &rmap_item->tree_node->sub_root);
+			if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) {
+				rb_erase(&rmap_item->tree_node->node,
+					 &root_unstable_tree);
+
+				free_tree_node(rmap_item->tree_node);
+			} else
+				rmap_item->tree_node->count--;
+		}
+		uksm_pages_unshared--;
+	}
+
+	rmap_item->address &= PAGE_MASK;
+	rmap_item->hash_max = 0;
+
+out:
+	cond_resched();		/* we're called from many long loops */
+}
+
+static inline int slot_in_uksm(struct vma_slot *slot)
+{
+	return list_empty(&slot->slot_list);
+}
+
+/*
+ * Test if the mm is exiting
+ */
+static inline bool uksm_test_exit(struct mm_struct *mm)
+{
+	return atomic_read(&mm->mm_users) == 0;
+}
+
+static inline unsigned long vma_pool_size(struct vma_slot *slot)
+{
+	return round_up(sizeof(struct rmap_list_entry) * slot->pages,
+			PAGE_SIZE) >> PAGE_SHIFT;
+}
+
+#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta))
+
+/* must be done with sem locked */
+static int slot_pool_alloc(struct vma_slot *slot)
+{
+	unsigned long pool_size;
+
+	if (slot->rmap_list_pool)
+		return 0;
+
+	pool_size = vma_pool_size(slot);
+	slot->rmap_list_pool = kzalloc(sizeof(struct page *) *
+				       pool_size, GFP_KERNEL);
+	if (!slot->rmap_list_pool)
+		return -ENOMEM;
+
+	slot->pool_counts = kzalloc(sizeof(unsigned int) * pool_size,
+				    GFP_KERNEL);
+	if (!slot->pool_counts) {
+		kfree(slot->rmap_list_pool);
+		return -ENOMEM;
+	}
+
+	slot->pool_size = pool_size;
+	BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages));
+	slot->flags |= UKSM_SLOT_IN_UKSM;
+	uksm_pages_total += slot->pages;
+
+	return 0;
+}
+
+/*
+ * Called after vma is unlinked from its mm
+ */
+void uksm_remove_vma(struct vm_area_struct *vma)
+{
+	struct vma_slot *slot;
+
+	if (!vma->uksm_vma_slot)
+		return;
+
+	spin_lock(&vma_slot_list_lock);
+	slot = vma->uksm_vma_slot;
+	if (!slot)
+		goto out;
+
+	if (slot_in_uksm(slot)) {
+		/**
+		 * This slot has been added by ksmd, so move to the del list
+		 * waiting ksmd to free it.
+		 */
+		list_add_tail(&slot->slot_list, &vma_slot_del);
+	} else {
+		/**
+		 * It's still on new list. It's ok to free slot directly.
+		 */
+		list_del(&slot->slot_list);
+		free_vma_slot(slot);
+	}
+out:
+	vma->uksm_vma_slot = NULL;
+	spin_unlock(&vma_slot_list_lock);
+}
+
+/**
+ * Need to do two things:
+ * 1. check if slot was moved to del list
+ * 2. make sure the mmap_sem is manipulated under valid vma.
+ *
+ * My concern here is that in some cases, this may make
+ * vma_slot_list_lock() waiters to serialized further by some
+ * sem->wait_lock, can this really be expensive?
+ *
+ *
+ * @return
+ * 0: if successfully locked mmap_sem
+ * -ENOENT: this slot was moved to del list
+ * -EBUSY: vma lock failed
+ */
+static int try_down_read_slot_mmap_sem(struct vma_slot *slot)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	struct rw_semaphore *sem;
+
+	spin_lock(&vma_slot_list_lock);
+
+	/* the slot_list was removed and inited from new list, when it enters
+	 * uksm_list. If now it's not empty, then it must be moved to del list
+	 */
+	if (!slot_in_uksm(slot)) {
+		spin_unlock(&vma_slot_list_lock);
+		return -ENOENT;
+	}
+
+	BUG_ON(slot->pages != vma_pages(slot->vma));
+	/* Ok, vma still valid */
+	vma = slot->vma;
+	mm = vma->vm_mm;
+	sem = &mm->mmap_sem;
+
+	if (uksm_test_exit(mm)) {
+		spin_unlock(&vma_slot_list_lock);
+		return -ENOENT;
+	}
+
+	if (down_read_trylock(sem)) {
+		spin_unlock(&vma_slot_list_lock);
+		if (slot_pool_alloc(slot)) {
+			uksm_remove_vma(vma);
+			up_read(sem);
+			return -ENOENT;
+		}
+		return 0;
+	}
+
+	spin_unlock(&vma_slot_list_lock);
+	return -EBUSY;
+}
+
+static inline unsigned long
+vma_page_address(struct page *page, struct vm_area_struct *vma)
+{
+	pgoff_t pgoff = page->index;
+	unsigned long address;
+
+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
+		/* page should be within @vma mapping range */
+		return -EFAULT;
+	}
+	return address;
+}
+
+
+/* return 0 on success with the item's mmap_sem locked */
+static inline int get_mergeable_page_lock_mmap(struct rmap_item *item)
+{
+	struct mm_struct *mm;
+	struct vma_slot *slot = item->slot;
+	int err = -EINVAL;
+
+	struct page *page;
+
+	/*
+	 * try_down_read_slot_mmap_sem() returns non-zero if the slot
+	 * has been removed by uksm_remove_vma().
+	 */
+	if (try_down_read_slot_mmap_sem(slot))
+		return -EBUSY;
+
+	mm = slot->vma->vm_mm;
+
+	if (uksm_test_exit(mm))
+		goto failout_up;
+
+	page = item->page;
+	rcu_read_lock();
+	if (!get_page_unless_zero(page)) {
+		rcu_read_unlock();
+		goto failout_up;
+	}
+
+	/* No need to consider huge page here. */
+	if (item->slot->vma->anon_vma != page_anon_vma(page) ||
+	    vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) {
+		/*
+		 * TODO:
+		 * should we release this item becase of its stale page
+		 * mapping?
+		 */
+		put_page(page);
+		rcu_read_unlock();
+		goto failout_up;
+	}
+	rcu_read_unlock();
+	return 0;
+
+failout_up:
+	up_read(&mm->mmap_sem);
+	return err;
+}
+
+/*
+ * What kind of VMA is considered ?
+ */
+static inline int vma_can_enter(struct vm_area_struct *vma)
+{
+	return uksm_flags_can_scan(vma->vm_flags);
+}
+
+/*
+ * Called whenever a fresh new vma is created A new vma_slot.
+ * is created and inserted into a global list Must be called.
+ * after vma is inserted to its mm      		    .
+ */
+void uksm_vma_add_new(struct vm_area_struct *vma)
+{
+	struct vma_slot *slot;
+
+	if (!vma_can_enter(vma)) {
+		vma->uksm_vma_slot = NULL;
+		return;
+	}
+
+	slot = alloc_vma_slot();
+	if (!slot) {
+		vma->uksm_vma_slot = NULL;
+		return;
+	}
+
+	vma->uksm_vma_slot = slot;
+	vma->vm_flags |= VM_MERGEABLE;
+	slot->vma = vma;
+	slot->mm = vma->vm_mm;
+	slot->ctime_j = jiffies;
+	slot->pages = vma_pages(vma);
+	spin_lock(&vma_slot_list_lock);
+	list_add_tail(&slot->slot_list, &vma_slot_new);
+	spin_unlock(&vma_slot_list_lock);
+}
+
+/*   32/3 < they < 32/2 */
+#define shiftl	8
+#define shiftr	12
+
+#define HASH_FROM_TO(from, to) 				\
+for (index = from; index < to; index++) {		\
+	pos = random_nums[index];			\
+	hash += key[pos];				\
+	hash += (hash << shiftl);			\
+	hash ^= (hash >> shiftr);			\
+}
+
+
+#define HASH_FROM_DOWN_TO(from, to) 			\
+for (index = from - 1; index >= to; index--) {		\
+	hash ^= (hash >> shiftr);			\
+	hash ^= (hash >> (shiftr*2));			\
+	hash -= (hash << shiftl);			\
+	hash += (hash << (shiftl*2));			\
+	pos = random_nums[index];			\
+	hash -= key[pos];				\
+}
+
+/*
+ * The main random sample hash function.
+ */
+static u32 random_sample_hash(void *addr, u32 hash_strength)
+{
+	u32 hash = 0xdeadbeef;
+	int index, pos, loop = hash_strength;
+	u32 *key = (u32 *)addr;
+
+	if (loop > HASH_STRENGTH_FULL)
+		loop = HASH_STRENGTH_FULL;
+
+	HASH_FROM_TO(0, loop);
+
+	if (hash_strength > HASH_STRENGTH_FULL) {
+		loop = hash_strength - HASH_STRENGTH_FULL;
+		HASH_FROM_TO(0, loop);
+	}
+
+	return hash;
+}
+
+
+/**
+ * It's used when hash strength is adjusted
+ *
+ * @addr The page's virtual address
+ * @from The original hash strength
+ * @to   The hash strength changed to
+ * @hash The hash value generated with "from" hash value
+ *
+ * return the hash value
+ */
+static u32 delta_hash(void *addr, int from, int to, u32 hash)
+{
+	u32 *key = (u32 *)addr;
+	int index, pos; /* make sure they are int type */
+
+	if (to > from) {
+		if (from >= HASH_STRENGTH_FULL) {
+			from -= HASH_STRENGTH_FULL;
+			to -= HASH_STRENGTH_FULL;
+			HASH_FROM_TO(from, to);
+		} else if (to <= HASH_STRENGTH_FULL) {
+			HASH_FROM_TO(from, to);
+		} else {
+			HASH_FROM_TO(from, HASH_STRENGTH_FULL);
+			HASH_FROM_TO(0, to - HASH_STRENGTH_FULL);
+		}
+	} else {
+		if (from <= HASH_STRENGTH_FULL) {
+			HASH_FROM_DOWN_TO(from, to);
+		} else if (to >= HASH_STRENGTH_FULL) {
+			from -= HASH_STRENGTH_FULL;
+			to -= HASH_STRENGTH_FULL;
+			HASH_FROM_DOWN_TO(from, to);
+		} else {
+			HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0);
+			HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to);
+		}
+	}
+
+	return hash;
+}
+
+/**
+ *
+ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round
+ * has finished.
+ *
+ * return 0 if no page has been scanned since last call, 1 otherwise.
+ */
+static inline int encode_benefit(void)
+{
+	u64 scanned_delta, pos_delta, neg_delta;
+	unsigned long base = benefit.base;
+
+	scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last;
+
+	if (!scanned_delta)
+		return 0;
+
+	scanned_delta >>= base;
+	pos_delta = rshash_pos >> base;
+	neg_delta = rshash_neg >> base;
+
+	if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) ||
+	    CAN_OVERFLOW_U64(benefit.neg, neg_delta) ||
+	    CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) {
+		benefit.scanned >>= 1;
+		benefit.neg >>= 1;
+		benefit.pos >>= 1;
+		benefit.base++;
+		scanned_delta >>= 1;
+		pos_delta >>= 1;
+		neg_delta >>= 1;
+	}
+
+	benefit.pos += pos_delta;
+	benefit.neg += neg_delta;
+	benefit.scanned += scanned_delta;
+
+	BUG_ON(!benefit.scanned);
+
+	rshash_pos = rshash_neg = 0;
+	uksm_pages_scanned_last = uksm_pages_scanned;
+
+	return 1;
+}
+
+static inline void reset_benefit(void)
+{
+	benefit.pos = 0;
+	benefit.neg = 0;
+	benefit.base = 0;
+	benefit.scanned = 0;
+}
+
+static inline void inc_rshash_pos(unsigned long delta)
+{
+	if (CAN_OVERFLOW_U64(rshash_pos, delta))
+		encode_benefit();
+
+	rshash_pos += delta;
+}
+
+static inline void inc_rshash_neg(unsigned long delta)
+{
+	if (CAN_OVERFLOW_U64(rshash_neg, delta))
+		encode_benefit();
+
+	rshash_neg += delta;
+}
+
+
+static inline u32 page_hash(struct page *page, unsigned long hash_strength,
+			    int cost_accounting)
+{
+	u32 val;
+	unsigned long delta;
+
+	void *addr = kmap_atomic(page);
+
+	val = random_sample_hash(addr, hash_strength);
+	kunmap_atomic(addr);
+
+	if (cost_accounting) {
+		if (HASH_STRENGTH_FULL > hash_strength)
+			delta = HASH_STRENGTH_FULL - hash_strength;
+		else
+			delta = 0;
+
+		inc_rshash_pos(delta);
+	}
+
+	return val;
+}
+
+static int memcmp_pages(struct page *page1, struct page *page2,
+			int cost_accounting)
+{
+	char *addr1, *addr2;
+	int ret;
+
+	addr1 = kmap_atomic(page1);
+	addr2 = kmap_atomic(page2);
+	ret = memcmp(addr1, addr2, PAGE_SIZE);
+	kunmap_atomic(addr2);
+	kunmap_atomic(addr1);
+
+	if (cost_accounting)
+		inc_rshash_neg(memcmp_cost);
+
+	return ret;
+}
+
+static inline int pages_identical(struct page *page1, struct page *page2)
+{
+	return !memcmp_pages(page1, page2, 0);
+}
+
+static inline int is_page_full_zero(struct page *page)
+{
+	char *addr;
+	int ret;
+
+	addr = kmap_atomic(page);
+	ret = is_full_zero(addr, PAGE_SIZE);
+	kunmap_atomic(addr);
+
+	return ret;
+}
+
+static int write_protect_page(struct vm_area_struct *vma, struct page *page,
+			      pte_t *orig_pte, pte_t *old_pte)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr;
+	pte_t *ptep;
+	spinlock_t *ptl;
+	int swapped;
+	int err = -EFAULT;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
+
+	addr = page_address_in_vma(page, vma);
+	if (addr == -EFAULT)
+		goto out;
+
+	BUG_ON(PageTransCompound(page));
+
+	mmun_start = addr;
+	mmun_end   = addr + PAGE_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+	ptep = page_check_address(page, mm, addr, &ptl, 0);
+	if (!ptep)
+		goto out_mn;
+
+	if (old_pte)
+		*old_pte = *ptep;
+
+	if (pte_write(*ptep) || pte_dirty(*ptep)) {
+		pte_t entry;
+
+		swapped = PageSwapCache(page);
+		flush_cache_page(vma, addr, page_to_pfn(page));
+		/*
+		 * Ok this is tricky, when get_user_pages_fast() run it doesnt
+		 * take any lock, therefore the check that we are going to make
+		 * with the pagecount against the mapcount is racey and
+		 * O_DIRECT can happen right after the check.
+		 * So we clear the pte and flush the tlb before the check
+		 * this assure us that no O_DIRECT can happen after the check
+		 * or in the middle of the check.
+		 */
+		entry = ptep_clear_flush_notify(vma, addr, ptep);
+		/*
+		 * Check that no O_DIRECT or similar I/O is in progress on the
+		 * page
+		 */
+		if (page_mapcount(page) + 1 + swapped != page_count(page)) {
+			set_pte_at(mm, addr, ptep, entry);
+			goto out_unlock;
+		}
+		if (pte_dirty(entry))
+			set_page_dirty(page);
+		entry = pte_mkclean(pte_wrprotect(entry));
+		set_pte_at_notify(mm, addr, ptep, entry);
+	}
+	*orig_pte = *ptep;
+	err = 0;
+
+out_unlock:
+	pte_unmap_unlock(ptep, ptl);
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+	return err;
+}
+
+#define MERGE_ERR_PGERR		1 /* the page is invalid cannot continue */
+#define MERGE_ERR_COLLI		2 /* there is a collision */
+#define MERGE_ERR_COLLI_MAX	3 /* collision at the max hash strength */
+#define MERGE_ERR_CHANGED	4 /* the page has changed since last hash */
+
+
+/**
+ * replace_page - replace page in vma by new ksm page
+ * @vma:      vma that holds the pte pointing to page
+ * @page:     the page we are replacing by kpage
+ * @kpage:    the ksm page we replace page by
+ * @orig_pte: the original value of the pte
+ *
+ * Returns 0 on success, MERGE_ERR_PGERR on failure.
+ */
+static int replace_page(struct vm_area_struct *vma, struct page *page,
+			struct page *kpage, pte_t orig_pte)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep;
+	spinlock_t *ptl;
+	pte_t entry;
+
+	unsigned long addr;
+	int err = MERGE_ERR_PGERR;
+	unsigned long mmun_start;	/* For mmu_notifiers */
+	unsigned long mmun_end;		/* For mmu_notifiers */
+
+	addr = page_address_in_vma(page, vma);
+	if (addr == -EFAULT)
+		goto out;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, addr);
+	BUG_ON(pmd_trans_huge(*pmd));
+	if (!pmd_present(*pmd))
+		goto out;
+
+	mmun_start = addr;
+	mmun_end   = addr + PAGE_SIZE;
+	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!pte_same(*ptep, orig_pte)) {
+		pte_unmap_unlock(ptep, ptl);
+		goto out_mn;
+	}
+
+	flush_cache_page(vma, addr, pte_pfn(*ptep));
+	ptep_clear_flush_notify(vma, addr, ptep);
+	entry = mk_pte(kpage, vma->vm_page_prot);
+
+	/* special treatment is needed for zero_page */
+	if ((page_to_pfn(kpage) == uksm_zero_pfn) ||
+				(page_to_pfn(kpage) == zero_pfn)) {
+		entry = pte_mkspecial(entry);
+		dec_mm_counter(mm, MM_ANONPAGES);
+		inc_zone_page_state(page, NR_UKSM_ZERO_PAGES);
+	} else {
+		get_page(kpage);
+		page_add_anon_rmap(kpage, vma, addr, false);
+	}
+
+	set_pte_at_notify(mm, addr, ptep, entry);
+
+	page_remove_rmap(page, false);
+	if (!page_mapped(page))
+		try_to_free_swap(page);
+	put_page(page);
+
+	pte_unmap_unlock(ptep, ptl);
+	err = 0;
+out_mn:
+	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+out:
+	return err;
+}
+
+
+/**
+ *  Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The
+ *  zero hash value at HASH_STRENGTH_MAX is used to indicated that its
+ *  hash_max member has not been calculated.
+ *
+ * @page The page needs to be hashed
+ * @hash_old The hash value calculated with current hash strength
+ *
+ * return the new hash value calculated at HASH_STRENGTH_MAX
+ */
+static inline u32 page_hash_max(struct page *page, u32 hash_old)
+{
+	u32 hash_max = 0;
+	void *addr;
+
+	addr = kmap_atomic(page);
+	hash_max = delta_hash(addr, hash_strength,
+			      HASH_STRENGTH_MAX, hash_old);
+
+	kunmap_atomic(addr);
+
+	if (!hash_max)
+		hash_max = 1;
+
+	inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
+	return hash_max;
+}
+
+/*
+ * We compare the hash again, to ensure that it is really a hash collision
+ * instead of being caused by page write.
+ */
+static inline int check_collision(struct rmap_item *rmap_item,
+				  u32 hash)
+{
+	int err;
+	struct page *page = rmap_item->page;
+
+	/* if this rmap_item has already been hash_maxed, then the collision
+	 * must appears in the second-level rbtree search. In this case we check
+	 * if its hash_max value has been changed. Otherwise, the collision
+	 * happens in the first-level rbtree search, so we check against it's
+	 * current hash value.
+	 */
+	if (rmap_item->hash_max) {
+		inc_rshash_neg(memcmp_cost);
+		inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength);
+
+		if (rmap_item->hash_max == page_hash_max(page, hash))
+			err = MERGE_ERR_COLLI;
+		else
+			err = MERGE_ERR_CHANGED;
+	} else {
+		inc_rshash_neg(memcmp_cost + hash_strength);
+
+		if (page_hash(page, hash_strength, 0) == hash)
+			err = MERGE_ERR_COLLI;
+		else
+			err = MERGE_ERR_CHANGED;
+	}
+
+	return err;
+}
+
+/**
+ * Try to merge a rmap_item.page with a kpage in stable node. kpage must
+ * already be a ksm page.
+ *
+ * @return 0 if the pages were merged, -EFAULT otherwise.
+ */
+static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item,
+				      struct page *kpage, u32 hash)
+{
+	struct vm_area_struct *vma = rmap_item->slot->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t orig_pte = __pte(0);
+	int err = MERGE_ERR_PGERR;
+	struct page *page;
+
+	if (uksm_test_exit(mm))
+		goto out;
+
+	page = rmap_item->page;
+
+	if (page == kpage) { /* ksm page forked */
+		err = 0;
+		goto out;
+	}
+
+	/*
+	 * We need the page lock to read a stable PageSwapCache in
+	 * write_protect_page().  We use trylock_page() instead of
+	 * lock_page() because we don't want to wait here - we
+	 * prefer to continue scanning and merging different pages,
+	 * then come back to this page when it is unlocked.
+	 */
+	if (!trylock_page(page))
+		goto out;
+
+	if (!PageAnon(page) || !PageKsm(kpage))
+		goto out_unlock;
+
+	if (PageTransCompound(page)) {
+		err = split_huge_page(page);
+		if (err)
+			goto out_unlock;
+	}
+
+	/*
+	 * If this anonymous page is mapped only here, its pte may need
+	 * to be write-protected.  If it's mapped elsewhere, all of its
+	 * ptes are necessarily already write-protected.  But in either
+	 * case, we need to lock and check page_count is not raised.
+	 */
+	if (write_protect_page(vma, page, &orig_pte, NULL) == 0) {
+		if (pages_identical(page, kpage))
+			err = replace_page(vma, page, kpage, orig_pte);
+		else
+			err = check_collision(rmap_item, hash);
+	}
+
+	if ((vma->vm_flags & VM_LOCKED) && kpage && !err) {
+		munlock_vma_page(page);
+		if (!PageMlocked(kpage)) {
+			unlock_page(page);
+			lock_page(kpage);
+			mlock_vma_page(kpage);
+			page = kpage;		/* for final unlock */
+		}
+	}
+
+out_unlock:
+	unlock_page(page);
+out:
+	return err;
+}
+
+
+
+/**
+ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance
+ * to restore a page mapping that has been changed in try_to_merge_two_pages.
+ *
+ * @return 0 on success.
+ */
+static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr,
+			     pte_t orig_pte, pte_t wprt_pte)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *ptep;
+	spinlock_t *ptl;
+
+	int err = -EFAULT;
+
+	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+		goto out;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+		goto out;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		goto out;
+
+	ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
+	if (!pte_same(*ptep, wprt_pte)) {
+		/* already copied, let it be */
+		pte_unmap_unlock(ptep, ptl);
+		goto out;
+	}
+
+	/*
+	 * Good boy, still here. When we still get the ksm page, it does not
+	 * return to the free page pool, there is no way that a pte was changed
+	 * to other page and gets back to this page. And remind that ksm page
+	 * do not reuse in do_wp_page(). So it's safe to restore the original
+	 * pte.
+	 */
+	flush_cache_page(vma, addr, pte_pfn(*ptep));
+	ptep_clear_flush_notify(vma, addr, ptep);
+	set_pte_at_notify(mm, addr, ptep, orig_pte);
+
+	pte_unmap_unlock(ptep, ptl);
+	err = 0;
+out:
+	return err;
+}
+
+/**
+ * try_to_merge_two_pages() - take two identical pages and prepare
+ * them to be merged into one page(rmap_item->page)
+ *
+ * @return 0 if we successfully merged two identical pages into
+ *         one ksm page. MERGE_ERR_COLLI if it's only a hash collision
+ *         search in rbtree. MERGE_ERR_CHANGED if rmap_item has been
+ *         changed since it's hashed. MERGE_ERR_PGERR otherwise.
+ *
+ */
+static int try_to_merge_two_pages(struct rmap_item *rmap_item,
+				  struct rmap_item *tree_rmap_item,
+				  u32 hash)
+{
+	pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0);
+	pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0);
+	struct vm_area_struct *vma1 = rmap_item->slot->vma;
+	struct vm_area_struct *vma2 = tree_rmap_item->slot->vma;
+	struct page *page = rmap_item->page;
+	struct page *tree_page = tree_rmap_item->page;
+	int err = MERGE_ERR_PGERR;
+	struct address_space *saved_mapping;
+
+
+	if (rmap_item->page == tree_rmap_item->page)
+		goto out;
+
+	if (!trylock_page(page))
+		goto out;
+
+	if (!PageAnon(page))
+		goto out_unlock;
+
+	if (PageTransCompound(page)) {
+		err = split_huge_page(page);
+		if (err)
+			goto out_unlock;
+	}
+
+	if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) {
+		unlock_page(page);
+		goto out;
+	}
+
+	/*
+	 * While we hold page lock, upgrade page from
+	 * PageAnon+anon_vma to PageKsm+NULL stable_node:
+	 * stable_tree_insert() will update stable_node.
+	 */
+	saved_mapping = page->mapping;
+	set_page_stable_node(page, NULL);
+	mark_page_accessed(page);
+	if (!PageDirty(page))
+		SetPageDirty(page);
+
+	unlock_page(page);
+
+	if (!trylock_page(tree_page))
+		goto restore_out;
+
+	if (!PageAnon(tree_page)) {
+		unlock_page(tree_page);
+		goto restore_out;
+	}
+
+	if (PageTransCompound(tree_page)) {
+		err = split_huge_page(tree_page);
+		if (err) {
+			unlock_page(tree_page);
+			goto restore_out;
+		}
+	}
+
+	if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) {
+		unlock_page(tree_page);
+		goto restore_out;
+	}
+
+	if (pages_identical(page, tree_page)) {
+		err = replace_page(vma2, tree_page, page, wprt_pte2);
+		if (err) {
+			unlock_page(tree_page);
+			goto restore_out;
+		}
+
+		if ((vma2->vm_flags & VM_LOCKED)) {
+			munlock_vma_page(tree_page);
+			if (!PageMlocked(page)) {
+				unlock_page(tree_page);
+				lock_page(page);
+				mlock_vma_page(page);
+				tree_page = page; /* for final unlock */
+			}
+		}
+
+		unlock_page(tree_page);
+
+		goto out; /* success */
+
+	} else {
+		if (tree_rmap_item->hash_max &&
+		    tree_rmap_item->hash_max == rmap_item->hash_max) {
+			err = MERGE_ERR_COLLI_MAX;
+		} else if (page_hash(page, hash_strength, 0) ==
+		    page_hash(tree_page, hash_strength, 0)) {
+			inc_rshash_neg(memcmp_cost + hash_strength * 2);
+			err = MERGE_ERR_COLLI;
+		} else {
+			err = MERGE_ERR_CHANGED;
+		}
+
+		unlock_page(tree_page);
+	}
+
+restore_out:
+	lock_page(page);
+	if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item),
+				  orig_pte1, wprt_pte1))
+		page->mapping = saved_mapping;
+
+out_unlock:
+	unlock_page(page);
+out:
+	return err;
+}
+
+static inline int hash_cmp(u32 new_val, u32 node_val)
+{
+	if (new_val > node_val)
+		return 1;
+	else if (new_val < node_val)
+		return -1;
+	else
+		return 0;
+}
+
+static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash)
+{
+	u32 hash_max = item->hash_max;
+
+	if (!hash_max) {
+		hash_max = page_hash_max(item->page, hash);
+
+		item->hash_max = hash_max;
+	}
+
+	return hash_max;
+}
+
+
+
+/**
+ * stable_tree_search() - search the stable tree for a page
+ *
+ * @item: 	the rmap_item we are comparing with
+ * @hash: 	the hash value of this item->page already calculated
+ *
+ * @return 	the page we have found, NULL otherwise. The page returned has
+ *         	been gotten.
+ */
+static struct page *stable_tree_search(struct rmap_item *item, u32 hash)
+{
+	struct rb_node *node = root_stable_treep->rb_node;
+	struct tree_node *tree_node;
+	unsigned long hash_max;
+	struct page *page = item->page;
+	struct stable_node *stable_node;
+
+	stable_node = page_stable_node(page);
+	if (stable_node) {
+		/* ksm page forked, that is
+		 * if (PageKsm(page) && !in_stable_tree(rmap_item))
+		 * it's actually gotten once outside.
+		 */
+		get_page(page);
+		return page;
+	}
+
+	while (node) {
+		int cmp;
+
+		tree_node = rb_entry(node, struct tree_node, node);
+
+		cmp = hash_cmp(hash, tree_node->hash);
+
+		if (cmp < 0)
+			node = node->rb_left;
+		else if (cmp > 0)
+			node = node->rb_right;
+		else
+			break;
+	}
+
+	if (!node)
+		return NULL;
+
+	if (tree_node->count == 1) {
+		stable_node = rb_entry(tree_node->sub_root.rb_node,
+				       struct stable_node, node);
+		BUG_ON(!stable_node);
+
+		goto get_page_out;
+	}
+
+	/*
+	 * ok, we have to search the second
+	 * level subtree, hash the page to a
+	 * full strength.
+	 */
+	node = tree_node->sub_root.rb_node;
+	BUG_ON(!node);
+	hash_max = rmap_item_hash_max(item, hash);
+
+	while (node) {
+		int cmp;
+
+		stable_node = rb_entry(node, struct stable_node, node);
+
+		cmp = hash_cmp(hash_max, stable_node->hash_max);
+
+		if (cmp < 0)
+			node = node->rb_left;
+		else if (cmp > 0)
+			node = node->rb_right;
+		else
+			goto get_page_out;
+	}
+
+	return NULL;
+
+get_page_out:
+	page = get_uksm_page(stable_node, 1, 1);
+	return page;
+}
+
+static int try_merge_rmap_item(struct rmap_item *item,
+			       struct page *kpage,
+			       struct page *tree_page)
+{
+	spinlock_t *ptl;
+	pte_t *ptep;
+	unsigned long addr;
+	struct vm_area_struct *vma = item->slot->vma;
+
+	addr = get_rmap_addr(item);
+	ptep = page_check_address(kpage, vma->vm_mm, addr, &ptl, 0);
+	if (!ptep)
+		return 0;
+
+	if (pte_write(*ptep)) {
+		/* has changed, abort! */
+		pte_unmap_unlock(ptep, ptl);
+		return 0;
+	}
+
+	get_page(tree_page);
+	page_add_anon_rmap(tree_page, vma, addr, false);
+
+	flush_cache_page(vma, addr, pte_pfn(*ptep));
+	ptep_clear_flush_notify(vma, addr, ptep);
+	set_pte_at_notify(vma->vm_mm, addr, ptep,
+			  mk_pte(tree_page, vma->vm_page_prot));
+
+	page_remove_rmap(kpage, false);
+	put_page(kpage);
+
+	pte_unmap_unlock(ptep, ptl);
+
+	return 1;
+}
+
+/**
+ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted
+ * into stable tree, the page was found to be identical to a stable ksm page,
+ * this is the last chance we can merge them into one.
+ *
+ * @item1:	the rmap_item holding the page which we wanted to insert
+ *       	into stable tree.
+ * @item2:	the other rmap_item we found when unstable tree search
+ * @oldpage:	the page currently mapped by the two rmap_items
+ * @tree_page: 	the page we found identical in stable tree node
+ * @success1:	return if item1 is successfully merged
+ * @success2:	return if item2 is successfully merged
+ */
+static void try_merge_with_stable(struct rmap_item *item1,
+				  struct rmap_item *item2,
+				  struct page **kpage,
+				  struct page *tree_page,
+				  int *success1, int *success2)
+{
+	struct vm_area_struct *vma1 = item1->slot->vma;
+	struct vm_area_struct *vma2 = item2->slot->vma;
+	*success1 = 0;
+	*success2 = 0;
+
+	if (unlikely(*kpage == tree_page)) {
+		/* I don't think this can really happen */
+		printk(KERN_WARNING "UKSM: unexpected condition detected in "
+			"try_merge_with_stable() -- *kpage == tree_page !\n");
+		*success1 = 1;
+		*success2 = 1;
+		return;
+	}
+
+	if (!PageAnon(*kpage) || !PageKsm(*kpage))
+		goto failed;
+
+	if (!trylock_page(tree_page))
+		goto failed;
+
+	/* If the oldpage is still ksm and still pointed
+	 * to in the right place, and still write protected,
+	 * we are confident it's not changed, no need to
+	 * memcmp anymore.
+	 * be ware, we cannot take nested pte locks,
+	 * deadlock risk.
+	 */
+	if (!try_merge_rmap_item(item1, *kpage, tree_page))
+		goto unlock_failed;
+
+	/* ok, then vma2, remind that pte1 already set */
+	if (!try_merge_rmap_item(item2, *kpage, tree_page))
+		goto success_1;
+
+	*success2 = 1;
+success_1:
+	*success1 = 1;
+
+
+	if ((*success1 && vma1->vm_flags & VM_LOCKED) ||
+	    (*success2 && vma2->vm_flags & VM_LOCKED)) {
+		munlock_vma_page(*kpage);
+		if (!PageMlocked(tree_page))
+			mlock_vma_page(tree_page);
+	}
+
+	/*
+	 * We do not need oldpage any more in the caller, so can break the lock
+	 * now.
+	 */
+	unlock_page(*kpage);
+	*kpage = tree_page; /* Get unlocked outside. */
+	return;
+
+unlock_failed:
+	unlock_page(tree_page);
+failed:
+	return;
+}
+
+static inline void stable_node_hash_max(struct stable_node *node,
+					 struct page *page, u32 hash)
+{
+	u32 hash_max = node->hash_max;
+
+	if (!hash_max) {
+		hash_max = page_hash_max(page, hash);
+		node->hash_max = hash_max;
+	}
+}
+
+static inline
+struct stable_node *new_stable_node(struct tree_node *tree_node,
+				    struct page *kpage, u32 hash_max)
+{
+	struct stable_node *new_stable_node;
+
+	new_stable_node = alloc_stable_node();
+	if (!new_stable_node)
+		return NULL;
+
+	new_stable_node->kpfn = page_to_pfn(kpage);
+	new_stable_node->hash_max = hash_max;
+	new_stable_node->tree_node = tree_node;
+	set_page_stable_node(kpage, new_stable_node);
+
+	return new_stable_node;
+}
+
+static inline
+struct stable_node *first_level_insert(struct tree_node *tree_node,
+				       struct rmap_item *rmap_item,
+				       struct rmap_item *tree_rmap_item,
+				       struct page **kpage, u32 hash,
+				       int *success1, int *success2)
+{
+	int cmp;
+	struct page *tree_page;
+	u32 hash_max = 0;
+	struct stable_node *stable_node, *new_snode;
+	struct rb_node *parent = NULL, **new;
+
+	/* this tree node contains no sub-tree yet */
+	stable_node = rb_entry(tree_node->sub_root.rb_node,
+			       struct stable_node, node);
+
+	tree_page = get_uksm_page(stable_node, 1, 0);
+	if (tree_page) {
+		cmp = memcmp_pages(*kpage, tree_page, 1);
+		if (!cmp) {
+			try_merge_with_stable(rmap_item, tree_rmap_item, kpage,
+					      tree_page, success1, success2);
+			put_page(tree_page);
+			if (!*success1 && !*success2)
+				goto failed;
+
+			return stable_node;
+
+		} else {
+			/*
+			 * collision in first level try to create a subtree.
+			 * A new node need to be created.
+			 */
+			put_page(tree_page);
+
+			stable_node_hash_max(stable_node, tree_page,
+					     tree_node->hash);
+			hash_max = rmap_item_hash_max(rmap_item, hash);
+			cmp = hash_cmp(hash_max, stable_node->hash_max);
+
+			parent = &stable_node->node;
+			if (cmp < 0) {
+				new = &parent->rb_left;
+			} else if (cmp > 0) {
+				new = &parent->rb_right;
+			} else {
+				goto failed;
+			}
+		}
+
+	} else {
+		/* the only stable_node deleted, we reuse its tree_node.
+		 */
+		parent = NULL;
+		new = &tree_node->sub_root.rb_node;
+	}
+
+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
+	if (!new_snode)
+		goto failed;
+
+	rb_link_node(&new_snode->node, parent, new);
+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
+	tree_node->count++;
+	*success1 = *success2 = 1;
+
+	return new_snode;
+
+failed:
+	return NULL;
+}
+
+static inline
+struct stable_node *stable_subtree_insert(struct tree_node *tree_node,
+					  struct rmap_item *rmap_item,
+					  struct rmap_item *tree_rmap_item,
+					  struct page **kpage, u32 hash,
+					  int *success1, int *success2)
+{
+	struct page *tree_page;
+	u32 hash_max;
+	struct stable_node *stable_node, *new_snode;
+	struct rb_node *parent, **new;
+
+research:
+	parent = NULL;
+	new = &tree_node->sub_root.rb_node;
+	BUG_ON(!*new);
+	hash_max = rmap_item_hash_max(rmap_item, hash);
+	while (*new) {
+		int cmp;
+
+		stable_node = rb_entry(*new, struct stable_node, node);
+
+		cmp = hash_cmp(hash_max, stable_node->hash_max);
+
+		if (cmp < 0) {
+			parent = *new;
+			new = &parent->rb_left;
+		} else if (cmp > 0) {
+			parent = *new;
+			new = &parent->rb_right;
+		} else {
+			tree_page = get_uksm_page(stable_node, 1, 0);
+			if (tree_page) {
+				cmp = memcmp_pages(*kpage, tree_page, 1);
+				if (!cmp) {
+					try_merge_with_stable(rmap_item,
+						tree_rmap_item, kpage,
+						tree_page, success1, success2);
+
+					put_page(tree_page);
+					if (!*success1 && !*success2)
+						goto failed;
+					/*
+					 * successfully merged with a stable
+					 * node
+					 */
+					return stable_node;
+				} else {
+					put_page(tree_page);
+					goto failed;
+				}
+			} else {
+				/*
+				 * stable node may be deleted,
+				 * and subtree maybe
+				 * restructed, cannot
+				 * continue, research it.
+				 */
+				if (tree_node->count) {
+					goto research;
+				} else {
+					/* reuse the tree node*/
+					parent = NULL;
+					new = &tree_node->sub_root.rb_node;
+				}
+			}
+		}
+	}
+
+	new_snode = new_stable_node(tree_node, *kpage, hash_max);
+	if (!new_snode)
+		goto failed;
+
+	rb_link_node(&new_snode->node, parent, new);
+	rb_insert_color(&new_snode->node, &tree_node->sub_root);
+	tree_node->count++;
+	*success1 = *success2 = 1;
+
+	return new_snode;
+
+failed:
+	return NULL;
+}
+
+
+/**
+ * stable_tree_insert() - try to insert a merged page in unstable tree to
+ * the stable tree
+ *
+ * @kpage:		the page need to be inserted
+ * @hash:		the current hash of this page
+ * @rmap_item:		the rmap_item being scanned
+ * @tree_rmap_item:	the rmap_item found on unstable tree
+ * @success1:		return if rmap_item is merged
+ * @success2:		return if tree_rmap_item is merged
+ *
+ * @return 		the stable_node on stable tree if at least one
+ *      		rmap_item is inserted into stable tree, NULL
+ *      		otherwise.
+ */
+static struct stable_node *
+stable_tree_insert(struct page **kpage, u32 hash,
+		   struct rmap_item *rmap_item,
+		   struct rmap_item *tree_rmap_item,
+		   int *success1, int *success2)
+{
+	struct rb_node **new = &root_stable_treep->rb_node;
+	struct rb_node *parent = NULL;
+	struct stable_node *stable_node;
+	struct tree_node *tree_node;
+	u32 hash_max = 0;
+
+	*success1 = *success2 = 0;
+
+	while (*new) {
+		int cmp;
+
+		tree_node = rb_entry(*new, struct tree_node, node);
+
+		cmp = hash_cmp(hash, tree_node->hash);
+
+		if (cmp < 0) {
+			parent = *new;
+			new = &parent->rb_left;
+		} else if (cmp > 0) {
+			parent = *new;
+			new = &parent->rb_right;
+		} else
+			break;
+	}
+
+	if (*new) {
+		if (tree_node->count == 1) {
+			stable_node = first_level_insert(tree_node, rmap_item,
+						tree_rmap_item, kpage,
+						hash, success1, success2);
+		} else {
+			stable_node = stable_subtree_insert(tree_node,
+					rmap_item, tree_rmap_item, kpage,
+					hash, success1, success2);
+		}
+	} else {
+
+		/* no tree node found */
+		tree_node = alloc_tree_node(stable_tree_node_listp);
+		if (!tree_node) {
+			stable_node = NULL;
+			goto out;
+		}
+
+		stable_node = new_stable_node(tree_node, *kpage, hash_max);
+		if (!stable_node) {
+			free_tree_node(tree_node);
+			goto out;
+		}
+
+		tree_node->hash = hash;
+		rb_link_node(&tree_node->node, parent, new);
+		rb_insert_color(&tree_node->node, root_stable_treep);
+		parent = NULL;
+		new = &tree_node->sub_root.rb_node;
+
+		rb_link_node(&stable_node->node, parent, new);
+		rb_insert_color(&stable_node->node, &tree_node->sub_root);
+		tree_node->count++;
+		*success1 = *success2 = 1;
+	}
+
+out:
+	return stable_node;
+}
+
+
+/**
+ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem
+ *
+ * @return 	0 on success, -EBUSY if unable to lock the mmap_sem,
+ *         	-EINVAL if the page mapping has been changed.
+ */
+static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item)
+{
+	int err;
+
+	err = get_mergeable_page_lock_mmap(tree_rmap_item);
+
+	if (err == -EINVAL) {
+		/* its page map has been changed, remove it */
+		remove_rmap_item_from_tree(tree_rmap_item);
+	}
+
+	/* The page is gotten and mmap_sem is locked now. */
+	return err;
+}
+
+
+/**
+ * unstable_tree_search_insert() - search an unstable tree rmap_item with the
+ * same hash value. Get its page and trylock the mmap_sem
+ */
+static inline
+struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
+					      u32 hash)
+
+{
+	struct rb_node **new = &root_unstable_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct tree_node *tree_node;
+	u32 hash_max;
+	struct rmap_item *tree_rmap_item;
+
+	while (*new) {
+		int cmp;
+
+		tree_node = rb_entry(*new, struct tree_node, node);
+
+		cmp = hash_cmp(hash, tree_node->hash);
+
+		if (cmp < 0) {
+			parent = *new;
+			new = &parent->rb_left;
+		} else if (cmp > 0) {
+			parent = *new;
+			new = &parent->rb_right;
+		} else
+			break;
+	}
+
+	if (*new) {
+		/* got the tree_node */
+		if (tree_node->count == 1) {
+			tree_rmap_item = rb_entry(tree_node->sub_root.rb_node,
+						  struct rmap_item, node);
+			BUG_ON(!tree_rmap_item);
+
+			goto get_page_out;
+		}
+
+		/* well, search the collision subtree */
+		new = &tree_node->sub_root.rb_node;
+		BUG_ON(!*new);
+		hash_max = rmap_item_hash_max(rmap_item, hash);
+
+		while (*new) {
+			int cmp;
+
+			tree_rmap_item = rb_entry(*new, struct rmap_item,
+						  node);
+
+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
+			parent = *new;
+			if (cmp < 0)
+				new = &parent->rb_left;
+			else if (cmp > 0)
+				new = &parent->rb_right;
+			else
+				goto get_page_out;
+		}
+	} else {
+		/* alloc a new tree_node */
+		tree_node = alloc_tree_node(&unstable_tree_node_list);
+		if (!tree_node)
+			return NULL;
+
+		tree_node->hash = hash;
+		rb_link_node(&tree_node->node, parent, new);
+		rb_insert_color(&tree_node->node, &root_unstable_tree);
+		parent = NULL;
+		new = &tree_node->sub_root.rb_node;
+	}
+
+	/* did not found even in sub-tree */
+	rmap_item->tree_node = tree_node;
+	rmap_item->address |= UNSTABLE_FLAG;
+	rmap_item->hash_round = uksm_hash_round;
+	rb_link_node(&rmap_item->node, parent, new);
+	rb_insert_color(&rmap_item->node, &tree_node->sub_root);
+
+	uksm_pages_unshared++;
+	return NULL;
+
+get_page_out:
+	if (tree_rmap_item->page == rmap_item->page)
+		return NULL;
+
+	if (get_tree_rmap_item_page(tree_rmap_item))
+		return NULL;
+
+	return tree_rmap_item;
+}
+
+static void hold_anon_vma(struct rmap_item *rmap_item,
+			  struct anon_vma *anon_vma)
+{
+	rmap_item->anon_vma = anon_vma;
+	get_anon_vma(anon_vma);
+}
+
+
+/**
+ * stable_tree_append() - append a rmap_item to a stable node. Deduplication
+ * ratio statistics is done in this function.
+ *
+ */
+static void stable_tree_append(struct rmap_item *rmap_item,
+			       struct stable_node *stable_node, int logdedup)
+{
+	struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL;
+	unsigned long key = (unsigned long)rmap_item->slot;
+	unsigned long factor = rmap_item->slot->rung->step;
+
+	BUG_ON(!stable_node);
+	rmap_item->address |= STABLE_FLAG;
+
+	if (hlist_empty(&stable_node->hlist)) {
+		uksm_pages_shared++;
+		goto node_vma_new;
+	} else {
+		uksm_pages_sharing++;
+	}
+
+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
+		if (node_vma->key >= key)
+			break;
+
+		if (logdedup) {
+			node_vma->slot->pages_bemerged += factor;
+			if (list_empty(&node_vma->slot->dedup_list))
+				list_add(&node_vma->slot->dedup_list,
+					 &vma_slot_dedup);
+		}
+	}
+
+	if (node_vma) {
+		if (node_vma->key == key) {
+			node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist);
+			goto node_vma_ok;
+		} else if (node_vma->key > key) {
+			node_vma_cont = node_vma;
+		}
+	}
+
+node_vma_new:
+	/* no same vma already in node, alloc a new node_vma */
+	new_node_vma = alloc_node_vma();
+	BUG_ON(!new_node_vma);
+	new_node_vma->head = stable_node;
+	new_node_vma->slot = rmap_item->slot;
+
+	if (!node_vma) {
+		hlist_add_head(&new_node_vma->hlist, &stable_node->hlist);
+	} else if (node_vma->key != key) {
+		if (node_vma->key < key)
+			hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist);
+		else {
+			hlist_add_before(&new_node_vma->hlist,
+					 &node_vma->hlist);
+		}
+
+	}
+	node_vma = new_node_vma;
+
+node_vma_ok: /* ok, ready to add to the list */
+	rmap_item->head = node_vma;
+	hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist);
+	hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma);
+	if (logdedup) {
+		rmap_item->slot->pages_merged++;
+		if (node_vma_cont) {
+			node_vma = node_vma_cont;
+			hlist_for_each_entry_continue(node_vma, hlist) {
+				node_vma->slot->pages_bemerged += factor;
+				if (list_empty(&node_vma->slot->dedup_list))
+					list_add(&node_vma->slot->dedup_list,
+						 &vma_slot_dedup);
+			}
+		}
+	}
+}
+
+/*
+ * We use break_ksm to break COW on a ksm page: it's a stripped down
+ *
+ *	if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1)
+ *		put_page(page);
+ *
+ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma,
+ * in case the application has unmapped and remapped mm,addr meanwhile.
+ * Could a ksm page appear anywhere else?  Actually yes, in a VM_PFNMAP
+ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it.
+ */
+static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
+{
+	struct page *page;
+	int ret = 0;
+
+	do {
+		cond_resched();
+		page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
+		if (IS_ERR_OR_NULL(page))
+			break;
+		if (PageKsm(page)) {
+			ret = handle_mm_fault(vma, addr,
+					      FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
+		} else
+			ret = VM_FAULT_WRITE;
+		put_page(page);
+	} while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
+	/*
+	 * We must loop because handle_mm_fault() may back out if there's
+	 * any difficulty e.g. if pte accessed bit gets updated concurrently.
+	 *
+	 * VM_FAULT_WRITE is what we have been hoping for: it indicates that
+	 * COW has been broken, even if the vma does not permit VM_WRITE;
+	 * but note that a concurrent fault might break PageKsm for us.
+	 *
+	 * VM_FAULT_SIGBUS could occur if we race with truncation of the
+	 * backing file, which also invalidates anonymous pages: that's
+	 * okay, that truncation will have unmapped the PageKsm for us.
+	 *
+	 * VM_FAULT_OOM: at the time of writing (late July 2009), setting
+	 * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the
+	 * current task has TIF_MEMDIE set, and will be OOM killed on return
+	 * to user; and ksmd, having no mm, would never be chosen for that.
+	 *
+	 * But if the mm is in a limited mem_cgroup, then the fault may fail
+	 * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and
+	 * even ksmd can fail in this way - though it's usually breaking ksm
+	 * just to undo a merge it made a moment before, so unlikely to oom.
+	 *
+	 * That's a pity: we might therefore have more kernel pages allocated
+	 * than we're counting as nodes in the stable tree; but uksm_do_scan
+	 * will retry to break_cow on each pass, so should recover the page
+	 * in due course.  The important thing is to not let VM_MERGEABLE
+	 * be cleared while any such pages might remain in the area.
+	 */
+	return (ret & VM_FAULT_OOM) ? -ENOMEM : 0;
+}
+
+static void break_cow(struct rmap_item *rmap_item)
+{
+	struct vm_area_struct *vma = rmap_item->slot->vma;
+	struct mm_struct *mm = vma->vm_mm;
+	unsigned long addr = get_rmap_addr(rmap_item);
+
+	if (uksm_test_exit(mm))
+		goto out;
+
+	break_ksm(vma, addr);
+out:
+	return;
+}
+
+/*
+ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather
+ * than check every pte of a given vma, the locking doesn't quite work for
+ * that - an rmap_item is assigned to the stable tree after inserting ksm
+ * page and upping mmap_sem.  Nor does it fit with the way we skip dup'ing
+ * rmap_items from parent to child at fork time (so as not to waste time
+ * if exit comes before the next scan reaches it).
+ *
+ * Similarly, although we'd like to remove rmap_items (so updating counts
+ * and freeing memory) when unmerging an area, it's easier to leave that
+ * to the next pass of ksmd - consider, for example, how ksmd might be
+ * in cmp_and_merge_page on one of the rmap_items we would be removing.
+ */
+inline int unmerge_uksm_pages(struct vm_area_struct *vma,
+		      unsigned long start, unsigned long end)
+{
+	unsigned long addr;
+	int err = 0;
+
+	for (addr = start; addr < end && !err; addr += PAGE_SIZE) {
+		if (uksm_test_exit(vma->vm_mm))
+			break;
+		if (signal_pending(current))
+			err = -ERESTARTSYS;
+		else
+			err = break_ksm(vma, addr);
+	}
+	return err;
+}
+
+static inline void inc_uksm_pages_scanned(void)
+{
+	u64 delta;
+
+
+	if (uksm_pages_scanned == U64_MAX) {
+		encode_benefit();
+
+		delta = uksm_pages_scanned >> pages_scanned_base;
+
+		if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) {
+			pages_scanned_stored >>= 1;
+			delta >>= 1;
+			pages_scanned_base++;
+		}
+
+		pages_scanned_stored += delta;
+
+		uksm_pages_scanned = uksm_pages_scanned_last = 0;
+	}
+
+	uksm_pages_scanned++;
+}
+
+static inline int find_zero_page_hash(int strength, u32 hash)
+{
+	return (zero_hash_table[strength] == hash);
+}
+
+static
+int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page)
+{
+	struct page *zero_page = empty_uksm_zero_page;
+	struct mm_struct *mm = vma->vm_mm;
+	pte_t orig_pte = __pte(0);
+	int err = -EFAULT;
+
+	if (uksm_test_exit(mm))
+		goto out;
+
+	if (!trylock_page(page))
+		goto out;
+
+	if (!PageAnon(page))
+		goto out_unlock;
+
+	if (PageTransCompound(page)) {
+		err = split_huge_page(page);
+		if (err)
+			goto out_unlock;
+	}
+
+	if (write_protect_page(vma, page, &orig_pte, 0) == 0) {
+		if (is_page_full_zero(page))
+			err = replace_page(vma, page, zero_page, orig_pte);
+	}
+
+out_unlock:
+	unlock_page(page);
+out:
+	return err;
+}
+
+/*
+ * cmp_and_merge_page() - first see if page can be merged into the stable
+ * tree; if not, compare hash to previous and if it's the same, see if page
+ * can be inserted into the unstable tree, or merged with a page already there
+ * and both transferred to the stable tree.
+ *
+ * @page: the page that we are searching identical page to.
+ * @rmap_item: the reverse mapping into the virtual address of this page
+ */
+static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash)
+{
+	struct rmap_item *tree_rmap_item;
+	struct page *page;
+	struct page *kpage = NULL;
+	u32 hash_max;
+	int err;
+	unsigned int success1, success2;
+	struct stable_node *snode;
+	int cmp;
+	struct rb_node *parent = NULL, **new;
+
+	remove_rmap_item_from_tree(rmap_item);
+	page = rmap_item->page;
+
+	/* We first start with searching the page inside the stable tree */
+	kpage = stable_tree_search(rmap_item, hash);
+	if (kpage) {
+		err = try_to_merge_with_uksm_page(rmap_item, kpage,
+						 hash);
+		if (!err) {
+			/*
+			 * The page was successfully merged, add
+			 * its rmap_item to the stable tree.
+			 * page lock is needed because it's
+			 * racing with try_to_unmap_ksm(), etc.
+			 */
+			lock_page(kpage);
+			snode = page_stable_node(kpage);
+			stable_tree_append(rmap_item, snode, 1);
+			unlock_page(kpage);
+			put_page(kpage);
+			return; /* success */
+		}
+		put_page(kpage);
+
+		/*
+		 * if it's a collision and it has been search in sub-rbtree
+		 * (hash_max != 0), we want to abort, because if it is
+		 * successfully merged in unstable tree, the collision trends to
+		 * happen again.
+		 */
+		if (err == MERGE_ERR_COLLI && rmap_item->hash_max)
+			return;
+	}
+
+	tree_rmap_item =
+		unstable_tree_search_insert(rmap_item, hash);
+	if (tree_rmap_item) {
+		err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash);
+		/*
+		 * As soon as we merge this page, we want to remove the
+		 * rmap_item of the page we have merged with from the unstable
+		 * tree, and insert it instead as new node in the stable tree.
+		 */
+		if (!err) {
+			kpage = page;
+			remove_rmap_item_from_tree(tree_rmap_item);
+			lock_page(kpage);
+			snode = stable_tree_insert(&kpage, hash,
+						   rmap_item, tree_rmap_item,
+						   &success1, &success2);
+
+			/*
+			 * Do not log dedup for tree item, it's not counted as
+			 * scanned in this round.
+			 */
+			if (success2)
+				stable_tree_append(tree_rmap_item, snode, 0);
+
+			/*
+			 * The order of these two stable append is important:
+			 * we are scanning rmap_item.
+			 */
+			if (success1)
+				stable_tree_append(rmap_item, snode, 1);
+
+			/*
+			 * The original kpage may be unlocked inside
+			 * stable_tree_insert() already. This page
+			 * should be unlocked before doing
+			 * break_cow().
+			 */
+			unlock_page(kpage);
+
+			if (!success1)
+				break_cow(rmap_item);
+
+			if (!success2)
+				break_cow(tree_rmap_item);
+
+		} else if (err == MERGE_ERR_COLLI) {
+			BUG_ON(tree_rmap_item->tree_node->count > 1);
+
+			rmap_item_hash_max(tree_rmap_item,
+					   tree_rmap_item->tree_node->hash);
+
+			hash_max = rmap_item_hash_max(rmap_item, hash);
+			cmp = hash_cmp(hash_max, tree_rmap_item->hash_max);
+			parent = &tree_rmap_item->node;
+			if (cmp < 0)
+				new = &parent->rb_left;
+			else if (cmp > 0)
+				new = &parent->rb_right;
+			else
+				goto put_up_out;
+
+			rmap_item->tree_node = tree_rmap_item->tree_node;
+			rmap_item->address |= UNSTABLE_FLAG;
+			rmap_item->hash_round = uksm_hash_round;
+			rb_link_node(&rmap_item->node, parent, new);
+			rb_insert_color(&rmap_item->node,
+					&tree_rmap_item->tree_node->sub_root);
+			rmap_item->tree_node->count++;
+		} else {
+			/*
+			 * either one of the page has changed or they collide
+			 * at the max hash, we consider them as ill items.
+			 */
+			remove_rmap_item_from_tree(tree_rmap_item);
+		}
+put_up_out:
+		put_page(tree_rmap_item->page);
+		up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem);
+	}
+}
+
+
+
+
+static inline unsigned long get_pool_index(struct vma_slot *slot,
+					   unsigned long index)
+{
+	unsigned long pool_index;
+
+	pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT;
+	if (pool_index >= slot->pool_size)
+		BUG();
+	return pool_index;
+}
+
+static inline unsigned long index_page_offset(unsigned long index)
+{
+	return offset_in_page(sizeof(struct rmap_list_entry *) * index);
+}
+
+static inline
+struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot,
+					    unsigned long index, int need_alloc)
+{
+	unsigned long pool_index;
+	struct page *page;
+	void *addr;
+
+
+	pool_index = get_pool_index(slot, index);
+	if (!slot->rmap_list_pool[pool_index]) {
+		if (!need_alloc)
+			return NULL;
+
+		page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN);
+		if (!page)
+			return NULL;
+
+		slot->rmap_list_pool[pool_index] = page;
+	}
+
+	addr = kmap(slot->rmap_list_pool[pool_index]);
+	addr += index_page_offset(index);
+
+	return addr;
+}
+
+static inline void put_rmap_list_entry(struct vma_slot *slot,
+				       unsigned long index)
+{
+	unsigned long pool_index;
+
+	pool_index = get_pool_index(slot, index);
+	BUG_ON(!slot->rmap_list_pool[pool_index]);
+	kunmap(slot->rmap_list_pool[pool_index]);
+}
+
+static inline int entry_is_new(struct rmap_list_entry *entry)
+{
+	return !entry->item;
+}
+
+static inline unsigned long get_index_orig_addr(struct vma_slot *slot,
+						unsigned long index)
+{
+	return slot->vma->vm_start + (index << PAGE_SHIFT);
+}
+
+static inline unsigned long get_entry_address(struct rmap_list_entry *entry)
+{
+	unsigned long addr;
+
+	if (is_addr(entry->addr))
+		addr = get_clean_addr(entry->addr);
+	else if (entry->item)
+		addr = get_rmap_addr(entry->item);
+	else
+		BUG();
+
+	return addr;
+}
+
+static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry)
+{
+	if (is_addr(entry->addr))
+		return NULL;
+
+	return entry->item;
+}
+
+static inline void inc_rmap_list_pool_count(struct vma_slot *slot,
+					    unsigned long index)
+{
+	unsigned long pool_index;
+
+	pool_index = get_pool_index(slot, index);
+	BUG_ON(!slot->rmap_list_pool[pool_index]);
+	slot->pool_counts[pool_index]++;
+}
+
+static inline void dec_rmap_list_pool_count(struct vma_slot *slot,
+					    unsigned long index)
+{
+	unsigned long pool_index;
+
+	pool_index = get_pool_index(slot, index);
+	BUG_ON(!slot->rmap_list_pool[pool_index]);
+	BUG_ON(!slot->pool_counts[pool_index]);
+	slot->pool_counts[pool_index]--;
+}
+
+static inline int entry_has_rmap(struct rmap_list_entry *entry)
+{
+	return !is_addr(entry->addr) && entry->item;
+}
+
+static inline void swap_entries(struct rmap_list_entry *entry1,
+				unsigned long index1,
+				struct rmap_list_entry *entry2,
+				unsigned long index2)
+{
+	struct rmap_list_entry tmp;
+
+	/* swapping two new entries is meaningless */
+	BUG_ON(entry_is_new(entry1) && entry_is_new(entry2));
+
+	tmp = *entry1;
+	*entry1 = *entry2;
+	*entry2 = tmp;
+
+	if (entry_has_rmap(entry1))
+		entry1->item->entry_index = index1;
+
+	if (entry_has_rmap(entry2))
+		entry2->item->entry_index = index2;
+
+	if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) {
+		inc_rmap_list_pool_count(entry1->item->slot, index1);
+		dec_rmap_list_pool_count(entry1->item->slot, index2);
+	} else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) {
+		inc_rmap_list_pool_count(entry2->item->slot, index2);
+		dec_rmap_list_pool_count(entry2->item->slot, index1);
+	}
+}
+
+static inline void free_entry_item(struct rmap_list_entry *entry)
+{
+	unsigned long index;
+	struct rmap_item *item;
+
+	if (!is_addr(entry->addr)) {
+		BUG_ON(!entry->item);
+		item = entry->item;
+		entry->addr = get_rmap_addr(item);
+		set_is_addr(entry->addr);
+		index = item->entry_index;
+		remove_rmap_item_from_tree(item);
+		dec_rmap_list_pool_count(item->slot, index);
+		free_rmap_item(item);
+	}
+}
+
+static inline int pool_entry_boundary(unsigned long index)
+{
+	unsigned long linear_addr;
+
+	linear_addr = sizeof(struct rmap_list_entry *) * index;
+	return index && !offset_in_page(linear_addr);
+}
+
+static inline void try_free_last_pool(struct vma_slot *slot,
+				      unsigned long index)
+{
+	unsigned long pool_index;
+
+	pool_index = get_pool_index(slot, index);
+	if (slot->rmap_list_pool[pool_index] &&
+	    !slot->pool_counts[pool_index]) {
+		__free_page(slot->rmap_list_pool[pool_index]);
+		slot->rmap_list_pool[pool_index] = NULL;
+		slot->flags |= UKSM_SLOT_NEED_SORT;
+	}
+
+}
+
+static inline unsigned long vma_item_index(struct vm_area_struct *vma,
+					   struct rmap_item *item)
+{
+	return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT;
+}
+
+static int within_same_pool(struct vma_slot *slot,
+			    unsigned long i, unsigned long j)
+{
+	unsigned long pool_i, pool_j;
+
+	pool_i = get_pool_index(slot, i);
+	pool_j = get_pool_index(slot, j);
+
+	return (pool_i == pool_j);
+}
+
+static void sort_rmap_entry_list(struct vma_slot *slot)
+{
+	unsigned long i, j;
+	struct rmap_list_entry *entry, *swap_entry;
+
+	entry = get_rmap_list_entry(slot, 0, 0);
+	for (i = 0; i < slot->pages; ) {
+
+		if (!entry)
+			goto skip_whole_pool;
+
+		if (entry_is_new(entry))
+			goto next_entry;
+
+		if (is_addr(entry->addr)) {
+			entry->addr = 0;
+			goto next_entry;
+		}
+
+		j = vma_item_index(slot->vma, entry->item);
+		if (j == i)
+			goto next_entry;
+
+		if (within_same_pool(slot, i, j))
+			swap_entry = entry + j - i;
+		else
+			swap_entry = get_rmap_list_entry(slot, j, 1);
+
+		swap_entries(entry, i, swap_entry, j);
+		if (!within_same_pool(slot, i, j))
+			put_rmap_list_entry(slot, j);
+		continue;
+
+skip_whole_pool:
+		i += PAGE_SIZE / sizeof(*entry);
+		if (i < slot->pages)
+			entry = get_rmap_list_entry(slot, i, 0);
+		continue;
+
+next_entry:
+		if (i >= slot->pages - 1 ||
+		    !within_same_pool(slot, i, i + 1)) {
+			put_rmap_list_entry(slot, i);
+			if (i + 1 < slot->pages)
+				entry = get_rmap_list_entry(slot, i + 1, 0);
+		} else
+			entry++;
+		i++;
+		continue;
+	}
+
+	/* free empty pool entries which contain no rmap_item */
+	/* CAN be simplied to based on only pool_counts when bug freed !!!!! */
+	for (i = 0; i < slot->pool_size; i++) {
+		unsigned char has_rmap;
+		void *addr;
+
+		if (!slot->rmap_list_pool[i])
+			continue;
+
+		has_rmap = 0;
+		addr = kmap(slot->rmap_list_pool[i]);
+		BUG_ON(!addr);
+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
+			entry = (struct rmap_list_entry *)addr + j;
+			if (is_addr(entry->addr))
+				continue;
+			if (!entry->item)
+				continue;
+			has_rmap = 1;
+		}
+		kunmap(slot->rmap_list_pool[i]);
+		if (!has_rmap) {
+			BUG_ON(slot->pool_counts[i]);
+			__free_page(slot->rmap_list_pool[i]);
+			slot->rmap_list_pool[i] = NULL;
+		}
+	}
+
+	slot->flags &= ~UKSM_SLOT_NEED_SORT;
+}
+
+/*
+ * vma_fully_scanned() - if all the pages in this slot have been scanned.
+ */
+static inline int vma_fully_scanned(struct vma_slot *slot)
+{
+	return slot->pages_scanned == slot->pages;
+}
+
+/**
+ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to
+ * its random permutation. This function is embedded with the random
+ * permutation index management code.
+ */
+static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash)
+{
+	unsigned long rand_range, addr, swap_index, scan_index;
+	struct rmap_item *item = NULL;
+	struct rmap_list_entry *scan_entry, *swap_entry = NULL;
+	struct page *page;
+
+	scan_index = swap_index = slot->pages_scanned % slot->pages;
+
+	if (pool_entry_boundary(scan_index))
+		try_free_last_pool(slot, scan_index - 1);
+
+	if (vma_fully_scanned(slot)) {
+		if (slot->flags & UKSM_SLOT_NEED_SORT)
+			slot->flags |= UKSM_SLOT_NEED_RERAND;
+		else
+			slot->flags &= ~UKSM_SLOT_NEED_RERAND;
+		if (slot->flags & UKSM_SLOT_NEED_SORT)
+			sort_rmap_entry_list(slot);
+	}
+
+	scan_entry = get_rmap_list_entry(slot, scan_index, 1);
+	if (!scan_entry)
+		return NULL;
+
+	if (entry_is_new(scan_entry)) {
+		scan_entry->addr = get_index_orig_addr(slot, scan_index);
+		set_is_addr(scan_entry->addr);
+	}
+
+	if (slot->flags & UKSM_SLOT_NEED_RERAND) {
+		rand_range = slot->pages - scan_index;
+		BUG_ON(!rand_range);
+		swap_index = scan_index + (prandom_u32() % rand_range);
+	}
+
+	if (swap_index != scan_index) {
+		swap_entry = get_rmap_list_entry(slot, swap_index, 1);
+		if (entry_is_new(swap_entry)) {
+			swap_entry->addr = get_index_orig_addr(slot,
+							       swap_index);
+			set_is_addr(swap_entry->addr);
+		}
+		swap_entries(scan_entry, scan_index, swap_entry, swap_index);
+	}
+
+	addr = get_entry_address(scan_entry);
+	item = get_entry_item(scan_entry);
+	BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start);
+
+	page = follow_page(slot->vma, addr, FOLL_GET);
+	if (IS_ERR_OR_NULL(page))
+		goto nopage;
+
+	if (!PageAnon(page))
+		goto putpage;
+
+	/*check is zero_page pfn or uksm_zero_page*/
+	if ((page_to_pfn(page) == zero_pfn)
+			|| (page_to_pfn(page) == uksm_zero_pfn))
+		goto putpage;
+
+	flush_anon_page(slot->vma, page, addr);
+	flush_dcache_page(page);
+
+
+	*hash = page_hash(page, hash_strength, 1);
+	inc_uksm_pages_scanned();
+	/*if the page content all zero, re-map to zero-page*/
+	if (find_zero_page_hash(hash_strength, *hash)) {
+		if (!cmp_and_merge_zero_page(slot->vma, page)) {
+			slot->pages_merged++;
+
+			/* For full-zero pages, no need to create rmap item */
+			goto putpage;
+		} else {
+			inc_rshash_neg(memcmp_cost / 2);
+		}
+	}
+
+	if (!item) {
+		item = alloc_rmap_item();
+		if (item) {
+			/* It has already been zeroed */
+			item->slot = slot;
+			item->address = addr;
+			item->entry_index = scan_index;
+			scan_entry->item = item;
+			inc_rmap_list_pool_count(slot, scan_index);
+		} else
+			goto putpage;
+	}
+
+	BUG_ON(item->slot != slot);
+	/* the page may have changed */
+	item->page = page;
+	put_rmap_list_entry(slot, scan_index);
+	if (swap_entry)
+		put_rmap_list_entry(slot, swap_index);
+	return item;
+
+putpage:
+	put_page(page);
+	page = NULL;
+nopage:
+	/* no page, store addr back and free rmap_item if possible */
+	free_entry_item(scan_entry);
+	put_rmap_list_entry(slot, scan_index);
+	if (swap_entry)
+		put_rmap_list_entry(slot, swap_index);
+	return NULL;
+}
+
+static inline int in_stable_tree(struct rmap_item *rmap_item)
+{
+	return rmap_item->address & STABLE_FLAG;
+}
+
+/**
+ * scan_vma_one_page() - scan the next page in a vma_slot. Called with
+ * mmap_sem locked.
+ */
+static noinline void scan_vma_one_page(struct vma_slot *slot)
+{
+	u32 hash;
+	struct mm_struct *mm;
+	struct rmap_item *rmap_item = NULL;
+	struct vm_area_struct *vma = slot->vma;
+
+	mm = vma->vm_mm;
+	BUG_ON(!mm);
+	BUG_ON(!slot);
+
+	rmap_item = get_next_rmap_item(slot, &hash);
+	if (!rmap_item)
+		goto out1;
+
+	if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item))
+		goto out2;
+
+	cmp_and_merge_page(rmap_item, hash);
+out2:
+	put_page(rmap_item->page);
+out1:
+	slot->pages_scanned++;
+	slot->this_sampled++;
+	if (slot->fully_scanned_round != fully_scanned_round)
+		scanned_virtual_pages++;
+
+	if (vma_fully_scanned(slot))
+		slot->fully_scanned_round = fully_scanned_round;
+}
+
+static inline unsigned long rung_get_pages(struct scan_rung *rung)
+{
+	struct slot_tree_node *node;
+
+	if (!rung->vma_root.rnode)
+		return 0;
+
+	node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode);
+
+	return node->size;
+}
+
+#define RUNG_SAMPLED_MIN	3
+
+static inline
+void uksm_calc_rung_step(struct scan_rung *rung,
+			 unsigned long page_time, unsigned long ratio)
+{
+	unsigned long sampled, pages;
+
+	/* will be fully scanned ? */
+	if (!rung->cover_msecs) {
+		rung->step = 1;
+		return;
+	}
+
+	sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE)
+		  * ratio / page_time;
+
+	/*
+	 *  Before we finsish a scan round and expensive per-round jobs,
+	 *  we need to have a chance to estimate the per page time. So
+	 *  the sampled number can not be too small.
+	 */
+	if (sampled < RUNG_SAMPLED_MIN)
+		sampled = RUNG_SAMPLED_MIN;
+
+	pages = rung_get_pages(rung);
+	if (likely(pages > sampled))
+		rung->step = pages / sampled;
+	else
+		rung->step = 1;
+}
+
+static inline int step_need_recalc(struct scan_rung *rung)
+{
+	unsigned long pages, stepmax;
+
+	pages = rung_get_pages(rung);
+	stepmax = pages / RUNG_SAMPLED_MIN;
+
+	return pages && (rung->step > pages ||
+			 (stepmax && rung->step > stepmax));
+}
+
+static inline
+void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc)
+{
+	struct vma_slot *slot;
+
+	if (finished)
+		rung->flags |= UKSM_RUNG_ROUND_FINISHED;
+
+	if (step_recalc || step_need_recalc(rung)) {
+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
+		BUG_ON(step_need_recalc(rung));
+	}
+
+	slot_iter_index = prandom_u32() % rung->step;
+	BUG_ON(!rung->vma_root.rnode);
+	slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter);
+	BUG_ON(!slot);
+
+	rung->current_scan = slot;
+	rung->current_offset = slot_iter_index;
+}
+
+static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot)
+{
+	return &slot->rung->vma_root;
+}
+
+/*
+ * return if resetted.
+ */
+static int advance_current_scan(struct scan_rung *rung)
+{
+	unsigned short n;
+	struct vma_slot *slot, *next = NULL;
+
+	BUG_ON(!rung->vma_root.num);
+
+	slot = rung->current_scan;
+	n = (slot->pages - rung->current_offset) % rung->step;
+	slot_iter_index = rung->step - n;
+	next = sradix_tree_next(&rung->vma_root, slot->snode,
+				slot->sindex, slot_iter);
+
+	if (next) {
+		rung->current_offset = slot_iter_index;
+		rung->current_scan = next;
+		return 0;
+	} else {
+		reset_current_scan(rung, 1, 0);
+		return 1;
+	}
+}
+
+static inline void rung_rm_slot(struct vma_slot *slot)
+{
+	struct scan_rung *rung = slot->rung;
+	struct sradix_tree_root *root;
+
+	if (rung->current_scan == slot)
+		advance_current_scan(rung);
+
+	root = slot_get_root(slot);
+	sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex);
+	slot->snode = NULL;
+	if (step_need_recalc(rung)) {
+		uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio);
+		BUG_ON(step_need_recalc(rung));
+	}
+
+	/* In case advance_current_scan loop back to this slot again */
+	if (rung->vma_root.num && rung->current_scan == slot)
+		reset_current_scan(slot->rung, 1, 0);
+}
+
+static inline void rung_add_new_slots(struct scan_rung *rung,
+			struct vma_slot **slots, unsigned long num)
+{
+	int err;
+	struct vma_slot *slot;
+	unsigned long i;
+	struct sradix_tree_root *root = &rung->vma_root;
+
+	err = sradix_tree_enter(root, (void **)slots, num);
+	BUG_ON(err);
+
+	for (i = 0; i < num; i++) {
+		slot = slots[i];
+		slot->rung = rung;
+		BUG_ON(vma_fully_scanned(slot));
+	}
+
+	if (rung->vma_root.num == num)
+		reset_current_scan(rung, 0, 1);
+}
+
+static inline int rung_add_one_slot(struct scan_rung *rung,
+				     struct vma_slot *slot)
+{
+	int err;
+
+	err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1);
+	if (err)
+		return err;
+
+	slot->rung = rung;
+	if (rung->vma_root.num == 1)
+		reset_current_scan(rung, 0, 1);
+
+	return 0;
+}
+
+/*
+ * Return true if the slot is deleted from its rung.
+ */
+static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung)
+{
+	struct scan_rung *old_rung = slot->rung;
+	int err;
+
+	if (old_rung == rung)
+		return 0;
+
+	rung_rm_slot(slot);
+	err = rung_add_one_slot(rung, slot);
+	if (err) {
+		err = rung_add_one_slot(old_rung, slot);
+		WARN_ON(err); /* OOPS, badly OOM, we lost this slot */
+	}
+
+	return 1;
+}
+
+static inline int vma_rung_up(struct vma_slot *slot)
+{
+	struct scan_rung *rung;
+
+	rung = slot->rung;
+	if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1])
+		rung++;
+
+	return vma_rung_enter(slot, rung);
+}
+
+static inline int vma_rung_down(struct vma_slot *slot)
+{
+	struct scan_rung *rung;
+
+	rung = slot->rung;
+	if (slot->rung != &uksm_scan_ladder[0])
+		rung--;
+
+	return vma_rung_enter(slot, rung);
+}
+
+/**
+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
+ */
+static unsigned long cal_dedup_ratio(struct vma_slot *slot)
+{
+	unsigned long ret;
+	unsigned long pages;
+
+	pages = slot->this_sampled;
+	if (!pages)
+		return 0;
+
+	BUG_ON(slot->pages_scanned == slot->last_scanned);
+
+	ret = slot->pages_merged;
+
+	/* Thrashing area filtering */
+	if (ret && uksm_thrash_threshold) {
+		if (slot->pages_cowed * 100 / slot->pages_merged
+		    > uksm_thrash_threshold) {
+			ret = 0;
+		} else {
+			ret = slot->pages_merged - slot->pages_cowed;
+		}
+	}
+
+	return ret * 100 / pages;
+}
+
+/**
+ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot.
+ */
+static unsigned long cal_dedup_ratio_old(struct vma_slot *slot)
+{
+	unsigned long ret;
+	unsigned long pages;
+
+	pages = slot->pages;
+	if (!pages)
+		return 0;
+
+	ret = slot->pages_bemerged;
+
+	/* Thrashing area filtering */
+	if (ret && uksm_thrash_threshold) {
+		if (slot->pages_cowed * 100 / slot->pages_bemerged
+		    > uksm_thrash_threshold) {
+			ret = 0;
+		} else {
+			ret = slot->pages_bemerged - slot->pages_cowed;
+		}
+	}
+
+	return ret * 100 / pages;
+}
+
+/**
+ * stable_node_reinsert() - When the hash_strength has been adjusted, the
+ * stable tree need to be restructured, this is the function re-inserting the
+ * stable node.
+ */
+static inline void stable_node_reinsert(struct stable_node *new_node,
+					struct page *page,
+					struct rb_root *root_treep,
+					struct list_head *tree_node_listp,
+					u32 hash)
+{
+	struct rb_node **new = &root_treep->rb_node;
+	struct rb_node *parent = NULL;
+	struct stable_node *stable_node;
+	struct tree_node *tree_node;
+	struct page *tree_page;
+	int cmp;
+
+	while (*new) {
+		int cmp;
+
+		tree_node = rb_entry(*new, struct tree_node, node);
+
+		cmp = hash_cmp(hash, tree_node->hash);
+
+		if (cmp < 0) {
+			parent = *new;
+			new = &parent->rb_left;
+		} else if (cmp > 0) {
+			parent = *new;
+			new = &parent->rb_right;
+		} else
+			break;
+	}
+
+	if (*new) {
+		/* find a stable tree node with same first level hash value */
+		stable_node_hash_max(new_node, page, hash);
+		if (tree_node->count == 1) {
+			stable_node = rb_entry(tree_node->sub_root.rb_node,
+					       struct stable_node, node);
+			tree_page = get_uksm_page(stable_node, 1, 0);
+			if (tree_page) {
+				stable_node_hash_max(stable_node,
+						      tree_page, hash);
+				put_page(tree_page);
+
+				/* prepare for stable node insertion */
+
+				cmp = hash_cmp(new_node->hash_max,
+						   stable_node->hash_max);
+				parent = &stable_node->node;
+				if (cmp < 0)
+					new = &parent->rb_left;
+				else if (cmp > 0)
+					new = &parent->rb_right;
+				else
+					goto failed;
+
+				goto add_node;
+			} else {
+				/* the only stable_node deleted, the tree node
+				 * was not deleted.
+				 */
+				goto tree_node_reuse;
+			}
+		}
+
+		/* well, search the collision subtree */
+		new = &tree_node->sub_root.rb_node;
+		parent = NULL;
+		BUG_ON(!*new);
+		while (*new) {
+			int cmp;
+
+			stable_node = rb_entry(*new, struct stable_node, node);
+
+			cmp = hash_cmp(new_node->hash_max,
+					   stable_node->hash_max);
+
+			if (cmp < 0) {
+				parent = *new;
+				new = &parent->rb_left;
+			} else if (cmp > 0) {
+				parent = *new;
+				new = &parent->rb_right;
+			} else {
+				/* oh, no, still a collision */
+				goto failed;
+			}
+		}
+
+		goto add_node;
+	}
+
+	/* no tree node found */
+	tree_node = alloc_tree_node(tree_node_listp);
+	if (!tree_node) {
+		printk(KERN_ERR "UKSM: memory allocation error!\n");
+		goto failed;
+	} else {
+		tree_node->hash = hash;
+		rb_link_node(&tree_node->node, parent, new);
+		rb_insert_color(&tree_node->node, root_treep);
+
+tree_node_reuse:
+		/* prepare for stable node insertion */
+		parent = NULL;
+		new = &tree_node->sub_root.rb_node;
+	}
+
+add_node:
+	rb_link_node(&new_node->node, parent, new);
+	rb_insert_color(&new_node->node, &tree_node->sub_root);
+	new_node->tree_node = tree_node;
+	tree_node->count++;
+	return;
+
+failed:
+	/* This can only happen when two nodes have collided
+	 * in two levels.
+	 */
+	new_node->tree_node = NULL;
+	return;
+}
+
+static inline void free_all_tree_nodes(struct list_head *list)
+{
+	struct tree_node *node, *tmp;
+
+	list_for_each_entry_safe(node, tmp, list, all_list) {
+		free_tree_node(node);
+	}
+}
+
+/**
+ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash
+ * strength to the current hash_strength. It re-structures the hole tree.
+ */
+static inline void stable_tree_delta_hash(u32 prev_hash_strength)
+{
+	struct stable_node *node, *tmp;
+	struct rb_root *root_new_treep;
+	struct list_head *new_tree_node_listp;
+
+	stable_tree_index = (stable_tree_index + 1) % 2;
+	root_new_treep = &root_stable_tree[stable_tree_index];
+	new_tree_node_listp = &stable_tree_node_list[stable_tree_index];
+	*root_new_treep = RB_ROOT;
+	BUG_ON(!list_empty(new_tree_node_listp));
+
+	/*
+	 * we need to be safe, the node could be removed by get_uksm_page()
+	 */
+	list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) {
+		void *addr;
+		struct page *node_page;
+		u32 hash;
+
+		/*
+		 * We are completely re-structuring the stable nodes to a new
+		 * stable tree. We don't want to touch the old tree unlinks and
+		 * old tree_nodes. The old tree_nodes will be freed at once.
+		 */
+		node_page = get_uksm_page(node, 0, 0);
+		if (!node_page)
+			continue;
+
+		if (node->tree_node) {
+			hash = node->tree_node->hash;
+
+			addr = kmap_atomic(node_page);
+
+			hash = delta_hash(addr, prev_hash_strength,
+					  hash_strength, hash);
+			kunmap_atomic(addr);
+		} else {
+			/*
+			 *it was not inserted to rbtree due to collision in last
+			 *round scan.
+			 */
+			hash = page_hash(node_page, hash_strength, 0);
+		}
+
+		stable_node_reinsert(node, node_page, root_new_treep,
+				     new_tree_node_listp, hash);
+		put_page(node_page);
+	}
+
+	root_stable_treep = root_new_treep;
+	free_all_tree_nodes(stable_tree_node_listp);
+	BUG_ON(!list_empty(stable_tree_node_listp));
+	stable_tree_node_listp = new_tree_node_listp;
+}
+
+static inline void inc_hash_strength(unsigned long delta)
+{
+	hash_strength += 1 << delta;
+	if (hash_strength > HASH_STRENGTH_MAX)
+		hash_strength = HASH_STRENGTH_MAX;
+}
+
+static inline void dec_hash_strength(unsigned long delta)
+{
+	unsigned long change = 1 << delta;
+
+	if (hash_strength <= change + 1)
+		hash_strength = 1;
+	else
+		hash_strength -= change;
+}
+
+static inline void inc_hash_strength_delta(void)
+{
+	hash_strength_delta++;
+	if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX)
+		hash_strength_delta = HASH_STRENGTH_DELTA_MAX;
+}
+
+/*
+static inline unsigned long get_current_neg_ratio(void)
+{
+	if (!rshash_pos || rshash_neg > rshash_pos)
+		return 100;
+
+	return div64_u64(100 * rshash_neg , rshash_pos);
+}
+*/
+
+static inline unsigned long get_current_neg_ratio(void)
+{
+	u64 pos = benefit.pos;
+	u64 neg = benefit.neg;
+
+	if (!neg)
+		return 0;
+
+	if (!pos || neg > pos)
+		return 100;
+
+	if (neg > div64_u64(U64_MAX, 100))
+		pos = div64_u64(pos, 100);
+	else
+		neg *= 100;
+
+	return div64_u64(neg, pos);
+}
+
+static inline unsigned long get_current_benefit(void)
+{
+	u64 pos = benefit.pos;
+	u64 neg = benefit.neg;
+	u64 scanned = benefit.scanned;
+
+	if (neg > pos)
+		return 0;
+
+	return div64_u64((pos - neg), scanned);
+}
+
+static inline int judge_rshash_direction(void)
+{
+	u64 current_neg_ratio, stable_benefit;
+	u64 current_benefit, delta = 0;
+	int ret = STILL;
+
+	/* Try to probe a value after the boot, and in case the system
+	   are still for a long time. */
+	if ((fully_scanned_round & 0xFFULL) == 10) {
+		ret = OBSCURE;
+		goto out;
+	}
+
+	current_neg_ratio = get_current_neg_ratio();
+
+	if (current_neg_ratio == 0) {
+		rshash_neg_cont_zero++;
+		if (rshash_neg_cont_zero > 2)
+			return GO_DOWN;
+		else
+			return STILL;
+	}
+	rshash_neg_cont_zero = 0;
+
+	if (current_neg_ratio > 90) {
+		ret = GO_UP;
+		goto out;
+	}
+
+	current_benefit = get_current_benefit();
+	stable_benefit = rshash_state.stable_benefit;
+
+	if (!stable_benefit) {
+		ret = OBSCURE;
+		goto out;
+	}
+
+	if (current_benefit > stable_benefit)
+		delta = current_benefit - stable_benefit;
+	else if (current_benefit < stable_benefit)
+		delta = stable_benefit - current_benefit;
+
+	delta = div64_u64(100 * delta , stable_benefit);
+
+	if (delta > 50) {
+		rshash_cont_obscure++;
+		if (rshash_cont_obscure > 2)
+			return OBSCURE;
+		else
+			return STILL;
+	}
+
+out:
+	rshash_cont_obscure = 0;
+	return ret;
+}
+
+/**
+ * rshash_adjust() - The main function to control the random sampling state
+ * machine for hash strength adapting.
+ *
+ * return true if hash_strength has changed.
+ */
+static inline int rshash_adjust(void)
+{
+	unsigned long prev_hash_strength = hash_strength;
+
+	if (!encode_benefit())
+		return 0;
+
+	switch (rshash_state.state) {
+	case RSHASH_STILL:
+		switch (judge_rshash_direction()) {
+		case GO_UP:
+			if (rshash_state.pre_direct == GO_DOWN)
+				hash_strength_delta = 0;
+
+			inc_hash_strength(hash_strength_delta);
+			inc_hash_strength_delta();
+			rshash_state.stable_benefit = get_current_benefit();
+			rshash_state.pre_direct = GO_UP;
+			break;
+
+		case GO_DOWN:
+			if (rshash_state.pre_direct == GO_UP)
+				hash_strength_delta = 0;
+
+			dec_hash_strength(hash_strength_delta);
+			inc_hash_strength_delta();
+			rshash_state.stable_benefit = get_current_benefit();
+			rshash_state.pre_direct = GO_DOWN;
+			break;
+
+		case OBSCURE:
+			rshash_state.stable_point = hash_strength;
+			rshash_state.turn_point_down = hash_strength;
+			rshash_state.turn_point_up = hash_strength;
+			rshash_state.turn_benefit_down = get_current_benefit();
+			rshash_state.turn_benefit_up = get_current_benefit();
+			rshash_state.lookup_window_index = 0;
+			rshash_state.state = RSHASH_TRYDOWN;
+			dec_hash_strength(hash_strength_delta);
+			inc_hash_strength_delta();
+			break;
+
+		case STILL:
+			break;
+		default:
+			BUG();
+		}
+		break;
+
+	case RSHASH_TRYDOWN:
+		if (rshash_state.lookup_window_index++ % 5 == 0)
+			rshash_state.below_count = 0;
+
+		if (get_current_benefit() < rshash_state.stable_benefit)
+			rshash_state.below_count++;
+		else if (get_current_benefit() >
+			 rshash_state.turn_benefit_down) {
+			rshash_state.turn_point_down = hash_strength;
+			rshash_state.turn_benefit_down = get_current_benefit();
+		}
+
+		if (rshash_state.below_count >= 3 ||
+		    judge_rshash_direction() == GO_UP ||
+		    hash_strength == 1) {
+			hash_strength = rshash_state.stable_point;
+			hash_strength_delta = 0;
+			inc_hash_strength(hash_strength_delta);
+			inc_hash_strength_delta();
+			rshash_state.lookup_window_index = 0;
+			rshash_state.state = RSHASH_TRYUP;
+			hash_strength_delta = 0;
+		} else {
+			dec_hash_strength(hash_strength_delta);
+			inc_hash_strength_delta();
+		}
+		break;
+
+	case RSHASH_TRYUP:
+		if (rshash_state.lookup_window_index++ % 5 == 0)
+			rshash_state.below_count = 0;
+
+		if (get_current_benefit() < rshash_state.turn_benefit_down)
+			rshash_state.below_count++;
+		else if (get_current_benefit() > rshash_state.turn_benefit_up) {
+			rshash_state.turn_point_up = hash_strength;
+			rshash_state.turn_benefit_up = get_current_benefit();
+		}
+
+		if (rshash_state.below_count >= 3 ||
+		    judge_rshash_direction() == GO_DOWN ||
+		    hash_strength == HASH_STRENGTH_MAX) {
+			hash_strength = rshash_state.turn_benefit_up >
+				rshash_state.turn_benefit_down ?
+				rshash_state.turn_point_up :
+				rshash_state.turn_point_down;
+
+			rshash_state.state = RSHASH_PRE_STILL;
+		} else {
+			inc_hash_strength(hash_strength_delta);
+			inc_hash_strength_delta();
+		}
+
+		break;
+
+	case RSHASH_NEW:
+	case RSHASH_PRE_STILL:
+		rshash_state.stable_benefit = get_current_benefit();
+		rshash_state.state = RSHASH_STILL;
+		hash_strength_delta = 0;
+		break;
+	default:
+		BUG();
+	}
+
+	/* rshash_neg = rshash_pos = 0; */
+	reset_benefit();
+
+	if (prev_hash_strength != hash_strength)
+		stable_tree_delta_hash(prev_hash_strength);
+
+	return prev_hash_strength != hash_strength;
+}
+
+/**
+ * round_update_ladder() - The main function to do update of all the
+ * adjustments whenever a scan round is finished.
+ */
+static noinline void round_update_ladder(void)
+{
+	int i;
+	unsigned long dedup;
+	struct vma_slot *slot, *tmp_slot;
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED;
+	}
+
+	list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) {
+
+		/* slot may be rung_rm_slot() when mm exits */
+		if (slot->snode) {
+			dedup = cal_dedup_ratio_old(slot);
+			if (dedup && dedup >= uksm_abundant_threshold)
+				vma_rung_up(slot);
+		}
+
+		slot->pages_bemerged = 0;
+		slot->pages_cowed = 0;
+
+		list_del_init(&slot->dedup_list);
+	}
+}
+
+static void uksm_del_vma_slot(struct vma_slot *slot)
+{
+	int i, j;
+	struct rmap_list_entry *entry;
+
+	if (slot->snode) {
+		/*
+		 * In case it just failed when entering the rung, it's not
+		 * necessary.
+		 */
+		rung_rm_slot(slot);
+	}
+
+	if (!list_empty(&slot->dedup_list))
+		list_del(&slot->dedup_list);
+
+	if (!slot->rmap_list_pool || !slot->pool_counts) {
+		/* In case it OOMed in uksm_vma_enter() */
+		goto out;
+	}
+
+	for (i = 0; i < slot->pool_size; i++) {
+		void *addr;
+
+		if (!slot->rmap_list_pool[i])
+			continue;
+
+		addr = kmap(slot->rmap_list_pool[i]);
+		for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) {
+			entry = (struct rmap_list_entry *)addr + j;
+			if (is_addr(entry->addr))
+				continue;
+			if (!entry->item)
+				continue;
+
+			remove_rmap_item_from_tree(entry->item);
+			free_rmap_item(entry->item);
+			slot->pool_counts[i]--;
+		}
+		BUG_ON(slot->pool_counts[i]);
+		kunmap(slot->rmap_list_pool[i]);
+		__free_page(slot->rmap_list_pool[i]);
+	}
+	kfree(slot->rmap_list_pool);
+	kfree(slot->pool_counts);
+
+out:
+	slot->rung = NULL;
+	if (slot->flags & UKSM_SLOT_IN_UKSM) {
+		BUG_ON(uksm_pages_total < slot->pages);
+		uksm_pages_total -= slot->pages;
+	}
+
+	if (slot->fully_scanned_round == fully_scanned_round)
+		scanned_virtual_pages -= slot->pages;
+	else
+		scanned_virtual_pages -= slot->pages_scanned;
+	free_vma_slot(slot);
+}
+
+
+#define SPIN_LOCK_PERIOD	32
+static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD];
+static inline void cleanup_vma_slots(void)
+{
+	struct vma_slot *slot;
+	int i;
+
+	i = 0;
+	spin_lock(&vma_slot_list_lock);
+	while (!list_empty(&vma_slot_del)) {
+		slot = list_entry(vma_slot_del.next,
+				  struct vma_slot, slot_list);
+		list_del(&slot->slot_list);
+		cleanup_slots[i++] = slot;
+		if (i == SPIN_LOCK_PERIOD) {
+			spin_unlock(&vma_slot_list_lock);
+			while (--i >= 0)
+				uksm_del_vma_slot(cleanup_slots[i]);
+			i = 0;
+			spin_lock(&vma_slot_list_lock);
+		}
+	}
+	spin_unlock(&vma_slot_list_lock);
+
+	while (--i >= 0)
+		uksm_del_vma_slot(cleanup_slots[i]);
+}
+
+/*
+*expotional moving average formula
+*/
+static inline unsigned long ema(unsigned long curr, unsigned long last_ema)
+{
+	/*
+	 * For a very high burst, even the ema cannot work well, a false very
+	 * high per-page time estimation can result in feedback in very high
+	 * overhead of context swith and rung update -- this will then lead
+	 * to higher per-paper time, this may not converge.
+	 *
+	 * Instead, we try to approach this value in a binary manner.
+	 */
+	if (curr > last_ema * 10)
+		return last_ema * 2;
+
+	return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100;
+}
+
+/*
+ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to
+ * nanoseconds based on current uksm_sleep_jiffies.
+ */
+static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio)
+{
+	return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) /
+		(TIME_RATIO_SCALE - ratio) * ratio;
+}
+
+
+static inline unsigned long rung_real_ratio(int cpu_time_ratio)
+{
+	unsigned long ret;
+
+	BUG_ON(!cpu_time_ratio);
+
+	if (cpu_time_ratio > 0)
+		ret = cpu_time_ratio;
+	else
+		ret = (unsigned long)(-cpu_time_ratio) *
+			uksm_max_cpu_percentage / 100UL;
+
+	return ret ? ret : 1;
+}
+
+static noinline void uksm_calc_scan_pages(void)
+{
+	struct scan_rung *ladder = uksm_scan_ladder;
+	unsigned long sleep_usecs, nsecs;
+	unsigned long ratio;
+	int i;
+	unsigned long per_page;
+
+	if (uksm_ema_page_time > 100000 ||
+	    (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL))
+		uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT;
+
+	per_page = uksm_ema_page_time;
+	BUG_ON(!per_page);
+
+	/*
+	 * For every 8 eval round, we try to probe a uksm_sleep_jiffies value
+	 * based on saved user input.
+	 */
+	if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL)
+		uksm_sleep_jiffies = uksm_sleep_saved;
+
+	/* We require a rung scan at least 1 page in a period. */
+	nsecs = per_page;
+	ratio = rung_real_ratio(ladder[0].cpu_ratio);
+	if (cpu_ratio_to_nsec(ratio) < nsecs) {
+		sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio
+				/ NSEC_PER_USEC;
+		uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1;
+	}
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		ratio = rung_real_ratio(ladder[i].cpu_ratio);
+		ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) /
+					per_page;
+		BUG_ON(!ladder[i].pages_to_scan);
+		uksm_calc_rung_step(&ladder[i], per_page, ratio);
+	}
+}
+
+/*
+ * From the scan time of this round (ns) to next expected min sleep time
+ * (ms), be careful of the possible overflows. ratio is taken from
+ * rung_real_ratio()
+ */
+static inline
+unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio)
+{
+	scan_time >>= 20; /* to msec level now */
+	BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE));
+
+	return (unsigned int) ((unsigned long) scan_time *
+			       (TIME_RATIO_SCALE - ratio) / ratio);
+}
+
+#define __round_mask(x, y) ((__typeof__(x))((y)-1))
+#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
+
+static void uksm_vma_enter(struct vma_slot **slots, unsigned long num)
+{
+	struct scan_rung *rung;
+
+	rung = &uksm_scan_ladder[0];
+	rung_add_new_slots(rung, slots, num);
+}
+
+static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE];
+
+static void uksm_enter_all_slots(void)
+{
+	struct vma_slot *slot;
+	unsigned long index;
+	struct list_head empty_vma_list;
+	int i;
+
+	i = 0;
+	index = 0;
+	INIT_LIST_HEAD(&empty_vma_list);
+
+	spin_lock(&vma_slot_list_lock);
+	while (!list_empty(&vma_slot_new)) {
+		slot = list_entry(vma_slot_new.next,
+				  struct vma_slot, slot_list);
+
+		if (!slot->vma->anon_vma) {
+			list_move(&slot->slot_list, &empty_vma_list);
+		} else if (vma_can_enter(slot->vma)) {
+			batch_slots[index++] = slot;
+			list_del_init(&slot->slot_list);
+		} else {
+			list_move(&slot->slot_list, &vma_slot_noadd);
+		}
+
+		if (++i == SPIN_LOCK_PERIOD ||
+		    (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) {
+			spin_unlock(&vma_slot_list_lock);
+
+			if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) {
+				uksm_vma_enter(batch_slots, index);
+				index = 0;
+			}
+			i = 0;
+			cond_resched();
+			spin_lock(&vma_slot_list_lock);
+		}
+	}
+
+	list_splice(&empty_vma_list, &vma_slot_new);
+
+	spin_unlock(&vma_slot_list_lock);
+
+	if (index)
+		uksm_vma_enter(batch_slots, index);
+
+}
+
+static inline int rung_round_finished(struct scan_rung *rung)
+{
+	return rung->flags & UKSM_RUNG_ROUND_FINISHED;
+}
+
+static inline void judge_slot(struct vma_slot *slot)
+{
+	struct scan_rung *rung = slot->rung;
+	unsigned long dedup;
+	int deleted;
+
+	dedup = cal_dedup_ratio(slot);
+	if (vma_fully_scanned(slot) && uksm_thrash_threshold)
+		deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]);
+	else if (dedup && dedup >= uksm_abundant_threshold)
+		deleted = vma_rung_up(slot);
+	else
+		deleted = vma_rung_down(slot);
+
+	slot->pages_merged = 0;
+	slot->pages_cowed = 0;
+	slot->this_sampled = 0;
+
+	if (vma_fully_scanned(slot)) {
+		slot->pages_scanned = 0;
+	}
+
+	slot->last_scanned = slot->pages_scanned;
+
+	/* If its deleted in above, then rung was already advanced. */
+	if (!deleted)
+		advance_current_scan(rung);
+}
+
+
+static inline int hash_round_finished(void)
+{
+	if (scanned_virtual_pages > (uksm_pages_total >> 2)) {
+		scanned_virtual_pages = 0;
+		if (uksm_pages_scanned)
+			fully_scanned_round++;
+
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+#define UKSM_MMSEM_BATCH	5
+#define BUSY_RETRY		100
+
+/**
+ * uksm_do_scan()  - the main worker function.
+ */
+static noinline void uksm_do_scan(void)
+{
+	struct vma_slot *slot, *iter;
+	struct mm_struct *busy_mm;
+	unsigned char round_finished, all_rungs_emtpy;
+	int i, err, mmsem_batch;
+	unsigned long pcost;
+	long long delta_exec;
+	unsigned long vpages, max_cpu_ratio;
+	unsigned long long start_time, end_time, scan_time;
+	unsigned int expected_jiffies;
+
+	might_sleep();
+
+	vpages = 0;
+
+	start_time = task_sched_runtime(current);
+	max_cpu_ratio = 0;
+	mmsem_batch = 0;
+
+	for (i = 0; i < SCAN_LADDER_SIZE;) {
+		struct scan_rung *rung = &uksm_scan_ladder[i];
+		unsigned long ratio;
+		int busy_retry;
+
+		if (!rung->pages_to_scan) {
+			i++;
+			continue;
+		}
+
+		if (!rung->vma_root.num) {
+			rung->pages_to_scan = 0;
+			i++;
+			continue;
+		}
+
+		ratio = rung_real_ratio(rung->cpu_ratio);
+		if (ratio > max_cpu_ratio)
+			max_cpu_ratio = ratio;
+
+		busy_retry = BUSY_RETRY;
+		/*
+		 * Do not consider rung_round_finished() here, just used up the
+		 * rung->pages_to_scan quota.
+		 */
+		while (rung->pages_to_scan && rung->vma_root.num &&
+		       likely(!freezing(current))) {
+			int reset = 0;
+
+			slot = rung->current_scan;
+
+			BUG_ON(vma_fully_scanned(slot));
+
+			if (mmsem_batch) {
+				err = 0;
+			} else {
+				err = try_down_read_slot_mmap_sem(slot);
+			}
+
+			if (err == -ENOENT) {
+rm_slot:
+				rung_rm_slot(slot);
+				continue;
+			}
+
+			busy_mm = slot->mm;
+
+			if (err == -EBUSY) {
+				/* skip other vmas on the same mm */
+				do {
+					reset = advance_current_scan(rung);
+					iter = rung->current_scan;
+					busy_retry--;
+					if (iter->vma->vm_mm != busy_mm ||
+					    !busy_retry || reset)
+						break;
+				} while (1);
+
+				if (iter->vma->vm_mm != busy_mm) {
+					continue;
+				} else {
+					/* scan round finsished */
+					break;
+				}
+			}
+
+			BUG_ON(!vma_can_enter(slot->vma));
+			if (uksm_test_exit(slot->vma->vm_mm)) {
+				mmsem_batch = 0;
+				up_read(&slot->vma->vm_mm->mmap_sem);
+				goto rm_slot;
+			}
+
+			if (mmsem_batch)
+				mmsem_batch--;
+			else
+				mmsem_batch = UKSM_MMSEM_BATCH;
+
+			/* Ok, we have take the mmap_sem, ready to scan */
+			scan_vma_one_page(slot);
+			rung->pages_to_scan--;
+			vpages++;
+
+			if (rung->current_offset + rung->step > slot->pages - 1
+			    || vma_fully_scanned(slot)) {
+				up_read(&slot->vma->vm_mm->mmap_sem);
+				judge_slot(slot);
+				mmsem_batch = 0;
+			} else {
+				rung->current_offset += rung->step;
+				if (!mmsem_batch)
+					up_read(&slot->vma->vm_mm->mmap_sem);
+			}
+
+			busy_retry = BUSY_RETRY;
+			cond_resched();
+		}
+
+		if (mmsem_batch) {
+			up_read(&slot->vma->vm_mm->mmap_sem);
+			mmsem_batch = 0;
+		}
+
+		if (freezing(current))
+			break;
+
+		cond_resched();
+	}
+	end_time = task_sched_runtime(current);
+	delta_exec = end_time - start_time;
+
+	if (freezing(current))
+		return;
+
+	cleanup_vma_slots();
+	uksm_enter_all_slots();
+
+	round_finished = 1;
+	all_rungs_emtpy = 1;
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		struct scan_rung *rung = &uksm_scan_ladder[i];
+
+		if (rung->vma_root.num) {
+			all_rungs_emtpy = 0;
+			if (!rung_round_finished(rung))
+				round_finished = 0;
+		}
+	}
+
+	if (all_rungs_emtpy)
+		round_finished = 0;
+
+	if (round_finished) {
+		round_update_ladder();
+		uksm_eval_round++;
+
+		if (hash_round_finished() && rshash_adjust()) {
+			/* Reset the unstable root iff hash strength changed */
+			uksm_hash_round++;
+			root_unstable_tree = RB_ROOT;
+			free_all_tree_nodes(&unstable_tree_node_list);
+		}
+
+		/*
+		 * A number of pages can hang around indefinitely on per-cpu
+		 * pagevecs, raised page count preventing write_protect_page
+		 * from merging them.  Though it doesn't really matter much,
+		 * it is puzzling to see some stuck in pages_volatile until
+		 * other activity jostles them out, and they also prevented
+		 * LTP's KSM test from succeeding deterministically; so drain
+		 * them here (here rather than on entry to uksm_do_scan(),
+		 * so we don't IPI too often when pages_to_scan is set low).
+		 */
+		lru_add_drain_all();
+	}
+
+
+	if (vpages && delta_exec > 0) {
+		pcost = (unsigned long) delta_exec / vpages;
+		if (likely(uksm_ema_page_time))
+			uksm_ema_page_time = ema(pcost, uksm_ema_page_time);
+		else
+			uksm_ema_page_time = pcost;
+	}
+
+	uksm_calc_scan_pages();
+	uksm_sleep_real = uksm_sleep_jiffies;
+	/* in case of radical cpu bursts, apply the upper bound */
+	end_time = task_sched_runtime(current);
+	if (max_cpu_ratio && end_time > start_time) {
+		scan_time = end_time - start_time;
+		expected_jiffies = msecs_to_jiffies(
+			scan_time_to_sleep(scan_time, max_cpu_ratio));
+
+		if (expected_jiffies > uksm_sleep_real)
+			uksm_sleep_real = expected_jiffies;
+
+		/* We have a 1 second up bound for responsiveness. */
+		if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC)
+			uksm_sleep_real = msecs_to_jiffies(1000);
+	}
+
+	return;
+}
+
+static int ksmd_should_run(void)
+{
+	return uksm_run & UKSM_RUN_MERGE;
+}
+
+static int uksm_scan_thread(void *nothing)
+{
+	set_freezable();
+	set_user_nice(current, 5);
+
+	while (!kthread_should_stop()) {
+		mutex_lock(&uksm_thread_mutex);
+		if (ksmd_should_run()) {
+			uksm_do_scan();
+		}
+		mutex_unlock(&uksm_thread_mutex);
+
+		try_to_freeze();
+
+		if (ksmd_should_run()) {
+			schedule_timeout_interruptible(uksm_sleep_real);
+			uksm_sleep_times++;
+		} else {
+			wait_event_freezable(uksm_thread_wait,
+				ksmd_should_run() || kthread_should_stop());
+		}
+	}
+	return 0;
+}
+
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
+{
+	struct stable_node *stable_node;
+	struct node_vma *node_vma;
+	struct rmap_item *rmap_item;
+	int ret = SWAP_AGAIN;
+	int search_new_forks = 0;
+	unsigned long address;
+
+	VM_BUG_ON_PAGE(!PageKsm(page), page);
+	VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+	stable_node = page_stable_node(page);
+	if (!stable_node)
+		return ret;
+again:
+	hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) {
+		hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) {
+			struct anon_vma *anon_vma = rmap_item->anon_vma;
+			struct anon_vma_chain *vmac;
+			struct vm_area_struct *vma;
+
+			cond_resched();
+			anon_vma_lock_read(anon_vma);
+			anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root,
+						       0, ULONG_MAX) {
+				cond_resched();
+				vma = vmac->vma;
+				address = get_rmap_addr(rmap_item);
+
+				if (address < vma->vm_start ||
+				    address >= vma->vm_end)
+					continue;
+
+				if ((rmap_item->slot->vma == vma) ==
+				    search_new_forks)
+					continue;
+
+				if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+					continue;
+
+				ret = rwc->rmap_one(page, vma, address, rwc->arg);
+				if (ret != SWAP_AGAIN) {
+					anon_vma_unlock_read(anon_vma);
+					goto out;
+				}
+
+				if (rwc->done && rwc->done(page)) {
+					anon_vma_unlock_read(anon_vma);
+					goto out;
+				}
+			}
+			anon_vma_unlock_read(anon_vma);
+		}
+	}
+	if (!search_new_forks++)
+		goto again;
+out:
+	return ret;
+}
+
+#ifdef CONFIG_MIGRATION
+/* Common ksm interface but may be specific to uksm */
+void ksm_migrate_page(struct page *newpage, struct page *oldpage)
+{
+	struct stable_node *stable_node;
+
+	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+	VM_BUG_ON(newpage->mapping != oldpage->mapping);
+
+	stable_node = page_stable_node(newpage);
+	if (stable_node) {
+		VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage));
+		stable_node->kpfn = page_to_pfn(newpage);
+	}
+}
+#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn,
+						 unsigned long end_pfn)
+{
+	struct rb_node *node;
+
+	for (node = rb_first(root_stable_treep); node; node = rb_next(node)) {
+		struct stable_node *stable_node;
+
+		stable_node = rb_entry(node, struct stable_node, node);
+		if (stable_node->kpfn >= start_pfn &&
+		    stable_node->kpfn < end_pfn)
+			return stable_node;
+	}
+	return NULL;
+}
+
+static int uksm_memory_callback(struct notifier_block *self,
+			       unsigned long action, void *arg)
+{
+	struct memory_notify *mn = arg;
+	struct stable_node *stable_node;
+
+	switch (action) {
+	case MEM_GOING_OFFLINE:
+		/*
+		 * Keep it very simple for now: just lock out ksmd and
+		 * MADV_UNMERGEABLE while any memory is going offline.
+		 * mutex_lock_nested() is necessary because lockdep was alarmed
+		 * that here we take uksm_thread_mutex inside notifier chain
+		 * mutex, and later take notifier chain mutex inside
+		 * uksm_thread_mutex to unlock it.   But that's safe because both
+		 * are inside mem_hotplug_mutex.
+		 */
+		mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING);
+		break;
+
+	case MEM_OFFLINE:
+		/*
+		 * Most of the work is done by page migration; but there might
+		 * be a few stable_nodes left over, still pointing to struct
+		 * pages which have been offlined: prune those from the tree.
+		 */
+		while ((stable_node = uksm_check_stable_tree(mn->start_pfn,
+					mn->start_pfn + mn->nr_pages)) != NULL)
+			remove_node_from_stable_tree(stable_node, 1, 1);
+		/* fallthrough */
+
+	case MEM_CANCEL_OFFLINE:
+		mutex_unlock(&uksm_thread_mutex);
+		break;
+	}
+	return NOTIFY_OK;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
+#ifdef CONFIG_SYSFS
+/*
+ * This all compiles without CONFIG_SYSFS, but is a waste of space.
+ */
+
+#define UKSM_ATTR_RO(_name) \
+	static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+#define UKSM_ATTR(_name) \
+	static struct kobj_attribute _name##_attr = \
+		__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static ssize_t max_cpu_percentage_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", uksm_max_cpu_percentage);
+}
+
+static ssize_t max_cpu_percentage_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t count)
+{
+	unsigned long max_cpu_percentage;
+	int err;
+
+	err = kstrtoul(buf, 10, &max_cpu_percentage);
+	if (err || max_cpu_percentage > 100)
+		return -EINVAL;
+
+	if (max_cpu_percentage == 100)
+		max_cpu_percentage = 99;
+	else if (max_cpu_percentage < 10)
+		max_cpu_percentage = 10;
+
+	uksm_max_cpu_percentage = max_cpu_percentage;
+
+	return count;
+}
+UKSM_ATTR(max_cpu_percentage);
+
+static ssize_t sleep_millisecs_show(struct kobject *kobj,
+				    struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies));
+}
+
+static ssize_t sleep_millisecs_store(struct kobject *kobj,
+				     struct kobj_attribute *attr,
+				     const char *buf, size_t count)
+{
+	unsigned long msecs;
+	int err;
+
+	err = kstrtoul(buf, 10, &msecs);
+	if (err || msecs > MSEC_PER_SEC)
+		return -EINVAL;
+
+	uksm_sleep_jiffies = msecs_to_jiffies(msecs);
+	uksm_sleep_saved = uksm_sleep_jiffies;
+
+	return count;
+}
+UKSM_ATTR(sleep_millisecs);
+
+
+static ssize_t cpu_governor_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
+	int i;
+
+	buf[0] = '\0';
+	for (i = 0; i < n ; i++) {
+		if (uksm_cpu_governor == i)
+			strcat(buf, "[");
+
+		strcat(buf, uksm_cpu_governor_str[i]);
+
+		if (uksm_cpu_governor == i)
+			strcat(buf, "]");
+
+		strcat(buf, " ");
+	}
+	strcat(buf, "\n");
+
+	return strlen(buf);
+}
+
+static inline void init_performance_values(void)
+{
+	int i;
+	struct scan_rung *rung;
+	struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor;
+
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		rung = uksm_scan_ladder + i;
+		rung->cpu_ratio = preset->cpu_ratio[i];
+		rung->cover_msecs = preset->cover_msecs[i];
+	}
+
+	uksm_max_cpu_percentage = preset->max_cpu;
+}
+
+static ssize_t cpu_governor_store(struct kobject *kobj,
+				   struct kobj_attribute *attr,
+				   const char *buf, size_t count)
+{
+	int n = sizeof(uksm_cpu_governor_str) / sizeof(char *);
+
+	for (n--; n >=0 ; n--) {
+		if (!strncmp(buf, uksm_cpu_governor_str[n],
+			     strlen(uksm_cpu_governor_str[n])))
+			break;
+	}
+
+	if (n < 0)
+		return -EINVAL;
+	else
+		uksm_cpu_governor = n;
+
+	init_performance_values();
+
+	return count;
+}
+UKSM_ATTR(cpu_governor);
+
+static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
+			char *buf)
+{
+	return sprintf(buf, "%u\n", uksm_run);
+}
+
+static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
+			 const char *buf, size_t count)
+{
+	int err;
+	unsigned long flags;
+
+	err = kstrtoul(buf, 10, &flags);
+	if (err || flags > UINT_MAX)
+		return -EINVAL;
+	if (flags > UKSM_RUN_MERGE)
+		return -EINVAL;
+
+	mutex_lock(&uksm_thread_mutex);
+	if (uksm_run != flags) {
+		uksm_run = flags;
+	}
+	mutex_unlock(&uksm_thread_mutex);
+
+	if (flags & UKSM_RUN_MERGE)
+		wake_up_interruptible(&uksm_thread_wait);
+
+	return count;
+}
+UKSM_ATTR(run);
+
+static ssize_t abundant_threshold_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", uksm_abundant_threshold);
+}
+
+static ssize_t abundant_threshold_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int err;
+	unsigned long flags;
+
+	err = kstrtoul(buf, 10, &flags);
+	if (err || flags > 99)
+		return -EINVAL;
+
+	uksm_abundant_threshold = flags;
+
+	return count;
+}
+UKSM_ATTR(abundant_threshold);
+
+static ssize_t thrash_threshold_show(struct kobject *kobj,
+				     struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%u\n", uksm_thrash_threshold);
+}
+
+static ssize_t thrash_threshold_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int err;
+	unsigned long flags;
+
+	err = kstrtoul(buf, 10, &flags);
+	if (err || flags > 99)
+		return -EINVAL;
+
+	uksm_thrash_threshold = flags;
+
+	return count;
+}
+UKSM_ATTR(thrash_threshold);
+
+static ssize_t cpu_ratios_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	int i, size;
+	struct scan_rung *rung;
+	char *p = buf;
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		rung = &uksm_scan_ladder[i];
+
+		if (rung->cpu_ratio > 0)
+			size = sprintf(p, "%d ", rung->cpu_ratio);
+		else
+			size = sprintf(p, "MAX/%d ",
+					TIME_RATIO_SCALE / -rung->cpu_ratio);
+
+		p += size;
+	}
+
+	*p++ = '\n';
+	*p = '\0';
+
+	return p - buf;
+}
+
+static ssize_t cpu_ratios_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int i, cpuratios[SCAN_LADDER_SIZE], err;
+	unsigned long value;
+	struct scan_rung *rung;
+	char *p, *end = NULL;
+
+	p = kzalloc(count, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	memcpy(p, buf, count);
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		if (i != SCAN_LADDER_SIZE -1) {
+			end = strchr(p, ' ');
+			if (!end)
+				return -EINVAL;
+
+			*end = '\0';
+		}
+
+		if (strstr(p, "MAX/")) {
+			p = strchr(p, '/') + 1;
+			err = kstrtoul(p, 10, &value);
+			if (err || value > TIME_RATIO_SCALE || !value)
+				return -EINVAL;
+
+			cpuratios[i] = - (int) (TIME_RATIO_SCALE / value);
+		} else {
+			err = kstrtoul(p, 10, &value);
+			if (err || value > TIME_RATIO_SCALE || !value)
+				return -EINVAL;
+
+			cpuratios[i] = value;
+		}
+
+		p = end + 1;
+	}
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		rung = &uksm_scan_ladder[i];
+
+		rung->cpu_ratio = cpuratios[i];
+	}
+
+	return count;
+}
+UKSM_ATTR(cpu_ratios);
+
+static ssize_t eval_intervals_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	int i, size;
+	struct scan_rung *rung;
+	char *p = buf;
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		rung = &uksm_scan_ladder[i];
+		size = sprintf(p, "%u ", rung->cover_msecs);
+		p += size;
+	}
+
+	*p++ = '\n';
+	*p = '\0';
+
+	return p - buf;
+}
+
+static ssize_t eval_intervals_store(struct kobject *kobj,
+				      struct kobj_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int i, err;
+	unsigned long values[SCAN_LADDER_SIZE];
+	struct scan_rung *rung;
+	char *p, *end = NULL;
+	ssize_t ret = count;
+
+	p = kzalloc(count + 2, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	memcpy(p, buf, count);
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		if (i != SCAN_LADDER_SIZE -1) {
+			end = strchr(p, ' ');
+			if (!end) {
+				ret = -EINVAL;
+				goto out;
+			}
+
+			*end = '\0';
+		}
+
+		err = kstrtoul(p, 10, &values[i]);
+		if (err) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		p = end + 1;
+	}
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		rung = &uksm_scan_ladder[i];
+
+		rung->cover_msecs = values[i];
+	}
+
+out:
+	kfree(p);
+	return ret;
+}
+UKSM_ATTR(eval_intervals);
+
+static ssize_t ema_per_page_time_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", uksm_ema_page_time);
+}
+UKSM_ATTR_RO(ema_per_page_time);
+
+static ssize_t pages_shared_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", uksm_pages_shared);
+}
+UKSM_ATTR_RO(pages_shared);
+
+static ssize_t pages_sharing_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", uksm_pages_sharing);
+}
+UKSM_ATTR_RO(pages_sharing);
+
+static ssize_t pages_unshared_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", uksm_pages_unshared);
+}
+UKSM_ATTR_RO(pages_unshared);
+
+static ssize_t full_scans_show(struct kobject *kobj,
+			       struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n", fully_scanned_round);
+}
+UKSM_ATTR_RO(full_scans);
+
+static ssize_t pages_scanned_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	unsigned long base = 0;
+	u64 delta, ret;
+
+	if (pages_scanned_stored) {
+		base = pages_scanned_base;
+		ret = pages_scanned_stored;
+		delta = uksm_pages_scanned >> base;
+		if (CAN_OVERFLOW_U64(ret, delta)) {
+			ret >>= 1;
+			delta >>= 1;
+			base++;
+			ret += delta;
+		}
+	} else {
+		ret = uksm_pages_scanned;
+	}
+
+	while (ret > ULONG_MAX) {
+		ret >>= 1;
+		base++;
+	}
+
+	if (base)
+		return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base);
+	else
+		return sprintf(buf, "%lu\n", (unsigned long)ret);
+}
+UKSM_ATTR_RO(pages_scanned);
+
+static ssize_t hash_strength_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%lu\n", hash_strength);
+}
+UKSM_ATTR_RO(hash_strength);
+
+static ssize_t sleep_times_show(struct kobject *kobj,
+				  struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%llu\n", uksm_sleep_times);
+}
+UKSM_ATTR_RO(sleep_times);
+
+
+static struct attribute *uksm_attrs[] = {
+	&max_cpu_percentage_attr.attr,
+	&sleep_millisecs_attr.attr,
+	&cpu_governor_attr.attr,
+	&run_attr.attr,
+	&ema_per_page_time_attr.attr,
+	&pages_shared_attr.attr,
+	&pages_sharing_attr.attr,
+	&pages_unshared_attr.attr,
+	&full_scans_attr.attr,
+	&pages_scanned_attr.attr,
+	&hash_strength_attr.attr,
+	&sleep_times_attr.attr,
+	&thrash_threshold_attr.attr,
+	&abundant_threshold_attr.attr,
+	&cpu_ratios_attr.attr,
+	&eval_intervals_attr.attr,
+	NULL,
+};
+
+static struct attribute_group uksm_attr_group = {
+	.attrs = uksm_attrs,
+	.name = "uksm",
+};
+#endif /* CONFIG_SYSFS */
+
+static inline void init_scan_ladder(void)
+{
+	int i;
+	struct scan_rung *rung;
+
+	for (i = 0; i < SCAN_LADDER_SIZE; i++) {
+		rung = uksm_scan_ladder + i;
+		slot_tree_init_root(&rung->vma_root);
+	}
+
+	init_performance_values();
+	uksm_calc_scan_pages();
+}
+
+static inline int cal_positive_negative_costs(void)
+{
+	struct page *p1, *p2;
+	unsigned char *addr1, *addr2;
+	unsigned long i, time_start, hash_cost;
+	unsigned long loopnum = 0;
+
+	/*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */
+	volatile u32 hash;
+	volatile int ret;
+
+	p1 = alloc_page(GFP_KERNEL);
+	if (!p1)
+		return -ENOMEM;
+
+	p2 = alloc_page(GFP_KERNEL);
+	if (!p2)
+		return -ENOMEM;
+
+	addr1 = kmap_atomic(p1);
+	addr2 = kmap_atomic(p2);
+	memset(addr1, prandom_u32(), PAGE_SIZE);
+	memcpy(addr2, addr1, PAGE_SIZE);
+
+	/* make sure that the two pages differ in last byte */
+	addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1];
+	kunmap_atomic(addr2);
+	kunmap_atomic(addr1);
+
+	time_start = jiffies;
+	while (jiffies - time_start < 100) {
+		for (i = 0; i < 100; i++)
+			hash = page_hash(p1, HASH_STRENGTH_FULL, 0);
+		loopnum += 100;
+	}
+	hash_cost = (jiffies - time_start);
+
+	time_start = jiffies;
+	for (i = 0; i < loopnum; i++)
+		ret = pages_identical(p1, p2);
+	memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start);
+	memcmp_cost /= hash_cost;
+	printk(KERN_INFO "UKSM: relative memcmp_cost = %lu "
+			 "hash=%u cmp_ret=%d.\n",
+	       memcmp_cost, hash, ret);
+
+	__free_page(p1);
+	__free_page(p2);
+	return 0;
+}
+
+static int init_zeropage_hash_table(void)
+{
+	struct page *page;
+	char *addr;
+	int i;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	addr = kmap_atomic(page);
+	memset(addr, 0, PAGE_SIZE);
+	kunmap_atomic(addr);
+
+	zero_hash_table = kmalloc(HASH_STRENGTH_MAX * sizeof(u32),
+		GFP_KERNEL);
+	if (!zero_hash_table)
+		return -ENOMEM;
+
+	for (i = 0; i < HASH_STRENGTH_MAX; i++)
+		zero_hash_table[i] = page_hash(page, i, 0);
+
+	__free_page(page);
+
+	return 0;
+}
+
+static inline int init_random_sampling(void)
+{
+	unsigned long i;
+	random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!random_nums)
+		return -ENOMEM;
+
+	for (i = 0; i < HASH_STRENGTH_FULL; i++)
+		random_nums[i] = i;
+
+	for (i = 0; i < HASH_STRENGTH_FULL; i++) {
+		unsigned long rand_range, swap_index, tmp;
+
+		rand_range = HASH_STRENGTH_FULL - i;
+		swap_index = i + prandom_u32() % rand_range;
+		tmp = random_nums[i];
+		random_nums[i] =  random_nums[swap_index];
+		random_nums[swap_index] = tmp;
+	}
+
+	rshash_state.state = RSHASH_NEW;
+	rshash_state.below_count = 0;
+	rshash_state.lookup_window_index = 0;
+
+	return cal_positive_negative_costs();
+}
+
+static int __init uksm_slab_init(void)
+{
+	rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0);
+	if (!rmap_item_cache)
+		goto out;
+
+	stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0);
+	if (!stable_node_cache)
+		goto out_free1;
+
+	node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0);
+	if (!node_vma_cache)
+		goto out_free2;
+
+	vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0);
+	if (!vma_slot_cache)
+		goto out_free3;
+
+	tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0);
+	if (!tree_node_cache)
+		goto out_free4;
+
+	return 0;
+
+out_free4:
+	kmem_cache_destroy(vma_slot_cache);
+out_free3:
+	kmem_cache_destroy(node_vma_cache);
+out_free2:
+	kmem_cache_destroy(stable_node_cache);
+out_free1:
+	kmem_cache_destroy(rmap_item_cache);
+out:
+	return -ENOMEM;
+}
+
+static void __init uksm_slab_free(void)
+{
+	kmem_cache_destroy(stable_node_cache);
+	kmem_cache_destroy(rmap_item_cache);
+	kmem_cache_destroy(node_vma_cache);
+	kmem_cache_destroy(vma_slot_cache);
+	kmem_cache_destroy(tree_node_cache);
+}
+
+/* Common interface to ksm, different to it. */
+int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
+		unsigned long end, int advice, unsigned long *vm_flags)
+{
+	int err;
+
+	switch (advice) {
+	case MADV_MERGEABLE:
+		return 0;		/* just ignore the advice */
+
+	case MADV_UNMERGEABLE:
+		if (!(*vm_flags & VM_MERGEABLE))
+			return 0;		/* just ignore the advice */
+
+		if (vma->anon_vma) {
+			err = unmerge_uksm_pages(vma, start, end);
+			if (err)
+				return err;
+		}
+
+		uksm_remove_vma(vma);
+		*vm_flags &= ~VM_MERGEABLE;
+		break;
+	}
+
+	return 0;
+}
+
+/* Common interface to ksm, actually the same. */
+struct page *ksm_might_need_to_copy(struct page *page,
+			struct vm_area_struct *vma, unsigned long address)
+{
+	struct anon_vma *anon_vma = page_anon_vma(page);
+	struct page *new_page;
+
+	if (PageKsm(page)) {
+		if (page_stable_node(page))
+			return page;	/* no need to copy it */
+	} else if (!anon_vma) {
+		return page;		/* no need to copy it */
+	} else if (anon_vma->root == vma->anon_vma->root &&
+		 page->index == linear_page_index(vma, address)) {
+		return page;		/* still no need to copy it */
+	}
+	if (!PageUptodate(page))
+		return page;		/* let do_swap_page report the error */
+
+	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+	if (new_page) {
+		copy_user_highpage(new_page, page, address, vma);
+
+		SetPageDirty(new_page);
+		__SetPageUptodate(new_page);
+		__SetPageLocked(new_page);
+	}
+
+	return new_page;
+}
+
+static int __init uksm_init(void)
+{
+	struct task_struct *uksm_thread;
+	int err;
+
+	uksm_sleep_jiffies = msecs_to_jiffies(100);
+	uksm_sleep_saved = uksm_sleep_jiffies;
+
+	slot_tree_init();
+	init_scan_ladder();
+
+
+	err = init_random_sampling();
+	if (err)
+		goto out_free2;
+
+	err = uksm_slab_init();
+	if (err)
+		goto out_free1;
+
+	err = init_zeropage_hash_table();
+	if (err)
+		goto out_free0;
+
+	uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd");
+	if (IS_ERR(uksm_thread)) {
+		printk(KERN_ERR "uksm: creating kthread failed\n");
+		err = PTR_ERR(uksm_thread);
+		goto out_free;
+	}
+
+#ifdef CONFIG_SYSFS
+	err = sysfs_create_group(mm_kobj, &uksm_attr_group);
+	if (err) {
+		printk(KERN_ERR "uksm: register sysfs failed\n");
+		kthread_stop(uksm_thread);
+		goto out_free;
+	}
+#else
+	uksm_run = UKSM_RUN_MERGE;	/* no way for user to start it */
+
+#endif /* CONFIG_SYSFS */
+
+#ifdef CONFIG_MEMORY_HOTREMOVE
+	/*
+	 * Choose a high priority since the callback takes uksm_thread_mutex:
+	 * later callbacks could only be taking locks which nest within that.
+	 */
+	hotplug_memory_notifier(uksm_memory_callback, 100);
+#endif
+	return 0;
+
+out_free:
+	kfree(zero_hash_table);
+out_free0:
+	uksm_slab_free();
+out_free1:
+	kfree(random_nums);
+out_free2:
+	kfree(uksm_scan_ladder);
+	return err;
+}
+
+#ifdef MODULE
+subsys_initcall(ksm_init);
+#else
+late_initcall(uksm_init);
+#endif
+
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 89cec42..188ce43 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -974,6 +974,9 @@ const char * const vmstat_text[] = {
 	"nr_dirtied",
 	"nr_written",
 
+#ifdef CONFIG_UKSM
+	"nr_uksm_zero_pages",
+#endif
 	/* enum writeback_stat_item counters */
 	"nr_dirty_threshold",
 	"nr_dirty_background_threshold",
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index fd3ce46..2d49cee 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -8,7 +8,11 @@
 #include <linux/ktime.h>
 #include <linux/string.h>
 #include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
 
+#include <net/tcp.h>
 #include <net/secure_seq.h>
 
 #if IS_ENABLED(CONFIG_IPV6) || IS_ENABLED(CONFIG_INET)
@@ -39,6 +43,102 @@ static u32 seq_scale(u32 seq)
 }
 #endif
 
+#ifdef CONFIG_TCP_STEALTH
+u32 tcp_stealth_sequence_number(struct sock *sk, __be32 *daddr,
+				u32 daddr_size, __be16 dport)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcp_md5sig_key *md5;
+
+	__u32 sec[MD5_MESSAGE_BYTES / sizeof(__u32)];
+	__u32 i;
+	__u32 tsval = 0;
+
+	__be32 iv[MD5_DIGEST_WORDS] = { 0 };
+	__be32 isn;
+
+	memcpy(iv, daddr, (daddr_size > sizeof(iv)) ? sizeof(iv) : daddr_size);
+
+#ifdef CONFIG_TCP_MD5SIG
+	md5 = tp->af_specific->md5_lookup(sk, sk);
+#else
+	md5 = NULL;
+#endif
+	if (likely(sysctl_tcp_timestamps && !md5) || tp->stealth.saw_tsval)
+		tsval = tp->stealth.mstamp.stamp_jiffies;
+
+	((__be16 *)iv)[2] ^= cpu_to_be16(tp->stealth.integrity_hash);
+	iv[2] ^= cpu_to_be32(tsval);
+	((__be16 *)iv)[6] ^= dport;
+
+	for (i = 0; i < MD5_DIGEST_WORDS; i++)
+		iv[i] = le32_to_cpu(iv[i]);
+	for (i = 0; i < MD5_MESSAGE_BYTES / sizeof(__le32); i++)
+		sec[i] = le32_to_cpu(((__le32 *)tp->stealth.secret)[i]);
+
+	md5_transform(iv, sec);
+
+	isn = cpu_to_be32(iv[0]) ^ cpu_to_be32(iv[1]) ^
+	      cpu_to_be32(iv[2]) ^ cpu_to_be32(iv[3]);
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY)
+		be32_isn_to_be16_ih(isn) =
+			cpu_to_be16(tp->stealth.integrity_hash);
+
+	return be32_to_cpu(isn);
+}
+EXPORT_SYMBOL(tcp_stealth_sequence_number);
+
+u32 tcp_stealth_do_auth(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th = tcp_hdr(skb);
+	__be32 isn = th->seq;
+	__be32 hash;
+	__be32 *daddr;
+	u32 daddr_size;
+
+	tp->stealth.saw_tsval =
+		tcp_parse_tsval_option(&tp->stealth.mstamp.stamp_jiffies, th);
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN)
+		tp->stealth.integrity_hash =
+			be16_to_cpu(be32_isn_to_be16_ih(isn));
+
+	switch (tp->inet_conn.icsk_inet.sk.sk_family) {
+#if IS_ENABLED(CONFIG_IPV6)
+	case PF_INET6:
+		daddr_size = sizeof(ipv6_hdr(skb)->daddr.s6_addr32);
+		daddr = ipv6_hdr(skb)->daddr.s6_addr32;
+	break;
+#endif
+	case PF_INET:
+		daddr_size = sizeof(ip_hdr(skb)->daddr);
+		daddr = &ip_hdr(skb)->daddr;
+	break;
+	default:
+		pr_err("TCP Stealth: Unknown network layer protocol, stop!\n");
+		return 1;
+	}
+
+	hash = tcp_stealth_sequence_number(sk, daddr, daddr_size, th->dest);
+	cpu_to_be32s(&hash);
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+	    tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN &&
+	    be32_isn_to_be16_av(isn) == be32_isn_to_be16_av(hash))
+		return 0;
+
+	if (tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+	    !(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+	    isn == hash)
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(tcp_stealth_do_auth);
+#endif
+
 #if IS_ENABLED(CONFIG_IPV6)
 __u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
 				   __be16 sport, __be16 dport)
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b..c4ed105 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -662,6 +662,9 @@ choice
 	config DEFAULT_VEGAS
 		bool "Vegas" if TCP_CONG_VEGAS=y
 
+	config DEFAULT_YEAH
+		bool "YeAH" if TCP_CONG_YEAH=y
+
 	config DEFAULT_VENO
 		bool "Veno" if TCP_CONG_VENO=y
 
@@ -692,6 +695,7 @@ config DEFAULT_TCP_CONG
 	default "htcp" if DEFAULT_HTCP
 	default "hybla" if DEFAULT_HYBLA
 	default "vegas" if DEFAULT_VEGAS
+	default "yeah" if DEFAULT_YEAH
 	default "westwood" if DEFAULT_WESTWOOD
 	default "veno" if DEFAULT_VENO
 	default "reno" if DEFAULT_RENO
@@ -709,3 +713,13 @@ config TCP_MD5SIG
 	  on the Internet.
 
 	  If unsure, say N.
+
+config TCP_STEALTH
+	bool "TCP: Stealth TCP socket support"
+	default n
+	---help---
+	  This option enables support for stealth TCP sockets. If you do not
+	  know what this means, you do not need it.
+
+	  If unsure, say N.
+
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index c876f5d..ccf8390 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -269,6 +269,7 @@
 #include <linux/err.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -2358,6 +2359,49 @@ static int tcp_repair_options_est(struct tcp_sock *tp,
 	return 0;
 }
 
+#ifdef CONFIG_TCP_STEALTH
+int tcp_stealth_integrity(__be16 *hash, u8 *secret, u8 *payload, int len)
+{
+	struct scatterlist sg[2];
+	struct crypto_ahash *tfm;
+	struct ahash_request *req;
+	__be16 h[MD5_DIGEST_WORDS * 2];
+	int i;
+	int err = 0;
+
+	tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+	if (IS_ERR(tfm)) {
+		err = -PTR_ERR(tfm);
+		goto out;
+	}
+	req = ahash_request_alloc(tfm, GFP_ATOMIC);
+	if (!req)
+		err = -EFAULT;
+		goto out;
+
+	sg_init_table(sg, 2);
+	sg_set_buf(&sg[0], secret, MD5_MESSAGE_BYTES);
+	sg_set_buf(&sg[1], payload, len);
+
+	ahash_request_set_callback(req, 0, NULL, NULL);
+	ahash_request_set_crypt(req, sg, (u8 *)h, MD5_MESSAGE_BYTES + len);
+
+	if (crypto_ahash_digest(req)) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	*hash = be16_to_cpu(h[0]);
+	for (i = 1; i < MD5_DIGEST_WORDS * 2; i++)
+		*hash ^= be16_to_cpu(h[i]);
+
+out:
+	ahash_request_free(req);
+	crypto_free_ahash(tfm);
+	return err;
+}
+#endif
+
 /*
  *	Socket option code for TCP.
  */
@@ -2389,6 +2433,66 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		release_sock(sk);
 		return err;
 	}
+#ifdef CONFIG_TCP_STEALTH
+	case TCP_STEALTH: {
+		u8 secret[MD5_MESSAGE_BYTES] = { 0 };
+
+		val = copy_from_user(secret, optval,
+				     min_t(unsigned int, optlen,
+					   MD5_MESSAGE_BYTES));
+		if (val != 0)
+			return -EFAULT;
+
+		lock_sock(sk);
+		memcpy(tp->stealth.secret, secret, MD5_MESSAGE_BYTES);
+		tp->stealth.mode = TCP_STEALTH_MODE_AUTH;
+		tp->stealth.mstamp.v64 = 0;
+		tp->stealth.saw_tsval = false;
+		release_sock(sk);
+		return err;
+	}
+	case TCP_STEALTH_INTEGRITY: {
+		u8 *payload;
+
+		lock_sock(sk);
+
+		if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+			err = -EOPNOTSUPP;
+			goto stealth_integrity_out_1;
+		}
+
+		if (optlen < 1 || optlen > USHRT_MAX) {
+			err = -EINVAL;
+			goto stealth_integrity_out_1;
+		}
+
+		payload = vmalloc(optlen);
+		if (!payload) {
+			err = -ENOMEM;
+			goto stealth_integrity_out_1;
+		}
+
+		val = copy_from_user(payload, optval, optlen);
+		if (val != 0) {
+			err = -EFAULT;
+			goto stealth_integrity_out_2;
+		}
+
+		err = tcp_stealth_integrity(&tp->stealth.integrity_hash,
+					    tp->stealth.secret, payload,
+					    optlen);
+		if (err)
+			goto stealth_integrity_out_2;
+
+		tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY;
+
+stealth_integrity_out_2:
+		vfree(payload);
+stealth_integrity_out_1:
+		release_sock(sk);
+		return err;
+	}
+#endif
 	default:
 		/* fallthru */
 		break;
@@ -2643,6 +2747,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		tp->notsent_lowat = val;
 		sk->sk_write_space(sk);
 		break;
+#ifdef CONFIG_TCP_STEALTH
+	case TCP_STEALTH_INTEGRITY_LEN:
+		if (!(tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+			err = -EOPNOTSUPP;
+		} else if (val < 1 || val > USHRT_MAX) {
+			err = -EINVAL;
+		} else {
+			tp->stealth.integrity_len = val;
+			tp->stealth.mode |= TCP_STEALTH_MODE_INTEGRITY_LEN;
+		}
+		break;
+#endif
 	default:
 		err = -ENOPROTOOPT;
 		break;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index a756b87..9d8e4f7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -77,6 +77,9 @@
 #include <linux/errqueue.h>
 
 int sysctl_tcp_timestamps __read_mostly = 1;
+#ifdef CONFIG_TCP_STEALTH
+EXPORT_SYMBOL(sysctl_tcp_timestamps);
+#endif
 int sysctl_tcp_window_scaling __read_mostly = 1;
 int sysctl_tcp_sack __read_mostly = 1;
 int sysctl_tcp_fack __read_mostly = 1;
@@ -3907,6 +3910,47 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb,
 	return true;
 }
 
+#ifdef CONFIG_TCP_STEALTH
+/* Parse only the TSVal field of the TCP Timestamp option header.
+ */
+const bool tcp_parse_tsval_option(u32 *tsval, const struct tcphdr *th)
+{
+	int length = (th->doff << 2) - sizeof(*th);
+	const u8 *ptr = (const u8 *)(th + 1);
+
+	/* If the TCP option is too short, we can short cut */
+	if (length < TCPOLEN_TIMESTAMP)
+		return false;
+
+	while (length > 0) {
+		int opcode = *ptr++;
+		int opsize;
+
+		switch (opcode) {
+		case TCPOPT_EOL:
+			return false;
+		case TCPOPT_NOP:
+			length--;
+			continue;
+		case TCPOPT_TIMESTAMP:
+			opsize = *ptr++;
+			if (opsize != TCPOLEN_TIMESTAMP || opsize > length)
+				return false;
+			*tsval = get_unaligned_be32(ptr);
+			return true;
+		default:
+			opsize = *ptr++;
+			if (opsize < 2 || opsize > length)
+				return false;
+		}
+		ptr += opsize - 2;
+		length -= opsize;
+	}
+	return false;
+}
+EXPORT_SYMBOL(tcp_parse_tsval_option);
+#endif
+
 #ifdef CONFIG_TCP_MD5SIG
 /*
  * Parse MD5 Signature option
@@ -4595,6 +4639,31 @@ err:
 
 }
 
+#ifdef CONFIG_TCP_STEALTH
+static int __tcp_stealth_integrity_check(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = tcp_hdr(skb);
+	struct tcp_sock *tp = tcp_sk(sk);
+	u16 hash;
+	__be32 seq = cpu_to_be32(TCP_SKB_CB(skb)->seq - 1);
+	char *data = skb->data + th->doff * 4;
+	int len = skb->len - th->doff * 4;
+
+	if (len < tp->stealth.integrity_len)
+		return 1;
+
+	if (tcp_stealth_integrity(&hash, tp->stealth.secret, data,
+				  tp->stealth.integrity_len))
+		return 1;
+
+	if (be32_isn_to_be16_ih(seq) != cpu_to_be16(hash))
+		return 1;
+
+	tp->stealth.mode &= ~TCP_STEALTH_MODE_INTEGRITY_LEN;
+	return 0;
+}
+#endif
+
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
@@ -4605,6 +4674,15 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 		__kfree_skb(skb);
 		return;
 	}
+
+#ifdef CONFIG_TCP_STEALTH
+	if (unlikely(tp->stealth.mode & TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+	    __tcp_stealth_integrity_check(sk, skb)) {
+		tcp_reset(sk);
+		goto drop;
+	}
+#endif
+
 	skb_dst_drop(skb);
 	__skb_pull(skb, tcp_hdr(skb)->doff * 4);
 
@@ -5371,6 +5449,15 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 			int eaten = 0;
 			bool fragstolen = false;
 
+#ifdef CONFIG_TCP_STEALTH
+			if (unlikely(tp->stealth.mode &
+				     TCP_STEALTH_MODE_INTEGRITY_LEN) &&
+			    __tcp_stealth_integrity_check(sk, skb)) {
+				tcp_reset(sk);
+				goto discard;
+			}
+#endif
+
 			if (tp->ucopy.task == current &&
 			    tp->copied_seq == tp->rcv_nxt &&
 			    len - tcp_header_len <= tp->ucopy.len &&
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 7b235fa..349ac2a 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -74,6 +74,7 @@
 #include <net/xfrm.h>
 #include <net/secure_seq.h>
 #include <net/busy_poll.h>
+#include <net/secure_seq.h>
 
 #include <linux/inet.h>
 #include <linux/ipv6.h>
@@ -234,6 +235,21 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 	sk->sk_gso_type = SKB_GSO_TCPV4;
 	sk_setup_caps(sk, &rt->dst);
 
+#ifdef CONFIG_TCP_STEALTH
+	/* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as
+	 * early as possible and thus move taking the snapshot of tcp_time_stamp
+	 * here.
+	 */
+	skb_mstamp_get(&tp->stealth.mstamp);
+
+	if (!tp->write_seq && likely(!tp->repair) &&
+	    unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH))
+		tp->write_seq = tcp_stealth_sequence_number(sk,
+					&inet->inet_daddr,
+					sizeof(inet->inet_daddr),
+					usin->sin_port);
+#endif
+
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 							   inet->inet_daddr,
@@ -1383,6 +1399,8 @@ static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
  */
 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th = tcp_hdr(skb);
 	struct sock *rsk;
 
 	if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
@@ -1404,6 +1422,15 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (tcp_checksum_complete(skb))
 		goto csum_err;
 
+#ifdef CONFIG_TCP_STEALTH
+	if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin &&
+	    unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH) &&
+	    tcp_stealth_do_auth(sk, skb)) {
+		rsk = sk;
+		goto reset;
+	}
+#endif
+
 	if (sk->sk_state == TCP_LISTEN) {
 		struct sock *nsk = tcp_v4_cookie_check(sk, skb);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d48d557..ac98740 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -931,6 +931,13 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 	tcb = TCP_SKB_CB(skb);
 	memset(&opts, 0, sizeof(opts));
 
+#ifdef TCP_STEALTH
+	if (unlikely(tcb->tcp_flags & TCPHDR_SYN &&
+		     tp->stealth.mode & TCP_STEALTH_MODE_AUTH)) {
+		skb->skb_mstamp = tp->stealth.mstamp;
+	}
+#endif
+
 	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
 		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
 	else
@@ -3307,7 +3314,15 @@ int tcp_connect(struct sock *sk)
 		return -ENOBUFS;
 
 	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
+#ifdef CONFIG_TCP_STEALTH
+	/* The timetamp was already made at the time the ISN was generated
+	 * as we need to know its value in the stealth_tcp_sequence_number()
+	 * function.
+	 */
+	tp->retrans_stamp = tp->stealth.mstamp.stamp_jiffies;
+#else
 	tp->retrans_stamp = tcp_time_stamp;
+#endif
 	tcp_connect_queue_skb(sk, buff);
 	tcp_ecn_send_syn(sk, buff);
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 7370ad2..2772004 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -447,8 +447,10 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info,
 
 	if (__ipv6_addr_needs_scope_id(addr_type))
 		iif = skb->dev->ifindex;
-	else
-		iif = l3mdev_master_ifindex(skb_dst(skb)->dev);
+	else {
+		dst = skb_dst(skb);
+		iif = l3mdev_master_ifindex(dst ? dst->dev : skb->dev);
+	}
 
 	/*
 	 *	Must not send error if the source does not uniquely
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index af6a09e..1620440 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -62,6 +62,7 @@
 #include <net/inet_common.h>
 #include <net/secure_seq.h>
 #include <net/busy_poll.h>
+#include <net/secure_seq.h>
 
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
@@ -278,6 +279,21 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
 
 	sk_set_txhash(sk);
 
+#ifdef CONFIG_TCP_STEALTH
+	/* If CONFIG_TCP_STEALTH is defined, we need to know the timestamp as
+	 * early as possible and thus move taking the snapshot of tcp_time_stamp
+	 * here.
+	 */
+	skb_mstamp_get(&tp->stealth.mstamp);
+
+	if (!tp->write_seq && likely(!tp->repair) &&
+	    unlikely(tp->stealth.mode & TCP_STEALTH_MODE_AUTH))
+		tp->write_seq = tcp_stealth_sequence_number(sk,
+					sk->sk_v6_daddr.s6_addr32,
+					sizeof(sk->sk_v6_daddr),
+					inet->inet_dport);
+#endif
+
 	if (!tp->write_seq && likely(!tp->repair))
 		tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32,
 							     sk->sk_v6_daddr.s6_addr32,
@@ -1214,7 +1230,8 @@ static void tcp_v6_restore_cb(struct sk_buff *skb)
 static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
-	struct tcp_sock *tp;
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct tcphdr *th = tcp_hdr(skb);
 	struct sk_buff *opt_skb = NULL;
 
 	/* Imagine: socket is IPv6. IPv4 packet arrives,
@@ -1274,6 +1291,13 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (tcp_checksum_complete(skb))
 		goto csum_err;
 
+#ifdef CONFIG_TCP_STEALTH
+	if (sk->sk_state == TCP_LISTEN && th->syn && !th->fin &&
+	    tp->stealth.mode & TCP_STEALTH_MODE_AUTH &&
+	    tcp_stealth_do_auth(sk, skb))
+		goto reset;
+#endif
+
 	if (sk->sk_state == TCP_LISTEN) {
 		struct sock *nsk = tcp_v6_cookie_check(sk, skb);
 
diff --git a/scripts/mkcompile_h b/scripts/mkcompile_h
index 6fdc97e..20739e4 100755
--- a/scripts/mkcompile_h
+++ b/scripts/mkcompile_h
@@ -54,8 +54,8 @@ else
 fi
 
 UTS_VERSION="#$VERSION"
-CONFIG_FLAGS=""
-if [ -n "$SMP" ] ; then CONFIG_FLAGS="SMP"; fi
+CONFIG_FLAGS="PCK"
+if [ -n "$SMP" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS SMP"; fi
 if [ -n "$PREEMPT" ] ; then CONFIG_FLAGS="$CONFIG_FLAGS PREEMPT"; fi
 UTS_VERSION="$UTS_VERSION $CONFIG_FLAGS $TIMESTAMP"
 
diff --git a/security/commoncap.c b/security/commoncap.c
index 14540bd..4e3b242 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -1066,12 +1066,14 @@ int cap_mmap_addr(unsigned long addr)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(cap_mmap_addr);
 
 int cap_mmap_file(struct file *file, unsigned long reqprot,
 		  unsigned long prot, unsigned long flags)
 {
 	return 0;
 }
+EXPORT_SYMBOL_GPL(cap_mmap_file);
 
 #ifdef CONFIG_SECURITY
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index 03c1652..f88c84b 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -7,6 +7,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/cgroup.h>
 #include <linux/ctype.h>
+#include <linux/export.h>
 #include <linux/list.h>
 #include <linux/uaccess.h>
 #include <linux/seq_file.h>
@@ -849,6 +850,7 @@ int __devcgroup_inode_permission(struct inode *inode, int mask)
 	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
 			access);
 }
+EXPORT_SYMBOL_GPL(__devcgroup_inode_permission);
 
 int devcgroup_inode_mknod(int mode, dev_t dev)
 {
diff --git a/security/security.c b/security/security.c
index 4838e7f..36c741e 100644
--- a/security/security.c
+++ b/security/security.c
@@ -434,6 +434,7 @@ int security_path_rmdir(const struct path *dir, struct dentry *dentry)
 		return 0;
 	return call_int_hook(path_rmdir, 0, dir, dentry);
 }
+EXPORT_SYMBOL_GPL(security_path_rmdir);
 
 int security_path_unlink(const struct path *dir, struct dentry *dentry)
 {
@@ -450,6 +451,7 @@ int security_path_symlink(const struct path *dir, struct dentry *dentry,
 		return 0;
 	return call_int_hook(path_symlink, 0, dir, dentry, old_name);
 }
+EXPORT_SYMBOL_GPL(security_path_symlink);
 
 int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
 		       struct dentry *new_dentry)
@@ -458,6 +460,7 @@ int security_path_link(struct dentry *old_dentry, const struct path *new_dir,
 		return 0;
 	return call_int_hook(path_link, 0, old_dentry, new_dir, new_dentry);
 }
+EXPORT_SYMBOL_GPL(security_path_link);
 
 int security_path_rename(const struct path *old_dir, struct dentry *old_dentry,
 			 const struct path *new_dir, struct dentry *new_dentry,
@@ -485,6 +488,7 @@ int security_path_truncate(const struct path *path)
 		return 0;
 	return call_int_hook(path_truncate, 0, path);
 }
+EXPORT_SYMBOL_GPL(security_path_truncate);
 
 int security_path_chmod(const struct path *path, umode_t mode)
 {
@@ -492,6 +496,7 @@ int security_path_chmod(const struct path *path, umode_t mode)
 		return 0;
 	return call_int_hook(path_chmod, 0, path, mode);
 }
+EXPORT_SYMBOL_GPL(security_path_chmod);
 
 int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 {
@@ -499,6 +504,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
 		return 0;
 	return call_int_hook(path_chown, 0, path, uid, gid);
 }
+EXPORT_SYMBOL_GPL(security_path_chown);
 
 int security_path_chroot(const struct path *path)
 {
@@ -584,6 +590,7 @@ int security_inode_readlink(struct dentry *dentry)
 		return 0;
 	return call_int_hook(inode_readlink, 0, dentry);
 }
+EXPORT_SYMBOL_GPL(security_inode_readlink);
 
 int security_inode_follow_link(struct dentry *dentry, struct inode *inode,
 			       bool rcu)
@@ -599,6 +606,7 @@ int security_inode_permission(struct inode *inode, int mask)
 		return 0;
 	return call_int_hook(inode_permission, 0, inode, mask);
 }
+EXPORT_SYMBOL_GPL(security_inode_permission);
 
 int security_inode_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -758,6 +766,7 @@ int security_file_permission(struct file *file, int mask)
 
 	return fsnotify_perm(file, mask);
 }
+EXPORT_SYMBOL_GPL(security_file_permission);
 
 int security_file_alloc(struct file *file)
 {
@@ -817,6 +826,7 @@ int security_mmap_file(struct file *file, unsigned long prot,
 		return ret;
 	return ima_file_mmap(file, prot);
 }
+EXPORT_SYMBOL_GPL(security_mmap_file);
 
 int security_mmap_addr(unsigned long addr)
 {