]> git.karo-electronics.de Git - karo-tx-linux.git/blobdiff - net/ceph/crush/mapper.c
crush: clarify numrep vs endpos
[karo-tx-linux.git] / net / ceph / crush / mapper.c
index cbd06a91941c15f3e88b9dd6671694e2fe76dbe2..125dbd04f2b6a6e58e2dde4ae072f6e254250d65 100644 (file)
@@ -189,7 +189,7 @@ static int terminal(int x)
 static int bucket_tree_choose(struct crush_bucket_tree *bucket,
                              int x, int r)
 {
-       int n, l;
+       int n;
        __u32 w;
        __u64 t;
 
@@ -197,6 +197,7 @@ static int bucket_tree_choose(struct crush_bucket_tree *bucket,
        n = bucket->num_nodes >> 1;
 
        while (!terminal(n)) {
+               int l;
                /* pick point in [0, w) */
                w = bucket->node_weights[n];
                t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
@@ -264,8 +265,12 @@ static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
  * true if device is marked "out" (failed, fully offloaded)
  * of the cluster
  */
-static int is_out(const struct crush_map *map, const __u32 *weight, int item, int x)
+static int is_out(const struct crush_map *map,
+                 const __u32 *weight, int weight_max,
+                 int item, int x)
 {
+       if (item >= weight_max)
+               return 1;
        if (weight[item] >= 0x10000)
                return 0;
        if (weight[item] == 0)
@@ -277,7 +282,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
 }
 
 /**
- * crush_choose - choose numrep distinct items of given type
+ * crush_choose_firstn - choose numrep distinct items of given type
  * @map: the crush_map
  * @bucket: the bucket we are choose an item from
  * @x: crush input value
@@ -285,18 +290,17 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
  * @type: the type of item to choose
  * @out: pointer to output vector
  * @outpos: our position in that vector
- * @firstn: true if choosing "first n" items, false if choosing "indep"
  * @recurse_to_leaf: true if we want one device under each item of given type
  * @descend_once: true if we should only try one descent before giving up
  * @out2: second output vector for leaf items (if @recurse_to_leaf)
  */
-static int crush_choose(const struct crush_map *map,
-                       struct crush_bucket *bucket,
-                       const __u32 *weight,
-                       int x, int numrep, int type,
-                       int *out, int outpos,
-                       int firstn, int recurse_to_leaf,
-                       int descend_once, int *out2)
+static int crush_choose_firstn(const struct crush_map *map,
+                              struct crush_bucket *bucket,
+                              const __u32 *weight, int weight_max,
+                              int x, int numrep, int type,
+                              int *out, int outpos,
+                              int recurse_to_leaf,
+                              int descend_once, int *out2)
 {
        int rep;
        unsigned int ftotal, flocal;
@@ -325,26 +329,8 @@ static int crush_choose(const struct crush_map *map,
                                collide = 0;
                                retry_bucket = 0;
                                r = rep;
-                               if (in->alg == CRUSH_BUCKET_UNIFORM) {
-                                       /* be careful */
-                                       if (firstn || (__u32)numrep >= in->size)
-                                               /* r' = r + f_total */
-                                               r += ftotal;
-                                       else if (in->size % numrep == 0)
-                                               /* r'=r+(n+1)*f_local */
-                                               r += (numrep+1) *
-                                                       (flocal+ftotal);
-                                       else
-                                               /* r' = r + n*f_local */
-                                               r += numrep * (flocal+ftotal);
-                               } else {
-                                       if (firstn)
-                                               /* r' = r + f_total */
-                                               r += ftotal;
-                                       else
-                                               /* r' = r + n*f_local */
-                                               r += numrep * (flocal+ftotal);
-                               }
+                               /* r' = r + f_total */
+                               r += ftotal;
 
                                /* bucket choose */
                                if (in->size == 0) {
@@ -394,12 +380,12 @@ static int crush_choose(const struct crush_map *map,
                                reject = 0;
                                if (!collide && recurse_to_leaf) {
                                        if (item < 0) {
-                                               if (crush_choose(map,
+                                               if (crush_choose_firstn(map,
                                                         map->buckets[-1-item],
-                                                        weight,
+                                                        weight, weight_max,
                                                         x, outpos+1, 0,
                                                         out2, outpos,
-                                                        firstn, 0,
+                                                        0,
                                                         map->chooseleaf_descend_once,
                                                         NULL) <= outpos)
                                                        /* didn't get leaf */
@@ -414,6 +400,7 @@ static int crush_choose(const struct crush_map *map,
                                        /* out? */
                                        if (itemtype == 0)
                                                reject = is_out(map, weight,
+                                                               weight_max,
                                                                item, x);
                                        else
                                                reject = 0;
@@ -463,6 +450,156 @@ reject:
 }
 
 
+/**
+ * crush_choose_indep: alternative breadth-first positionally stable mapping
+ *
+ */
+static void crush_choose_indep(const struct crush_map *map,
+                             struct crush_bucket *bucket,
+                             const __u32 *weight, int weight_max,
+                              int x, int left, int numrep, int type,
+                             int *out, int outpos,
+                             int recurse_to_leaf,
+                             int *out2)
+{
+       struct crush_bucket *in = bucket;
+       int endpos = outpos + left;
+       int rep;
+       unsigned int ftotal;
+       int r;
+       int i;
+       int item = 0;
+       int itemtype;
+       int collide;
+
+       dprintk("CHOOSE%s INDEP bucket %d x %d outpos %d numrep %d\n", recurse_to_leaf ? "_LEAF" : "",
+               bucket->id, x, outpos, numrep);
+
+       /* initially my result is undefined */
+       for (rep = outpos; rep < endpos; rep++) {
+               out[rep] = CRUSH_ITEM_UNDEF;
+               if (out2)
+                       out2[rep] = CRUSH_ITEM_UNDEF;
+       }
+
+       for (ftotal = 0; left > 0 && ftotal < map->choose_total_tries; ftotal++) {
+               for (rep = outpos; rep < endpos; rep++) {
+                       if (out[rep] != CRUSH_ITEM_UNDEF)
+                               continue;
+
+                       in = bucket;  /* initial bucket */
+
+                       /* choose through intervening buckets */
+                       for (;;) {
+                               /* note: we base the choice on the position
+                                * even in the nested call.  that means that
+                                * if the first layer chooses the same bucket
+                                * in a different position, we will tend to
+                                * choose a different item in that bucket.
+                                * this will involve more devices in data
+                                * movement and tend to distribute the load.
+                                */
+                               r = rep;
+
+                               /* be careful */
+                               if (in->alg == CRUSH_BUCKET_UNIFORM &&
+                                   in->size % numrep == 0)
+                                       /* r'=r+(n+1)*f_total */
+                                       r += (numrep+1) * ftotal;
+                               else
+                                       /* r' = r + n*f_total */
+                                       r += numrep * ftotal;
+
+                               /* bucket choose */
+                               if (in->size == 0) {
+                                       dprintk("   empty bucket\n");
+                                       break;
+                               }
+
+                               item = crush_bucket_choose(in, x, r);
+                               if (item >= map->max_devices) {
+                                       dprintk("   bad item %d\n", item);
+                                       out[rep] = CRUSH_ITEM_NONE;
+                                       if (out2)
+                                               out2[rep] = CRUSH_ITEM_NONE;
+                                       left--;
+                                       break;
+                               }
+
+                               /* desired type? */
+                               if (item < 0)
+                                       itemtype = map->buckets[-1-item]->type;
+                               else
+                                       itemtype = 0;
+                               dprintk("  item %d type %d\n", item, itemtype);
+
+                               /* keep going? */
+                               if (itemtype != type) {
+                                       if (item >= 0 ||
+                                           (-1-item) >= map->max_buckets) {
+                                               dprintk("   bad item type %d\n", type);
+                                               out[rep] = CRUSH_ITEM_NONE;
+                                               if (out2)
+                                                       out2[rep] =
+                                                               CRUSH_ITEM_NONE;
+                                               left--;
+                                               break;
+                                       }
+                                       in = map->buckets[-1-item];
+                                       continue;
+                               }
+
+                               /* collision? */
+                               collide = 0;
+                               for (i = outpos; i < endpos; i++) {
+                                       if (out[i] == item) {
+                                               collide = 1;
+                                               break;
+                                       }
+                               }
+                               if (collide)
+                                       break;
+
+                               if (recurse_to_leaf) {
+                                       if (item < 0) {
+                                               crush_choose_indep(map,
+                                                                  map->buckets[-1-item],
+                                                                  weight, weight_max,
+                                                                  x, 1, numrep, 0,
+                                                                  out2, rep,
+                                                                  0, NULL);
+                                               if (out2[rep] == CRUSH_ITEM_NONE) {
+                                                       /* placed nothing; no leaf */
+                                                       break;
+                                               }
+                                       } else {
+                                               /* we already have a leaf! */
+                                               out2[rep] = item;
+                                       }
+                               }
+
+                               /* out? */
+                               if (itemtype == 0 &&
+                                   is_out(map, weight, weight_max, item, x))
+                                       break;
+
+                               /* yay! */
+                               out[rep] = item;
+                               left--;
+                               break;
+                       }
+               }
+       }
+       for (rep = outpos; rep < endpos; rep++) {
+               if (out[rep] == CRUSH_ITEM_UNDEF) {
+                       out[rep] = CRUSH_ITEM_NONE;
+               }
+               if (out2 && out2[rep] == CRUSH_ITEM_UNDEF) {
+                       out2[rep] = CRUSH_ITEM_NONE;
+               }
+       }
+}
+
 /**
  * crush_do_rule - calculate a mapping with the given input and rule
  * @map: the crush_map
@@ -470,15 +607,19 @@ reject:
  * @x: hash input
  * @result: pointer to result vector
  * @result_max: maximum result size
+ * @weight: weight vector (for map leaves)
+ * @weight_max: size of weight vector
+ * @scratch: scratch vector for private use; must be >= 3 * result_max
  */
 int crush_do_rule(const struct crush_map *map,
                  int ruleno, int x, int *result, int result_max,
-                 const __u32 *weight)
+                 const __u32 *weight, int weight_max,
+                 int *scratch)
 {
        int result_len;
-       int a[CRUSH_MAX_SET];
-       int b[CRUSH_MAX_SET];
-       int c[CRUSH_MAX_SET];
+       int *a = scratch;
+       int *b = scratch + result_max;
+       int *c = scratch + result_max*2;
        int recurse_to_leaf;
        int *w;
        int wsize = 0;
@@ -489,7 +630,6 @@ int crush_do_rule(const struct crush_map *map,
        __u32 step;
        int i, j;
        int numrep;
-       int firstn;
        const int descend_once = 0;
 
        if ((__u32)ruleno >= map->max_rules) {
@@ -503,9 +643,9 @@ int crush_do_rule(const struct crush_map *map,
        o = b;
 
        for (step = 0; step < rule->len; step++) {
+               int firstn = 0;
                struct crush_rule_step *curstep = &rule->steps[step];
 
-               firstn = 0;
                switch (curstep->op) {
                case CRUSH_RULE_TAKE:
                        w[0] = curstep->arg1;
@@ -543,22 +683,35 @@ int crush_do_rule(const struct crush_map *map,
                                                continue;
                                }
                                j = 0;
-                               osize += crush_choose(map,
-                                                     map->buckets[-1-w[i]],
-                                                     weight,
-                                                     x, numrep,
-                                                     curstep->arg2,
-                                                     o+osize, j,
-                                                     firstn,
-                                                     recurse_to_leaf,
-                                                     descend_once, c+osize);
+                               if (firstn) {
+                                       osize += crush_choose_firstn(
+                                               map,
+                                               map->buckets[-1-w[i]],
+                                               weight, weight_max,
+                                               x, numrep,
+                                               curstep->arg2,
+                                               o+osize, j,
+                                               recurse_to_leaf,
+                                               descend_once, c+osize);
+                               } else {
+                                       crush_choose_indep(
+                                               map,
+                                               map->buckets[-1-w[i]],
+                                               weight, weight_max,
+                                               x, numrep, numrep,
+                                               curstep->arg2,
+                                               o+osize, j,
+                                               recurse_to_leaf,
+                                               c+osize);
+                                       osize += numrep;
+                               }
                        }
 
                        if (recurse_to_leaf)
                                /* copy final _leaf_ values to output set */
                                memcpy(o, c, osize*sizeof(*o));
 
-                       /* swap t and w arrays */
+                       /* swap o and w arrays */
                        tmp = o;
                        o = w;
                        w = tmp;