All the mail mirrored from lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs
@ 2020-06-09 11:40 Gao Xiang
  2020-06-09 14:35 ` Gao Xiang
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Gao Xiang @ 2020-06-09 11:40 UTC (permalink / raw
  To: linux-xfs; +Cc: Gao Xiang, Darrick J. Wong, Dave Chinner, Eric Sandeen

In production, we found that sometimes xfs_repair phase 5
rebuilds freespace node block with pointers less than minrecs
and if we trigger xfs_repair again it would report such
the following message:

bad btree nrecs (39, min=40, max=80) in btbno block 0/7882

The background is that xfs_repair starts to rebuild AGFL
after the freespace btree is settled in phase 5 so we may
need to leave necessary room in advance for each btree
leaves in order to avoid freespace btree split and then
result in AGFL rebuild fails. The old mathematics uses
ceil(num_extents / maxrecs) to decide the number of node
blocks. That would be fine without leaving extra space
since minrecs = maxrecs / 2 but if some slack was decreased
from maxrecs, the result would be larger than what is
expected and cause num_recs_pb less than minrecs, i.e:

num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78

so we'd get

num_blocks = ceil(79 / 78) = 2,
num_recs_pb = 79 / 2 = 39, which is less than
minrecs = 80 / 2 = 40

OTOH, btree bulk loading code behaves in a different way.
As in xfs_btree_bload_level_geometry it wrote

num_blocks = floor(num_extents / maxrecs)

which will never go below minrecs. And when it goes
above maxrecs, just increment num_blocks and recalculate
so we can get the reasonable results.

In the long term, btree bulk loader will replace the current
repair code as well as to resolve AGFL dependency issue.
But we may still want to look for a backportable solution
for stable versions. Hence, use the same logic to avoid the
freespace btree minrecs underflow for now.

Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Eric Sandeen <sandeen@sandeen.net>
Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
not heavy tested yet..

 repair/phase5.c | 101 +++++++++++++++++++++---------------------------
 1 file changed, 45 insertions(+), 56 deletions(-)

diff --git a/repair/phase5.c b/repair/phase5.c
index abae8a08..997804a5 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -348,11 +348,29 @@ finish_cursor(bt_status_t *curs)
  * failure at runtime. Hence leave a couple of records slack space in
  * each block to allow immediate modification of the tree without
  * requiring splits to be done.
- *
- * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
  */
-#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
-	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
+static void
+compute_level_geometry(xfs_mount_t *mp, bt_stat_level_t *lptr,
+		       uint64_t nr_this_level, bool leaf)
+{
+	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
+	int			slack = leaf ? 2 : 0;
+	unsigned int		desired_npb;
+
+	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
+	lptr->num_recs_tot = nr_this_level;
+	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
+
+	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+	lptr->modulo = nr_this_level % lptr->num_blocks;
+	if (lptr->num_recs_pb > maxrecs || (lptr->num_recs_pb == maxrecs &&
+			lptr->modulo)) {
+		lptr->num_blocks++;
+
+		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+		lptr->modulo = nr_this_level % lptr->num_blocks;
+	}
+}
 
 /*
  * this calculates a freespace cursor for an ag.
@@ -370,6 +388,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	int			i;
 	int			extents_used;
 	int			extra_blocks;
+	uint64_t		old_blocks;
 	bt_stat_level_t		*lptr;
 	bt_stat_level_t		*p_lptr;
 	extent_tree_node_t	*ext_ptr;
@@ -388,10 +407,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * of the tree and set up the cursor for the leaf level
 	 * (note that the same code is duplicated further down)
 	 */
-	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-	lptr->num_recs_pb = num_extents / lptr->num_blocks;
-	lptr->modulo = num_extents % lptr->num_blocks;
-	lptr->num_recs_tot = num_extents;
+	compute_level_geometry(mp, lptr, num_extents, true);
 	level = 1;
 
 #ifdef XR_BLD_FREE_TRACE
@@ -406,18 +422,12 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * per level is the # of blocks in the level below it
 	 */
 	if (lptr->num_blocks > 1)  {
-		for (; btree_curs->level[level - 1].num_blocks > 1
-				&& level < XFS_BTREE_MAXLEVELS;
-				level++)  {
+		do {
+			p_lptr = lptr;
 			lptr = &btree_curs->level[level];
-			p_lptr = &btree_curs->level[level - 1];
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-			lptr->modulo = p_lptr->num_blocks
-					% lptr->num_blocks;
-			lptr->num_recs_pb = p_lptr->num_blocks
-					/ lptr->num_blocks;
-			lptr->num_recs_tot = p_lptr->num_blocks;
+
+			compute_level_geometry(mp, lptr,
+					p_lptr->num_blocks, false);
 #ifdef XR_BLD_FREE_TRACE
 			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
 					level,
@@ -426,7 +436,9 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 					lptr->modulo,
 					lptr->num_recs_tot);
 #endif
-		}
+			level++;
+		} while (lptr->num_blocks > 1);
+		ASSERT (level < XFS_BTREE_MAXLEVELS);
 	}
 
 	ASSERT(lptr->num_blocks == 1);
@@ -496,8 +508,11 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * see if the number of leaf blocks will change as a result
 	 * of the number of extents changing
 	 */
-	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
-			!= btree_curs->level[0].num_blocks)  {
+	old_blocks = btree_curs->level[0].num_blocks;
+	compute_level_geometry(mp, &btree_curs->level[0], num_extents, true);
+	extra_blocks = 0;
+
+	if (old_blocks != btree_curs->level[0].num_blocks)  {
 		/*
 		 * yes -- recalculate the cursor.  If the number of
 		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
@@ -553,30 +568,20 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 		}
 
 		lptr = &btree_curs->level[0];
-		lptr->num_blocks = howmany(num_extents,
-					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-		lptr->num_recs_pb = num_extents / lptr->num_blocks;
-		lptr->modulo = num_extents % lptr->num_blocks;
-		lptr->num_recs_tot = num_extents;
 		level = 1;
 
 		/*
 		 * if we need more levels, set them up
 		 */
 		if (lptr->num_blocks > 1)  {
-			for (level = 1; btree_curs->level[level-1].num_blocks
-					> 1 && level < XFS_BTREE_MAXLEVELS;
-					level++)  {
-				lptr = &btree_curs->level[level];
-				p_lptr = &btree_curs->level[level-1];
-				lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-				lptr->modulo = p_lptr->num_blocks
-						% lptr->num_blocks;
-				lptr->num_recs_pb = p_lptr->num_blocks
-						/ lptr->num_blocks;
-				lptr->num_recs_tot = p_lptr->num_blocks;
-			}
+			do {
+				p_lptr = lptr;
+				lptr = &btree_curs->level[level++];
+
+				compute_level_geometry(mp, lptr,
+						p_lptr->num_blocks, false);
+			} while (lptr->num_blocks > 1);
+			ASSERT (level < XFS_BTREE_MAXLEVELS);
 		}
 		ASSERT(lptr->num_blocks == 1);
 		btree_curs->num_levels = level;
@@ -591,22 +596,6 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 
 		ASSERT(blocks_allocated_total >= blocks_needed);
 		extra_blocks = blocks_allocated_total - blocks_needed;
-	} else  {
-		if (extents_used > 0) {
-			/*
-			 * reset the leaf level geometry to account
-			 * for consumed extents.  we can leave the
-			 * rest of the cursor alone since the number
-			 * of leaf blocks hasn't changed.
-			 */
-			lptr = &btree_curs->level[0];
-
-			lptr->num_recs_pb = num_extents / lptr->num_blocks;
-			lptr->modulo = num_extents % lptr->num_blocks;
-			lptr->num_recs_tot = num_extents;
-		}
-
-		extra_blocks = 0;
 	}
 
 	btree_curs->num_tot_blocks = blocks_allocated_pt;
-- 
2.18.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs
  2020-06-09 11:40 [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs Gao Xiang
@ 2020-06-09 14:35 ` Gao Xiang
  2020-06-09 20:14 ` Darrick J. Wong
  2020-06-10  3:58 ` [RFC PATCH v2] xfs_repair: fix rebuilding btree " Gao Xiang
  2 siblings, 0 replies; 9+ messages in thread
From: Gao Xiang @ 2020-06-09 14:35 UTC (permalink / raw
  To: linux-xfs; +Cc: Darrick J. Wong, Dave Chinner, Eric Sandeen

On Tue, Jun 09, 2020 at 07:40:53PM +0800, Gao Xiang wrote:
> In production, we found that sometimes xfs_repair phase 5
> rebuilds freespace node block with pointers less than minrecs
> and if we trigger xfs_repair again it would report such
> the following message:
> 
> bad btree nrecs (39, min=40, max=80) in btbno block 0/7882
> 
> The background is that xfs_repair starts to rebuild AGFL
> after the freespace btree is settled in phase 5 so we may
> need to leave necessary room in advance for each btree
> leaves in order to avoid freespace btree split and then
> result in AGFL rebuild fails. The old mathematics uses
> ceil(num_extents / maxrecs) to decide the number of node
> blocks. That would be fine without leaving extra space
> since minrecs = maxrecs / 2 but if some slack was decreased
> from maxrecs, the result would be larger than what is
> expected and cause num_recs_pb less than minrecs, i.e:
> 
> num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78
> 
> so we'd get
> 
> num_blocks = ceil(79 / 78) = 2,
> num_recs_pb = 79 / 2 = 39, which is less than
> minrecs = 80 / 2 = 40
> 
> OTOH, btree bulk loading code behaves in a different way.
> As in xfs_btree_bload_level_geometry it wrote
> 
> num_blocks = floor(num_extents / maxrecs)
> 
> which will never go below minrecs. And when it goes
> above maxrecs, just increment num_blocks and recalculate
> so we can get the reasonable results.
> 
> In the long term, btree bulk loader will replace the current
> repair code as well as to resolve AGFL dependency issue.
> But we may still want to look for a backportable solution
> for stable versions. Hence, use the same logic to avoid the
> freespace btree minrecs underflow for now.
> 
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Eric Sandeen <sandeen@sandeen.net>
> Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
> not heavy tested yet..
> 
>  repair/phase5.c | 101 +++++++++++++++++++++---------------------------
>  1 file changed, 45 insertions(+), 56 deletions(-)
> 
> diff --git a/repair/phase5.c b/repair/phase5.c
> index abae8a08..997804a5 100644
> --- a/repair/phase5.c
> +++ b/repair/phase5.c
> @@ -348,11 +348,29 @@ finish_cursor(bt_status_t *curs)
>   * failure at runtime. Hence leave a couple of records slack space in
>   * each block to allow immediate modification of the tree without
>   * requiring splits to be done.
> - *
> - * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
>   */
> -#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
> -	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
> +static void
> +compute_level_geometry(xfs_mount_t *mp, bt_stat_level_t *lptr,
> +		       uint64_t nr_this_level, bool leaf)
> +{
> +	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
> +	int			slack = leaf ? 2 : 0;
> +	unsigned int		desired_npb;
> +
> +	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
> +	lptr->num_recs_tot = nr_this_level;
> +	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
> +
> +	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> +	lptr->modulo = nr_this_level % lptr->num_blocks;
> +	if (lptr->num_recs_pb > maxrecs || (lptr->num_recs_pb == maxrecs &&
> +			lptr->modulo)) {
> +		lptr->num_blocks++;
> +
> +		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> +		lptr->modulo = nr_this_level % lptr->num_blocks;
> +	}
> +}

side note: alternatively, maybe we could also adjust (by decreasing)
           num_blocks and recalculate for the original approach.
           Although for both ways we could not make 2 extra leaves
           room for the above 79 of 80 case...


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs
  2020-06-09 11:40 [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs Gao Xiang
  2020-06-09 14:35 ` Gao Xiang
@ 2020-06-09 20:14 ` Darrick J. Wong
  2020-06-09 22:12   ` Darrick J. Wong
  2020-06-10  3:58 ` [RFC PATCH v2] xfs_repair: fix rebuilding btree " Gao Xiang
  2 siblings, 1 reply; 9+ messages in thread
From: Darrick J. Wong @ 2020-06-09 20:14 UTC (permalink / raw
  To: Gao Xiang; +Cc: linux-xfs, Dave Chinner, Eric Sandeen

On Tue, Jun 09, 2020 at 07:40:53PM +0800, Gao Xiang wrote:
> In production, we found that sometimes xfs_repair phase 5
> rebuilds freespace node block with pointers less than minrecs
> and if we trigger xfs_repair again it would report such
> the following message:
> 
> bad btree nrecs (39, min=40, max=80) in btbno block 0/7882
> 
> The background is that xfs_repair starts to rebuild AGFL
> after the freespace btree is settled in phase 5 so we may
> need to leave necessary room in advance for each btree
> leaves in order to avoid freespace btree split and then
> result in AGFL rebuild fails. The old mathematics uses
> ceil(num_extents / maxrecs) to decide the number of node
> blocks. That would be fine without leaving extra space
> since minrecs = maxrecs / 2 but if some slack was decreased
> from maxrecs, the result would be larger than what is
> expected and cause num_recs_pb less than minrecs, i.e:
> 
> num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78
> 
> so we'd get
> 
> num_blocks = ceil(79 / 78) = 2,
> num_recs_pb = 79 / 2 = 39, which is less than
> minrecs = 80 / 2 = 40
> 
> OTOH, btree bulk loading code behaves in a different way.
> As in xfs_btree_bload_level_geometry it wrote
> 
> num_blocks = floor(num_extents / maxrecs)
> 
> which will never go below minrecs. And when it goes
> above maxrecs, just increment num_blocks and recalculate
> so we can get the reasonable results.
> 
> In the long term, btree bulk loader will replace the current
> repair code as well as to resolve AGFL dependency issue.
> But we may still want to look for a backportable solution
> for stable versions. Hence, use the same logic to avoid the
> freespace btree minrecs underflow for now.
> 
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Eric Sandeen <sandeen@sandeen.net>
> Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
> not heavy tested yet..
> 
>  repair/phase5.c | 101 +++++++++++++++++++++---------------------------
>  1 file changed, 45 insertions(+), 56 deletions(-)
> 
> diff --git a/repair/phase5.c b/repair/phase5.c
> index abae8a08..997804a5 100644
> --- a/repair/phase5.c
> +++ b/repair/phase5.c
> @@ -348,11 +348,29 @@ finish_cursor(bt_status_t *curs)
>   * failure at runtime. Hence leave a couple of records slack space in
>   * each block to allow immediate modification of the tree without
>   * requiring splits to be done.
> - *
> - * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
>   */
> -#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
> -	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
> +static void
> +compute_level_geometry(xfs_mount_t *mp, bt_stat_level_t *lptr,
> +		       uint64_t nr_this_level, bool leaf)

Please don't use structure typedefs here.

Also, please indent the argument list like the rest of xfs code, e.g.

static void
compute_bnobt_geometry(
	struct xfs_mount	*mp,
	struct bt_stat_level	*lptr,

(etc)

> +{
> +	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
> +	int			slack = leaf ? 2 : 0;
> +	unsigned int		desired_npb;
> +
> +	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
> +	lptr->num_recs_tot = nr_this_level;
> +	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
> +
> +	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> +	lptr->modulo = nr_this_level % lptr->num_blocks;
> +	if (lptr->num_recs_pb > maxrecs || (lptr->num_recs_pb == maxrecs &&
> +			lptr->modulo)) {

Indentation....

	if (lptr->num_recs_pb > maxrecs ||
	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
		lptr->num_blocks++;
		...
	}

> +		lptr->num_blocks++;
> +
> +		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> +		lptr->modulo = nr_this_level % lptr->num_blocks;
> +	}
> +}
>  
>  /*
>   * this calculates a freespace cursor for an ag.
> @@ -370,6 +388,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	int			i;
>  	int			extents_used;
>  	int			extra_blocks;
> +	uint64_t		old_blocks;
>  	bt_stat_level_t		*lptr;
>  	bt_stat_level_t		*p_lptr;
>  	extent_tree_node_t	*ext_ptr;
> @@ -388,10 +407,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	 * of the tree and set up the cursor for the leaf level
>  	 * (note that the same code is duplicated further down)
>  	 */
> -	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
> -	lptr->num_recs_pb = num_extents / lptr->num_blocks;
> -	lptr->modulo = num_extents % lptr->num_blocks;
> -	lptr->num_recs_tot = num_extents;
> +	compute_level_geometry(mp, lptr, num_extents, true);
>  	level = 1;
>  
>  #ifdef XR_BLD_FREE_TRACE
> @@ -406,18 +422,12 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	 * per level is the # of blocks in the level below it
>  	 */
>  	if (lptr->num_blocks > 1)  {
> -		for (; btree_curs->level[level - 1].num_blocks > 1
> -				&& level < XFS_BTREE_MAXLEVELS;
> -				level++)  {
> +		do {
> +			p_lptr = lptr;
>  			lptr = &btree_curs->level[level];
> -			p_lptr = &btree_curs->level[level - 1];
> -			lptr->num_blocks = howmany(p_lptr->num_blocks,
> -					XR_ALLOC_BLOCK_MAXRECS(mp, level));
> -			lptr->modulo = p_lptr->num_blocks
> -					% lptr->num_blocks;
> -			lptr->num_recs_pb = p_lptr->num_blocks
> -					/ lptr->num_blocks;
> -			lptr->num_recs_tot = p_lptr->num_blocks;
> +
> +			compute_level_geometry(mp, lptr,
> +					p_lptr->num_blocks, false);
>  #ifdef XR_BLD_FREE_TRACE
>  			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
>  					level,
> @@ -426,7 +436,9 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  					lptr->modulo,
>  					lptr->num_recs_tot);
>  #endif
> -		}
> +			level++;
> +		} while (lptr->num_blocks > 1);
> +		ASSERT (level < XFS_BTREE_MAXLEVELS);
>  	}
>  
>  	ASSERT(lptr->num_blocks == 1);
> @@ -496,8 +508,11 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	 * see if the number of leaf blocks will change as a result
>  	 * of the number of extents changing
>  	 */
> -	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
> -			!= btree_curs->level[0].num_blocks)  {
> +	old_blocks = btree_curs->level[0].num_blocks;
> +	compute_level_geometry(mp, &btree_curs->level[0], num_extents, true);
> +	extra_blocks = 0;
> +
> +	if (old_blocks != btree_curs->level[0].num_blocks)  {
>  		/*
>  		 * yes -- recalculate the cursor.  If the number of
>  		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
> @@ -553,30 +568,20 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  		}
>  
>  		lptr = &btree_curs->level[0];
> -		lptr->num_blocks = howmany(num_extents,
> -					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
> -		lptr->num_recs_pb = num_extents / lptr->num_blocks;
> -		lptr->modulo = num_extents % lptr->num_blocks;
> -		lptr->num_recs_tot = num_extents;
>  		level = 1;
>  
>  		/*
>  		 * if we need more levels, set them up
>  		 */
>  		if (lptr->num_blocks > 1)  {
> -			for (level = 1; btree_curs->level[level-1].num_blocks
> -					> 1 && level < XFS_BTREE_MAXLEVELS;
> -					level++)  {
> -				lptr = &btree_curs->level[level];
> -				p_lptr = &btree_curs->level[level-1];
> -				lptr->num_blocks = howmany(p_lptr->num_blocks,
> -					XR_ALLOC_BLOCK_MAXRECS(mp, level));
> -				lptr->modulo = p_lptr->num_blocks
> -						% lptr->num_blocks;
> -				lptr->num_recs_pb = p_lptr->num_blocks
> -						/ lptr->num_blocks;
> -				lptr->num_recs_tot = p_lptr->num_blocks;
> -			}
> +			do {
> +				p_lptr = lptr;
> +				lptr = &btree_curs->level[level++];
> +
> +				compute_level_geometry(mp, lptr,
> +						p_lptr->num_blocks, false);
> +			} while (lptr->num_blocks > 1);

/me wonders why it's necessary to wrap the do...while in an if test,
as opposed to using a while loop with the if test up front?

--D

> +			ASSERT (level < XFS_BTREE_MAXLEVELS);
>  		}
>  		ASSERT(lptr->num_blocks == 1);
>  		btree_curs->num_levels = level;
> @@ -591,22 +596,6 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  
>  		ASSERT(blocks_allocated_total >= blocks_needed);
>  		extra_blocks = blocks_allocated_total - blocks_needed;
> -	} else  {
> -		if (extents_used > 0) {
> -			/*
> -			 * reset the leaf level geometry to account
> -			 * for consumed extents.  we can leave the
> -			 * rest of the cursor alone since the number
> -			 * of leaf blocks hasn't changed.
> -			 */
> -			lptr = &btree_curs->level[0];
> -
> -			lptr->num_recs_pb = num_extents / lptr->num_blocks;
> -			lptr->modulo = num_extents % lptr->num_blocks;
> -			lptr->num_recs_tot = num_extents;
> -		}
> -
> -		extra_blocks = 0;
>  	}
>  
>  	btree_curs->num_tot_blocks = blocks_allocated_pt;
> -- 
> 2.18.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs
  2020-06-09 20:14 ` Darrick J. Wong
@ 2020-06-09 22:12   ` Darrick J. Wong
  2020-06-10  1:04     ` Gao Xiang
  0 siblings, 1 reply; 9+ messages in thread
From: Darrick J. Wong @ 2020-06-09 22:12 UTC (permalink / raw
  To: Gao Xiang; +Cc: linux-xfs, Dave Chinner, Eric Sandeen

On Tue, Jun 09, 2020 at 01:14:23PM -0700, Darrick J. Wong wrote:
> On Tue, Jun 09, 2020 at 07:40:53PM +0800, Gao Xiang wrote:
> > In production, we found that sometimes xfs_repair phase 5
> > rebuilds freespace node block with pointers less than minrecs
> > and if we trigger xfs_repair again it would report such
> > the following message:
> > 
> > bad btree nrecs (39, min=40, max=80) in btbno block 0/7882
> > 
> > The background is that xfs_repair starts to rebuild AGFL
> > after the freespace btree is settled in phase 5 so we may
> > need to leave necessary room in advance for each btree
> > leaves in order to avoid freespace btree split and then
> > result in AGFL rebuild fails. The old mathematics uses
> > ceil(num_extents / maxrecs) to decide the number of node
> > blocks. That would be fine without leaving extra space
> > since minrecs = maxrecs / 2 but if some slack was decreased
> > from maxrecs, the result would be larger than what is
> > expected and cause num_recs_pb less than minrecs, i.e:
> > 
> > num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78
> > 
> > so we'd get
> > 
> > num_blocks = ceil(79 / 78) = 2,
> > num_recs_pb = 79 / 2 = 39, which is less than
> > minrecs = 80 / 2 = 40
> > 
> > OTOH, btree bulk loading code behaves in a different way.
> > As in xfs_btree_bload_level_geometry it wrote
> > 
> > num_blocks = floor(num_extents / maxrecs)
> > 
> > which will never go below minrecs. And when it goes
> > above maxrecs, just increment num_blocks and recalculate
> > so we can get the reasonable results.
> > 
> > In the long term, btree bulk loader will replace the current
> > repair code as well as to resolve AGFL dependency issue.
> > But we may still want to look for a backportable solution
> > for stable versions. Hence, use the same logic to avoid the
> > freespace btree minrecs underflow for now.
> > 
> > Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> > Cc: Dave Chinner <dchinner@redhat.com>
> > Cc: Eric Sandeen <sandeen@sandeen.net>
> > Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
> > Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> > ---
> > not heavy tested yet..
> > 
> >  repair/phase5.c | 101 +++++++++++++++++++++---------------------------
> >  1 file changed, 45 insertions(+), 56 deletions(-)
> > 
> > diff --git a/repair/phase5.c b/repair/phase5.c
> > index abae8a08..997804a5 100644
> > --- a/repair/phase5.c
> > +++ b/repair/phase5.c
> > @@ -348,11 +348,29 @@ finish_cursor(bt_status_t *curs)
> >   * failure at runtime. Hence leave a couple of records slack space in
> >   * each block to allow immediate modification of the tree without
> >   * requiring splits to be done.
> > - *
> > - * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
> >   */
> > -#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
> > -	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
> > +static void
> > +compute_level_geometry(xfs_mount_t *mp, bt_stat_level_t *lptr,
> > +		       uint64_t nr_this_level, bool leaf)
> 
> Please don't use structure typedefs here.
> 
> Also, please indent the argument list like the rest of xfs code, e.g.
> 
> static void
> compute_bnobt_geometry(
> 	struct xfs_mount	*mp,
> 	struct bt_stat_level	*lptr,
> 
> (etc)
> 
> > +{
> > +	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
> > +	int			slack = leaf ? 2 : 0;
> > +	unsigned int		desired_npb;
> > +
> > +	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
> > +	lptr->num_recs_tot = nr_this_level;
> > +	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
> > +
> > +	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> > +	lptr->modulo = nr_this_level % lptr->num_blocks;
> > +	if (lptr->num_recs_pb > maxrecs || (lptr->num_recs_pb == maxrecs &&
> > +			lptr->modulo)) {
> 
> Indentation....
> 
> 	if (lptr->num_recs_pb > maxrecs ||
> 	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
> 		lptr->num_blocks++;
> 		...
> 	}
> 
> > +		lptr->num_blocks++;
> > +
> > +		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> > +		lptr->modulo = nr_this_level % lptr->num_blocks;

Oops, I hit send too quickly; the computation looks ok to me.

Granted that's probably due to having written another btree geometry
calculation function. :D

Also, I think init_rmapbt_cursor does a similar trick and therefore
suffers from a similar bug.  See the comment "Leave enough slack in the
rmapbt that we can insert..." around line 1412 or so?

--D

> > +	}
> > +}
> >  
> >  /*
> >   * this calculates a freespace cursor for an ag.
> > @@ -370,6 +388,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
> >  	int			i;
> >  	int			extents_used;
> >  	int			extra_blocks;
> > +	uint64_t		old_blocks;
> >  	bt_stat_level_t		*lptr;
> >  	bt_stat_level_t		*p_lptr;
> >  	extent_tree_node_t	*ext_ptr;
> > @@ -388,10 +407,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
> >  	 * of the tree and set up the cursor for the leaf level
> >  	 * (note that the same code is duplicated further down)
> >  	 */
> > -	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
> > -	lptr->num_recs_pb = num_extents / lptr->num_blocks;
> > -	lptr->modulo = num_extents % lptr->num_blocks;
> > -	lptr->num_recs_tot = num_extents;
> > +	compute_level_geometry(mp, lptr, num_extents, true);
> >  	level = 1;
> >  
> >  #ifdef XR_BLD_FREE_TRACE
> > @@ -406,18 +422,12 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
> >  	 * per level is the # of blocks in the level below it
> >  	 */
> >  	if (lptr->num_blocks > 1)  {
> > -		for (; btree_curs->level[level - 1].num_blocks > 1
> > -				&& level < XFS_BTREE_MAXLEVELS;
> > -				level++)  {
> > +		do {
> > +			p_lptr = lptr;
> >  			lptr = &btree_curs->level[level];
> > -			p_lptr = &btree_curs->level[level - 1];
> > -			lptr->num_blocks = howmany(p_lptr->num_blocks,
> > -					XR_ALLOC_BLOCK_MAXRECS(mp, level));
> > -			lptr->modulo = p_lptr->num_blocks
> > -					% lptr->num_blocks;
> > -			lptr->num_recs_pb = p_lptr->num_blocks
> > -					/ lptr->num_blocks;
> > -			lptr->num_recs_tot = p_lptr->num_blocks;
> > +
> > +			compute_level_geometry(mp, lptr,
> > +					p_lptr->num_blocks, false);
> >  #ifdef XR_BLD_FREE_TRACE
> >  			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
> >  					level,
> > @@ -426,7 +436,9 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
> >  					lptr->modulo,
> >  					lptr->num_recs_tot);
> >  #endif
> > -		}
> > +			level++;
> > +		} while (lptr->num_blocks > 1);
> > +		ASSERT (level < XFS_BTREE_MAXLEVELS);
> >  	}
> >  
> >  	ASSERT(lptr->num_blocks == 1);
> > @@ -496,8 +508,11 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
> >  	 * see if the number of leaf blocks will change as a result
> >  	 * of the number of extents changing
> >  	 */
> > -	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
> > -			!= btree_curs->level[0].num_blocks)  {
> > +	old_blocks = btree_curs->level[0].num_blocks;
> > +	compute_level_geometry(mp, &btree_curs->level[0], num_extents, true);
> > +	extra_blocks = 0;
> > +
> > +	if (old_blocks != btree_curs->level[0].num_blocks)  {
> >  		/*
> >  		 * yes -- recalculate the cursor.  If the number of
> >  		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
> > @@ -553,30 +568,20 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
> >  		}
> >  
> >  		lptr = &btree_curs->level[0];
> > -		lptr->num_blocks = howmany(num_extents,
> > -					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
> > -		lptr->num_recs_pb = num_extents / lptr->num_blocks;
> > -		lptr->modulo = num_extents % lptr->num_blocks;
> > -		lptr->num_recs_tot = num_extents;
> >  		level = 1;
> >  
> >  		/*
> >  		 * if we need more levels, set them up
> >  		 */
> >  		if (lptr->num_blocks > 1)  {
> > -			for (level = 1; btree_curs->level[level-1].num_blocks
> > -					> 1 && level < XFS_BTREE_MAXLEVELS;
> > -					level++)  {
> > -				lptr = &btree_curs->level[level];
> > -				p_lptr = &btree_curs->level[level-1];
> > -				lptr->num_blocks = howmany(p_lptr->num_blocks,
> > -					XR_ALLOC_BLOCK_MAXRECS(mp, level));
> > -				lptr->modulo = p_lptr->num_blocks
> > -						% lptr->num_blocks;
> > -				lptr->num_recs_pb = p_lptr->num_blocks
> > -						/ lptr->num_blocks;
> > -				lptr->num_recs_tot = p_lptr->num_blocks;
> > -			}
> > +			do {
> > +				p_lptr = lptr;
> > +				lptr = &btree_curs->level[level++];
> > +
> > +				compute_level_geometry(mp, lptr,
> > +						p_lptr->num_blocks, false);
> > +			} while (lptr->num_blocks > 1);
> 
> /me wonders why it's necessary to wrap the do...while in an if test,
> as opposed to using a while loop with the if test up front?
> 
> --D
> 
> > +			ASSERT (level < XFS_BTREE_MAXLEVELS);
> >  		}
> >  		ASSERT(lptr->num_blocks == 1);
> >  		btree_curs->num_levels = level;
> > @@ -591,22 +596,6 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
> >  
> >  		ASSERT(blocks_allocated_total >= blocks_needed);
> >  		extra_blocks = blocks_allocated_total - blocks_needed;
> > -	} else  {
> > -		if (extents_used > 0) {
> > -			/*
> > -			 * reset the leaf level geometry to account
> > -			 * for consumed extents.  we can leave the
> > -			 * rest of the cursor alone since the number
> > -			 * of leaf blocks hasn't changed.
> > -			 */
> > -			lptr = &btree_curs->level[0];
> > -
> > -			lptr->num_recs_pb = num_extents / lptr->num_blocks;
> > -			lptr->modulo = num_extents % lptr->num_blocks;
> > -			lptr->num_recs_tot = num_extents;
> > -		}
> > -
> > -		extra_blocks = 0;
> >  	}
> >  
> >  	btree_curs->num_tot_blocks = blocks_allocated_pt;
> > -- 
> > 2.18.1
> > 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs
  2020-06-09 22:12   ` Darrick J. Wong
@ 2020-06-10  1:04     ` Gao Xiang
  0 siblings, 0 replies; 9+ messages in thread
From: Gao Xiang @ 2020-06-10  1:04 UTC (permalink / raw
  To: Darrick J. Wong; +Cc: linux-xfs, Dave Chinner, Eric Sandeen

Hi Darrick,

On Tue, Jun 09, 2020 at 03:12:39PM -0700, Darrick J. Wong wrote:

...

> 
> Oops, I hit send too quickly; the computation looks ok to me.
> 
> Granted that's probably due to having written another btree geometry
> calculation function. :D
> 
> Also, I think init_rmapbt_cursor does a similar trick and therefore
> suffers from a similar bug.  See the comment "Leave enough slack in the
> rmapbt that we can insert..." around line 1412 or so?

These comments are fine :) I will try to fix them all in the next version.

Thanks,
Gao Xiang


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [RFC PATCH v2] xfs_repair: fix rebuilding btree block less than minrecs
  2020-06-09 11:40 [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs Gao Xiang
  2020-06-09 14:35 ` Gao Xiang
  2020-06-09 20:14 ` Darrick J. Wong
@ 2020-06-10  3:58 ` Gao Xiang
  2020-06-10  5:26   ` [RFC PATCH v3] " Gao Xiang
  2 siblings, 1 reply; 9+ messages in thread
From: Gao Xiang @ 2020-06-10  3:58 UTC (permalink / raw
  To: linux-xfs; +Cc: Gao Xiang, Darrick J. Wong, Dave Chinner, Eric Sandeen

In production, we found that sometimes xfs_repair phase 5
rebuilds freespace node block with pointers less than minrecs
and if we trigger xfs_repair again it would report such
the following message:

bad btree nrecs (39, min=40, max=80) in btbno block 0/7882

The background is that xfs_repair starts to rebuild AGFL
after the freespace btree is settled in phase 5 so we may
need to leave necessary room in advance for each btree
leaves in order to avoid freespace btree split and then
result in AGFL rebuild fails. The old mathematics uses
ceil(num_extents / maxrecs) to decide the number of node
blocks. That would be fine without leaving extra space
since minrecs = maxrecs / 2 but if some slack was decreased
from maxrecs, the result would be larger than what is
expected and cause num_recs_pb less than minrecs, i.e:

num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78

so we'd get

num_blocks = ceil(79 / 78) = 2,
num_recs_pb = 79 / 2 = 39, which is less than
minrecs = 80 / 2 = 40

OTOH, btree bulk loading code behaves in a different way.
As in xfs_btree_bload_level_geometry it wrote

num_blocks = floor(num_extents / maxrecs)

which will never go below minrecs. And when it goes above
maxrecs, just increment num_blocks and recalculate so we
can get the reasonable results.

Later, btree bulk loader will replace the current repair
code. But we may still want to look for a backportable
solution for stable versions. Hense, keep the same logic
to avoid the freespace as well as rmap btree minrecs
underflow for now.

Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Eric Sandeen <sandeen@sandeen.net>
Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
changes since v1:
 - fix indentation, typedefs, etc code styling problem
   pointed out by Darrick;

 - adapt init_rmapbt_cursor to the new algorithm since
   it's similar pointed out by Darrick; thus the function
   name remains the origin compute_level_geometry...
   and hence, adjust the subject a bit as well.

 repair/phase5.c | 151 ++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 88 deletions(-)

diff --git a/repair/phase5.c b/repair/phase5.c
index abae8a08..278de7a4 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -348,11 +348,32 @@ finish_cursor(bt_status_t *curs)
  * failure at runtime. Hence leave a couple of records slack space in
  * each block to allow immediate modification of the tree without
  * requiring splits to be done.
- *
- * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
  */
-#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
-	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
+static void
+compute_level_geometry(
+	struct xfs_mount *	mp,
+	struct bt_stat_level *	lptr,
+	uint64_t		nr_this_level,
+	int			slack,
+	bool			leaf)
+{
+	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
+	unsigned int		desired_npb;
+
+	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
+	lptr->num_recs_tot = nr_this_level;
+	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
+
+	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+	lptr->modulo = nr_this_level % lptr->num_blocks;
+	if (lptr->num_recs_pb > maxrecs ||
+	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
+		lptr->num_blocks++;
+
+		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+		lptr->modulo = nr_this_level % lptr->num_blocks;
+	}
+}
 
 /*
  * this calculates a freespace cursor for an ag.
@@ -370,6 +391,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	int			i;
 	int			extents_used;
 	int			extra_blocks;
+	uint64_t		old_blocks;
 	bt_stat_level_t		*lptr;
 	bt_stat_level_t		*p_lptr;
 	extent_tree_node_t	*ext_ptr;
@@ -388,10 +410,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * of the tree and set up the cursor for the leaf level
 	 * (note that the same code is duplicated further down)
 	 */
-	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-	lptr->num_recs_pb = num_extents / lptr->num_blocks;
-	lptr->modulo = num_extents % lptr->num_blocks;
-	lptr->num_recs_tot = num_extents;
+	compute_level_geometry(mp, lptr, num_extents, 2, true);
 	level = 1;
 
 #ifdef XR_BLD_FREE_TRACE
@@ -405,29 +424,23 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * if we need more levels, set them up.  # of records
 	 * per level is the # of blocks in the level below it
 	 */
-	if (lptr->num_blocks > 1)  {
-		for (; btree_curs->level[level - 1].num_blocks > 1
-				&& level < XFS_BTREE_MAXLEVELS;
-				level++)  {
-			lptr = &btree_curs->level[level];
-			p_lptr = &btree_curs->level[level - 1];
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-			lptr->modulo = p_lptr->num_blocks
-					% lptr->num_blocks;
-			lptr->num_recs_pb = p_lptr->num_blocks
-					/ lptr->num_blocks;
-			lptr->num_recs_tot = p_lptr->num_blocks;
+	while (lptr->num_blocks > 1) {
+		p_lptr = lptr;
+		lptr = &btree_curs->level[level];
+
+		compute_level_geometry(mp, lptr,
+				p_lptr->num_blocks, 0, false);
 #ifdef XR_BLD_FREE_TRACE
-			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
-					level,
-					lptr->num_blocks,
-					lptr->num_recs_pb,
-					lptr->modulo,
-					lptr->num_recs_tot);
+		fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
+				level,
+				lptr->num_blocks,
+				lptr->num_recs_pb,
+				lptr->modulo,
+				lptr->num_recs_tot);
 #endif
-		}
+		level++;
 	}
+	ASSERT (level < XFS_BTREE_MAXLEVELS);
 
 	ASSERT(lptr->num_blocks == 1);
 	btree_curs->num_levels = level;
@@ -496,8 +509,11 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * see if the number of leaf blocks will change as a result
 	 * of the number of extents changing
 	 */
-	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
-			!= btree_curs->level[0].num_blocks)  {
+	old_blocks = btree_curs->level[0].num_blocks;
+	compute_level_geometry(mp, &btree_curs->level[0], num_extents, 2, true);
+	extra_blocks = 0;
+
+	if (old_blocks != btree_curs->level[0].num_blocks)  {
 		/*
 		 * yes -- recalculate the cursor.  If the number of
 		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
@@ -553,31 +569,19 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 		}
 
 		lptr = &btree_curs->level[0];
-		lptr->num_blocks = howmany(num_extents,
-					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-		lptr->num_recs_pb = num_extents / lptr->num_blocks;
-		lptr->modulo = num_extents % lptr->num_blocks;
-		lptr->num_recs_tot = num_extents;
 		level = 1;
 
 		/*
 		 * if we need more levels, set them up
 		 */
-		if (lptr->num_blocks > 1)  {
-			for (level = 1; btree_curs->level[level-1].num_blocks
-					> 1 && level < XFS_BTREE_MAXLEVELS;
-					level++)  {
-				lptr = &btree_curs->level[level];
-				p_lptr = &btree_curs->level[level-1];
-				lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-				lptr->modulo = p_lptr->num_blocks
-						% lptr->num_blocks;
-				lptr->num_recs_pb = p_lptr->num_blocks
-						/ lptr->num_blocks;
-				lptr->num_recs_tot = p_lptr->num_blocks;
-			}
+		while (lptr->num_blocks > 1) {
+			p_lptr = lptr;
+			lptr = &btree_curs->level[level++];
+
+			compute_level_geometry(mp, lptr,
+					p_lptr->num_blocks, 0, false);
 		}
+		ASSERT (level < XFS_BTREE_MAXLEVELS);
 		ASSERT(lptr->num_blocks == 1);
 		btree_curs->num_levels = level;
 
@@ -591,22 +595,6 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 
 		ASSERT(blocks_allocated_total >= blocks_needed);
 		extra_blocks = blocks_allocated_total - blocks_needed;
-	} else  {
-		if (extents_used > 0) {
-			/*
-			 * reset the leaf level geometry to account
-			 * for consumed extents.  we can leave the
-			 * rest of the cursor alone since the number
-			 * of leaf blocks hasn't changed.
-			 */
-			lptr = &btree_curs->level[0];
-
-			lptr->num_recs_pb = num_extents / lptr->num_blocks;
-			lptr->modulo = num_extents % lptr->num_blocks;
-			lptr->num_recs_tot = num_extents;
-		}
-
-		extra_blocks = 0;
 	}
 
 	btree_curs->num_tot_blocks = blocks_allocated_pt;
@@ -1376,7 +1364,6 @@ init_rmapbt_cursor(
 	struct bt_stat_level	*lptr;
 	struct bt_stat_level	*p_lptr;
 	xfs_extlen_t		blocks_allocated;
-	int			maxrecs;
 
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) {
 		memset(btree_curs, 0, sizeof(struct bt_status));
@@ -1412,32 +1399,20 @@ init_rmapbt_cursor(
 	 * Leave enough slack in the rmapbt that we can insert the
 	 * metadata AG entries without too many splits.
 	 */
-	maxrecs = mp->m_rmap_mxr[0];
-	if (num_recs > maxrecs)
-		maxrecs -= 10;
-	blocks_allocated = lptr->num_blocks = howmany(num_recs, maxrecs);
-
-	lptr->modulo = num_recs % lptr->num_blocks;
-	lptr->num_recs_pb = num_recs / lptr->num_blocks;
-	lptr->num_recs_tot = num_recs;
+	compute_level_geometry(mp, lptr, num_recs,
+			num_recs > mp->m_rmap_mxr[0] ? 10 : 0, true);
+	blocks_allocated = lptr->num_blocks;
 	level = 1;
 
-	if (lptr->num_blocks > 1)  {
-		for (; btree_curs->level[level-1].num_blocks > 1
-				&& level < XFS_BTREE_MAXLEVELS;
-				level++)  {
-			lptr = &btree_curs->level[level];
-			p_lptr = &btree_curs->level[level - 1];
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
-				mp->m_rmap_mxr[1]);
-			lptr->modulo = p_lptr->num_blocks % lptr->num_blocks;
-			lptr->num_recs_pb = p_lptr->num_blocks
-					/ lptr->num_blocks;
-			lptr->num_recs_tot = p_lptr->num_blocks;
+	while (lptr->num_blocks > 1) {
+		p_lptr = lptr;
+		lptr = &btree_curs->level[level++];
 
-			blocks_allocated += lptr->num_blocks;
-		}
+		compute_level_geometry(mp, lptr,
+				p_lptr->num_blocks, 0, false);
+		blocks_allocated += lptr->num_blocks;
 	}
+	ASSERT(level < XFS_BTREE_MAXLEVELS);
 	ASSERT(lptr->num_blocks == 1);
 	btree_curs->num_levels = level;
 
-- 
2.18.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [RFC PATCH v3] xfs_repair: fix rebuilding btree block less than minrecs
  2020-06-10  3:58 ` [RFC PATCH v2] xfs_repair: fix rebuilding btree " Gao Xiang
@ 2020-06-10  5:26   ` Gao Xiang
  2020-06-16 16:11     ` Darrick J. Wong
  0 siblings, 1 reply; 9+ messages in thread
From: Gao Xiang @ 2020-06-10  5:26 UTC (permalink / raw
  To: linux-xfs; +Cc: Gao Xiang, Darrick J. Wong, Dave Chinner, Eric Sandeen

In production, we found that sometimes xfs_repair phase 5
rebuilds freespace node block with pointers less than minrecs
and if we trigger xfs_repair again it would report such
the following message:

bad btree nrecs (39, min=40, max=80) in btbno block 0/7882

The background is that xfs_repair starts to rebuild AGFL
after the freespace btree is settled in phase 5 so we may
need to leave necessary room in advance for each btree
leaves in order to avoid freespace btree split and then
result in AGFL rebuild fails. The old mathematics uses
ceil(num_extents / maxrecs) to decide the number of node
blocks. That would be fine without leaving extra space
since minrecs = maxrecs / 2 but if some slack was decreased
from maxrecs, the result would be larger than what is
expected and cause num_recs_pb less than minrecs, i.e:

num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78

so we'd get

num_blocks = ceil(79 / 78) = 2,
num_recs_pb = 79 / 2 = 39, which is less than
minrecs = 80 / 2 = 40

OTOH, btree bulk loading code behaves in a different way.
As in xfs_btree_bload_level_geometry it wrote

num_blocks = floor(num_extents / maxrecs)

which will never go below minrecs. And when it goes above
maxrecs, just increment num_blocks and recalculate so we
can get the reasonable results.

Later, btree bulk loader will replace the current repair code.
But we may still want to look for a backportable solution
for stable versions. Hence, keep the same logic to avoid
the freespace as well as rmap btree minrecs underflow for now.

Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Eric Sandeen <sandeen@sandeen.net>
Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
changes since v2:
 still some minor styling fix (ASSERT, args)..

changes since v1:
 - fix indentation, typedefs, etc code styling problem
   pointed out by Darrick;

 - adapt init_rmapbt_cursor to the new algorithm since
   it's similar pointed out by Darrick; thus the function
   name remains the origin compute_level_geometry...
   and hence, adjust the subject a bit as well.

 repair/phase5.c | 152 ++++++++++++++++++++----------------------------
 1 file changed, 63 insertions(+), 89 deletions(-)

diff --git a/repair/phase5.c b/repair/phase5.c
index abae8a08..d30d32b2 100644
--- a/repair/phase5.c
+++ b/repair/phase5.c
@@ -348,11 +348,32 @@ finish_cursor(bt_status_t *curs)
  * failure at runtime. Hence leave a couple of records slack space in
  * each block to allow immediate modification of the tree without
  * requiring splits to be done.
- *
- * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
  */
-#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
-	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
+static void
+compute_level_geometry(
+	struct xfs_mount	*mp,
+	struct bt_stat_level	*lptr,
+	uint64_t		nr_this_level,
+	int			slack,
+	bool			leaf)
+{
+	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
+	unsigned int		desired_npb;
+
+	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
+	lptr->num_recs_tot = nr_this_level;
+	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
+
+	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+	lptr->modulo = nr_this_level % lptr->num_blocks;
+	if (lptr->num_recs_pb > maxrecs ||
+	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
+		lptr->num_blocks++;
+
+		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
+		lptr->modulo = nr_this_level % lptr->num_blocks;
+	}
+}
 
 /*
  * this calculates a freespace cursor for an ag.
@@ -370,6 +391,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	int			i;
 	int			extents_used;
 	int			extra_blocks;
+	uint64_t		old_blocks;
 	bt_stat_level_t		*lptr;
 	bt_stat_level_t		*p_lptr;
 	extent_tree_node_t	*ext_ptr;
@@ -388,10 +410,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * of the tree and set up the cursor for the leaf level
 	 * (note that the same code is duplicated further down)
 	 */
-	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-	lptr->num_recs_pb = num_extents / lptr->num_blocks;
-	lptr->modulo = num_extents % lptr->num_blocks;
-	lptr->num_recs_tot = num_extents;
+	compute_level_geometry(mp, lptr, num_extents, 2, true);
 	level = 1;
 
 #ifdef XR_BLD_FREE_TRACE
@@ -405,30 +424,23 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * if we need more levels, set them up.  # of records
 	 * per level is the # of blocks in the level below it
 	 */
-	if (lptr->num_blocks > 1)  {
-		for (; btree_curs->level[level - 1].num_blocks > 1
-				&& level < XFS_BTREE_MAXLEVELS;
-				level++)  {
-			lptr = &btree_curs->level[level];
-			p_lptr = &btree_curs->level[level - 1];
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-			lptr->modulo = p_lptr->num_blocks
-					% lptr->num_blocks;
-			lptr->num_recs_pb = p_lptr->num_blocks
-					/ lptr->num_blocks;
-			lptr->num_recs_tot = p_lptr->num_blocks;
+	while (lptr->num_blocks > 1) {
+		p_lptr = lptr;
+		lptr = &btree_curs->level[level];
+
+		compute_level_geometry(mp, lptr,
+				p_lptr->num_blocks, 0, false);
 #ifdef XR_BLD_FREE_TRACE
-			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
-					level,
-					lptr->num_blocks,
-					lptr->num_recs_pb,
-					lptr->modulo,
-					lptr->num_recs_tot);
+		fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
+				level,
+				lptr->num_blocks,
+				lptr->num_recs_pb,
+				lptr->modulo,
+				lptr->num_recs_tot);
 #endif
-		}
+		level++;
 	}
-
+	ASSERT(level < XFS_BTREE_MAXLEVELS);
 	ASSERT(lptr->num_blocks == 1);
 	btree_curs->num_levels = level;
 
@@ -496,8 +508,11 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 	 * see if the number of leaf blocks will change as a result
 	 * of the number of extents changing
 	 */
-	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
-			!= btree_curs->level[0].num_blocks)  {
+	old_blocks = btree_curs->level[0].num_blocks;
+	compute_level_geometry(mp, &btree_curs->level[0], num_extents, 2, true);
+	extra_blocks = 0;
+
+	if (old_blocks != btree_curs->level[0].num_blocks)  {
 		/*
 		 * yes -- recalculate the cursor.  If the number of
 		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
@@ -553,31 +568,19 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 		}
 
 		lptr = &btree_curs->level[0];
-		lptr->num_blocks = howmany(num_extents,
-					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
-		lptr->num_recs_pb = num_extents / lptr->num_blocks;
-		lptr->modulo = num_extents % lptr->num_blocks;
-		lptr->num_recs_tot = num_extents;
 		level = 1;
 
 		/*
 		 * if we need more levels, set them up
 		 */
-		if (lptr->num_blocks > 1)  {
-			for (level = 1; btree_curs->level[level-1].num_blocks
-					> 1 && level < XFS_BTREE_MAXLEVELS;
-					level++)  {
-				lptr = &btree_curs->level[level];
-				p_lptr = &btree_curs->level[level-1];
-				lptr->num_blocks = howmany(p_lptr->num_blocks,
-					XR_ALLOC_BLOCK_MAXRECS(mp, level));
-				lptr->modulo = p_lptr->num_blocks
-						% lptr->num_blocks;
-				lptr->num_recs_pb = p_lptr->num_blocks
-						/ lptr->num_blocks;
-				lptr->num_recs_tot = p_lptr->num_blocks;
-			}
+		while (lptr->num_blocks > 1) {
+			p_lptr = lptr;
+			lptr = &btree_curs->level[level++];
+
+			compute_level_geometry(mp, lptr,
+					p_lptr->num_blocks, 0, false);
 		}
+		ASSERT(level < XFS_BTREE_MAXLEVELS);
 		ASSERT(lptr->num_blocks == 1);
 		btree_curs->num_levels = level;
 
@@ -591,22 +594,6 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
 
 		ASSERT(blocks_allocated_total >= blocks_needed);
 		extra_blocks = blocks_allocated_total - blocks_needed;
-	} else  {
-		if (extents_used > 0) {
-			/*
-			 * reset the leaf level geometry to account
-			 * for consumed extents.  we can leave the
-			 * rest of the cursor alone since the number
-			 * of leaf blocks hasn't changed.
-			 */
-			lptr = &btree_curs->level[0];
-
-			lptr->num_recs_pb = num_extents / lptr->num_blocks;
-			lptr->modulo = num_extents % lptr->num_blocks;
-			lptr->num_recs_tot = num_extents;
-		}
-
-		extra_blocks = 0;
 	}
 
 	btree_curs->num_tot_blocks = blocks_allocated_pt;
@@ -1376,7 +1363,6 @@ init_rmapbt_cursor(
 	struct bt_stat_level	*lptr;
 	struct bt_stat_level	*p_lptr;
 	xfs_extlen_t		blocks_allocated;
-	int			maxrecs;
 
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) {
 		memset(btree_curs, 0, sizeof(struct bt_status));
@@ -1412,32 +1398,20 @@ init_rmapbt_cursor(
 	 * Leave enough slack in the rmapbt that we can insert the
 	 * metadata AG entries without too many splits.
 	 */
-	maxrecs = mp->m_rmap_mxr[0];
-	if (num_recs > maxrecs)
-		maxrecs -= 10;
-	blocks_allocated = lptr->num_blocks = howmany(num_recs, maxrecs);
-
-	lptr->modulo = num_recs % lptr->num_blocks;
-	lptr->num_recs_pb = num_recs / lptr->num_blocks;
-	lptr->num_recs_tot = num_recs;
+	compute_level_geometry(mp, lptr, num_recs,
+			num_recs > mp->m_rmap_mxr[0] ? 10 : 0, true);
+	blocks_allocated = lptr->num_blocks;
 	level = 1;
 
-	if (lptr->num_blocks > 1)  {
-		for (; btree_curs->level[level-1].num_blocks > 1
-				&& level < XFS_BTREE_MAXLEVELS;
-				level++)  {
-			lptr = &btree_curs->level[level];
-			p_lptr = &btree_curs->level[level - 1];
-			lptr->num_blocks = howmany(p_lptr->num_blocks,
-				mp->m_rmap_mxr[1]);
-			lptr->modulo = p_lptr->num_blocks % lptr->num_blocks;
-			lptr->num_recs_pb = p_lptr->num_blocks
-					/ lptr->num_blocks;
-			lptr->num_recs_tot = p_lptr->num_blocks;
+	while (lptr->num_blocks > 1) {
+		p_lptr = lptr;
+		lptr = &btree_curs->level[level++];
 
-			blocks_allocated += lptr->num_blocks;
-		}
+		compute_level_geometry(mp, lptr,
+				p_lptr->num_blocks, 0, false);
+		blocks_allocated += lptr->num_blocks;
 	}
+	ASSERT(level < XFS_BTREE_MAXLEVELS);
 	ASSERT(lptr->num_blocks == 1);
 	btree_curs->num_levels = level;
 
-- 
2.18.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v3] xfs_repair: fix rebuilding btree block less than minrecs
  2020-06-10  5:26   ` [RFC PATCH v3] " Gao Xiang
@ 2020-06-16 16:11     ` Darrick J. Wong
  2020-06-16 17:26       ` Gao Xiang
  0 siblings, 1 reply; 9+ messages in thread
From: Darrick J. Wong @ 2020-06-16 16:11 UTC (permalink / raw
  To: Gao Xiang; +Cc: linux-xfs, Dave Chinner, Eric Sandeen

On Wed, Jun 10, 2020 at 01:26:24PM +0800, Gao Xiang wrote:
> In production, we found that sometimes xfs_repair phase 5
> rebuilds freespace node block with pointers less than minrecs
> and if we trigger xfs_repair again it would report such
> the following message:
> 
> bad btree nrecs (39, min=40, max=80) in btbno block 0/7882
> 
> The background is that xfs_repair starts to rebuild AGFL
> after the freespace btree is settled in phase 5 so we may
> need to leave necessary room in advance for each btree
> leaves in order to avoid freespace btree split and then
> result in AGFL rebuild fails. The old mathematics uses
> ceil(num_extents / maxrecs) to decide the number of node
> blocks. That would be fine without leaving extra space
> since minrecs = maxrecs / 2 but if some slack was decreased
> from maxrecs, the result would be larger than what is
> expected and cause num_recs_pb less than minrecs, i.e:
> 
> num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78
> 
> so we'd get
> 
> num_blocks = ceil(79 / 78) = 2,
> num_recs_pb = 79 / 2 = 39, which is less than
> minrecs = 80 / 2 = 40
> 
> OTOH, btree bulk loading code behaves in a different way.
> As in xfs_btree_bload_level_geometry it wrote
> 
> num_blocks = floor(num_extents / maxrecs)
> 
> which will never go below minrecs. And when it goes above
> maxrecs, just increment num_blocks and recalculate so we
> can get the reasonable results.
> 
> Later, btree bulk loader will replace the current repair code.
> But we may still want to look for a backportable solution
> for stable versions. Hence, keep the same logic to avoid
> the freespace as well as rmap btree minrecs underflow for now.
> 
> Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> Cc: Dave Chinner <dchinner@redhat.com>
> Cc: Eric Sandeen <sandeen@sandeen.net>
> Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
> Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> ---
> changes since v2:
>  still some minor styling fix (ASSERT, args)..
> 
> changes since v1:
>  - fix indentation, typedefs, etc code styling problem
>    pointed out by Darrick;
> 
>  - adapt init_rmapbt_cursor to the new algorithm since
>    it's similar pointed out by Darrick; thus the function
>    name remains the origin compute_level_geometry...
>    and hence, adjust the subject a bit as well.
> 
>  repair/phase5.c | 152 ++++++++++++++++++++----------------------------
>  1 file changed, 63 insertions(+), 89 deletions(-)
> 
> diff --git a/repair/phase5.c b/repair/phase5.c
> index abae8a08..d30d32b2 100644
> --- a/repair/phase5.c
> +++ b/repair/phase5.c
> @@ -348,11 +348,32 @@ finish_cursor(bt_status_t *curs)
>   * failure at runtime. Hence leave a couple of records slack space in
>   * each block to allow immediate modification of the tree without
>   * requiring splits to be done.
> - *
> - * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
>   */
> -#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
> -	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
> +static void
> +compute_level_geometry(
> +	struct xfs_mount	*mp,
> +	struct bt_stat_level	*lptr,
> +	uint64_t		nr_this_level,

Probably didn't need a u64 here, but <shrug> that's probably just my
kernel-coloured glasses. :)

> +	int			slack,
> +	bool			leaf)
> +{
> +	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
> +	unsigned int		desired_npb;
> +
> +	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
> +	lptr->num_recs_tot = nr_this_level;
> +	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
> +
> +	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> +	lptr->modulo = nr_this_level % lptr->num_blocks;
> +	if (lptr->num_recs_pb > maxrecs ||
> +	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
> +		lptr->num_blocks++;
> +
> +		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> +		lptr->modulo = nr_this_level % lptr->num_blocks;
> +	}

Seems to be more or less the same solution that I (half unknowingly)
coded into the btree bulkload geometry calculator, so:

Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

(Still working on adapting the new phase5 code to try to fill the AGFL
as part of rebuilding the free space btrees, fwiw.)

--D

> +}
>  
>  /*
>   * this calculates a freespace cursor for an ag.
> @@ -370,6 +391,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	int			i;
>  	int			extents_used;
>  	int			extra_blocks;
> +	uint64_t		old_blocks;
>  	bt_stat_level_t		*lptr;
>  	bt_stat_level_t		*p_lptr;
>  	extent_tree_node_t	*ext_ptr;
> @@ -388,10 +410,7 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	 * of the tree and set up the cursor for the leaf level
>  	 * (note that the same code is duplicated further down)
>  	 */
> -	lptr->num_blocks = howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0));
> -	lptr->num_recs_pb = num_extents / lptr->num_blocks;
> -	lptr->modulo = num_extents % lptr->num_blocks;
> -	lptr->num_recs_tot = num_extents;
> +	compute_level_geometry(mp, lptr, num_extents, 2, true);
>  	level = 1;
>  
>  #ifdef XR_BLD_FREE_TRACE
> @@ -405,30 +424,23 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	 * if we need more levels, set them up.  # of records
>  	 * per level is the # of blocks in the level below it
>  	 */
> -	if (lptr->num_blocks > 1)  {
> -		for (; btree_curs->level[level - 1].num_blocks > 1
> -				&& level < XFS_BTREE_MAXLEVELS;
> -				level++)  {
> -			lptr = &btree_curs->level[level];
> -			p_lptr = &btree_curs->level[level - 1];
> -			lptr->num_blocks = howmany(p_lptr->num_blocks,
> -					XR_ALLOC_BLOCK_MAXRECS(mp, level));
> -			lptr->modulo = p_lptr->num_blocks
> -					% lptr->num_blocks;
> -			lptr->num_recs_pb = p_lptr->num_blocks
> -					/ lptr->num_blocks;
> -			lptr->num_recs_tot = p_lptr->num_blocks;
> +	while (lptr->num_blocks > 1) {
> +		p_lptr = lptr;
> +		lptr = &btree_curs->level[level];
> +
> +		compute_level_geometry(mp, lptr,
> +				p_lptr->num_blocks, 0, false);
>  #ifdef XR_BLD_FREE_TRACE
> -			fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
> -					level,
> -					lptr->num_blocks,
> -					lptr->num_recs_pb,
> -					lptr->modulo,
> -					lptr->num_recs_tot);
> +		fprintf(stderr, "%s %d %d %d %d %d\n", __func__,
> +				level,
> +				lptr->num_blocks,
> +				lptr->num_recs_pb,
> +				lptr->modulo,
> +				lptr->num_recs_tot);
>  #endif
> -		}
> +		level++;
>  	}
> -
> +	ASSERT(level < XFS_BTREE_MAXLEVELS);
>  	ASSERT(lptr->num_blocks == 1);
>  	btree_curs->num_levels = level;
>  
> @@ -496,8 +508,11 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  	 * see if the number of leaf blocks will change as a result
>  	 * of the number of extents changing
>  	 */
> -	if (howmany(num_extents, XR_ALLOC_BLOCK_MAXRECS(mp, 0))
> -			!= btree_curs->level[0].num_blocks)  {
> +	old_blocks = btree_curs->level[0].num_blocks;
> +	compute_level_geometry(mp, &btree_curs->level[0], num_extents, 2, true);
> +	extra_blocks = 0;
> +
> +	if (old_blocks != btree_curs->level[0].num_blocks)  {
>  		/*
>  		 * yes -- recalculate the cursor.  If the number of
>  		 * excess (overallocated) blocks is < xfs_agfl_size/2, we're ok.
> @@ -553,31 +568,19 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  		}
>  
>  		lptr = &btree_curs->level[0];
> -		lptr->num_blocks = howmany(num_extents,
> -					XR_ALLOC_BLOCK_MAXRECS(mp, 0));
> -		lptr->num_recs_pb = num_extents / lptr->num_blocks;
> -		lptr->modulo = num_extents % lptr->num_blocks;
> -		lptr->num_recs_tot = num_extents;
>  		level = 1;
>  
>  		/*
>  		 * if we need more levels, set them up
>  		 */
> -		if (lptr->num_blocks > 1)  {
> -			for (level = 1; btree_curs->level[level-1].num_blocks
> -					> 1 && level < XFS_BTREE_MAXLEVELS;
> -					level++)  {
> -				lptr = &btree_curs->level[level];
> -				p_lptr = &btree_curs->level[level-1];
> -				lptr->num_blocks = howmany(p_lptr->num_blocks,
> -					XR_ALLOC_BLOCK_MAXRECS(mp, level));
> -				lptr->modulo = p_lptr->num_blocks
> -						% lptr->num_blocks;
> -				lptr->num_recs_pb = p_lptr->num_blocks
> -						/ lptr->num_blocks;
> -				lptr->num_recs_tot = p_lptr->num_blocks;
> -			}
> +		while (lptr->num_blocks > 1) {
> +			p_lptr = lptr;
> +			lptr = &btree_curs->level[level++];
> +
> +			compute_level_geometry(mp, lptr,
> +					p_lptr->num_blocks, 0, false);
>  		}
> +		ASSERT(level < XFS_BTREE_MAXLEVELS);
>  		ASSERT(lptr->num_blocks == 1);
>  		btree_curs->num_levels = level;
>  
> @@ -591,22 +594,6 @@ calculate_freespace_cursor(xfs_mount_t *mp, xfs_agnumber_t agno,
>  
>  		ASSERT(blocks_allocated_total >= blocks_needed);
>  		extra_blocks = blocks_allocated_total - blocks_needed;
> -	} else  {
> -		if (extents_used > 0) {
> -			/*
> -			 * reset the leaf level geometry to account
> -			 * for consumed extents.  we can leave the
> -			 * rest of the cursor alone since the number
> -			 * of leaf blocks hasn't changed.
> -			 */
> -			lptr = &btree_curs->level[0];
> -
> -			lptr->num_recs_pb = num_extents / lptr->num_blocks;
> -			lptr->modulo = num_extents % lptr->num_blocks;
> -			lptr->num_recs_tot = num_extents;
> -		}
> -
> -		extra_blocks = 0;
>  	}
>  
>  	btree_curs->num_tot_blocks = blocks_allocated_pt;
> @@ -1376,7 +1363,6 @@ init_rmapbt_cursor(
>  	struct bt_stat_level	*lptr;
>  	struct bt_stat_level	*p_lptr;
>  	xfs_extlen_t		blocks_allocated;
> -	int			maxrecs;
>  
>  	if (!xfs_sb_version_hasrmapbt(&mp->m_sb)) {
>  		memset(btree_curs, 0, sizeof(struct bt_status));
> @@ -1412,32 +1398,20 @@ init_rmapbt_cursor(
>  	 * Leave enough slack in the rmapbt that we can insert the
>  	 * metadata AG entries without too many splits.
>  	 */
> -	maxrecs = mp->m_rmap_mxr[0];
> -	if (num_recs > maxrecs)
> -		maxrecs -= 10;
> -	blocks_allocated = lptr->num_blocks = howmany(num_recs, maxrecs);
> -
> -	lptr->modulo = num_recs % lptr->num_blocks;
> -	lptr->num_recs_pb = num_recs / lptr->num_blocks;
> -	lptr->num_recs_tot = num_recs;
> +	compute_level_geometry(mp, lptr, num_recs,
> +			num_recs > mp->m_rmap_mxr[0] ? 10 : 0, true);
> +	blocks_allocated = lptr->num_blocks;
>  	level = 1;
>  
> -	if (lptr->num_blocks > 1)  {
> -		for (; btree_curs->level[level-1].num_blocks > 1
> -				&& level < XFS_BTREE_MAXLEVELS;
> -				level++)  {
> -			lptr = &btree_curs->level[level];
> -			p_lptr = &btree_curs->level[level - 1];
> -			lptr->num_blocks = howmany(p_lptr->num_blocks,
> -				mp->m_rmap_mxr[1]);
> -			lptr->modulo = p_lptr->num_blocks % lptr->num_blocks;
> -			lptr->num_recs_pb = p_lptr->num_blocks
> -					/ lptr->num_blocks;
> -			lptr->num_recs_tot = p_lptr->num_blocks;
> +	while (lptr->num_blocks > 1) {
> +		p_lptr = lptr;
> +		lptr = &btree_curs->level[level++];
>  
> -			blocks_allocated += lptr->num_blocks;
> -		}
> +		compute_level_geometry(mp, lptr,
> +				p_lptr->num_blocks, 0, false);
> +		blocks_allocated += lptr->num_blocks;
>  	}
> +	ASSERT(level < XFS_BTREE_MAXLEVELS);
>  	ASSERT(lptr->num_blocks == 1);
>  	btree_curs->num_levels = level;
>  
> -- 
> 2.18.1
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [RFC PATCH v3] xfs_repair: fix rebuilding btree block less than minrecs
  2020-06-16 16:11     ` Darrick J. Wong
@ 2020-06-16 17:26       ` Gao Xiang
  0 siblings, 0 replies; 9+ messages in thread
From: Gao Xiang @ 2020-06-16 17:26 UTC (permalink / raw
  To: Darrick J. Wong; +Cc: linux-xfs, Dave Chinner, Eric Sandeen

Hi Darrick,

On Tue, Jun 16, 2020 at 09:11:43AM -0700, Darrick J. Wong wrote:
> On Wed, Jun 10, 2020 at 01:26:24PM +0800, Gao Xiang wrote:
> > In production, we found that sometimes xfs_repair phase 5
> > rebuilds freespace node block with pointers less than minrecs
> > and if we trigger xfs_repair again it would report such
> > the following message:
> > 
> > bad btree nrecs (39, min=40, max=80) in btbno block 0/7882
> > 
> > The background is that xfs_repair starts to rebuild AGFL
> > after the freespace btree is settled in phase 5 so we may
> > need to leave necessary room in advance for each btree
> > leaves in order to avoid freespace btree split and then
> > result in AGFL rebuild fails. The old mathematics uses
> > ceil(num_extents / maxrecs) to decide the number of node
> > blocks. That would be fine without leaving extra space
> > since minrecs = maxrecs / 2 but if some slack was decreased
> > from maxrecs, the result would be larger than what is
> > expected and cause num_recs_pb less than minrecs, i.e:
> > 
> > num_extents = 79, adj_maxrecs = 80 - 2 (slack) = 78
> > 
> > so we'd get
> > 
> > num_blocks = ceil(79 / 78) = 2,
> > num_recs_pb = 79 / 2 = 39, which is less than
> > minrecs = 80 / 2 = 40
> > 
> > OTOH, btree bulk loading code behaves in a different way.
> > As in xfs_btree_bload_level_geometry it wrote
> > 
> > num_blocks = floor(num_extents / maxrecs)
> > 
> > which will never go below minrecs. And when it goes above
> > maxrecs, just increment num_blocks and recalculate so we
> > can get the reasonable results.
> > 
> > Later, btree bulk loader will replace the current repair code.
> > But we may still want to look for a backportable solution
> > for stable versions. Hence, keep the same logic to avoid
> > the freespace as well as rmap btree minrecs underflow for now.
> > 
> > Cc: "Darrick J. Wong" <darrick.wong@oracle.com>
> > Cc: Dave Chinner <dchinner@redhat.com>
> > Cc: Eric Sandeen <sandeen@sandeen.net>
> > Fixes: 9851fd79bfb1 ("repair: AGFL rebuild fails if btree split required")
> > Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
> > ---
> > changes since v2:
> >  still some minor styling fix (ASSERT, args)..
> > 
> > changes since v1:
> >  - fix indentation, typedefs, etc code styling problem
> >    pointed out by Darrick;
> > 
> >  - adapt init_rmapbt_cursor to the new algorithm since
> >    it's similar pointed out by Darrick; thus the function
> >    name remains the origin compute_level_geometry...
> >    and hence, adjust the subject a bit as well.
> > 
> >  repair/phase5.c | 152 ++++++++++++++++++++----------------------------
> >  1 file changed, 63 insertions(+), 89 deletions(-)
> > 
> > diff --git a/repair/phase5.c b/repair/phase5.c
> > index abae8a08..d30d32b2 100644
> > --- a/repair/phase5.c
> > +++ b/repair/phase5.c
> > @@ -348,11 +348,32 @@ finish_cursor(bt_status_t *curs)
> >   * failure at runtime. Hence leave a couple of records slack space in
> >   * each block to allow immediate modification of the tree without
> >   * requiring splits to be done.
> > - *
> > - * XXX(hch): any reason we don't just look at mp->m_alloc_mxr?
> >   */
> > -#define XR_ALLOC_BLOCK_MAXRECS(mp, level) \
> > -	(libxfs_allocbt_maxrecs((mp), (mp)->m_sb.sb_blocksize, (level) == 0) - 2)
> > +static void
> > +compute_level_geometry(
> > +	struct xfs_mount	*mp,
> > +	struct bt_stat_level	*lptr,
> > +	uint64_t		nr_this_level,
> 
> Probably didn't need a u64 here, but <shrug> that's probably just my
> kernel-coloured glasses. :)

Yeah, I personally tend to use kernel type u64 in my own projects, but I'm not
sure what's preferred here...

> 
> > +	int			slack,
> > +	bool			leaf)
> > +{
> > +	unsigned int		maxrecs = mp->m_alloc_mxr[!leaf];
> > +	unsigned int		desired_npb;
> > +
> > +	desired_npb = max(mp->m_alloc_mnr[!leaf], maxrecs - slack);
> > +	lptr->num_recs_tot = nr_this_level;
> > +	lptr->num_blocks = max(1ULL, nr_this_level / desired_npb);
> > +
> > +	lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> > +	lptr->modulo = nr_this_level % lptr->num_blocks;
> > +	if (lptr->num_recs_pb > maxrecs ||
> > +	    (lptr->num_recs_pb == maxrecs && lptr->modulo)) {
> > +		lptr->num_blocks++;
> > +
> > +		lptr->num_recs_pb = nr_this_level / lptr->num_blocks;
> > +		lptr->modulo = nr_this_level % lptr->num_blocks;
> > +	}
> 
> Seems to be more or less the same solution that I (half unknowingly)
> coded into the btree bulkload geometry calculator, so:
> 
> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>

Thanks for the review... And I checked all xfs-repair related fstests
and it seems no noticable strange...

> 
> (Still working on adapting the new phase5 code to try to fill the AGFL
> as part of rebuilding the free space btrees, fwiw.)

Good news... although I still have limited knowledge to the whole XFS
(now stuggling in reading XFS logging system...)

Thanks,
Gao Xiang

> 
> --D
> 


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-06-16 17:26 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-06-09 11:40 [RFC PATCH] xfs_repair: fix rebuilding btree node block less than minrecs Gao Xiang
2020-06-09 14:35 ` Gao Xiang
2020-06-09 20:14 ` Darrick J. Wong
2020-06-09 22:12   ` Darrick J. Wong
2020-06-10  1:04     ` Gao Xiang
2020-06-10  3:58 ` [RFC PATCH v2] xfs_repair: fix rebuilding btree " Gao Xiang
2020-06-10  5:26   ` [RFC PATCH v3] " Gao Xiang
2020-06-16 16:11     ` Darrick J. Wong
2020-06-16 17:26       ` Gao Xiang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.