Linux-EROFS Archive mirror
 help / color / mirror / Atom feed
From: Sandeep Dhavale via Linux-erofs <linux-erofs@lists.ozlabs.org>
To: linux-erofs@lists.ozlabs.org
Cc: hsiangkao@linux.alibaba.com, kernel-team@android.com
Subject: [PATCH v3] erofs-utils: lib: treat data blocks filled with 0s as a hole
Date: Wed, 17 Apr 2024 16:48:44 -0700	[thread overview]
Message-ID: <20240417234845.2758882-1-dhavale@google.com> (raw)

Add optimization to treat data blocks filled with 0s as a hole.
Even though diskspace savings are comparable to chunk based or dedupe,
having no block assigned saves us redundant disk IOs during read.

To detect blocks filled with zeros during chunking, we insert block
filled with zeros (zerochunk) in the hashmap. If we detect a possible
dedupe, we map it to the hole so there is no physical block assigned.

Signed-off-by: Sandeep Dhavale <dhavale@google.com>
---
Changes since v2:
	- Fix memory leak for zeros buffer
	- Add and use helper erofs_blob_can_merge() as suggested by Gao
	- Return erofs_holechunk directly from erofs_blob_getchunk()
Changes since v1:
	- Instead of checking every block for 0s word by word,
	  add a zerochunk in blobs during init. So we effectively
	  detect the zero blocks by comparing the hash.
 include/erofs/blobchunk.h |  2 +-
 lib/blobchunk.c           | 74 +++++++++++++++++++++++++++++++++------
 mkfs/main.c               |  2 +-
 3 files changed, 65 insertions(+), 13 deletions(-)

diff --git a/include/erofs/blobchunk.h b/include/erofs/blobchunk.h
index a674640..ebe2efe 100644
--- a/include/erofs/blobchunk.h
+++ b/include/erofs/blobchunk.h
@@ -23,7 +23,7 @@ int erofs_write_zero_inode(struct erofs_inode *inode);
 int tarerofs_write_chunkes(struct erofs_inode *inode, erofs_off_t data_offset);
 int erofs_mkfs_dump_blobs(struct erofs_sb_info *sbi);
 void erofs_blob_exit(void);
-int erofs_blob_init(const char *blobfile_path);
+int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize);
 int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices);
 
 #ifdef __cplusplus
diff --git a/lib/blobchunk.c b/lib/blobchunk.c
index 641e3d4..5830498 100644
--- a/lib/blobchunk.c
+++ b/lib/blobchunk.c
@@ -69,6 +69,10 @@ static struct erofs_blobchunk *erofs_blob_getchunk(struct erofs_sb_info *sbi,
 	chunk = hashmap_get_from_hash(&blob_hashmap, hash, sha256);
 	if (chunk) {
 		DBG_BUGON(chunksize != chunk->chunksize);
+
+		if (chunk->blkaddr == erofs_holechunk.blkaddr)
+			chunk = &erofs_holechunk;
+
 		sbi->saved_by_deduplication += chunksize;
 		erofs_dbg("Found duplicated chunk at %u", chunk->blkaddr);
 		return chunk;
@@ -231,7 +235,21 @@ static void erofs_update_minextblks(struct erofs_sb_info *sbi,
 	if (lb && lb < *minextblks)
 		*minextblks = lb;
 }
-
+static bool erofs_blob_can_merge(struct erofs_sb_info *sbi,
+				 struct erofs_blobchunk *lastch,
+				 struct erofs_blobchunk *chunk)
+{
+	if (!lastch)
+		return true;
+	if (lastch == &erofs_holechunk && chunk == &erofs_holechunk)
+		return true;
+	if (lastch->device_id == chunk->device_id &&
+		erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize ==
+		erofs_pos(sbi, chunk->blkaddr))
+		return true;
+
+	return false;
+}
 int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
 				  erofs_off_t startoff)
 {
@@ -303,16 +321,19 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
 		}
 
 		if (offset > pos) {
-			len = 0;
-			erofs_update_minextblks(sbi, interval_start, pos,
-						&minextblks);
+			if (!erofs_blob_can_merge(sbi, lastch,
+							&erofs_holechunk)) {
+				erofs_update_minextblks(sbi, interval_start,
+							pos, &minextblks);
+				interval_start = pos;
+			}
 			do {
 				*(void **)idx++ = &erofs_holechunk;
 				pos += chunksize;
 			} while (pos < offset);
 			DBG_BUGON(pos != offset);
-			lastch = NULL;
-			interval_start = pos;
+			lastch = &erofs_holechunk;
+			len = 0;
 			continue;
 		}
 #endif
@@ -330,9 +351,7 @@ int erofs_blob_write_chunked_file(struct erofs_inode *inode, int fd,
 			goto err;
 		}
 
-		if (lastch && (lastch->device_id != chunk->device_id ||
-		    erofs_pos(sbi, lastch->blkaddr) + lastch->chunksize !=
-		    erofs_pos(sbi, chunk->blkaddr))) {
+		if (!erofs_blob_can_merge(sbi, lastch, chunk)) {
 			erofs_update_minextblks(sbi, interval_start, pos,
 						&minextblks);
 			interval_start = pos;
@@ -540,7 +559,40 @@ void erofs_blob_exit(void)
 	}
 }
 
-int erofs_blob_init(const char *blobfile_path)
+static int erofs_insert_zerochunk(erofs_off_t chunksize)
+{
+	u8 *zeros;
+	struct erofs_blobchunk *chunk;
+	u8 sha256[32];
+	unsigned int hash;
+	int ret = 0;
+
+	zeros = calloc(1, chunksize);
+	if (!zeros)
+		return -ENOMEM;
+
+	erofs_sha256(zeros, chunksize, sha256);
+	hash = memhash(sha256, sizeof(sha256));
+	chunk = malloc(sizeof(struct erofs_blobchunk));
+	if (!chunk) {
+		ret = -ENOMEM;
+		goto out_free_zeros;
+	}
+
+	chunk->chunksize = chunksize;
+	/* treat chunk filled with zeros as hole */
+	chunk->blkaddr = erofs_holechunk.blkaddr;
+	memcpy(chunk->sha256, sha256, sizeof(sha256));
+
+	hashmap_entry_init(&chunk->ent, hash);
+	hashmap_add(&blob_hashmap, chunk);
+
+out_free_zeros:
+	free(zeros);
+	return ret;
+}
+
+int erofs_blob_init(const char *blobfile_path, erofs_off_t chunksize)
 {
 	if (!blobfile_path) {
 #ifdef HAVE_TMPFILE64
@@ -557,7 +609,7 @@ int erofs_blob_init(const char *blobfile_path)
 		return -EACCES;
 
 	hashmap_init(&blob_hashmap, erofs_blob_hashmap_cmp, 0);
-	return 0;
+	return erofs_insert_zerochunk(chunksize);
 }
 
 int erofs_mkfs_init_devices(struct erofs_sb_info *sbi, unsigned int devices)
diff --git a/mkfs/main.c b/mkfs/main.c
index 2fb4a57..d632f74 100644
--- a/mkfs/main.c
+++ b/mkfs/main.c
@@ -1255,7 +1255,7 @@ int main(int argc, char **argv)
 	}
 
 	if (cfg.c_chunkbits) {
-		err = erofs_blob_init(cfg.c_blobdev_path);
+		err = erofs_blob_init(cfg.c_blobdev_path, 1 << cfg.c_chunkbits);
 		if (err)
 			return 1;
 	}
-- 
2.44.0.683.g7961c838ac-goog


             reply	other threads:[~2024-04-17 23:49 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-04-17 23:48 Sandeep Dhavale via Linux-erofs [this message]
2024-04-18  5:05 ` [PATCH v3] erofs-utils: lib: treat data blocks filled with 0s as a hole Gao Xiang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240417234845.2758882-1-dhavale@google.com \
    --to=linux-erofs@lists.ozlabs.org \
    --cc=dhavale@google.com \
    --cc=hsiangkao@linux.alibaba.com \
    --cc=kernel-team@android.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).