Re: [f2fs-dev] [PATCH 1/2] f2fs: support large folio for immutable non-compressed case
Brought to you by:
kjgkr
|
From: Chao Yu <ch...@ke...> - 2025-11-25 01:38:48
|
On 11/22/2025 9:17 AM, Jaegeuk Kim wrote:
> On 11/21, Chao Yu wrote:
>> On 11/21/2025 7:54 AM, Jaegeuk Kim via Linux-f2fs-devel wrote:
>>> This patch enables large folio for limited case where we can get the high-order
>>> memory allocation. It supports the encrypted and fsverity files, which are
>>> essential for Android environment.
>>>
>>> How to test:
>>> - dd if=/dev/zero of=/mnt/test/test bs=1G count=4
>>> - f2fs_io setflags immutable /mnt/test/test
>>> - echo 3 > /proc/sys/vm/drop_caches
>>> : to reload inode with large folio
>>> - f2fs_io read 32 0 1024 mmap 0 0 /mnt/test/test
>>>
>>> Signed-off-by: Jaegeuk Kim <ja...@ke...>
>>> ---
>>> fs/f2fs/data.c | 245 ++++++++++++++++++++++++++++++++++++++++++++++--
>>> fs/f2fs/f2fs.h | 16 ++++
>>> fs/f2fs/inode.c | 6 +-
>>> 3 files changed, 257 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
>>> index 48c20386f031..8f433677c49d 100644
>>> --- a/fs/f2fs/data.c
>>> +++ b/fs/f2fs/data.c
>>> @@ -31,9 +31,15 @@
>>> static struct kmem_cache *bio_post_read_ctx_cache;
>>> static struct kmem_cache *bio_entry_slab;
>>> +static struct kmem_cache *ffs_entry_slab;
>>> static mempool_t *bio_post_read_ctx_pool;
>>> static struct bio_set f2fs_bioset;
>>> +struct f2fs_folio_state {
>>> + spinlock_t state_lock;
>>> + unsigned int read_pages_pending;
>>> +};
>>> +
>>> #define F2FS_BIO_POOL_SIZE NR_CURSEG_TYPE
>>> int __init f2fs_init_bioset(void)
>>> @@ -138,11 +144,15 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
>>> {
>>> struct folio_iter fi;
>>> struct bio_post_read_ctx *ctx = bio->bi_private;
>>> + unsigned long flags;
>>> bio_for_each_folio_all(fi, bio) {
>>> struct folio *folio = fi.folio;
>>> + unsigned nr_pages = fi.length >> PAGE_SHIFT;
>>> + bool finished = true;
>>> - if (f2fs_is_compressed_page(folio)) {
>>> + if (!folio_test_large(folio) &&
>>> + f2fs_is_compressed_page(folio)) {
>>> if (ctx && !ctx->decompression_attempted)
>>> f2fs_end_read_compressed_page(folio, true, 0,
>>> in_task);
>>> @@ -156,8 +166,20 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task)
>>> bio->bi_status = BLK_STS_IOERR;
>>> }
>>> - dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
>>> - folio_end_read(folio, bio->bi_status == BLK_STS_OK);
>>> + if (folio_test_large(folio)) {
>>> + struct f2fs_folio_state *ffs = folio->private;
>>> +
>>> + spin_lock_irqsave(&ffs->state_lock, flags);
>>> + ffs->read_pages_pending -= nr_pages;
>>> + finished = !ffs->read_pages_pending;
>>> + spin_unlock_irqrestore(&ffs->state_lock, flags);
>>> + }
>>> +
>>> + while (nr_pages--)
>>> + dec_page_count(F2FS_F_SB(folio), __read_io_type(folio));
>>> +
>>> + if (finished)
>>> + folio_end_read(folio, bio->bi_status == BLK_STS_OK);
>>> }
>>> if (ctx)
>>> @@ -518,6 +540,9 @@ static bool f2fs_crypt_mergeable_bio(struct bio *bio, const struct inode *inode,
>>> void f2fs_submit_read_bio(struct f2fs_sb_info *sbi, struct bio *bio,
>>> enum page_type type)
>>> {
>>> + if (!bio)
>>> + return;
>>> +
>>> WARN_ON_ONCE(!is_read_io(bio_op(bio)));
>>> trace_f2fs_submit_read_bio(sbi->sb, type, bio);
>>> @@ -1209,11 +1234,21 @@ struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index,
>>> struct dnode_of_data dn;
>>> struct folio *folio;
>>> int err;
>>> -
>>> +retry:
>>> folio = f2fs_grab_cache_folio(mapping, index, for_write);
>>> if (IS_ERR(folio))
>>> return folio;
>>> + if (folio_test_large(folio)) {
>>> + pgoff_t folio_index = mapping_align_index(mapping, index);
>>> +
>>> + f2fs_folio_put(folio, true);
>>> + invalidate_inode_pages2_range(mapping, folio_index,
>>> + folio_index + folio_nr_pages(folio) - 1);
>>> + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT);
>>> + goto retry;
>>> + }
>>
>> Do we need to move above check into f2fs_grab_cache_folio()? as we call
>> f2fs_grab_cache_folio() in a lot of place.
>
> We're okay with high-order allocation in other path, but I think this is
> the only problem since it goes to GC writes.
Oh, right.
>
>>
>>> +
>>> if (f2fs_lookup_read_extent_cache_block(inode, index,
>>> &dn.data_blkaddr)) {
>>> if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
>>> @@ -2341,6 +2376,177 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
>>> }
>>> #endif
>>> +static struct f2fs_folio_state *ffs_find_or_alloc(struct folio *folio)
>>> +{
>>> + struct f2fs_folio_state *ffs = folio->private;
>>> +
>>> + if (ffs)
>>> + return ffs;
>>> +
>>> + ffs = f2fs_kmem_cache_alloc(ffs_entry_slab, GFP_NOIO, true, NULL);
>>> +
>>> + spin_lock_init(&ffs->state_lock);
>>> + folio_attach_private(folio, ffs);
>>> + return ffs;
>>> +}
>>> +
>>> +static void ffs_detach_free(struct folio *folio)
>>> +{
>>> + struct f2fs_folio_state *ffs;
>>> +
>>> + if (!folio_test_large(folio)) {
>>> + folio_detach_private(folio);
>>> + return;
>>> + }
>>> +
>>> + ffs = folio_detach_private(folio);
>>> + if (!ffs)
>>> + return;
>>> +
>>> + WARN_ON_ONCE(ffs->read_pages_pending != 0);
>>> + kmem_cache_free(ffs_entry_slab, ffs);
>>> +}
>>> +
>>> +static int f2fs_read_data_large_folio(struct inode *inode,
>>> + struct readahead_control *rac, struct folio *folio)
>>> +{
>>> + struct bio *bio = NULL;
>>> + sector_t last_block_in_bio = 0;
>>> + struct f2fs_map_blocks map;
>>> + pgoff_t index, offset;> + unsigned max_nr_pages = rac ? readahead_count(rac) :
>>> + folio_nr_pages(folio);
>>> + unsigned nrpages;
>>> + struct f2fs_folio_state *ffs;
>>> + int ret = 0;
>>> +
>>> + if (f2fs_compressed_file(inode))
>>> + return -EOPNOTSUPP;
>>
>> if (!IS_IMMUTABLE(inode))
>> return -EOPNOTSUPP;
>>
>> We can configure inode after this check? Can we add some sanity check to prevent
>> enabling compress/immutable/quota if inode has already enabled large folio?
>
> I think immutable will prevent most of the changes?
Someone can drop immutable flag after above check condition in parallel?
Do we need to cover read() w/ inode_lock_shared() to prevent f2fs_fileattr_set
/w non-immutable flag concurrently?
Thanks,
>
>>
>>> +
>>> + memset(&map, 0, sizeof(map));
>>
>> Can be replaced w/ struct f2fs_map_blocks map = {0, };
>>
>>> + map.m_seg_type = NO_CHECK_TYPE;
>>> +
>>> + if (rac)
>>> + folio = readahead_folio(rac);
>>> +next_folio:
>>> + if (!folio)
>>> + goto out;
>>> +
>>> + index = folio->index;
>>> + offset = 0;
>>> + ffs = NULL;
>>> + nrpages = folio_nr_pages(folio);
>>> +
>>> + for (; nrpages; nrpages--) {
>>> + sector_t block_nr;
>>> + /*
>>> + * Map blocks using the previous result first.
>>> + */
>>> + if ((map.m_flags & F2FS_MAP_MAPPED) &&
>>> + index > map.m_lblk &&
>>> + index < (map.m_lblk + map.m_len))
>>> + goto got_it;
>>> +
>>> + /*
>>> + * Then do more f2fs_map_blocks() calls until we are
>>> + * done with this page.
>>> + */
>>> + memset(&map, 0, sizeof(map));
>>> + map.m_seg_type = NO_CHECK_TYPE;
>>> + map.m_lblk = index;
>>> + map.m_len = max_nr_pages;
>>> +
>>> + ret = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DEFAULT);
>>> + if (ret)
>>> + goto err_out;
>>> +got_it:
>>> + if ((map.m_flags & F2FS_MAP_MAPPED)) {
>>> + block_nr = map.m_pblk + index - map.m_lblk;
>>> + if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
>>> + DATA_GENERIC_ENHANCE_READ)) {
>>> + ret = -EFSCORRUPTED;
>>> + goto err_out;
>>> + }
>>> + } else {
>>> + folio_zero_range(folio, offset << PAGE_SHIFT, PAGE_SIZE);
>>> + if (f2fs_need_verity(inode, index) &&
>>> + !fsverity_verify_page(folio_file_page(folio,
>>> + index))) {
>>> + ret = -EIO;
>>> + goto err_out;
>>> + }
>>> + continue;
>>> + }
>>> +
>>> + /*
>>> + * This page will go to BIO. Do we need to send this
>>> + * BIO off first?
>>> + */
>>> + if (bio && (!page_is_mergeable(F2FS_I_SB(inode), bio,
>>> + last_block_in_bio, block_nr) ||
>>> + !f2fs_crypt_mergeable_bio(bio, inode, index, NULL))) {
>>> +submit_and_realloc:
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> + bio = NULL;
>>> + }
>>> + if (bio == NULL)
>>> + bio = f2fs_grab_read_bio(inode, block_nr,
>>> + max_nr_pages,
>>> + f2fs_ra_op_flags(rac),
>>> + index, false);
>>> +
>>> + /*
>>> + * If the page is under writeback, we need to wait for
>>> + * its completion to see the correct decrypted data.
>>> + */
>>> + f2fs_wait_on_block_writeback(inode, block_nr);
>>> +
>>> + if (!bio_add_folio(bio, folio, F2FS_BLKSIZE,
>>> + offset << PAGE_SHIFT))> + goto submit_and_realloc;
>>> +
>>> + if (folio_test_large(folio)) {
>>> + ffs = ffs_find_or_alloc(folio);
>>> +
>>> + /* set the bitmap to wait */
>>> + spin_lock_irq(&ffs->state_lock);
>>> + ffs->read_pages_pending++;
>>> + spin_unlock_irq(&ffs->state_lock);
>>> + }
>>> +
>>> + inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
>>> + f2fs_update_iostat(F2FS_I_SB(inode), NULL, FS_DATA_READ_IO,
>>> + F2FS_BLKSIZE);
>>> + last_block_in_bio = block_nr;
>>> + index++;
>>> + offset++;
>>> + }
>>> + if (rac) {
>>> + folio = readahead_folio(rac);
>>> + goto next_folio;
>>> + }
>>> +err_out:
>>> + /* Nothing was submitted. */
>>> + if (!bio) {
>>> + if (!ret)
>>> + folio_mark_uptodate(folio);
>>> + folio_unlock(folio);
>>> + return ret;
>>> + }
>>> +
>>> + if (ret) {
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> +
>>> + /* Wait bios and clear uptodate. */
>>> + folio_lock(folio);
>>> + folio_clear_uptodate(folio);
>>> + folio_unlock(folio);
>>> + }
>>> +out:
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> + return ret;
>>> +}
>>> +
>>> /*
>>> * This function was originally taken from fs/mpage.c, and customized for f2fs.
>>> * Major change was from block_size == page_size in f2fs by default.
>>> @@ -2366,9 +2572,13 @@ static int f2fs_mpage_readpages(struct inode *inode,
>>> pgoff_t index;
>>> #endif
>>> unsigned nr_pages = rac ? readahead_count(rac) : 1;
>>> + struct address_space *mapping = rac ? rac->mapping : folio->mapping;
>>> unsigned max_nr_pages = nr_pages;
>>> int ret = 0;
>>> + if (mapping_large_folio_support(mapping))
>>> + return f2fs_read_data_large_folio(inode, rac, folio);
>>> +
>>> #ifdef CONFIG_F2FS_FS_COMPRESSION
>>> if (f2fs_compressed_file(inode)) {
>>> index = rac ? readahead_index(rac) : folio->index;
>>> @@ -2459,8 +2669,7 @@ static int f2fs_mpage_readpages(struct inode *inode,
>>> }
>>> #endif
>>> }
>>> - if (bio)
>>> - f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> + f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA);
>>> return ret;
>>> }
>>> @@ -3747,7 +3956,12 @@ void f2fs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
>>> f2fs_remove_dirty_inode(inode);
>>> }
>>> }
>>> - folio_detach_private(folio);
>>> +
>>> + if (offset || length != folio_size(folio))
>>> + return;
>>> +
>>> + folio_cancel_dirty(folio);
>>> + ffs_detach_free(folio);
>>> }
>>> bool f2fs_release_folio(struct folio *folio, gfp_t wait)
>>> @@ -3756,7 +3970,7 @@ bool f2fs_release_folio(struct folio *folio, gfp_t wait)
>>> if (folio_test_dirty(folio))
>>> return false;
>>> - folio_detach_private(folio);
>>> + ffs_detach_free(folio);
>>> return true;
>>> }
>>> @@ -4162,12 +4376,25 @@ int __init f2fs_init_bio_entry_cache(void)
>>> {
>>> bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
>>> sizeof(struct bio_entry));
>>> - return bio_entry_slab ? 0 : -ENOMEM;
>>> +
>>> + if (!bio_entry_slab)
>>> + return -ENOMEM;
>>> +
>>> + ffs_entry_slab = f2fs_kmem_cache_create("f2fs_ffs_slab",
>>> + sizeof(struct f2fs_folio_state));
>>> +
>>> + if (!ffs_entry_slab) {
>>> + kmem_cache_destroy(bio_entry_slab);
>>> + return -ENOMEM;
>>> + }
>>> +
>>> + return 0;
>>> }
>>> void f2fs_destroy_bio_entry_cache(void)
>>> {
>>> kmem_cache_destroy(bio_entry_slab);
>>> + kmem_cache_destroy(ffs_entry_slab);
>>> }
>>> static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
>>> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
>>> index dffe8958b580..3340db04a7c2 100644
>>> --- a/fs/f2fs/f2fs.h
>>> +++ b/fs/f2fs/f2fs.h
>>> @@ -4916,6 +4916,22 @@ static inline bool is_journalled_quota(struct f2fs_sb_info *sbi)
>>> return false;
>>> }
>>> +static inline bool f2fs_quota_file(struct inode *inode)
>>> +{
>>> +#ifdef CONFIG_QUOTA
>>> + int i;
>>> +
>>> + if (!f2fs_sb_has_quota_ino(F2FS_I_SB(inode)))
>>> + return false;
>>> +
>>> + for (i = 0; i < MAXQUOTAS; i++) {
>>> + if (f2fs_qf_ino(F2FS_I_SB(inode)->sb, i) == inode->i_ino)
>>> + return true;
>>> + }
>>> +#endif
>>> + return false;
>>> +}
>>> +
>>> static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi)
>>> {
>>> return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK;
>>> diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
>>> index e2405b79b3cc..9162154d5211 100644
>>> --- a/fs/f2fs/inode.c
>>> +++ b/fs/f2fs/inode.c
>>> @@ -597,6 +597,8 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
>>> if (ret)
>>> goto bad_inode;
>>> make_now:
>>> + f2fs_set_inode_flags(inode);
>>> +
>>> if (ino == F2FS_NODE_INO(sbi)) {
>>> inode->i_mapping->a_ops = &f2fs_node_aops;
>>> mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
>>> @@ -618,6 +620,9 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
>>> inode->i_op = &f2fs_file_inode_operations;
>>> inode->i_fop = &f2fs_file_operations;
>>> inode->i_mapping->a_ops = &f2fs_dblock_aops;
>>> + if (IS_IMMUTABLE(inode) && !f2fs_compressed_file(inode) &&
>>> + !f2fs_quota_file(inode))
>>> + mapping_set_folio_min_order(inode->i_mapping, 0);
>>> } else if (S_ISDIR(inode->i_mode)) {
>>> inode->i_op = &f2fs_dir_inode_operations;
>>> inode->i_fop = &f2fs_dir_operations;
>>> @@ -638,7 +643,6 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
>>> ret = -EIO;
>>> goto bad_inode;
>>> }
>>> - f2fs_set_inode_flags(inode);
>>> unlock_new_inode(inode);
>>> trace_f2fs_iget(inode);
|