-
Notifications
You must be signed in to change notification settings - Fork 305
feat: add multi node disk cache #1218
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,7 +6,7 @@ | |
| from typing import List, Optional | ||
|
|
||
| import torch | ||
| from lightllm.utils.envs_utils import get_unique_server_name | ||
| from lightllm.utils.envs_utils import get_disk_cache_index_prefix, get_unique_server_name | ||
| from lightllm.utils.log_utils import init_logger | ||
| from .cpu_cache_client import CpuKvCacheClient | ||
|
|
||
|
|
@@ -36,15 +36,18 @@ def __init__( | |
| disk_cache_storage_size: float, | ||
| cpu_cache_client: CpuKvCacheClient, | ||
| disk_cache_dir: Optional[str] = None, | ||
| redis_endpoint: str = "", | ||
| num_node_in_disk_cache: int = 1, | ||
| ): | ||
| self.cpu_cache_client = cpu_cache_client | ||
| self._pages_all_idle = False | ||
|
|
||
| assert disk_cache_storage_size > 0 | ||
| storage_size = int(disk_cache_storage_size * (1024 ** 3)) | ||
| # num_shard与KVCACHE_MAX_BLOCK_SIZE相关,KVCACHE_MAX_BLOCK_SIZE默认64MB前提下, | ||
| # num_shard设置32, 能使disk cache的容量利用率达到90%,继续增大num_shard会导致容量利用率下降 | ||
| num_shard = 32 | ||
| if num_node_in_disk_cache <= 0: | ||
| raise ValueError(f"num_node_in_disk_cache must be >= 1, got {num_node_in_disk_cache}") | ||
| num_shard = 64 * num_node_in_disk_cache if redis_endpoint else 64 | ||
| num_worker = 48 | ||
| # 读写同时进行时,分配16线程用来写,32线程用来读 | ||
| max_concurrent_write_tasks = 16 | ||
|
|
@@ -64,6 +67,9 @@ def __init__( | |
| storage_size=storage_size, | ||
| num_shard=num_shard, | ||
| num_worker=num_worker, | ||
| index_endpoint=redis_endpoint, | ||
| index_prefix=get_disk_cache_index_prefix(), | ||
| bandwidth_log=True, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
| ) | ||
|
|
||
| logger.info( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The comment on lines 45-46 states that
num_shardis set to32for 90% utilization. However, the new logic on line 50 changesnum_shardto64 * num_node_in_disk_cacheor64. This change in the calculation ofnum_shardshould be reflected in the comment to avoid confusion and explain the new rationale behind the value, especially the change from 32 to 64 as a base.