improve readability

grimoire · grimoire · commit 4cf43bc85621 · 2026-06-23T14:27:48.000+08:00
diff --git a/lmdeploy/pytorch/engine/inputs_maker.py b/lmdeploy/pytorch/engine/inputs_maker.py
@@ -905,6 +905,53 @@ def __create_short_or_normal_prefill_turn():
                 self._short_prefill_turns_since_long_chunk += 1
             return result
 
+        def __is_empty_forward(forward_inputs: 'ModelInputs|None', forward_delta: 'ModelInputsDelta|None'):
+            return forward_inputs is None and forward_delta is None
+
+        def __try_active_long_context_chunk():
+            nonlocal attempted_long_work
+            nonlocal active_long_chunk_blocked_by_kv
+            attempted_long_work = True
+            result = __create_inputs_long_context_chunk()
+            _, chunk_inputs, chunk_delta, _ = result
+            active_long_chunk_blocked_by_kv = __is_empty_forward(chunk_inputs, chunk_delta)
+            return result
+
+        def __should_try_short_prefill_before_active_chunk():
+            """Allow short/normal prefill quota before an active non-final
+            chunk."""
+            if self.long_context_chunker.is_last_chunk():
+                return False
+            if not scheduler.has_waiting():
+                return False
+            return not self._is_long_context_chunk_turn_due()
+
+        def __has_no_forward():
+            return __is_empty_forward(inputs, delta)
+
+        def __can_fallback_to_short_after_long_work():
+            if not __has_no_forward():
+                return False
+            if not attempted_long_work:
+                return False
+            if active_long_chunk_blocked_by_kv:
+                return False
+            if attempted_short_or_normal_prefill:
+                return False
+            return scheduler.has_waiting()
+
+        def __can_try_short_prefill_after_defer():
+            if not __has_no_forward():
+                return False
+            if not deferred_long_context_chunk:
+                return False
+            if self._is_long_context_chunk_turn_due():
+                return False
+            return scheduler.has_waiting()
+
+        def __can_retry_deferred_active_chunk():
+            return __has_no_forward() and deferred_long_context_chunk and self.long_context_chunker.enabled()
+
         scheduler = self.scheduler
         logger.debug(f'Make forward inputs with prefill={prefill}, enable_empty={enable_empty}')
 
@@ -926,11 +973,9 @@ def __create_short_or_normal_prefill_turn():
         # long prefill through the scheduler.
         self.long_context_chunker.check_enable()
         if self.long_context_chunker.enabled():
-            # long context chunking
             if self._should_defer_long_context_chunk(prefill):
                 deferred_long_context_chunk = True
-            elif (not self.long_context_chunker.is_last_chunk() and scheduler.has_waiting()
-                  and not self._is_long_context_chunk_turn_due()):
+            elif __should_try_short_prefill_before_active_chunk():
                 # After a decode turn, keep the short/normal prefill quota in
                 # front of active long chunks; otherwise decode -> long can
                 # repeat and small waiting requests remain gated by the active
@@ -943,14 +988,10 @@ def __create_short_or_normal_prefill_turn():
                     swap_in_map,
                     swap_out_map,
                 ) = __create_short_or_normal_prefill_turn()
-                if inputs is None and delta is None:
-                    attempted_long_work = True
-                    running, inputs, delta, extra_inputs = __create_inputs_long_context_chunk()
-                    active_long_chunk_blocked_by_kv = inputs is None and delta is None
+                if __is_empty_forward(inputs, delta):
+                    running, inputs, delta, extra_inputs = __try_active_long_context_chunk()
             else:
-                attempted_long_work = True
-                running, inputs, delta, extra_inputs = __create_inputs_long_context_chunk()
-                active_long_chunk_blocked_by_kv = inputs is None and delta is None
+                running, inputs, delta, extra_inputs = __try_active_long_context_chunk()
         elif prefill:
             # prefill
             has_waiting_long_prefill = scheduler.has_waiting_long_prefill()
@@ -963,7 +1004,7 @@ def __create_short_or_normal_prefill_turn():
                     swap_in_map,
                     swap_out_map,
                 ) = __create_short_or_normal_prefill_turn()
-                if inputs is None and delta is None:
+                if __has_no_forward():
                     (
                         running,
                         inputs,
@@ -986,8 +1027,7 @@ def __create_short_or_normal_prefill_turn():
         # Waiting-long admission failure can still fall back to short prefills.
         # Active-long reservation failure means KV is pinned by running work;
         # admit decode only so existing requests can drain blocks.
-        if (inputs is None and delta is None and attempted_long_work and not active_long_chunk_blocked_by_kv
-                and scheduler.has_waiting() and not attempted_short_or_normal_prefill):
+        if __can_fallback_to_short_after_long_work():
             (
                 running,
                 inputs,
@@ -1004,8 +1044,7 @@ def __create_short_or_normal_prefill_turn():
             self.to_evict_seqs = invalid_seqs
             extra_inputs = None
 
-        if (inputs is None and delta is None and deferred_long_context_chunk and scheduler.has_waiting()
-                and not self._is_long_context_chunk_turn_due()):
+        if __can_try_short_prefill_after_defer():
             (
                 running,
                 inputs,
@@ -1015,8 +1054,8 @@ def __create_short_or_normal_prefill_turn():
                 swap_out_map,
             ) = __create_short_or_normal_prefill_turn()
 
-        if inputs is None and delta is None and deferred_long_context_chunk and self.long_context_chunker.enabled():
-            running, inputs, delta, extra_inputs = __create_inputs_long_context_chunk()
+        if __can_retry_deferred_active_chunk():
+            running, inputs, delta, extra_inputs = __try_active_long_context_chunk()
 
         # reset decode count when non-decoding inputs are produced
         if inputs is not None and not inputs.is_decoding:
diff --git a/lmdeploy/pytorch/paging/scheduler.py b/lmdeploy/pytorch/paging/scheduler.py
@@ -250,8 +250,8 @@ def _long_prefill_priority_key(self, seq: SchedulerSequence, now: float):
         estimated_chunks = self._long_prefill_estimated_chunks(seq)
         wait_age = max(0.0, now - seq.arrive_time)
         age_credit = int(wait_age // self._long_prefill_aging_seconds_per_chunk)
-        virtual_chunks = estimated_chunks - age_credit
-        return virtual_chunks, estimated_chunks, seq.arrive_time
+        age_adjusted_chunks = estimated_chunks - age_credit
+        return age_adjusted_chunks, estimated_chunks, seq.arrive_time
 
     def _prepare_prefill_allocation(self, seq: SchedulerSequence, prealloc_size: int):
         """Apply chunk KV limit and return the effective prealloc size."""
@@ -423,44 +423,59 @@ def __prepare_and_evict(seq: SchedulerSequence, waiting):
             seq.kv_token_limit = None
             return False, alloc_prealloc_size
 
+        def _split_waiting_by_prefill_kind(waiting: SeqList):
+            """Split waiting requests into normal/final and non-final long
+            prefill."""
+            normal_waiting: SeqList = []
+            long_waiting: SeqList = []
+            for seq in waiting:
+                if self._prefill_kv_token_limit(seq) is None:
+                    normal_waiting.append(seq)
+                else:
+                    long_waiting.append(seq)
+            return normal_waiting, long_waiting
+
+        def _sort_normal_prefills(waiting: SeqList):
+            return sorted(waiting, key=lambda seq: (self._prefill_admission_token_count(seq), seq.arrive_time))
+
+        def _sort_long_prefills_for_long_turn(waiting: SeqList):
+            if self._long_prefill_policy != 'size':
+                return waiting
+            now = time.perf_counter()
+            return sorted(waiting, key=lambda seq: self._long_prefill_priority_key(seq, now))
+
+        def _reorder_waiting_for_long_turn(waiting: SeqList):
+            """Choose one long waiter, then fill the turn with normal
+            prefills."""
+            normal_waiting, long_waiting = _split_waiting_by_prefill_kind(waiting)
+            if len(long_waiting) == 0:
+                return None
+
+            long_waiting = _sort_long_prefills_for_long_turn(long_waiting)
+            normal_waiting = _sort_normal_prefills(normal_waiting)
+            return [long_waiting[0]] + normal_waiting + long_waiting[1:]
+
+        def _reorder_waiting_for_short_turn(waiting: SeqList):
+            """Prioritize normal/final prefills while preserving long
+            waiters."""
+            normal_waiting, long_waiting = _split_waiting_by_prefill_kind(waiting)
+            return _sort_normal_prefills(normal_waiting) + long_waiting
+
         def _reorder_waiting():
             """Reorder waiting."""
             waiting = sorted(self.waiting, key=lambda seq: seq.arrive_time)
             if prefer_long_prefill:
                 # Long-work turns choose one long waiter first. The size policy
                 # only reorders this long lane; it is not global
                 # shortest-prefill-first admission.
-                long_waiting: SeqList = []
-                normal_waiting: SeqList = []
-                for seq in waiting:
-                    if self._prefill_kv_token_limit(seq) is None:
-                        normal_waiting.append(seq)
-                    else:
-                        long_waiting.append(seq)
-                if len(long_waiting) > 0:
-                    if self._long_prefill_policy == 'size':
-                        now = time.perf_counter()
-                        long_waiting = sorted(long_waiting,
-                                              key=lambda seq: self._long_prefill_priority_key(seq, now))
-                    normal_waiting = sorted(normal_waiting,
-                                            key=lambda seq: (self._prefill_admission_token_count(seq),
-                                                             seq.arrive_time))
-                    return [long_waiting[0]] + normal_waiting + long_waiting[1:]
+                long_turn_waiting = _reorder_waiting_for_long_turn(waiting)
+                if long_turn_waiting is not None:
+                    return long_turn_waiting
 
             if allow_long_prefill:
                 return waiting
 
-            normal_waiting: SeqList = []
-            long_waiting: SeqList = []
-            for seq in waiting:
-                if self._prefill_kv_token_limit(seq) is None:
-                    normal_waiting.append(seq)
-                else:
-                    long_waiting.append(seq)
-
-            normal_waiting = sorted(normal_waiting, key=lambda seq: (self._prefill_admission_token_count(seq),
-                                                                     seq.arrive_time))
-            return normal_waiting + long_waiting
+            return _reorder_waiting_for_short_turn(waiting)
 
         num_waiting = self.seq_manager.num_sequences(MessageStatus.WAITING)
         if (len(running) >= max_batches or num_waiting == 0):