Re: [PATCH 2/2] tuna: Add idle-state control functionality

Crystal Wood <crwood@xxxxxxxxxx> · Thu, 13 Feb 2025 17:09:18 -0600

On Mon, 2025-01-27 at 20:45 -0500, John B. Wyatt IV wrote:
> Allows Tuna to control cpu idle-state functionality on the system,
> including querying, enabling, disabling of cpu idle-states to control
> power usage or to test functionality.
> 
> This requires cpupower, a utility in the Linux kernel repository and
> the cpupower Python bindings added in Linux 6.12 to control cpu
> idle-states. If cpupower is missing Tuna as a whole will continue to
> function and idle-set functionality will error out.
> 
> Signed-off-by: John B. Wyatt IV <jwyatt@xxxxxxxxxx>
> Signed-off-by: John B. Wyatt IV <sageofredondo@xxxxxxxxx>
> ---
>  tuna-cmd.py      |  33 +++++++-
>  tuna/cpupower.py | 202 +++++++++++++++++++++++++++++++++++++++++++++++
>  2 files changed, 233 insertions(+), 2 deletions(-)
>  create mode 100755 tuna/cpupower.py
> 
> diff --git a/tuna-cmd.py b/tuna-cmd.py
> index d0323f5..81d0f48 100755
> --- a/tuna-cmd.py
> +++ b/tuna-cmd.py
> @@ -25,6 +25,7 @@ from tuna import tuna, sysfs, utils
>  import logging
>  import time
>  import shutil
> +import tuna.cpupower as cpw

>  def get_loglevel(level):
>      if level.isdigit() and int(level) in range(0,5):
> @@ -115,8 +116,12 @@ def gen_parser():
>              "disable_perf": dict(action='store_true', help="Explicitly disable usage of perf in GUI for process view"),
>              "refresh": dict(default=2500, metavar='MSEC', type=int, help="Refresh the GUI every MSEC milliseconds"),
>              "priority": dict(default=(None, None), metavar="POLICY:RTPRIO", type=tuna.get_policy_and_rtprio, help="Set thread scheduler tunables: POLICY and RTPRIO"),
> -            "background": dict(action='store_true', help="Run command as background task")
> -         }
> +            "background": dict(action='store_true', help="Run command as background task"),
> +            "idle_state_disabled_status": dict(dest='idle_state_disabled_status', metavar='IDLESTATEDISABLEDSTATUS', type=int, help='Print if cpu idle state of the cpus in CPU-LIST is enabled or disabled. If CPU-LIST is not specified, default to all cpus.'),
> +            "idle_info": dict(dest='idle_info', action='store_const', const=True, help='Print general idle information on cpus in CPU-LIST. If CPU-LIST is not specified, default to all cpus.'),
> +            "disable_idle_state": dict(dest='disable_idle_state', metavar='IDLESTATEINDEX', type=int, help='Disable cpus in CPU-LIST\'s cpu idle (cpu sleep state). If CPU-LIST is not specified, default to all cpus.'),
> +            "enable_idle_state": dict(dest='enable_idle_state', metavar='IDLESTATEINDEX', type=int, help='Enable cpus in CPU-LIST\'s cpu idle (cpu sleep state). If CPU-LIST is not specified, default to all cpus.')
> +    }
>  
>      parser = HelpMessageParser(description="tuna - Application Tuning Program")
>  
> @@ -147,6 +152,10 @@ def gen_parser():
>      show_irqs = subparser.add_parser('show_irqs', description='Show IRQ list', help='Show IRQ list')
>      show_configs = subparser.add_parser('show_configs', description='List preloaded profiles', help='List preloaded profiles')
>  
> +    idle_set = subparser.add_parser('idle-set',
> +                                    description='Query and set all idle states on a given CPU list. Requires libcpupower to be installed',
> +                                    help='Set all idle states on a given CPU-LIST.')

idle_state would be a better name (or idle-state, but underscores are
already used elsewhere...), since it can both set and get.

It also mostly operates on individual states (-i being the exception),
not all at once.  How about just:

Manage CPU idle state disabling (requires libcpupower)

Also, don't forget to update the man page.

> @@ -635,6 +651,19 @@ def main():
>          my_logger.addHandler(add_handler("DEBUG", tofile=False))
>          my_logger.info("Debug option set")
>  
> +    if args.command == 'idle-set':
> +        if not cpw.have_cpupower:
> +            print(f"Error: libcpupower bindings are not detected; need {cpw.cpupower_required_kernel} at a minimum.")
> +            sys.exit(1)
> +
> +        if not args.cpu_list or args.cpu_list == []:
> +            args.cpu_list = cpw.Cpupower().get_all_cpu_list()
> +
> +        my_cpupower = cpw.Cpupower(args.cpu_list)
> +        ret = my_cpupower.idle_state_handler(args)
> +        if ret > 0:
> +            sys.exit(ret)

Why not just pass in cpu_list as is, and have cpw understand what
an empty or absent list is?  And it looks like it already partially
does check for None...  If the user specifically does something
like --cpu '' should we really treat that as equivalent to not
specifing --cpu?  I don't see other commands doing this.

Especially if you're going to delegate all other option parsing...
Why is cpulist special?  Just do something like:

elif args.command == 'idle_state'
    cpw.idle_state(args)

>      if args.loglevel:
>          if not args.debug:
>              my_logger = setup_logging("my_logger")

Why did you put the new command before log handling, rather than with
all the other commands?

> diff --git a/tuna/cpupower.py b/tuna/cpupower.py
> new file mode 100755
> index 0000000..b09dc2f
> --- /dev/null
> +++ b/tuna/cpupower.py
> @@ -0,0 +1,202 @@
> +# Copyright (C) 2024 John B. Wyatt IV
> +# SPDX-License-Identifier: GPL-2.0-only
> +
> +from typing import List
> +import tuna.utils as utils
> +
> +cpupower_required_kernel = "6.12"
> +have_cpupower = None
> +
> +
> +import raw_pylibcpupower as cpw

This is a bit confusing since you import this module as cpw
elsewhere...

Also, I got this even without trying to use the new functionality:

$ ./tuna-cmd.py 
Traceback (most recent call last):
  File "/home/crwood/git/tuna/./tuna-cmd.py", line 28, in <module>
    import tuna.cpupower as cpw
  File "/home/crwood/git/tuna/tuna/cpupower.py", line 11, in <module>
    import raw_pylibcpupower as cpw
ModuleNotFoundError: No module named 'raw_pylibcpupower'

Maybe something like:

try:
    import raw_pylibcpupower as lcpw
    lcpw.cpufreq_get_available_frequencies(0)
except:
    lcpw = None

> +        You must use have_cpupower variable to determine if the bindings were
> +        detected in your code."""

Instead of doing to the class/module user what SWIG did to the binding
user, why not just throw an exception if the binding is missing?  This
will automatically happen if lcpw is None, though you may want a
friendlier error message in the main entry point.

> +        def __init__(self, cpulist=None):
> +            if cpulist == None:
> +                self.__cpulist = self.get_all_cpu_list()
> +            else:
> +                self.__cpulist = cpulist
> +
> +        @classmethod
> +        def get_all_cpu_list(cls):
> +            return list(range(cls.get_idle_info()["all_cpus"]))

Is this really idle-state-specific?  Maybe just something like this
in tuna/utils.py:

def get_all_cpu_list():
    return list(range(get_nr_cpus()))

We shouldn't need to get all the idle state information just to get a
cpu list.

And all these class methods make me wonder why this is a class to begin
with, rather than just module functions.

> +
> +        @classmethod
> +        def get_idle_info(cls, cpu=0):

Why is cpu 0 special?

> +            idle_states, idle_states_amt = cls.get_idle_states(cpu)
> +            idle_states_list = []
> +            for idle_state in range(0, len(idle_states)):
> +                idle_states_list.append(
> +                    {
> +                        "CPU ID": cpu,
> +                        "Idle State Name": idle_states[idle_state],
> +                        "Flags/Description": cpw.cpuidle_state_desc(cpu, idle_state),
> +                        "Latency": cpw.cpuidle_state_latency(cpu, idle_state),
> +                        "Usage": cpw.cpuidle_state_usage(cpu, idle_state),
> +                        "Duration": cpw.cpuidle_state_time(cpu, idle_state)
> +                    }
> +                )
> +            idle_info = {
> +                "all_cpus": utils.get_nr_cpus(),
> +                "CPUidle-driver": cpw.cpuidle_get_driver(),
> +                "CPUidle-governor": cpw.cpuidle_get_governor(),
> +                "idle-states-count": idle_states_amt,
> +                "available-idle-states": idle_states,
> +                "cpu-states": idle_states_list
> +            }
> +            return idle_info

This seems overly complicated.  Why do you bundle all this stuff up
just to extract it elsewhere?  The call to get the data is simpler than
the data structure lookup.

> +
> +        @classmethod
> +        def print_idle_info(cls, cpu_list=[0]):

The only thing that instantiating this class does is to store a cpu
list, and here you're passing it into a class method instead...

> +        def idle_state_handler(self, args) -> int:
> +            if args.idle_state_disabled_status != None:
> +                cstate_index = args.idle_state_disabled_status
> +                cstate_list, cstate_amt = self.get_idle_states(args.cpu_list[0]) # Assumption, that all cpus have the same idle state

The API doesn't make that assumption, so why should we?  Systems exist
with heterogeneous CPUs... could be a reason to support looking up states
by name, if and when there are systems we care about that actually have
different cpuidle states.

And you don't need this lookup anyay -- libcpupower already does the
bounds check, and you can get the name inside the loop.

> +                if cstate_index < 0 or cstate_index >= cstate_amt:
> +                    print(f"Invalid idle state range. Total for this cpu is {cstate_amt}")
> +                    return 1

"this cpu"?

> +                cstate_name = cstate_list[cstate_index]
> +                ret = self.is_disabled_idle_state(cstate_index)
> +                for i, e in enumerate(ret):
> +                    match e:
> +                        case 1:
> +                            print(f"CPU: {args.cpu_list[i]} Idle state \"{cstate_name}\" is disabled.")
> +                        case 0:
> +                            print(f"CPU: {args.cpu_list[i]} Idle state \"{cstate_name}\" is enabled.")
> +                        case -1:
> +                            print(f"Idlestate not available")
> +                        case -2:
> +                            print(f"Disabling is not supported by the kernel")
> +                        case _:
> +                            print(f"Not documented: {e}")
> +            elif args.idle_info != None:
> +                self.print_idle_info(args.cpu_list)
> +                return 0
> +            elif args.disable_idle_state != None:
> +                cstate_index = args.disable_idle_state
> +                cstate_list, cstate_amt = self.get_idle_states(args.cpu_list[0]) # Assumption, that all cpus have the same idle state
> +                if cstate_index < 0 or cstate_index >= cstate_amt:
> +                    print(f"Invalid idle state range. Total for this cpu is {cstate_amt}")
> +                    return 1
> +                cstate_name = cstate_list[cstate_index]
> +                ret = self.disable_idle_state(cstate_index, 1)
> +                for i, e in enumerate(ret):
> +                    match e:
> +                        case 0:
> +                            print(f"CPU: {args.cpu_list[i]} Idle state \"{cstate_name}\" is disabled.")
> +                        case -1:
> +                            print(f"Idlestate not available")
> +                        case -2:
> +                            print(f"Disabling is not supported by the kernel")
> +                        case -3:
> +                            print(f"No write access to disable/enable C-states: try using sudo")
> +                        case _:
> +                            print(f"Not documented: {e}")
> +            elif args.enable_idle_state != None:
> +                cstate_index = args.enable_idle_state
> +                cstate_list, cstate_amt = self.get_idle_states(args.cpu_list[0]) # Assumption, that all cpus have the same idle state
> +                if cstate_index < 0 or cstate_index >= cstate_amt:
> +                    print(f"Invalid idle state range. Total for this cpu is {cstate_amt}")
> +                    return 1
> +                cstate_name = cstate_list[cstate_index]
> +                ret = self.disable_idle_state(cstate_index, 0)
> +                for i, e in enumerate(ret):
> +                    match e:
> +                        case 0:
> +                            print(f"CPU: {args.cpu_list[i]} Idle state \"{cstate_name}\" is enabled.")
> +                        case -1:
> +                            print(f"Idlestate not available")
> +                        case -2:
> +                            print(f"Disabling is not supported by the kernel")
> +                        case -3:
> +                            print(f"No write access to disable/enable C-states: try using sudo")
> +                        case _:
> +                            print(f"Not documented: {e}")

Factor out the error messages so they're not duplicated between
suboperations.  Actually, pretty much the whole thing is duplicated between
enable/disable...

Do we want to print anything at all on success?  It looks like tuna
generally follows the Unix philosophy of not doing so.

And the error messages are a bit vague... imagine running a script and
something deep inside it spits out "Disabling is not supported by the
kernel".  Disabling what?

> +            else:
> +                print(args)
> +                print("idle-set error: you should not get here!")

"you should not get here" is not a useful error message... jut throw
an exception.

-Crystal