[remote_app] ; The application that is run by remote_app when it receives a job. app_path="./job_processor.sh" ; A relative or absolute pathname of the application that must be periodically ; run when the node is in idle mode (no jobs are being executed). ; ;idle_app_cmd="./idle_task.sh" ; Maximum application run time in seconds. ; If the job runs longer than that, remote_app terminates it. ; Default: 0 (no time limit) ; max_app_run_time=0 ; Time slice in seconds by which job expiration is extended while the job is ; running. ; ; This parameter is zero by default, which means that job lifetime is not ; extended while the job is running. For long running jobs, it is recommended ; to set this parameter to a non-zero value. ; keep_alive_period=60 ; Specifies what to do with the job when the job-processing application ; terminates with a non-zero exit code. ; ; Available actions: done - the job will be marked as done ; fail - the job will be failed with an error message ; return - the job will be returned back to the queue ; ; Default action is 'done' ; ;non_zero_exit_action = fail ; Specifies whether a separate temporary directory must be created for each ; run of the job-processing application. ; ; When the application terminates, this directory is automatically deleted ; unless 'remove_tmp_dir' (see below) is set to false. ; ; Default is false. ; run_in_separate_dir=true ; If 'run_in_separate_dir' is set to true, 'tmp_path' defines the location of ; the top-level temporary directory and, optionally, the format of the names ; of its per-job subdirectories. ; ; This parameter allows the following substitutions: ; * %q - NetSchedule queue name ; * %j - the ID of the job being processed ; * %t - the current UNIX Epoch time in seconds ; * %r - the current request number ; * %% - substituted with a single '%' character ; If none of the substitutions is given, the following default ; combination is used: ${tmp_path}/%q_%j_%t ; ;tmp_path=/tmp ; Synonym for 'tmp_path'. If both are set, 'tmp_dir' is used. ; ;tmp_dir=/tmp ; If 'run_in_separate_dir is set to true, this parameter specifies whether the ; temporary directory created for each job must be deleted after the job has ; finished. ; ; Default is true. ; ;remove_tmp_dir = false ; Specifies a number of subsequent attempts to make asynchronously to remove ; a temporary directory (all attempts incl. first are done in a separate thread). ; Zero means a single immediate synchronous try (in the same thread). ; ; Default value is 60 attempts ; ;max_remove_tmp_attempts = 60 ; Specifies how often (not earlier than) those removing attempts occur. ; Cannot be less than 1 second (will be adjusted to 1 second otherwise). ; ; Default value is every 60 seconds ; ;sleep_between_remove_tmp_attempts = 60 ; If 'run_in_separate_dir' is set to true, specifies whether the standard ; output and standard error output must be cached in the working directory ; prior to sending them to NetCache. ; ; This prevents network timeouts for long-running jobs that produce ; significant amount of data. ; ; Default is true. ; ;cache_std_out_err = false ; Specifies a job execution monitor (or watcher) application. This application ; can be used to check on the job-progressing application or for progress ; report. ; ; The watcher runs periodically during the job-processing application ; execution (see 'monitor_period'). ; ; The following parameters are passed to the monitor program: ; -pid pid - real application process id ; -jid jid - job id which the real application is processing. ; -jwdir dir - the application's working directory ; ; The exit code of the monitor program is interpreted as follows: ; 0: The job is running as expected. The monitor's stdout is interpreted as ; a job progress message. The stderr goes to the log file if logging is ; enabled. ; 1: The monitor detected an inconsistency with the job run; the job must be ; returned back to the queue. The monitor's stderr goes to the log file ; regardless of whether logging is enabled or not. ; 2: The job must be failed. The monitor's stdout is interpreted as the ; error message; stderr goes to the log file regardless of whether ; logging is enabled or not. ; 3: There's a problem with the monitor application itself. The job ; continues to run and the monitor's stderr goes to the log file ; unconditionally. ;monitor_app_path=./monitor.sh ; Maximum monitor run time in seconds. If the monitor application does not ; finish within the specified time frame, it is terminated. ; ; The default is 5 seconds. ; ;max_monitor_running_time = 3 ; Specifies how often the monitor application must be run. ; ; Default period is every 5 seconds. ; ;monitor_period = 10 ; Wait time in seconds between the "soft" and the "hard" attempts to terminate ; the job processor. ; ; Default value is 1 second. ; ;kill_timeout = 5 ; Specifies number of attempts to be made to reap (wait for) a child process ; after that process failed to be killed. ; ; Default value is 60 attempts ; ;max_reap_attempts_after_kill = 60 ; Specifies how often (not earlier than) those reaping attempts occur ; ; Default value is every 60 seconds ; ;sleep_between_reap_attempts = 60 ; A command to request version of application that is run by remote_app ; ; Default value is equal to 'app_path' parameter value ; ;version_cmd = cat ; Space-separated command-line arguments to pass to 'version_cmd' command ; ; Default value is '-version' ; ;version_args = ./my_app_version.txt ; Specifies if application timeout is reported (in job's progress message). ; It's when an app does not finish in 'max_app_run_time' seconds (if set). ; ; Possible values: ; smart - report only if there is no progress message already reported earlier ; always - always report, even if it involves overriding ; an existing progress message (reported earlier) ; never - never report ; ; Default value is 'smart'. ; ;progress_message_on_timeout = smart ; Specifies what exit codes force jobs to fail with no retries. ; ; The parameter must be in the following format (in ascending order): ; [!] R1, N2, ..., Rn ; ; Where: ; ! - negation, makes all provided ranges be excluded ; (not included). ; R1 ... Rn - integer closed ranges specified either as ; FROM - TO (corresponds to [FROM, TO] range) or as ; NUMBER (corresponds to [NUMBER, NUMBER] range). ; Example: ; 4, 6 - 9, 16 - 40, 64 ; ;fail_no_retries_if_exit_code = ! 0 - 10 ; Environment variables handling: ; 1. Take the local environment. ; 2. Add or override all entries from [env_set] section. [env_set] ; LD_LIBRARY_PATH=. ; Generic worker node parameters [server] ; Maximum number of jobs that can be served simultaneously (each job occupies ; one thread). For computationally intensive algorithms, this value should ; not exceed the number of CPU cores. ; ; Default value is 'auto', which means that the value is automatically ; detected based on the number of CPU cores available in the system. ; ;max_threads=auto ; Initial number of worker threads. ; ; Default number is 1. ; ;init_threads=1 ; TCP port number for administrative access. This parameter can be specified ; as a range, in which case the worker node will choose the first available ; port from the given range. ; control_port=9300 ; Whether to enable verbose logging. ; log=true ; Whether to log progress message changes generated by the monitor ; application. False by default. ; ;log_progress=false ; Internal. ; Lower values give better response to the shutdown command when all ; working threads are occupied. ; thread_pool_timeout=5 ; Time in seconds between two consecutive attempts to get a job for execution. ; In between these attempts, the worker node waits on its UDP port for a ; notification from the server. If UDP is blocked by the firewall, this value ; may need to be lowered. ; ; Default value is 30 seconds. ; job_wait_timeout=10 ; The maximum number of jobs the worker node is allowed to process before must ; shut itself down. ; Restarting the node periodically is sometimes useful because of heap ; fragmentation, possible memory leaks, etc. ; ; The default value is 0, which means that worker node lifetime is not limited ; by the number of jobs it processes. ; ;max_total_jobs = 100 ; The maximum number of jobs the worker node is allowed to fail before must ; shut itself down. ; ; The default value is 0, which means that worker node lifetime is not limited ; by the number of jobs it fails. ; ;max_failed_jobs = 3 ; Sets the maximum number of jobs with the same client IP address running in parallel. ; While this limit is reached for a client IP address, the worker node immediately returns ; all new jobs with the same client IP address back to the queue also blacklisting them. ; Blacklisted jobs will not be given again to the same worker node for some time (server configured). ; ; Default value is 0, which means that jobs per client IP address are not limited ; ;max_jobs_per_client_ip = 10 ; Sets the maximum number of jobs with the same session ID running in parallel. ; While this limit is reached for a session ID, the worker node immediately returns ; all new jobs with the same session ID back to the queue also blacklisting them. ; Blacklisted jobs will not be given again to the same worker node for some time (server configured). ; ; Default value is 0, which means that jobs per session ID are not limited ; ;max_jobs_per_session_id = 10 ; Sets the maximum limit for total memory consumption by this worker node. ; When this limit is reached, the worker node shuts down. ; The value can contain multiple-byte units: G(giga-), MB(mega-), KiB (kibi-) etc. ; ; Default value is 0, which means that memory consumption is not limited ; ;total_memory_limit = 1.5GB ; Sets the maximum limit for total runtime of this worker node (in seconds). ; When this limit is reached, the worker node shuts down. ; ; Default value is 0, which means that runtime is not limited ; ;total_time_limit = 3600 ; Allows this worker node to detect infinite loops in job execution. ; If a job is being executed for more than the specified time, ; it is assumed to be stuck in an infinite loop. ; If this happens, the worker node enters shutdown mode: ; After all other running jobs are either done or also assumed stuck, ; the worker node shuts down (may end up killing itself). ; ; Default value is 0, which means that the node will not detect infinite loops. ; ;infinite_loop_time = 600 ; (POSIX systems only) Whether the worker node must daemonize after startup. ; ;daemon=true ; The list of worker nodes that this node must check before attempting ; to retrieve a job from the NetSchedule queue. If at least one of these ; worker nodes has an idle thread, this node will not connect to the queue ; to get a job. This node and all nodes from the given list must be connected ; to the same NetSchedule service and the same queue. ; ;master_nodes = server1:9300, server2:9300 ; List of network hosts from which administrative access to this worker node ; is allowed. Note that 'localhost' must be explicitly added to this list if ; needed. ; ; By default, administrative access from any host is granted. ; ;admin_hosts = localhost service1 service2 service3 ; Timeout in seconds before the node enters idle mode and the idle task is ; executed. The node is considered idle when all jobs are done and there are ; no new jobs in the queue. ; ; Default value is 30 sec. ; ;idle_run_delay = 30 ; Whether and when the worker node must shut itself down if it's idle. ; If the node is idle longer than the specified number of seconds, it will ; automatically shut down. ; ; Default: 0 (no automatic shutdown) ; ;auto_shutdown_if_idle = 0 ; Specifies that the worker node must immediately exit as soon as ; it receives a shutdown request, without waiting for the worker ; threads to finish running their jobs. ;force_exit = false ; Specifies if the framework must reuse an instance of the job class. Setting ; this parameter to true means that only one instance of the job class will be ; create per each execution thread. False means that a new instance of the job ; class will be created for each incoming job. ; ; Default value is false. ; reuse_job_object = true ; Specifies how often the node must check the status of a running job. ; This parameter affects how soon the job-processing application will be ; terminated if the job is canceled. ; check_status_period = 4 ; Default timeout before the job is terminated in case of pullback. ; This value can be overridden by the '--timeout' option specified ; with 'grid_cli suspend --pullback'. ; default_pullback_timeout = 0 ; Specifies how often (in seconds) the worker node retries to commit a job after communication errors. ; The worker node gives up retrying either after it takes more than queue timeout (server configured) ; or if server successfully receives corresponding commit request but responds with an error. ; ; Minimum allowed value is 1 second, default is 2 seconds. ; ;commit_job_interval = 5 ; If set to true, the worker node forks at start. ; Parent process is only used to clear the node on servers on exit and child process does everything else. ; Thus, the node is realiably cleared even if child process crashes/is killed (for UNIX only). ; ; Default value is false (no forking). ; ;reliable_cleanup = true ; NetSchedule client configuration ; [netschedule_api] ; Specifies how the node must introduce itself to the NetSchedule servers. ; client_name= ; Specifies NetSchedule service. It can be either an LBSM service or a ; host:port address. ; service=NS_test ; The name of the job queue, which can be either allocated by a Grid ; administrator or created dynamically using 'grid_cli createqueue'. ; queue_name= ; How often to query LBSM (time in seconds). ; rebalance_time=10 ; How often to query LBSM (number of internal LBSM requests before ; rebalancing). ; rebalance_requests=100 ; Instructs the worker node to use the internal NetSchedule storage. The size ; of this internal storage is queue-specific, but usually is about 256Kb. ; If job output does not fit there, it will be saved to NetCache. ; ; Default value is false. ; use_embedded_storage = true ; Use affinity information when requesting jobs. ; ; Default value is false. ; use_affinities = false ; Initial set of preferred affinities. ; Initial (comma/space separated) list of preferred affinities. ; Example: job_type_a, job_type_b ; ; Default value is empty. ; ;affinity_list = ; A prioritized lists of affinities, which overrides the default ; job processing order. ; Cannot be used with affinity_list. ; ; Example: high_priority_job, mid_priority_job, low_priority_job ; ; Default value is empty. ; ;affinity_ladder = ; Use affinity information and accept new affinities automatically. ; Cannot be used with affinity_ladder. ; ; Default value is false. ; claim_new_affinities = false ; Allow the worker node to process jobs without affinities as well as ; jobs with "non-preferred" affinities. ; Cannot be used in combination with 'claim_new_affinities'. ; ; Default value is false. ; process_any_job = false ; Communication timeout (in seconds) for connections to the NetSchedule ; servers. ; ; Default is 12 seconds. ; ;communication_timeout = 20 ; Sets a communication timeout (in seconds) for accessing the first server ; in a service while submitting a job. If the first server does not reply ; within the specified amount of time, the next server will be tried, but ; the second and all subsequent servers will be given the full ; communication_timeout to reply. If LBSM services are not used or there's ; only one server in the service, this parameter does not apply. ; ; Default is [netschedule_api]/communication_timeout if defined, ; or 300ms (0.3s) ; ;first_server_timeout = 1 ; Network storage (NetCache) configuration ; [netcache_api] ; NetCache client identification -- must match [netschedule_api].client_name. ; client_name= ; NetCache service to use. It can be either an LBSM service or a ; host:port address. ; service=NC_test ; When true, job input data will be read from NetCache and stored as a file on ; a local file system before the job is run. This is to avoid possible ; connection timeouts when the job processor takes too much time to read its ; input. ; ; Default is false ; cache_input = false ; When true, job output will be collected in a local file prior to sending it ; to NetCache when the job has finished. This is to avoid possible connection ; timeouts when the job is long running and produces bursts of data with long ; intervals in between. ; ; Default is false ; cache_output = true ; If either 'cache_input' or 'cache_output' is enabled, this parameter ; defines a temporary directory where the cached input/output is stored. ; ; Default: . ; ;tmp_dir=/tmp ; Synonym for 'tmp_dir'. If both are set, 'tmp_dir' is used. ; ;tmp_path=/tmp ; How often to query LBSM (time interval in seconds). ; rebalance_time=10 ; How often to query LBSM (number of internal LBSM requests before ; rebalancing). ; rebalance_requests=100 ; Communication timeout (in seconds) for connections to the NetCache ; servers. ; ; Default is 12 seconds. ; ;communication_timeout = 20 ; Sets a communication timeout (in seconds) for accessing the first server ; in a service while creating a blob or performing any other operation with ; a mirrored blob. If the first server does not reply within the specified ; amount of time, the next server will be tried, but the second and all ; subsequent servers will be given the full communication_timeout to reply. ; If LBSM services are not used or there's only one server in the service, ; this parameter does not apply. ; ; Default is [netcache_api]/communication_timeout if defined, ; or 300ms (0.3s) ; ;first_server_timeout = 1 ; Throttling parameters. Described here for NetCache, but the same ; set of parameters can be defined in the [netschedule_api] section. ; If that many attempts to connect to a server occurs in a row, ; then requests to the server will be throttled. ; Setting this parameter to zero disables this throttling criterion. ; Default = 0 ; ;throttle_by_subsequent_connection_failures = 3 ; If at least "5" of the last "20" attempts to connect to a server did not ; succeed, then requests to the server will be throttled. ; Setting any of the parameters to zero disables this throttling criterion. ; Default = 0/0 ; ;throttle_by_connection_error_rate = 5/20 ; Seconds to wait before the server can be accessed again (after requests to it ; have been throttled). ; Setting this parameter to zero or negative value will disable the throttling ; algorithm (and, all other "throttle_*" parameters will be ignored). ; Default = 0 ; ;throttle_relaxation_period = 12 ; If connecting to a server (with all retries and delays) takes more than the ; specified amount of time, the connection will be failed. ; Setting this parameter to zero or negative value removes the restriction. ; Default = 0.0 ; ;max_connection_time = 23.4 ; If the NC server pool is load-balanced, then do not unthrottle server ; unless (and until) it is present in the LBSM service table. ; Default = false ; ;throttle_hold_until_active_in_lb = true