cctools
work_queue.h
Go to the documentation of this file.
1 /*
2 Copyright (C) 2008- The University of Notre Dame
3 This software is distributed under the GNU General Public License.
4 See the file COPYING for details.
5 */
6 
7 #ifndef WORK_QUEUE_H
8 #define WORK_QUEUE_H
9 
20 #include <sys/types.h>
21 #include "timestamp.h"
22 #include "rmsummary.h"
23 
24 #define WORK_QUEUE_DEFAULT_PORT 9123
25 #define WORK_QUEUE_RANDOM_PORT 0
26 #define WORK_QUEUE_WAITFORTASK -1
28 #define WORK_QUEUE_SCHEDULE_UNSET 0
29 #define WORK_QUEUE_SCHEDULE_FCFS 1
30 #define WORK_QUEUE_SCHEDULE_FILES 2
31 #define WORK_QUEUE_SCHEDULE_TIME 3
32 #define WORK_QUEUE_SCHEDULE_RAND 4
33 #define WORK_QUEUE_SCHEDULE_WORST 5
35 #define WORK_QUEUE_INPUT 0
36 #define WORK_QUEUE_OUTPUT 1
38 #define WORK_QUEUE_NOCACHE 0
39 #define WORK_QUEUE_CACHE 1
40 #define WORK_QUEUE_SYMLINK 2 /* Create a symlink to the file rather than copying it, if possible. */
41 #define WORK_QUEUE_PREEXIST 4 /* If the filename already exists on the host, use it in place. */
42 #define WORK_QUEUE_THIRDGET 8 /* Access the file on the client from a shared filesystem */
43 #define WORK_QUEUE_THIRDPUT 8 /* Access the file on the client from a shared filesystem (included for readability) */
44 #define WORK_QUEUE_WATCH 16
46 #define WORK_QUEUE_RESET_ALL 0
47 #define WORK_QUEUE_RESET_KEEP_TASKS 1
49 #define WORK_QUEUE_DEFAULT_KEEPALIVE_INTERVAL 300
50 #define WORK_QUEUE_DEFAULT_KEEPALIVE_TIMEOUT 30
52 #define WORK_QUEUE_RESULT_SUCCESS 0
53 #define WORK_QUEUE_RESULT_INPUT_MISSING 1
54 #define WORK_QUEUE_RESULT_OUTPUT_MISSING 2
55 #define WORK_QUEUE_RESULT_STDOUT_MISSING 4
56 #define WORK_QUEUE_RESULT_SIGNAL 8
57 #define WORK_QUEUE_RESULT_RESOURCE_EXHAUSTION 16
58 #define WORK_QUEUE_RESULT_TASK_TIMEOUT 32
61 #define WORK_QUEUE_TASK_UNKNOWN 0
62 #define WORK_QUEUE_TASK_READY 1
63 #define WORK_QUEUE_TASK_RUNNING 2
64 #define WORK_QUEUE_TASK_WAITING_RETRIEVAL 3
65 #define WORK_QUEUE_TASK_RETRIEVED 4
66 #define WORK_QUEUE_TASK_DONE 5
67 #define WORK_QUEUE_TASK_CANCELED 6
69 extern double wq_option_fast_abort_multiplier;
71 extern int wq_option_scheduler;
75 struct work_queue_task {
76  char *tag;
77  char *command_line;
79  char *output;
80  struct list *input_files;
81  struct list *output_files;
82  struct list *env_list;
83  int taskid;
85  int result;
86  char *host;
87  char *hostname;
110  int64_t maximum_end_time;
111  int64_t memory;
112  int64_t disk;
113  int cores;
114  int gpus;
115  int unlabeled;
116 
117  double priority;
123 };
124 
154  double efficiency;
156  int capacity;
158  double bandwidth;
159  int64_t total_cores;
160  int64_t total_memory;
161  int64_t total_disk;
162  int64_t total_gpus;
163  int64_t committed_cores;
165  int64_t committed_disk;
166  int64_t committed_gpus;
167  int64_t min_cores;
168  int64_t max_cores;
169  int64_t min_memory;
170  int64_t max_memory;
171  int64_t min_disk;
172  int64_t max_disk;
173  int64_t min_gpus;
174  int64_t max_gpus;
175  int port;
176  int priority;
181 };
182 
183 
187 
195 struct work_queue_task *work_queue_task_create(const char *full_command);
196 
202 struct work_queue_task *work_queue_task_clone(const struct work_queue_task *task);
203 
208 void work_queue_task_specify_command( struct work_queue_task *t, const char *cmd );
209 
226 int work_queue_task_specify_file(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags);
227 
242 int work_queue_task_specify_file_piece(struct work_queue_task *t, const char *local_name, const char *remote_name, off_t start_byte, off_t end_byte, int type, int flags);
243 
254 int work_queue_task_specify_buffer(struct work_queue_task *t, const char *data, int length, const char *remote_name, int flags);
255 
269 int work_queue_task_specify_directory(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags, int recursive);
270 
276 void work_queue_task_specify_memory( struct work_queue_task *t, int64_t memory );
277 
283 void work_queue_task_specify_disk( struct work_queue_task *t, int64_t disk );
284 
290 void work_queue_task_specify_cores( struct work_queue_task *t, int cores );
291 
297 void work_queue_task_specify_gpus( struct work_queue_task *t, int gpus );
298 
304 void work_queue_task_specify_end_time( struct work_queue_task *t, int64_t seconds );
305 
312 void work_queue_task_specify_tag(struct work_queue_task *t, const char *tag);
313 
321 
328 void work_queue_task_specify_env( struct work_queue_task *t, const char *name, const char *value );
329 
339 void work_queue_task_specify_algorithm(struct work_queue_task *t, int algo );
340 
346 
348 
352 
369 struct work_queue *work_queue_create(int port);
370 
378 int work_queue_enable_monitoring(struct work_queue *q, char *monitor_summary_file);
379 
388 int work_queue_submit(struct work_queue *q, struct work_queue_task *t);
389 
394 void work_queue_blacklist_add(struct work_queue *q, const char *hostname);
395 
396 
401 void work_queue_blacklist_remove(struct work_queue *q, const char *hostname);
402 
403 
407 void work_queue_blacklist_clear(struct work_queue *q);
408 
423 struct work_queue_task *work_queue_wait(struct work_queue *q, int timeout);
424 
436 int work_queue_hungry(struct work_queue *q);
437 
445 int work_queue_empty(struct work_queue *q);
446 
453 int work_queue_port(struct work_queue *q);
454 
459 void work_queue_get_stats(struct work_queue *q, struct work_queue_stats *s);
460 
465 void work_queue_get_stats_hierarchy(struct work_queue *q, struct work_queue_stats *s);
466 
467 
473 int work_queue_task_state(struct work_queue *q, int taskid);
474 
479 void work_queue_set_bandwidth_limit(struct work_queue *q, const char *bandwidth);
480 
485 double work_queue_get_effective_bandwidth(struct work_queue *q);
486 
493 char * work_queue_get_worker_summary( struct work_queue *q );
494 
500 int work_queue_activate_fast_abort(struct work_queue *q, double multiplier);
501 
502 
506 int work_queue_send_receive_ratio(struct work_queue *q, double ratio);
507 
517 void work_queue_specify_algorithm(struct work_queue *q, int algo);
518 
523 const char *work_queue_name(struct work_queue *q);
524 
529 void work_queue_specify_name(struct work_queue *q, const char *name);
530 
535 void work_queue_specify_priority(struct work_queue *q, int priority);
536 
542 void work_queue_specify_catalog_server(struct work_queue *q, const char *hostname, int port);
543 
549 struct work_queue_task *work_queue_cancel_by_taskid(struct work_queue *q, int id);
550 
556 struct work_queue_task *work_queue_cancel_by_tasktag(struct work_queue *q, const char *tag);
557 
562 struct list * work_queue_cancel_all_tasks(struct work_queue *q);
563 
568 int work_queue_shut_down_workers(struct work_queue *q, int n);
569 
574 void work_queue_delete(struct work_queue *q);
575 
581 int work_queue_specify_log(struct work_queue *q, const char *logfile);
582 
588 void work_queue_specify_password( struct work_queue *q, const char *password );
589 
596 int work_queue_specify_password_file( struct work_queue *q, const char *file );
597 
602 void work_queue_specify_keepalive_interval(struct work_queue *q, int interval);
603 
608 void work_queue_specify_keepalive_timeout(struct work_queue *q, int timeout);
609 
610 
626 int work_queue_tune(struct work_queue *q, const char *name, double value);
627 
629 
633 
634 #define WORK_QUEUE_TASK_ORDER_FIFO 0
635 #define WORK_QUEUE_TASK_ORDER_LIFO 1
643 void work_queue_specify_task_order(struct work_queue *q, int order);
644 
645 
646 #define WORK_QUEUE_MASTER_MODE_STANDALONE 0
647 #define WORK_QUEUE_MASTER_MODE_CATALOG 1
656 void work_queue_specify_master_mode(struct work_queue *q, int mode);
657 
663 void work_queue_specify_estimate_capacity_on(struct work_queue *q, int estimate_capacity_on);
664 
673 int work_queue_task_specify_input_buf(struct work_queue_task *t, const char *buf, int length, const char *rname);
674 
682 int work_queue_task_specify_input_file(struct work_queue_task *t, const char *fname, const char *rname);
683 
691 int work_queue_task_specify_input_file_do_not_cache(struct work_queue_task *t, const char *fname, const char *rname);
692 
700 int work_queue_task_specify_output_file(struct work_queue_task *t, const char *rname, const char *fname);
701 
709 int work_queue_task_specify_output_file_do_not_cache(struct work_queue_task *t, const char *rname, const char *fname);
710 
712 
713 /* Experimental feature - intentionally left undocumented.
714 This feature exists to simplify performance evaulation and is not recommended
715 for production use since it delays execution of the workload.
716 Force the master to wait for the given number of workers to connect before
717 starting to dispatch tasks.
718 @param q A work queue object.
719 @param worker The number of workers to wait before tasks are dispatched.*/
720 void work_queue_activate_worker_waiting(struct work_queue *q, int resources);
721 
722 #endif
int64_t total_memory
Total memory in MB aggregated across the connected workers.
Definition: work_queue.h:160
int64_t committed_gpus
Committed number of GPUs aggregated across the connected workers.
Definition: work_queue.h:166
int work_queue_task_specify_buffer(struct work_queue_task *t, const char *data, int length, const char *remote_name, int flags)
Add an input buffer to a task.
void work_queue_task_specify_env(struct work_queue_task *t, const char *name, const char *value)
Specify an environment variable to be added to the task.
void work_queue_specify_estimate_capacity_on(struct work_queue *q, int estimate_capacity_on)
Change whether to estimate master capacity for a given queue.
int work_queue_send_receive_ratio(struct work_queue *q, double ratio)
Change the preference to send or receive tasks.
int64_t committed_disk
Committed disk space in MB aggregated across the connected workers.
Definition: work_queue.h:165
int64_t min_disk
The smallest disk space in MB observed among the connected workers.
Definition: work_queue.h:171
A task description.
Definition: work_queue.h:75
void work_queue_task_specify_cores(struct work_queue_task *t, int cores)
Specify the number of cores required by a task.
timestamp_t total_good_execute_time
Total time in microseconds workers spent executing successful tasks.
Definition: work_queue.h:149
timestamp_t time_send_input_start
The time at which it started to transfer input files.
Definition: work_queue.h:93
struct list * work_queue_cancel_all_tasks(struct work_queue *q)
Cancel all submitted tasks and remove them from the queue.
timestamp_t start_time
Absolute time at which the master started.
Definition: work_queue.h:143
timestamp_t cmd_execution_time
Time spent in microseconds for executing the command on the worker.
Definition: work_queue.h:106
int tasks_running
Number of tasks currently running.
Definition: work_queue.h:136
int work_queue_task_specify_input_file_do_not_cache(struct work_queue_task *t, const char *fname, const char *rname)
Add an input file to a task, without caching.
int total_workers_joined
Total number of worker connections that were established to the master.
Definition: work_queue.h:132
int work_queue_task_specify_output_file(struct work_queue_task *t, const char *rname, const char *fname)
Add an output file to a task.
timestamp_t time_receive_output_start
The time at which it started to transfer output files.
Definition: work_queue.h:99
timestamp_t total_cmd_execution_time
Time spent in microseconds for executing the command on any worker, including resubmittions of the ta...
Definition: work_queue.h:108
void work_queue_get_stats(struct work_queue *q, struct work_queue_stats *s)
Get queue statistics (only from master).
struct work_queue_task * work_queue_cancel_by_taskid(struct work_queue *q, int id)
Cancel a submitted task using its task id and remove it from queue.
int workers_full
Definition: work_queue.h:178
int total_tasks_dispatched
Total number of tasks dispatch to workers.
Definition: work_queue.h:138
struct work_queue * work_queue_create(int port)
Create a new work queue.
void work_queue_specify_algorithm(struct work_queue *q, int algo)
Change the worker selection algorithm.
timestamp_t total_good_transfer_time
Total time in microseconds spent in sending and receiving data to workers for tasks with result WQ_RE...
Definition: work_queue.h:146
int workers_init
Number of workers initializing.
Definition: work_queue.h:129
int work_queue_task_specify_output_file_do_not_cache(struct work_queue_task *t, const char *rname, const char *fname)
Add an output file to a task without caching.
timestamp_t time_send_input_finish
The time at which it finished transferring input files.
Definition: work_queue.h:94
void work_queue_get_stats_hierarchy(struct work_queue *q, struct work_queue_stats *s)
Get statistics of the master queue together with foremen information.
timestamp_t time_app_delay
Definition: work_queue.h:121
void work_queue_task_specify_command(struct work_queue_task *t, const char *cmd)
Indicate the command to be executed.
int tasks_waiting
Number of tasks waiting to be run.
Definition: work_queue.h:135
int total_submissions
The number of times the task has been submitted.
Definition: work_queue.h:107
char * hostname
The name of the host on which it ran.
Definition: work_queue.h:87
Portable routines for high resolution timing.
double work_queue_get_effective_bandwidth(struct work_queue *q)
Get current queue bandwidth.
int64_t total_gpus
Total number of GPUs aggregated across the connected workers.
Definition: work_queue.h:162
int64_t total_bytes_sent
Total number of file bytes (not including protocol control msg bytes) sent out to the workers by the ...
Definition: work_queue.h:152
struct list * output_files
The output files (other than the standard output stream) created by the program expected to be retrie...
Definition: work_queue.h:81
int64_t total_bytes_transferred
Number of bytes transferred since task has last started transferring input data.
Definition: work_queue.h:104
void work_queue_delete(struct work_queue *q)
Delete a work queue.
UINT64_T timestamp_t
A type to hold the current time, in microseconds since January 1st, 1970.
Definition: timestamp.h:20
struct list * env_list
Environment variables applied to the task.
Definition: work_queue.h:82
int tasks_complete
Number of tasks waiting to be returned to user.
Definition: work_queue.h:137
int64_t min_gpus
The lowest number of GPUs observed among the connected workers.
Definition: work_queue.h:173
struct list * input_files
The files to transfer to the worker and place in the executing directory.
Definition: work_queue.h:80
double bandwidth
Average network bandwidth in MB/S observed by the master when transferring to workers.
Definition: work_queue.h:158
void work_queue_task_specify_algorithm(struct work_queue_task *t, int algo)
Select the scheduling algorithm for a single task.
struct work_queue_task * work_queue_cancel_by_tasktag(struct work_queue *q, const char *tag)
Cancel a submitted task using its tag and remove it from queue.
int total_tasks_complete
Total number of tasks completed and returned to user.
Definition: work_queue.h:139
struct work_queue_task * work_queue_wait(struct work_queue *q, int timeout)
Wait for a task to complete.
void work_queue_task_specify_memory(struct work_queue_task *t, int64_t memory)
Specify the amount of memory required by a task.
char * command_line
The program(s) to execute, as a shell command line.
Definition: work_queue.h:77
int capacity
The estimated number of workers that this master can effectively support.
Definition: work_queue.h:156
int64_t committed_memory
Committed memory in MB aggregated across the connected workers.
Definition: work_queue.h:164
int total_workers_removed
Total number of worker connections that were terminated by the master.
Definition: work_queue.h:133
int work_queue_specify_log(struct work_queue *q, const char *logfile)
Add a log file that records the states of the connected workers and submitted tasks.
timestamp_t time_receive_output_finish
The time at which it finished transferring output files.
Definition: work_queue.h:100
timestamp_t time_task_submit
The time at which this task was submitted.
Definition: work_queue.h:91
timestamp_t time_receive_result_finish
The time at which it finished transferring the results.
Definition: work_queue.h:98
int work_queue_task_specify_input_buf(struct work_queue_task *t, const char *buf, int length, const char *rname)
Add an input buffer to a task.
int64_t committed_cores
Committed number of cores aggregated across the connected workers.
Definition: work_queue.h:163
void work_queue_specify_catalog_server(struct work_queue *q, const char *hostname, int port)
Specify the catalog server the master should report to.
char * host
The address and port of the host on which it ran.
Definition: work_queue.h:86
int work_queue_port(struct work_queue *q)
Get the listening port of the queue.
int work_queue_specify_password_file(struct work_queue *q, const char *file)
Add a mandatory password file that each worker must present.
int taskid
A unique task id number.
Definition: work_queue.h:83
int avg_capacity
Definition: work_queue.h:180
int workers_busy
Number of workers that are running at least one task.
Definition: work_queue.h:131
struct work_queue_task * work_queue_task_create(const char *full_command)
Create a new task object.
void work_queue_specify_password(struct work_queue *q, const char *password)
Add a mandatory password that each worker must present.
struct rmsummary * resources_measured
When monitoring is enabled, it points to the measured resources used by the task. ...
Definition: work_queue.h:119
timestamp_t time_committed
The time at which a task was committed to a worker.
Definition: work_queue.h:89
int work_queue_task_specify_file(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags)
Add a file to a task.
int64_t total_bytes_received
Total number of file bytes (not including protocol control msg bytes) received from the workers by th...
Definition: work_queue.h:153
void work_queue_task_specify_tag(struct work_queue_task *t, const char *tag)
Attach a user defined string tag to the task.
int total_tasks_cancelled
Total number of tasks cancelled.
Definition: work_queue.h:141
void work_queue_task_specify_priority(struct work_queue_task *t, double priority)
Specify the priority of this task relative to others in the queue.
void work_queue_task_specify_end_time(struct work_queue_task *t, int64_t seconds)
Specify the maximum end time allowed for the task (in seconds since the Epoch).
void work_queue_blacklist_add(struct work_queue *q, const char *hostname)
Blacklist host from a queue.
struct work_queue_task * work_queue_task_clone(const struct work_queue_task *task)
Create a copy of a task Create a functionally identical copy of a work_queue_task that can be re-sub...
int work_queue_empty(struct work_queue *q)
Determine whether the queue is empty.
void work_queue_task_specify_disk(struct work_queue_task *t, int64_t disk)
Specify the amount of disk space required by a task.
int work_queue_shut_down_workers(struct work_queue *q, int n)
Shut down workers connected to the work_queue system.
int64_t min_memory
The smallest memory size in MB observed among the connected workers.
Definition: work_queue.h:169
void work_queue_set_bandwidth_limit(struct work_queue *q, const char *bandwidth)
Limit the queue bandwidth when transferring files to and from workers.
const char * work_queue_name(struct work_queue *q)
Get the project name of the queue.
double priority
The priority of this task relative to others in the queue: higher number run earlier.
Definition: work_queue.h:117
void work_queue_specify_keepalive_timeout(struct work_queue *q, int timeout)
Change the keepalive timeout for identifying dead workers for a given queue.
void work_queue_blacklist_clear(struct work_queue *q)
Clear blacklist of a queue.
Statistics describing a work queue.
Definition: work_queue.h:127
Definition: rmsummary.h:24
int work_queue_task_specify_file_piece(struct work_queue_task *t, const char *local_name, const char *remote_name, off_t start_byte, off_t end_byte, int type, int flags)
Add a file piece to a task.
timestamp_t time_receive_result_start
The time at which it started to transfer the results.
Definition: work_queue.h:97
int return_status
The exit code of the command line.
Definition: work_queue.h:84
int64_t min_cores
The lowest number of cores observed among the connected workers.
Definition: work_queue.h:167
int64_t max_gpus
The highest number of GPUs observed among the connected workers.
Definition: work_queue.h:174
void work_queue_task_specify_gpus(struct work_queue_task *t, int gpus)
Specify the number of gpus required by a task.
int total_workers_connected
Total number of workers currently connected to the master.
Definition: work_queue.h:128
timestamp_t total_receive_time
Total time in microseconds spent in receiving data from workers.
Definition: work_queue.h:145
int work_queue_hungry(struct work_queue *q)
Determine whether the queue is &#39;hungry&#39; for more tasks.
int work_queue_tune(struct work_queue *q, const char *name, double value)
Tune advanced parameters for work queue.
int64_t total_cores
Total number of cores aggregated across the connected workers.
Definition: work_queue.h:159
Definition: list.h:49
int work_queue_task_state(struct work_queue *q, int taskid)
Get the current state of the task.
int result
The result of the task (successful, failed return_status, missing input file, missing output file)...
Definition: work_queue.h:85
int total_worker_slots
Definition: work_queue.h:179
int64_t max_disk
The largest disk space in MB observed among the connected workers.
Definition: work_queue.h:172
timestamp_t total_transfer_time
Time comsumed in microseconds for transferring total_bytes_transferred.
Definition: work_queue.h:105
double idle_percentage
The fraction of time that the master is idle waiting for workers to respond.
Definition: work_queue.h:155
int work_queue_activate_fast_abort(struct work_queue *q, double multiplier)
Turn on or off fast abort functionality for a given queue.
int64_t total_disk
Total disk space in MB aggregated across the connected workers.
Definition: work_queue.h:161
char * tag
An optional user-defined logical name for the task.
Definition: work_queue.h:76
timestamp_t total_execute_time
Total time in microseconds workers spent executing completed tasks.
Definition: work_queue.h:148
int work_queue_submit(struct work_queue *q, struct work_queue_task *t)
Submit a task to a queue.
int worker_selection_algorithm
How to choose worker to run the task.
Definition: work_queue.h:78
timestamp_t time_execute_cmd_finish
The time at which the task finished (discovered by the master).
Definition: work_queue.h:96
double efficiency
Parallel efficiency of the system, sum(task execution times) / sum(worker lifetimes) ...
Definition: work_queue.h:154
void work_queue_specify_keepalive_interval(struct work_queue *q, int interval)
Change the keepalive interval for a given queue.
timestamp_t time_task_finish
The time at which this task was finished.
Definition: work_queue.h:92
int work_queue_task_specify_input_file(struct work_queue_task *t, const char *fname, const char *rname)
Add an input file to a task.
timestamp_t time_execute_cmd_start
The time at which the task began.
Definition: work_queue.h:95
int workers_idle
Number of workers that are not running a task.
Definition: work_queue.h:130
int64_t total_bytes_sent
Number of bytes sent since task has last started sending input data.
Definition: work_queue.h:103
timestamp_t total_send_time
Total time in microseconds spent in sending data to workers.
Definition: work_queue.h:144
char * work_queue_get_worker_summary(struct work_queue *q)
Summarize workers.
void work_queue_specify_priority(struct work_queue *q, int priority)
Change the priority for a given queue.
int64_t total_bytes_received
Number of bytes received since task has last started receiving input data.
Definition: work_queue.h:102
int total_tasks_failed
Total number of tasks completed and returned to user with result other than WQ_RESULT_SUCCESS.
Definition: work_queue.h:140
int work_queue_enable_monitoring(struct work_queue *q, char *monitor_summary_file)
Enables resource monitoring on the give work queue.
int64_t max_memory
The largest memory size in MB observed among the connected workers.
Definition: work_queue.h:170
void work_queue_task_delete(struct work_queue_task *t)
Delete a task.
char * output
The standard output of the task.
Definition: work_queue.h:79
void work_queue_blacklist_remove(struct work_queue *q, const char *hostname)
Unblacklist host from a queue.
void work_queue_specify_name(struct work_queue *q, const char *name)
Change the project name for a given queue.
int workers_ready
Definition: work_queue.h:177
int work_queue_task_specify_directory(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags, int recursive)
Add a directory to a task.
int64_t max_cores
The highest number of cores observed among the connected workers.
Definition: work_queue.h:168