cctools
work_queue.h
Go to the documentation of this file.
1 /*
2 Copyright (C) 2008- The University of Notre Dame
3 This software is distributed under the GNU General Public License.
4 See the file COPYING for details.
5 */
6 
7 #ifndef WORK_QUEUE_H
8 #define WORK_QUEUE_H
9 
20 #include <sys/types.h>
21 #include "timestamp.h"
22 #include "rmsummary.h"
23 
24 #define WORK_QUEUE_DEFAULT_PORT 9123
25 #define WORK_QUEUE_RANDOM_PORT 0
26 #define WORK_QUEUE_WAITFORTASK -1
28 #define WORK_QUEUE_SCHEDULE_UNSET 0
29 #define WORK_QUEUE_SCHEDULE_FCFS 1
30 #define WORK_QUEUE_SCHEDULE_FILES 2
31 #define WORK_QUEUE_SCHEDULE_TIME 3
32 #define WORK_QUEUE_SCHEDULE_RAND 4
34 #define WORK_QUEUE_INPUT 0
35 #define WORK_QUEUE_OUTPUT 1
37 #define WORK_QUEUE_NOCACHE 0
38 #define WORK_QUEUE_CACHE 1
39 #define WORK_QUEUE_SYMLINK 2 /* Create a symlink to the file rather than copying it, if possible. */
40 #define WORK_QUEUE_PREEXIST 4 /* If the filename already exists on the host, use it in place. */
41 #define WORK_QUEUE_THIRDGET 8 /* Access the file on the client from a shared filesystem */
42 #define WORK_QUEUE_THIRDPUT 8 /* Access the file on the client from a shared filesystem (included for readability) */
43 #define WORK_QUEUE_WATCH 16
45 #define WORK_QUEUE_RESET_ALL 0
46 #define WORK_QUEUE_RESET_KEEP_TASKS 1
48 #define WORK_QUEUE_DEFAULT_KEEPALIVE_INTERVAL 300
49 #define WORK_QUEUE_DEFAULT_KEEPALIVE_TIMEOUT 30
51 #define WORK_QUEUE_RESULT_SUCCESS 0
52 #define WORK_QUEUE_RESULT_INPUT_MISSING 1
53 #define WORK_QUEUE_RESULT_OUTPUT_MISSING 2
54 #define WORK_QUEUE_RESULT_STDOUT_MISSING 4
55 #define WORK_QUEUE_RESULT_SIGNAL 8
56 #define WORK_QUEUE_RESULT_RESOURCE_EXHAUSTION 16
57 #define WORK_QUEUE_RESULT_TASK_TIMEOUT 32
59 extern double wq_option_fast_abort_multiplier;
61 extern int wq_option_scheduler;
65 struct work_queue_task {
66  char *tag;
67  char *command_line;
69  char *output;
70  struct list *input_files;
71  struct list *output_files;
72  int taskid;
74  int result;
75  char *host;
76  char *hostname;
99  int64_t maximum_end_time;
100  int64_t memory;
101  int64_t disk;
102  int cores;
103  int gpus;
104  int unlabeled;
107  double priority;
111 };
112 
142  double efficiency;
144  int capacity;
146  double bandwidth;
147  int64_t total_cores;
148  int64_t total_memory;
149  int64_t total_disk;
150  int64_t total_gpus;
151  int64_t committed_cores;
153  int64_t committed_disk;
154  int64_t committed_gpus;
155  int64_t min_cores;
156  int64_t max_cores;
157  int64_t min_memory;
158  int64_t max_memory;
159  int64_t min_disk;
160  int64_t max_disk;
161  int64_t min_gpus;
162  int64_t max_gpus;
163  int port;
164  int priority;
169 };
170 
171 
175 
183 struct work_queue_task *work_queue_task_create(const char *full_command);
184 
190 struct work_queue_task *work_queue_task_clone(const struct work_queue_task *task);
191 
196 void work_queue_task_specify_command( struct work_queue_task *t, const char *cmd );
197 
214 int work_queue_task_specify_file(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags);
215 
230 int work_queue_task_specify_file_piece(struct work_queue_task *t, const char *local_name, const char *remote_name, off_t start_byte, off_t end_byte, int type, int flags);
231 
242 int work_queue_task_specify_buffer(struct work_queue_task *t, const char *data, int length, const char *remote_name, int flags);
243 
257 int work_queue_task_specify_directory(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags, int recursive);
258 
264 void work_queue_task_specify_memory( struct work_queue_task *t, int64_t memory );
265 
271 void work_queue_task_specify_disk( struct work_queue_task *t, int64_t disk );
272 
278 void work_queue_task_specify_cores( struct work_queue_task *t, int cores );
279 
285 void work_queue_task_specify_gpus( struct work_queue_task *t, int gpus );
286 
292 void work_queue_task_specify_end_time( struct work_queue_task *t, int64_t seconds );
293 
300 void work_queue_task_specify_tag(struct work_queue_task *t, const char *tag);
301 
309 
319 void work_queue_task_specify_algorithm(struct work_queue_task *t, int algo );
320 
326 
328 
332 
349 struct work_queue *work_queue_create(int port);
350 
358 int work_queue_enable_monitoring(struct work_queue *q, char *monitor_summary_file);
359 
368 int work_queue_submit(struct work_queue *q, struct work_queue_task *t);
369 
374 void work_queue_blacklist_add(struct work_queue *q, const char *hostname);
375 
376 
381 void work_queue_blacklist_remove(struct work_queue *q, const char *hostname);
382 
383 
387 void work_queue_blacklist_clear(struct work_queue *q);
388 
403 struct work_queue_task *work_queue_wait(struct work_queue *q, int timeout);
404 
416 int work_queue_hungry(struct work_queue *q);
417 
425 int work_queue_empty(struct work_queue *q);
426 
433 int work_queue_port(struct work_queue *q);
434 
439 void work_queue_get_stats(struct work_queue *q, struct work_queue_stats *s);
440 
445 void work_queue_get_stats_hierarchy(struct work_queue *q, struct work_queue_stats *s);
446 
451 void work_queue_set_bandwidth_limit(struct work_queue *q, const char *bandwidth);
452 
457 double work_queue_get_effective_bandwidth(struct work_queue *q);
458 
465 char * work_queue_get_worker_summary( struct work_queue *q );
466 
472 int work_queue_activate_fast_abort(struct work_queue *q, double multiplier);
473 
474 
478 int work_queue_send_receive_ratio(struct work_queue *q, double ratio);
479 
489 void work_queue_specify_algorithm(struct work_queue *q, int algo);
490 
495 const char *work_queue_name(struct work_queue *q);
496 
501 void work_queue_specify_name(struct work_queue *q, const char *name);
502 
507 void work_queue_specify_priority(struct work_queue *q, int priority);
508 
514 void work_queue_specify_catalog_server(struct work_queue *q, const char *hostname, int port);
515 
521 struct work_queue_task *work_queue_cancel_by_taskid(struct work_queue *q, int id);
522 
528 struct work_queue_task *work_queue_cancel_by_tasktag(struct work_queue *q, const char *tag);
529 
534 struct list * work_queue_cancel_all_tasks(struct work_queue *q);
535 
540 int work_queue_shut_down_workers(struct work_queue *q, int n);
541 
546 void work_queue_delete(struct work_queue *q);
547 
553 int work_queue_specify_log(struct work_queue *q, const char *logfile);
554 
560 void work_queue_specify_password( struct work_queue *q, const char *password );
561 
568 int work_queue_specify_password_file( struct work_queue *q, const char *file );
569 
574 void work_queue_specify_keepalive_interval(struct work_queue *q, int interval);
575 
580 void work_queue_specify_keepalive_timeout(struct work_queue *q, int timeout);
581 
582 
598 int work_queue_tune(struct work_queue *q, const char *name, double value);
599 
601 
605 
606 #define WORK_QUEUE_TASK_ORDER_FIFO 0
607 #define WORK_QUEUE_TASK_ORDER_LIFO 1
615 void work_queue_specify_task_order(struct work_queue *q, int order);
616 
617 
618 #define WORK_QUEUE_MASTER_MODE_STANDALONE 0
619 #define WORK_QUEUE_MASTER_MODE_CATALOG 1
628 void work_queue_specify_master_mode(struct work_queue *q, int mode);
629 
635 void work_queue_specify_estimate_capacity_on(struct work_queue *q, int estimate_capacity_on);
636 
645 int work_queue_task_specify_input_buf(struct work_queue_task *t, const char *buf, int length, const char *rname);
646 
654 int work_queue_task_specify_input_file(struct work_queue_task *t, const char *fname, const char *rname);
655 
663 int work_queue_task_specify_input_file_do_not_cache(struct work_queue_task *t, const char *fname, const char *rname);
664 
672 int work_queue_task_specify_output_file(struct work_queue_task *t, const char *rname, const char *fname);
673 
681 int work_queue_task_specify_output_file_do_not_cache(struct work_queue_task *t, const char *rname, const char *fname);
682 
684 
685 /* Experimental feature - intentionally left undocumented.
686 This feature exists to simplify performance evaulation and is not recommended
687 for production use since it delays execution of the workload.
688 Force the master to wait for the given number of workers to connect before
689 starting to dispatch tasks.
690 @param q A work queue object.
691 @param worker The number of workers to wait before tasks are dispatched.*/
692 void work_queue_activate_worker_waiting(struct work_queue *q, int resources);
693 
694 #endif
int64_t total_memory
Total memory in MB aggregated across the connected workers.
Definition: work_queue.h:148
int64_t committed_gpus
Committed number of GPUs aggregated across the connected workers.
Definition: work_queue.h:154
int work_queue_task_specify_buffer(struct work_queue_task *t, const char *data, int length, const char *remote_name, int flags)
Add an input buffer to a task.
void work_queue_specify_estimate_capacity_on(struct work_queue *q, int estimate_capacity_on)
Change whether to estimate master capacity for a given queue.
int work_queue_send_receive_ratio(struct work_queue *q, double ratio)
Change the preference to send or receive tasks.
int64_t committed_disk
Committed disk space in MB aggregated across the connected workers.
Definition: work_queue.h:153
int64_t min_disk
The smallest disk space in MB observed among the connected workers.
Definition: work_queue.h:159
A task description.
Definition: work_queue.h:65
void work_queue_task_specify_cores(struct work_queue_task *t, int cores)
Specify the number of cores required by a task.
timestamp_t total_good_execute_time
Total time in microseconds workers spent executing successful tasks.
Definition: work_queue.h:137
timestamp_t time_send_input_start
The time at which it started to transfer input files.
Definition: work_queue.h:82
struct list * work_queue_cancel_all_tasks(struct work_queue *q)
Cancel all submitted tasks and remove them from the queue.
timestamp_t start_time
Absolute time at which the master started.
Definition: work_queue.h:131
timestamp_t cmd_execution_time
Time spent in microseconds for executing the command on the worker.
Definition: work_queue.h:95
int tasks_running
Number of tasks currently running.
Definition: work_queue.h:124
int work_queue_task_specify_input_file_do_not_cache(struct work_queue_task *t, const char *fname, const char *rname)
Add an input file to a task, without caching.
int total_workers_joined
Total number of worker connections that were established to the master.
Definition: work_queue.h:120
int work_queue_task_specify_output_file(struct work_queue_task *t, const char *rname, const char *fname)
Add an output file to a task.
timestamp_t time_receive_output_start
The time at which it started to transfer output files.
Definition: work_queue.h:88
timestamp_t total_cmd_execution_time
Time spent in microseconds for executing the command on any worker, including resubmittions of the ta...
Definition: work_queue.h:97
void work_queue_get_stats(struct work_queue *q, struct work_queue_stats *s)
Get queue statistics (only from master).
struct work_queue_task * work_queue_cancel_by_taskid(struct work_queue *q, int id)
Cancel a submitted task using its task id and remove it from queue.
int workers_full
Definition: work_queue.h:166
int total_tasks_dispatched
Total number of tasks dispatch to workers.
Definition: work_queue.h:126
struct work_queue * work_queue_create(int port)
Create a new work queue.
void work_queue_specify_algorithm(struct work_queue *q, int algo)
Change the worker selection algorithm.
timestamp_t total_good_transfer_time
Total time in microseconds spent in sending and receiving data to workers for tasks with result WQ_RE...
Definition: work_queue.h:134
int workers_init
Number of workers initializing.
Definition: work_queue.h:117
int work_queue_task_specify_output_file_do_not_cache(struct work_queue_task *t, const char *rname, const char *fname)
Add an output file to a task without caching.
timestamp_t time_send_input_finish
The time at which it finished transferring input files.
Definition: work_queue.h:83
void work_queue_get_stats_hierarchy(struct work_queue *q, struct work_queue_stats *s)
Get statistics of the master queue together with foremen information.
timestamp_t time_app_delay
Definition: work_queue.h:109
void work_queue_task_specify_command(struct work_queue_task *t, const char *cmd)
Indicate the command to be executed.
int tasks_waiting
Number of tasks waiting to be run.
Definition: work_queue.h:123
int total_submissions
The number of times the task has been submitted.
Definition: work_queue.h:96
char * hostname
The name of the host on which it ran.
Definition: work_queue.h:76
Portable routines for high resolution timing.
double work_queue_get_effective_bandwidth(struct work_queue *q)
Get current queue bandwidth.
int64_t total_gpus
Total number of GPUs aggregated across the connected workers.
Definition: work_queue.h:150
int64_t total_bytes_sent
Total number of file bytes (not including protocol control msg bytes) sent out to the workers by the ...
Definition: work_queue.h:140
struct list * output_files
The output files (other than the standard output stream) created by the program expected to be retrie...
Definition: work_queue.h:71
int64_t total_bytes_transferred
Number of bytes transferred since task has last started transferring input data.
Definition: work_queue.h:93
void work_queue_delete(struct work_queue *q)
Delete a work queue.
UINT64_T timestamp_t
A type to hold the current time, in microseconds since January 1st, 1970.
Definition: timestamp.h:20
int tasks_complete
Number of tasks waiting to be returned to user.
Definition: work_queue.h:125
int64_t min_gpus
The lowest number of GPUs observed among the connected workers.
Definition: work_queue.h:161
struct list * input_files
The files to transfer to the worker and place in the executing directory.
Definition: work_queue.h:70
double bandwidth
Average network bandwidth in MB/S observed by the master when transferring to workers.
Definition: work_queue.h:146
void work_queue_task_specify_algorithm(struct work_queue_task *t, int algo)
Select the scheduling algorithm for a single task.
struct work_queue_task * work_queue_cancel_by_tasktag(struct work_queue *q, const char *tag)
Cancel a submitted task using its tag and remove it from queue.
int total_tasks_complete
Total number of tasks completed and returned to user.
Definition: work_queue.h:127
struct work_queue_task * work_queue_wait(struct work_queue *q, int timeout)
Wait for a task to complete.
void work_queue_task_specify_memory(struct work_queue_task *t, int64_t memory)
Specify the amount of memory required by a task.
char * command_line
The program(s) to execute, as a shell command line.
Definition: work_queue.h:67
int capacity
The estimated number of workers that this master can effectively support.
Definition: work_queue.h:144
int64_t committed_memory
Committed memory in MB aggregated across the connected workers.
Definition: work_queue.h:152
int total_workers_removed
Total number of worker connections that were terminated by the master.
Definition: work_queue.h:121
int work_queue_specify_log(struct work_queue *q, const char *logfile)
Add a log file that records the states of the connected workers and submitted tasks.
timestamp_t time_receive_output_finish
The time at which it finished transferring output files.
Definition: work_queue.h:89
timestamp_t time_task_submit
The time at which this task was submitted.
Definition: work_queue.h:80
timestamp_t time_receive_result_finish
The time at which it finished transferring the results.
Definition: work_queue.h:87
int work_queue_task_specify_input_buf(struct work_queue_task *t, const char *buf, int length, const char *rname)
Add an input buffer to a task.
int64_t committed_cores
Committed number of cores aggregated across the connected workers.
Definition: work_queue.h:151
void work_queue_specify_catalog_server(struct work_queue *q, const char *hostname, int port)
Specify the catalog server the master should report to.
char * host
The address and port of the host on which it ran.
Definition: work_queue.h:75
int work_queue_port(struct work_queue *q)
Get the listening port of the queue.
int work_queue_specify_password_file(struct work_queue *q, const char *file)
Add a mandatory password file that each worker must present.
int taskid
A unique task id number.
Definition: work_queue.h:72
int avg_capacity
Definition: work_queue.h:168
int workers_busy
Number of workers that are running at least one task.
Definition: work_queue.h:119
struct work_queue_task * work_queue_task_create(const char *full_command)
Create a new task object.
void work_queue_specify_password(struct work_queue *q, const char *password)
Add a mandatory password that each worker must present.
struct rmsummary * resources_measured
When monitoring is enabled, it points to the measured resources used by the task. ...
Definition: work_queue.h:105
timestamp_t time_committed
The time at which a task was committed to a worker.
Definition: work_queue.h:78
int work_queue_task_specify_file(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags)
Add a file to a task.
int64_t total_bytes_received
Total number of file bytes (not including protocol control msg bytes) received from the workers by th...
Definition: work_queue.h:141
void work_queue_task_specify_tag(struct work_queue_task *t, const char *tag)
Attach a user defined string tag to the task.
int total_tasks_cancelled
Total number of tasks cancelled.
Definition: work_queue.h:129
void work_queue_task_specify_priority(struct work_queue_task *t, double priority)
Specify the priority of this task relative to others in the queue.
void work_queue_task_specify_end_time(struct work_queue_task *t, int64_t seconds)
Specify the maximum end time allowed for the task (in seconds since the Epoch).
void work_queue_blacklist_add(struct work_queue *q, const char *hostname)
Blacklist host from a queue.
struct work_queue_task * work_queue_task_clone(const struct work_queue_task *task)
Create a copy of a task Create a functionally identical copy of a work_queue_task that can be re-sub...
int work_queue_empty(struct work_queue *q)
Determine whether the queue is empty.
void work_queue_task_specify_disk(struct work_queue_task *t, int64_t disk)
Specify the amount of disk space required by a task.
int work_queue_shut_down_workers(struct work_queue *q, int n)
Shut down workers connected to the work_queue system.
int64_t min_memory
The smallest memory size in MB observed among the connected workers.
Definition: work_queue.h:157
void work_queue_set_bandwidth_limit(struct work_queue *q, const char *bandwidth)
Limit the queue bandwidth when transferring files to and from workers.
const char * work_queue_name(struct work_queue *q)
Get the project name of the queue.
double priority
The priority of this task relative to others in the queue: higher number run earlier.
Definition: work_queue.h:107
void work_queue_specify_keepalive_timeout(struct work_queue *q, int timeout)
Change the keepalive timeout for identifying dead workers for a given queue.
void work_queue_blacklist_clear(struct work_queue *q)
Clear blacklist of a queue.
Statistics describing a work queue.
Definition: work_queue.h:115
Definition: rmsummary.h:24
int work_queue_task_specify_file_piece(struct work_queue_task *t, const char *local_name, const char *remote_name, off_t start_byte, off_t end_byte, int type, int flags)
Add a file piece to a task.
timestamp_t time_receive_result_start
The time at which it started to transfer the results.
Definition: work_queue.h:86
int return_status
The exit code of the command line.
Definition: work_queue.h:73
int64_t min_cores
The lowest number of cores observed among the connected workers.
Definition: work_queue.h:155
int64_t max_gpus
The highest number of GPUs observed among the connected workers.
Definition: work_queue.h:162
void work_queue_task_specify_gpus(struct work_queue_task *t, int gpus)
Specify the number of gpus required by a task.
int total_workers_connected
Total number of workers currently connected to the master.
Definition: work_queue.h:116
timestamp_t total_receive_time
Total time in microseconds spent in receiving data from workers.
Definition: work_queue.h:133
int work_queue_hungry(struct work_queue *q)
Determine whether the queue is &#39;hungry&#39; for more tasks.
int work_queue_tune(struct work_queue *q, const char *name, double value)
Tune advanced parameters for work queue.
int64_t total_cores
Total number of cores aggregated across the connected workers.
Definition: work_queue.h:147
Definition: list.h:49
int result
The result of the task (successful, failed return_status, missing input file, missing output file)...
Definition: work_queue.h:74
int total_worker_slots
Definition: work_queue.h:167
int64_t max_disk
The largest disk space in MB observed among the connected workers.
Definition: work_queue.h:160
timestamp_t total_transfer_time
Time comsumed in microseconds for transferring total_bytes_transferred.
Definition: work_queue.h:94
double idle_percentage
The fraction of time that the master is idle waiting for workers to respond.
Definition: work_queue.h:143
int work_queue_activate_fast_abort(struct work_queue *q, double multiplier)
Turn on or off fast abort functionality for a given queue.
int64_t total_disk
Total disk space in MB aggregated across the connected workers.
Definition: work_queue.h:149
char * tag
An optional user-defined logical name for the task.
Definition: work_queue.h:66
timestamp_t total_execute_time
Total time in microseconds workers spent executing completed tasks.
Definition: work_queue.h:136
int work_queue_submit(struct work_queue *q, struct work_queue_task *t)
Submit a task to a queue.
int worker_selection_algorithm
How to choose worker to run the task.
Definition: work_queue.h:68
timestamp_t time_execute_cmd_finish
The time at which the task finished (discovered by the master).
Definition: work_queue.h:85
double efficiency
Parallel efficiency of the system, sum(task execution times) / sum(worker lifetimes) ...
Definition: work_queue.h:142
void work_queue_specify_keepalive_interval(struct work_queue *q, int interval)
Change the keepalive interval for a given queue.
timestamp_t time_task_finish
The time at which this task was finished.
Definition: work_queue.h:81
int work_queue_task_specify_input_file(struct work_queue_task *t, const char *fname, const char *rname)
Add an input file to a task.
timestamp_t time_execute_cmd_start
The time at which the task began.
Definition: work_queue.h:84
int workers_idle
Number of workers that are not running a task.
Definition: work_queue.h:118
int64_t total_bytes_sent
Number of bytes sent since task has last started sending input data.
Definition: work_queue.h:92
timestamp_t total_send_time
Total time in microseconds spent in sending data to workers.
Definition: work_queue.h:132
char * work_queue_get_worker_summary(struct work_queue *q)
Summarize workers.
void work_queue_specify_priority(struct work_queue *q, int priority)
Change the priority for a given queue.
int64_t total_bytes_received
Number of bytes received since task has last started receiving input data.
Definition: work_queue.h:91
int total_tasks_failed
Total number of tasks completed and returned to user with result other than WQ_RESULT_SUCCESS.
Definition: work_queue.h:128
int work_queue_enable_monitoring(struct work_queue *q, char *monitor_summary_file)
Enables resource monitoring on the give work queue.
int64_t max_memory
The largest memory size in MB observed among the connected workers.
Definition: work_queue.h:158
void work_queue_task_delete(struct work_queue_task *t)
Delete a task.
char * output
The standard output of the task.
Definition: work_queue.h:69
void work_queue_blacklist_remove(struct work_queue *q, const char *hostname)
Unblacklist host from a queue.
void work_queue_specify_name(struct work_queue *q, const char *name)
Change the project name for a given queue.
int workers_ready
Definition: work_queue.h:165
int work_queue_task_specify_directory(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags, int recursive)
Add a directory to a task.
int64_t max_cores
The highest number of cores observed among the connected workers.
Definition: work_queue.h:156