cctools
work_queue.h
Go to the documentation of this file.
1 /*
2 Copyright (C) 2008- The University of Notre Dame
3 This software is distributed under the GNU General Public License.
4 See the file COPYING for details.
5 */
6 
7 #ifndef WORK_QUEUE_H
8 #define WORK_QUEUE_H
9 
20 #include <sys/types.h>
21 #include "timestamp.h"
22 #include "rmsummary.h"
23 
24 #define WORK_QUEUE_DEFAULT_PORT 9123
25 #define WORK_QUEUE_RANDOM_PORT 0
26 #define WORK_QUEUE_WAITFORTASK -1
28 #define WORK_QUEUE_SCHEDULE_UNSET 0
29 #define WORK_QUEUE_SCHEDULE_FCFS 1
30 #define WORK_QUEUE_SCHEDULE_FILES 2
31 #define WORK_QUEUE_SCHEDULE_TIME 3
32 #define WORK_QUEUE_SCHEDULE_RAND 4
34 #define WORK_QUEUE_INPUT 0
35 #define WORK_QUEUE_OUTPUT 1
37 #define WORK_QUEUE_NOCACHE 0
38 #define WORK_QUEUE_CACHE 1
39 #define WORK_QUEUE_SYMLINK 2 /* Create a symlink to the file rather than copying it, if possible. */
40 #define WORK_QUEUE_PREEXIST 4 /* If the filename already exists on the host, use it in place. */
41 #define WORK_QUEUE_THIRDGET 8 /* Access the file on the client from a shared filesystem */
42 #define WORK_QUEUE_THIRDPUT 8 /* Access the file on the client from a shared filesystem (included for readability) */
43 #define WORK_QUEUE_WATCH 16
45 #define WORK_QUEUE_RESET_ALL 0
46 #define WORK_QUEUE_RESET_KEEP_TASKS 1
48 #define WORK_QUEUE_DEFAULT_KEEPALIVE_INTERVAL 300
49 #define WORK_QUEUE_DEFAULT_KEEPALIVE_TIMEOUT 30
51 #define WORK_QUEUE_RESULT_SUCCESS 0
52 #define WORK_QUEUE_RESULT_INPUT_MISSING 1
53 #define WORK_QUEUE_RESULT_OUTPUT_MISSING 2
54 #define WORK_QUEUE_RESULT_STDOUT_MISSING 4
55 #define WORK_QUEUE_RESULT_SIGNAL 8
56 #define WORK_QUEUE_RESULT_RESOURCE_EXHAUSTION 16
57 #define WORK_QUEUE_RESULT_TASK_TIMEOUT 32
60 #define WORK_QUEUE_TASK_UNKNOWN 0
61 #define WORK_QUEUE_TASK_READY 1
62 #define WORK_QUEUE_TASK_RUNNING 2
63 #define WORK_QUEUE_TASK_WAITING_RETRIEVAL 3
64 #define WORK_QUEUE_TASK_RETRIEVED 4
65 #define WORK_QUEUE_TASK_DONE 5
66 #define WORK_QUEUE_TASK_CANCELED 6
68 extern double wq_option_fast_abort_multiplier;
70 extern int wq_option_scheduler;
74 struct work_queue_task {
75  char *tag;
76  char *command_line;
78  char *output;
79  struct list *input_files;
80  struct list *output_files;
81  int taskid;
83  int result;
84  char *host;
85  char *hostname;
108  int64_t maximum_end_time;
109  int64_t memory;
110  int64_t disk;
111  int cores;
112  int gpus;
113  int unlabeled;
116  double priority;
120 };
121 
151  double efficiency;
153  int capacity;
155  double bandwidth;
156  int64_t total_cores;
157  int64_t total_memory;
158  int64_t total_disk;
159  int64_t total_gpus;
160  int64_t committed_cores;
162  int64_t committed_disk;
163  int64_t committed_gpus;
164  int64_t min_cores;
165  int64_t max_cores;
166  int64_t min_memory;
167  int64_t max_memory;
168  int64_t min_disk;
169  int64_t max_disk;
170  int64_t min_gpus;
171  int64_t max_gpus;
172  int port;
173  int priority;
178 };
179 
180 
184 
192 struct work_queue_task *work_queue_task_create(const char *full_command);
193 
199 struct work_queue_task *work_queue_task_clone(const struct work_queue_task *task);
200 
205 void work_queue_task_specify_command( struct work_queue_task *t, const char *cmd );
206 
223 int work_queue_task_specify_file(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags);
224 
239 int work_queue_task_specify_file_piece(struct work_queue_task *t, const char *local_name, const char *remote_name, off_t start_byte, off_t end_byte, int type, int flags);
240 
251 int work_queue_task_specify_buffer(struct work_queue_task *t, const char *data, int length, const char *remote_name, int flags);
252 
266 int work_queue_task_specify_directory(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags, int recursive);
267 
273 void work_queue_task_specify_memory( struct work_queue_task *t, int64_t memory );
274 
280 void work_queue_task_specify_disk( struct work_queue_task *t, int64_t disk );
281 
287 void work_queue_task_specify_cores( struct work_queue_task *t, int cores );
288 
294 void work_queue_task_specify_gpus( struct work_queue_task *t, int gpus );
295 
301 void work_queue_task_specify_end_time( struct work_queue_task *t, int64_t seconds );
302 
309 void work_queue_task_specify_tag(struct work_queue_task *t, const char *tag);
310 
318 
328 void work_queue_task_specify_algorithm(struct work_queue_task *t, int algo );
329 
335 
337 
341 
358 struct work_queue *work_queue_create(int port);
359 
367 int work_queue_enable_monitoring(struct work_queue *q, char *monitor_summary_file);
368 
377 int work_queue_submit(struct work_queue *q, struct work_queue_task *t);
378 
383 void work_queue_blacklist_add(struct work_queue *q, const char *hostname);
384 
385 
390 void work_queue_blacklist_remove(struct work_queue *q, const char *hostname);
391 
392 
396 void work_queue_blacklist_clear(struct work_queue *q);
397 
412 struct work_queue_task *work_queue_wait(struct work_queue *q, int timeout);
413 
425 int work_queue_hungry(struct work_queue *q);
426 
434 int work_queue_empty(struct work_queue *q);
435 
442 int work_queue_port(struct work_queue *q);
443 
448 void work_queue_get_stats(struct work_queue *q, struct work_queue_stats *s);
449 
454 void work_queue_get_stats_hierarchy(struct work_queue *q, struct work_queue_stats *s);
455 
456 
462 int work_queue_task_state(struct work_queue *q, int taskid);
463 
468 void work_queue_set_bandwidth_limit(struct work_queue *q, const char *bandwidth);
469 
474 double work_queue_get_effective_bandwidth(struct work_queue *q);
475 
482 char * work_queue_get_worker_summary( struct work_queue *q );
483 
489 int work_queue_activate_fast_abort(struct work_queue *q, double multiplier);
490 
491 
495 int work_queue_send_receive_ratio(struct work_queue *q, double ratio);
496 
506 void work_queue_specify_algorithm(struct work_queue *q, int algo);
507 
512 const char *work_queue_name(struct work_queue *q);
513 
518 void work_queue_specify_name(struct work_queue *q, const char *name);
519 
524 void work_queue_specify_priority(struct work_queue *q, int priority);
525 
531 void work_queue_specify_catalog_server(struct work_queue *q, const char *hostname, int port);
532 
538 struct work_queue_task *work_queue_cancel_by_taskid(struct work_queue *q, int id);
539 
545 struct work_queue_task *work_queue_cancel_by_tasktag(struct work_queue *q, const char *tag);
546 
551 struct list * work_queue_cancel_all_tasks(struct work_queue *q);
552 
557 int work_queue_shut_down_workers(struct work_queue *q, int n);
558 
563 void work_queue_delete(struct work_queue *q);
564 
570 int work_queue_specify_log(struct work_queue *q, const char *logfile);
571 
577 void work_queue_specify_password( struct work_queue *q, const char *password );
578 
585 int work_queue_specify_password_file( struct work_queue *q, const char *file );
586 
591 void work_queue_specify_keepalive_interval(struct work_queue *q, int interval);
592 
597 void work_queue_specify_keepalive_timeout(struct work_queue *q, int timeout);
598 
599 
615 int work_queue_tune(struct work_queue *q, const char *name, double value);
616 
618 
622 
623 #define WORK_QUEUE_TASK_ORDER_FIFO 0
624 #define WORK_QUEUE_TASK_ORDER_LIFO 1
632 void work_queue_specify_task_order(struct work_queue *q, int order);
633 
634 
635 #define WORK_QUEUE_MASTER_MODE_STANDALONE 0
636 #define WORK_QUEUE_MASTER_MODE_CATALOG 1
645 void work_queue_specify_master_mode(struct work_queue *q, int mode);
646 
652 void work_queue_specify_estimate_capacity_on(struct work_queue *q, int estimate_capacity_on);
653 
662 int work_queue_task_specify_input_buf(struct work_queue_task *t, const char *buf, int length, const char *rname);
663 
671 int work_queue_task_specify_input_file(struct work_queue_task *t, const char *fname, const char *rname);
672 
680 int work_queue_task_specify_input_file_do_not_cache(struct work_queue_task *t, const char *fname, const char *rname);
681 
689 int work_queue_task_specify_output_file(struct work_queue_task *t, const char *rname, const char *fname);
690 
698 int work_queue_task_specify_output_file_do_not_cache(struct work_queue_task *t, const char *rname, const char *fname);
699 
701 
702 /* Experimental feature - intentionally left undocumented.
703 This feature exists to simplify performance evaulation and is not recommended
704 for production use since it delays execution of the workload.
705 Force the master to wait for the given number of workers to connect before
706 starting to dispatch tasks.
707 @param q A work queue object.
708 @param worker The number of workers to wait before tasks are dispatched.*/
709 void work_queue_activate_worker_waiting(struct work_queue *q, int resources);
710 
711 #endif
int64_t total_memory
Total memory in MB aggregated across the connected workers.
Definition: work_queue.h:157
int64_t committed_gpus
Committed number of GPUs aggregated across the connected workers.
Definition: work_queue.h:163
int work_queue_task_specify_buffer(struct work_queue_task *t, const char *data, int length, const char *remote_name, int flags)
Add an input buffer to a task.
void work_queue_specify_estimate_capacity_on(struct work_queue *q, int estimate_capacity_on)
Change whether to estimate master capacity for a given queue.
int work_queue_send_receive_ratio(struct work_queue *q, double ratio)
Change the preference to send or receive tasks.
int64_t committed_disk
Committed disk space in MB aggregated across the connected workers.
Definition: work_queue.h:162
int64_t min_disk
The smallest disk space in MB observed among the connected workers.
Definition: work_queue.h:168
A task description.
Definition: work_queue.h:74
void work_queue_task_specify_cores(struct work_queue_task *t, int cores)
Specify the number of cores required by a task.
timestamp_t total_good_execute_time
Total time in microseconds workers spent executing successful tasks.
Definition: work_queue.h:146
timestamp_t time_send_input_start
The time at which it started to transfer input files.
Definition: work_queue.h:91
struct list * work_queue_cancel_all_tasks(struct work_queue *q)
Cancel all submitted tasks and remove them from the queue.
timestamp_t start_time
Absolute time at which the master started.
Definition: work_queue.h:140
timestamp_t cmd_execution_time
Time spent in microseconds for executing the command on the worker.
Definition: work_queue.h:104
int tasks_running
Number of tasks currently running.
Definition: work_queue.h:133
int work_queue_task_specify_input_file_do_not_cache(struct work_queue_task *t, const char *fname, const char *rname)
Add an input file to a task, without caching.
int total_workers_joined
Total number of worker connections that were established to the master.
Definition: work_queue.h:129
int work_queue_task_specify_output_file(struct work_queue_task *t, const char *rname, const char *fname)
Add an output file to a task.
timestamp_t time_receive_output_start
The time at which it started to transfer output files.
Definition: work_queue.h:97
timestamp_t total_cmd_execution_time
Time spent in microseconds for executing the command on any worker, including resubmittions of the ta...
Definition: work_queue.h:106
void work_queue_get_stats(struct work_queue *q, struct work_queue_stats *s)
Get queue statistics (only from master).
struct work_queue_task * work_queue_cancel_by_taskid(struct work_queue *q, int id)
Cancel a submitted task using its task id and remove it from queue.
int workers_full
Definition: work_queue.h:175
int total_tasks_dispatched
Total number of tasks dispatch to workers.
Definition: work_queue.h:135
struct work_queue * work_queue_create(int port)
Create a new work queue.
void work_queue_specify_algorithm(struct work_queue *q, int algo)
Change the worker selection algorithm.
timestamp_t total_good_transfer_time
Total time in microseconds spent in sending and receiving data to workers for tasks with result WQ_RE...
Definition: work_queue.h:143
int workers_init
Number of workers initializing.
Definition: work_queue.h:126
int work_queue_task_specify_output_file_do_not_cache(struct work_queue_task *t, const char *rname, const char *fname)
Add an output file to a task without caching.
timestamp_t time_send_input_finish
The time at which it finished transferring input files.
Definition: work_queue.h:92
void work_queue_get_stats_hierarchy(struct work_queue *q, struct work_queue_stats *s)
Get statistics of the master queue together with foremen information.
timestamp_t time_app_delay
Definition: work_queue.h:118
void work_queue_task_specify_command(struct work_queue_task *t, const char *cmd)
Indicate the command to be executed.
int tasks_waiting
Number of tasks waiting to be run.
Definition: work_queue.h:132
int total_submissions
The number of times the task has been submitted.
Definition: work_queue.h:105
char * hostname
The name of the host on which it ran.
Definition: work_queue.h:85
Portable routines for high resolution timing.
double work_queue_get_effective_bandwidth(struct work_queue *q)
Get current queue bandwidth.
int64_t total_gpus
Total number of GPUs aggregated across the connected workers.
Definition: work_queue.h:159
int64_t total_bytes_sent
Total number of file bytes (not including protocol control msg bytes) sent out to the workers by the ...
Definition: work_queue.h:149
struct list * output_files
The output files (other than the standard output stream) created by the program expected to be retrie...
Definition: work_queue.h:80
int64_t total_bytes_transferred
Number of bytes transferred since task has last started transferring input data.
Definition: work_queue.h:102
void work_queue_delete(struct work_queue *q)
Delete a work queue.
UINT64_T timestamp_t
A type to hold the current time, in microseconds since January 1st, 1970.
Definition: timestamp.h:20
int tasks_complete
Number of tasks waiting to be returned to user.
Definition: work_queue.h:134
int64_t min_gpus
The lowest number of GPUs observed among the connected workers.
Definition: work_queue.h:170
struct list * input_files
The files to transfer to the worker and place in the executing directory.
Definition: work_queue.h:79
double bandwidth
Average network bandwidth in MB/S observed by the master when transferring to workers.
Definition: work_queue.h:155
void work_queue_task_specify_algorithm(struct work_queue_task *t, int algo)
Select the scheduling algorithm for a single task.
struct work_queue_task * work_queue_cancel_by_tasktag(struct work_queue *q, const char *tag)
Cancel a submitted task using its tag and remove it from queue.
int total_tasks_complete
Total number of tasks completed and returned to user.
Definition: work_queue.h:136
struct work_queue_task * work_queue_wait(struct work_queue *q, int timeout)
Wait for a task to complete.
void work_queue_task_specify_memory(struct work_queue_task *t, int64_t memory)
Specify the amount of memory required by a task.
char * command_line
The program(s) to execute, as a shell command line.
Definition: work_queue.h:76
int capacity
The estimated number of workers that this master can effectively support.
Definition: work_queue.h:153
int64_t committed_memory
Committed memory in MB aggregated across the connected workers.
Definition: work_queue.h:161
int total_workers_removed
Total number of worker connections that were terminated by the master.
Definition: work_queue.h:130
int work_queue_specify_log(struct work_queue *q, const char *logfile)
Add a log file that records the states of the connected workers and submitted tasks.
timestamp_t time_receive_output_finish
The time at which it finished transferring output files.
Definition: work_queue.h:98
timestamp_t time_task_submit
The time at which this task was submitted.
Definition: work_queue.h:89
timestamp_t time_receive_result_finish
The time at which it finished transferring the results.
Definition: work_queue.h:96
int work_queue_task_specify_input_buf(struct work_queue_task *t, const char *buf, int length, const char *rname)
Add an input buffer to a task.
int64_t committed_cores
Committed number of cores aggregated across the connected workers.
Definition: work_queue.h:160
void work_queue_specify_catalog_server(struct work_queue *q, const char *hostname, int port)
Specify the catalog server the master should report to.
char * host
The address and port of the host on which it ran.
Definition: work_queue.h:84
int work_queue_port(struct work_queue *q)
Get the listening port of the queue.
int work_queue_specify_password_file(struct work_queue *q, const char *file)
Add a mandatory password file that each worker must present.
int taskid
A unique task id number.
Definition: work_queue.h:81
int avg_capacity
Definition: work_queue.h:177
int workers_busy
Number of workers that are running at least one task.
Definition: work_queue.h:128
struct work_queue_task * work_queue_task_create(const char *full_command)
Create a new task object.
void work_queue_specify_password(struct work_queue *q, const char *password)
Add a mandatory password that each worker must present.
struct rmsummary * resources_measured
When monitoring is enabled, it points to the measured resources used by the task. ...
Definition: work_queue.h:114
timestamp_t time_committed
The time at which a task was committed to a worker.
Definition: work_queue.h:87
int work_queue_task_specify_file(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags)
Add a file to a task.
int64_t total_bytes_received
Total number of file bytes (not including protocol control msg bytes) received from the workers by th...
Definition: work_queue.h:150
void work_queue_task_specify_tag(struct work_queue_task *t, const char *tag)
Attach a user defined string tag to the task.
int total_tasks_cancelled
Total number of tasks cancelled.
Definition: work_queue.h:138
void work_queue_task_specify_priority(struct work_queue_task *t, double priority)
Specify the priority of this task relative to others in the queue.
void work_queue_task_specify_end_time(struct work_queue_task *t, int64_t seconds)
Specify the maximum end time allowed for the task (in seconds since the Epoch).
void work_queue_blacklist_add(struct work_queue *q, const char *hostname)
Blacklist host from a queue.
struct work_queue_task * work_queue_task_clone(const struct work_queue_task *task)
Create a copy of a task Create a functionally identical copy of a work_queue_task that can be re-sub...
int work_queue_empty(struct work_queue *q)
Determine whether the queue is empty.
void work_queue_task_specify_disk(struct work_queue_task *t, int64_t disk)
Specify the amount of disk space required by a task.
int work_queue_shut_down_workers(struct work_queue *q, int n)
Shut down workers connected to the work_queue system.
int64_t min_memory
The smallest memory size in MB observed among the connected workers.
Definition: work_queue.h:166
void work_queue_set_bandwidth_limit(struct work_queue *q, const char *bandwidth)
Limit the queue bandwidth when transferring files to and from workers.
const char * work_queue_name(struct work_queue *q)
Get the project name of the queue.
double priority
The priority of this task relative to others in the queue: higher number run earlier.
Definition: work_queue.h:116
void work_queue_specify_keepalive_timeout(struct work_queue *q, int timeout)
Change the keepalive timeout for identifying dead workers for a given queue.
void work_queue_blacklist_clear(struct work_queue *q)
Clear blacklist of a queue.
Statistics describing a work queue.
Definition: work_queue.h:124
Definition: rmsummary.h:24
int work_queue_task_specify_file_piece(struct work_queue_task *t, const char *local_name, const char *remote_name, off_t start_byte, off_t end_byte, int type, int flags)
Add a file piece to a task.
timestamp_t time_receive_result_start
The time at which it started to transfer the results.
Definition: work_queue.h:95
int return_status
The exit code of the command line.
Definition: work_queue.h:82
int64_t min_cores
The lowest number of cores observed among the connected workers.
Definition: work_queue.h:164
int64_t max_gpus
The highest number of GPUs observed among the connected workers.
Definition: work_queue.h:171
void work_queue_task_specify_gpus(struct work_queue_task *t, int gpus)
Specify the number of gpus required by a task.
int total_workers_connected
Total number of workers currently connected to the master.
Definition: work_queue.h:125
timestamp_t total_receive_time
Total time in microseconds spent in receiving data from workers.
Definition: work_queue.h:142
int work_queue_hungry(struct work_queue *q)
Determine whether the queue is &#39;hungry&#39; for more tasks.
int work_queue_tune(struct work_queue *q, const char *name, double value)
Tune advanced parameters for work queue.
int64_t total_cores
Total number of cores aggregated across the connected workers.
Definition: work_queue.h:156
Definition: list.h:49
int work_queue_task_state(struct work_queue *q, int taskid)
Get the current state of the task.
int result
The result of the task (successful, failed return_status, missing input file, missing output file)...
Definition: work_queue.h:83
int total_worker_slots
Definition: work_queue.h:176
int64_t max_disk
The largest disk space in MB observed among the connected workers.
Definition: work_queue.h:169
timestamp_t total_transfer_time
Time comsumed in microseconds for transferring total_bytes_transferred.
Definition: work_queue.h:103
double idle_percentage
The fraction of time that the master is idle waiting for workers to respond.
Definition: work_queue.h:152
int work_queue_activate_fast_abort(struct work_queue *q, double multiplier)
Turn on or off fast abort functionality for a given queue.
int64_t total_disk
Total disk space in MB aggregated across the connected workers.
Definition: work_queue.h:158
char * tag
An optional user-defined logical name for the task.
Definition: work_queue.h:75
timestamp_t total_execute_time
Total time in microseconds workers spent executing completed tasks.
Definition: work_queue.h:145
int work_queue_submit(struct work_queue *q, struct work_queue_task *t)
Submit a task to a queue.
int worker_selection_algorithm
How to choose worker to run the task.
Definition: work_queue.h:77
timestamp_t time_execute_cmd_finish
The time at which the task finished (discovered by the master).
Definition: work_queue.h:94
double efficiency
Parallel efficiency of the system, sum(task execution times) / sum(worker lifetimes) ...
Definition: work_queue.h:151
void work_queue_specify_keepalive_interval(struct work_queue *q, int interval)
Change the keepalive interval for a given queue.
timestamp_t time_task_finish
The time at which this task was finished.
Definition: work_queue.h:90
int work_queue_task_specify_input_file(struct work_queue_task *t, const char *fname, const char *rname)
Add an input file to a task.
timestamp_t time_execute_cmd_start
The time at which the task began.
Definition: work_queue.h:93
int workers_idle
Number of workers that are not running a task.
Definition: work_queue.h:127
int64_t total_bytes_sent
Number of bytes sent since task has last started sending input data.
Definition: work_queue.h:101
timestamp_t total_send_time
Total time in microseconds spent in sending data to workers.
Definition: work_queue.h:141
char * work_queue_get_worker_summary(struct work_queue *q)
Summarize workers.
void work_queue_specify_priority(struct work_queue *q, int priority)
Change the priority for a given queue.
int64_t total_bytes_received
Number of bytes received since task has last started receiving input data.
Definition: work_queue.h:100
int total_tasks_failed
Total number of tasks completed and returned to user with result other than WQ_RESULT_SUCCESS.
Definition: work_queue.h:137
int work_queue_enable_monitoring(struct work_queue *q, char *monitor_summary_file)
Enables resource monitoring on the give work queue.
int64_t max_memory
The largest memory size in MB observed among the connected workers.
Definition: work_queue.h:167
void work_queue_task_delete(struct work_queue_task *t)
Delete a task.
char * output
The standard output of the task.
Definition: work_queue.h:78
void work_queue_blacklist_remove(struct work_queue *q, const char *hostname)
Unblacklist host from a queue.
void work_queue_specify_name(struct work_queue *q, const char *name)
Change the project name for a given queue.
int workers_ready
Definition: work_queue.h:174
int work_queue_task_specify_directory(struct work_queue_task *t, const char *local_name, const char *remote_name, int type, int flags, int recursive)
Add a directory to a task.
int64_t max_cores
The highest number of cores observed among the connected workers.
Definition: work_queue.h:165