Halide 14.0.0
Halide compiler and libraries
FunctionDAG.h
Go to the documentation of this file.
1/** This file defines the class FunctionDAG, which is our
2 * representation of a Halide pipeline, and contains methods to using
3 * Halide's bounds tools to query properties of it. */
4
5#ifndef FUNCTION_DAG_H
6#define FUNCTION_DAG_H
7
8#include <algorithm>
9#include <cstdint>
10#include <map>
11#include <string>
12#include <utility>
13
14#include <vector>
15
16#include "Errors.h"
17#include "Featurization.h"
18#include "Halide.h"
19
20namespace Halide {
21namespace Internal {
22namespace Autoscheduler {
23
24using std::map;
25using std::pair;
26using std::string;
27using std::unique_ptr;
28using std::vector;
29
30// First we have various utility classes.
31
32// An optional rational type used when analyzing memory dependencies.
34 bool exists = false;
36
37 OptionalRational() = default;
39 : exists(e), numerator(n), denominator(d) {
40 }
41
42 void operator+=(const OptionalRational &other) {
43 if (!exists || !other.exists) {
44 exists = false;
45 return;
46 }
47 if (denominator == other.denominator) {
48 numerator += other.numerator;
49 return;
50 }
51
54 denominator = l;
55 numerator += other.numerator * (l / other.denominator);
57 numerator /= g;
58 denominator /= g;
59 }
60
62 if ((*this) == 0) {
63 return *this;
64 }
65 if (other == 0) {
66 return other;
67 }
68 int64_t num = numerator * other.numerator;
69 int64_t den = denominator * other.denominator;
70 bool e = exists && other.exists;
71 return OptionalRational{e, num, den};
72 }
73
74 // Because this type is optional (exists may be false), we don't
75 // have a total ordering. These methods all return false when the
76 // operators are not comparable, so a < b is not the same as !(a
77 // >= b).
78 bool operator<(int x) const {
79 if (!exists) {
80 return false;
81 }
82 if (denominator > 0) {
83 return numerator < x * denominator;
84 } else {
85 return numerator > x * denominator;
86 }
87 }
88
89 bool operator<=(int x) const {
90 if (!exists) {
91 return false;
92 }
93 if (denominator > 0) {
94 return numerator <= x * denominator;
95 } else {
96 return numerator >= x * denominator;
97 }
98 }
99
100 bool operator>(int x) const {
101 if (!exists) {
102 return false;
103 }
104 return !((*this) <= x);
105 }
106
107 bool operator>=(int x) const {
108 if (!exists) {
109 return false;
110 }
111 return !((*this) < x);
112 }
113
114 bool operator==(int x) const {
115 return exists && (numerator == (x * denominator));
116 }
117
118 bool operator==(const OptionalRational &other) const {
119 return (exists == other.exists) && (numerator * other.denominator == denominator * other.numerator);
120 }
121};
122
123// A LoadJacobian records the derivative of the coordinate accessed in
124// some producer w.r.t the loops of the consumer.
126 vector<vector<OptionalRational>> coeffs;
127 int64_t c;
128
129public:
130 LoadJacobian(vector<vector<OptionalRational>> &&matrix, int64_t c = 1)
131 : coeffs(matrix), c(c) {
132 }
133
134 size_t producer_storage_dims() const {
135 return coeffs.size();
136 }
137
138 size_t consumer_loop_dims() const {
139 if (coeffs.empty() || coeffs[0].empty()) {
140 // The producer is scalar, and we don't know how
141 // many consumer loops there are.
142 return 0;
143 }
144 return coeffs[0].size();
145 }
146
147 OptionalRational operator()(int producer_storage_dim, int consumer_loop_dim) const {
148 if (coeffs.empty()) {
149 // The producer is scalar, so all strides are zero.
150 return {true, 0, 1};
151 }
152 internal_assert(producer_storage_dim < (int)coeffs.size());
153 const auto &p = coeffs[producer_storage_dim];
154 if (p.empty()) {
155 // The consumer is scalar, so all strides are zero.
156 return {true, 0, 1};
157 }
158 internal_assert(consumer_loop_dim < (int)p.size());
159 return p[consumer_loop_dim];
160 }
161
162 // To avoid redundantly re-recording copies of the same
163 // load Jacobian, we keep a count of how many times a
164 // load with this Jacobian occurs.
165 int64_t count() const {
166 return c;
167 }
168
169 // Try to merge another LoadJacobian into this one, increasing the
170 // count if the coefficients match.
171 bool merge(const LoadJacobian &other) {
172 if (other.coeffs.size() != coeffs.size()) {
173 return false;
174 }
175 for (size_t i = 0; i < coeffs.size(); i++) {
176 if (other.coeffs[i].size() != coeffs[i].size()) {
177 return false;
178 }
179 for (size_t j = 0; j < coeffs[i].size(); j++) {
180 if (!(other.coeffs[i][j] == coeffs[i][j])) {
181 return false;
182 }
183 }
184 }
185 c += other.count();
186 return true;
187 }
188
189 // Multiply Jacobians, used to look at memory dependencies through
190 // inlined functions.
191 LoadJacobian operator*(const LoadJacobian &other) const {
192 vector<vector<OptionalRational>> matrix;
194 matrix.resize(producer_storage_dims());
195 for (size_t i = 0; i < producer_storage_dims(); i++) {
196 matrix[i].resize(other.consumer_loop_dims());
197 for (size_t j = 0; j < other.consumer_loop_dims(); j++) {
198 matrix[i][j] = OptionalRational{true, 0, 1};
199 for (size_t k = 0; k < consumer_loop_dims(); k++) {
200 matrix[i][j] += (*this)(i, k) * other(k, j);
201 }
202 }
203 }
204 LoadJacobian result(std::move(matrix), count() * other.count());
205 return result;
206 }
207
208 void dump(const char *prefix) const;
209};
210
211// Classes to represent a concrete set of bounds for a Func. A Span is
212// single-dimensional, and a Bound is a multi-dimensional box. For
213// each dimension we track the estimated size, and also whether or not
214// the size is known to be constant at compile-time. For each Func we
215// track three different types of bounds:
216
217// 1) The region required by consumers of the Func, which determines
218// 2) The region actually computed, which in turn determines
219// 3) The min and max of all loops in the loop next.
220
221// 3 in turn determines the region required of the inputs to a Func,
222// which determines their region computed, and hence their loop nest,
223// and so on back up the Function DAG from outputs back to inputs.
224
225class Span {
226 int64_t min_, max_;
227 bool constant_extent_;
228
229public:
230 int64_t min() const {
231 return min_;
232 }
233 int64_t max() const {
234 return max_;
235 }
236 int64_t extent() const {
237 return max_ - min_ + 1;
238 }
239 bool constant_extent() const {
240 return constant_extent_;
241 }
242
243 void union_with(const Span &other) {
244 min_ = std::min(min_, other.min());
245 max_ = std::max(max_, other.max());
246 constant_extent_ = constant_extent_ && other.constant_extent();
247 }
248
250 max_ = min_ + e - 1;
251 }
252
254 min_ += x;
255 max_ += x;
256 }
257
258 Span(int64_t a, int64_t b, bool c)
259 : min_(a), max_(b), constant_extent_(c) {
260 }
261 Span() = default;
262 Span(const Span &other) = default;
263 static Span empty_span() {
264 return Span(INT64_MAX, INT64_MIN, true);
265 }
266};
267
268// Bounds objects are created and destroyed very frequently while
269// exploring scheduling options, so we have a custom allocator and
270// memory pool. Much like IR nodes, we treat them as immutable once
271// created and wrapped in a Bound object so that they can be shared
272// safely across scheduling alternatives.
275
276 class Layout;
277 const Layout *layout = nullptr;
278
279 Span *data() const {
280 // This struct is a header
281 return (Span *)(const_cast<BoundContents *>(this) + 1);
282 }
283
285 return data()[i];
286 }
287
289 return data()[i + layout->computed_offset];
290 }
291
292 Span &loops(int i, int j) {
293 return data()[j + layout->loop_offset[i]];
294 }
295
296 const Span &region_required(int i) const {
297 return data()[i];
298 }
299
300 const Span &region_computed(int i) const {
301 return data()[i + layout->computed_offset];
302 }
303
304 const Span &loops(int i, int j) const {
305 return data()[j + layout->loop_offset[i]];
306 }
307
309 auto *b = layout->make();
310 size_t bytes = sizeof(data()[0]) * layout->total_size;
311 memcpy(b->data(), data(), bytes);
312 return b;
313 }
314
315 void validate() const;
316
317 // We're frequently going to need to make these concrete bounds
318 // arrays. It makes things more efficient if we figure out the
319 // memory layout of those data structures once ahead of time, and
320 // make each individual instance just use that. Note that this is
321 // not thread-safe.
322 class Layout {
323 // A memory pool of free BoundContent objects with this layout
324 mutable std::vector<BoundContents *> pool;
325
326 // All the blocks of memory allocated
327 mutable std::vector<void *> blocks;
328
329 mutable size_t num_live = 0;
330
331 void allocate_some_more() const;
332
333 public:
334 // number of Span to allocate
336
337 // region_required has size func->dimensions() and comes first in the memory layout
338
339 // region_computed comes next at the following index
341
342 // the loop for each stage starts at the following index
343 std::vector<int> loop_offset;
344
345 Layout() = default;
347
348 Layout(const Layout &) = delete;
349 void operator=(const Layout &) = delete;
350 Layout(Layout &&) = delete;
351 void operator=(Layout &&) = delete;
352
353 // Make a BoundContents object with this layout
355
356 // Release a BoundContents object with this layout back to the pool
357 void release(const BoundContents *b) const;
358 };
359};
360
362
363// A representation of the function DAG. The nodes and edges are both
364// in reverse realization order, so if you want to walk backwards up
365// the DAG, just iterate the nodes or edges in-order.
367
368 // An edge is a producer-consumer relationship
369 struct Edge;
370
374 };
375
376 // A Node represents a single Func
377 struct Node {
378 // A pointer back to the owning DAG
380
381 // The Halide Func this represents
383
384 // The number of bytes per point stored.
386
387 // The min/max variables used to denote a symbolic region of
388 // this Func. Used in the cost above, and in the Edges below.
389 vector<SymbolicInterval> region_required;
390
391 // A concrete region required from a bounds estimate. Only
392 // defined for outputs.
394
395 // The region computed of a Func, in terms of the region
396 // required. For simple Funcs this is identical to the
397 // region_required. However, in some Funcs computing one
398 // output requires computing other outputs too. You can't
399 // really ask for a single output pixel from something blurred
400 // with an IIR without computing the others, for example.
402 // The min and max in their full symbolic glory. We use
403 // these in the general case.
406
407 // Analysis used to accelerate common cases
410 };
411 vector<RegionComputedInfo> region_computed;
413
414 // Expand a region required into a region computed, using the
415 // symbolic intervals above.
416 void required_to_computed(const Span *required, Span *computed) const;
417
418 // Metadata about one symbolic loop in a Func's default loop nest.
419 struct Loop {
420 string var;
421 bool pure, rvar;
423
424 // Which pure dimension does this loop correspond to? Invalid if it's an rvar
426
427 // Precomputed metadata to accelerate common cases:
428
429 // If true, the loop bounds are just the region computed in the given dimension
432
433 // If true, the loop bounds are a constant with the given min and max
436
437 // A persistent fragment of source for getting this Var
438 // from its owner Func. Used for printing source code
439 // equivalent to a computed schedule.
440 string accessor;
441 };
442
443 // Get the loop nest shape as a function of the region computed
444 void loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const;
445
446 // One stage of a Func
447 struct Stage {
448 // The owning Node
450
451 // Which stage of the Func is this. 0 = pure.
452 int index;
453
454 // The loop nest that computes this stage, from innermost out.
455 vector<Loop> loop;
457
458 // The vectorization width that will be used for
459 // compute. Corresponds to the natural width for the
460 // narrowest type used.
462
463 // The featurization of the compute done
465
466 // The actual Halide front-end stage object
468
469 // The name for scheduling (e.g. "foo.update(3)")
470 string name;
471
472 // Ids for perfect hashing on stages.
473 int id, max_id;
474
475 vector<Edge *> incoming_edges;
476
477 vector<bool> dependencies;
478 bool downstream_of(const Node &n) const {
479 return dependencies[n.id];
480 };
481
483 : stage(std::move(s)) {
484 }
485 };
486 vector<Stage> stages;
487
488 vector<Edge *> outgoing_edges;
489
490 // Max vector size across the stages
492
493 // A unique ID for this node, allocated consecutively starting
494 // at zero for each pipeline.
495 int id, max_id;
496
497 // Just func->dimensions(), but we ask for it so many times
498 // that's it's worth avoiding the function call into
499 // libHalide.
501
502 // Is a single pointwise call to another Func
504
505 // We represent the input buffers as node, though we do not attempt to schedule them.
507
508 // Is one of the pipeline outputs
510
511 // Only uses pointwise calls
513
514 // Only uses pointwise calls + clamping on all indices
516
517 std::unique_ptr<BoundContents::Layout> bounds_memory_layout;
518
520 return bounds_memory_layout->make();
521 }
522 };
523
524 // A representation of a producer-consumer relationship
525 struct Edge {
526 struct BoundInfo {
527 // The symbolic expression for the bound in this dimension
529
530 // Fields below are the results of additional analysis
531 // used to evaluate this bound more quickly.
535
536 BoundInfo(const Expr &e, const Node::Stage &consumer, bool dependent);
537 };
538
539 // Memory footprint on producer required by consumer.
540 vector<pair<BoundInfo, BoundInfo>> bounds;
541
544
545 // The number of calls the consumer makes to the producer, per
546 // point in the loop nest of the consumer.
547 int calls;
548
550
551 vector<LoadJacobian> load_jacobians;
552
554
555 // Given a loop nest of the consumer stage, expand a region
556 // required of the producer to be large enough to include all
557 // points required.
558 void expand_footprint(const Span *consumer_loop, Span *producer_required) const;
559 };
560
561 vector<Node> nodes;
562 vector<Edge> edges;
563
564 // Create the function DAG, and do all the dependency and cost
565 // analysis. This is done once up-front before the tree search.
566 FunctionDAG(const vector<Function> &outputs, const MachineParams &params, const Target &target);
567
568 void dump() const;
569 std::ostream &dump(std::ostream &os) const;
570
571private:
572 // Compute the featurization for the entire DAG
573 void featurize();
574
575 template<typename OS>
576 void dump_internal(OS &os) const;
577
578public:
579 // This class uses a lot of internal pointers, so we'll make it uncopyable/unmovable.
580 FunctionDAG(const FunctionDAG &other) = delete;
581 FunctionDAG &operator=(const FunctionDAG &other) = delete;
582 FunctionDAG(FunctionDAG &&other) = delete;
584};
585
586} // namespace Autoscheduler
587} // namespace Internal
588} // namespace Halide
589
590#endif // FUNCTION_DAG_H
#define internal_assert(c)
Definition: Errors.h:19
void release(const BoundContents *b) const
OptionalRational operator()(int producer_storage_dim, int consumer_loop_dim) const
Definition: FunctionDAG.h:147
LoadJacobian operator*(const LoadJacobian &other) const
Definition: FunctionDAG.h:191
bool merge(const LoadJacobian &other)
Definition: FunctionDAG.h:171
LoadJacobian(vector< vector< OptionalRational > > &&matrix, int64_t c=1)
Definition: FunctionDAG.h:130
void dump(const char *prefix) const
Span(int64_t a, int64_t b, bool c)
Definition: FunctionDAG.h:258
Span(const Span &other)=default
void union_with(const Span &other)
Definition: FunctionDAG.h:243
A reference-counted handle to Halide's internal representation of a function.
Definition: Function.h:38
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
A single definition of a Func.
Definition: Func.h:70
A Halide variable, to be used when defining functions.
Definition: Var.h:19
int64_t gcd(int64_t, int64_t)
The greatest common divisor of two integers.
int64_t lcm(int64_t, int64_t)
The least common multiple of two integers.
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition: Func.h:600
Expr max(const FuncRef &a, const FuncRef &b)
Definition: Func.h:603
signed __INT64_TYPE__ int64_t
void * memcpy(void *s1, const void *s2, size_t n)
A fragment of Halide syntax.
Definition: Expr.h:256
const Span & loops(int i, int j) const
Definition: FunctionDAG.h:304
BoundInfo(const Expr &e, const Node::Stage &consumer, bool dependent)
void expand_footprint(const Span *consumer_loop, Span *producer_required) const
vector< pair< BoundInfo, BoundInfo > > bounds
Definition: FunctionDAG.h:540
std::unique_ptr< BoundContents::Layout > bounds_memory_layout
Definition: FunctionDAG.h:517
void loop_nest_for_region(int stage_idx, const Span *computed, Span *loop) const
vector< RegionComputedInfo > region_computed
Definition: FunctionDAG.h:411
void required_to_computed(const Span *required, Span *computed) const
FunctionDAG(const vector< Function > &outputs, const MachineParams &params, const Target &target)
FunctionDAG(FunctionDAG &&other)=delete
FunctionDAG(const FunctionDAG &other)=delete
std::ostream & dump(std::ostream &os) const
FunctionDAG & operator=(FunctionDAG &&other)=delete
FunctionDAG & operator=(const FunctionDAG &other)=delete
void operator+=(const OptionalRational &other)
Definition: FunctionDAG.h:42
OptionalRational(bool e, int64_t n, int64_t d)
Definition: FunctionDAG.h:38
bool operator==(const OptionalRational &other) const
Definition: FunctionDAG.h:118
OptionalRational operator*(const OptionalRational &other) const
Definition: FunctionDAG.h:61
A class to represent ranges of Exprs.
Definition: Interval.h:14
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
A struct representing the machine parameters to generate the auto-scheduled code for.
Definition: Pipeline.h:33
A struct representing a target machine and os to generate code for.
Definition: Target.h:19