Halide 14.0.0
Halide compiler and libraries
LoopNest.h
Go to the documentation of this file.
1/** This file defines the LoopNest, which is our
2 * representation of a Halide schedule, and contains methods to
3 * generate candidates for scheduling as well as extract a
4 * featurization that can be used to cost each candidate. */
5
6#ifndef LOOP_NEST_H
7#define LOOP_NEST_H
8
9#include "FunctionDAG.h"
10#include "PerfectHashMap.h"
11#include <map>
12#include <set>
13#include <utility>
14#include <vector>
15
16namespace Halide {
17namespace Internal {
18namespace Autoscheduler {
19
20template<typename T>
22
23template<typename T>
25
27
28// Given a multi-dimensional box of dimensionality d, generate a list
29// of candidate tile sizes for it, logarithmically spacing the sizes
30// using the given factor. If 'allow_splits' is false, every dimension
31// must either be one, or the full extent of the box. This function is
32// used to generate candidate tilings when tiling for
33// producer-consumer fusion, or tiling for parallelism.
34std::vector<std::vector<int64_t>> generate_tilings(const vector<int64_t> &s, int d, int factor, bool allow_splits);
35
36struct LoopNest {
38
39 // The extents of this loop. Put another way, the number of tiles,
40 // not the size of each tile.
41 std::vector<int64_t> size;
42
43 // The nodes inside the loop body
44 std::vector<IntrusivePtr<const LoopNest>> children;
45
46 // Funcs inlined into this inner loop, and the number of times
47 // each is called. Only valid if children is empty.
49
50 // Funcs stored inside this loop
51 std::set<const FunctionDAG::Node *> store_at;
52
53 // The total bounds required of any given Func over all iterations
54 // of this loop. In the paper, this is represented using the
55 // little boxes to the left of the loop nest tree figures.
57
58 // The Func this loop nest belongs to
59 const FunctionDAG::Node *node = nullptr;
60
61 // The stage of the Func
63
64 // Is this the innermost loop of this func (the SIMD loop)?
65 bool innermost = false;
66
67 // Are we permitted to tile this loop?
68 bool tileable = false;
69
70 // Is this the parallel outer loop?
71 bool parallel = false;
72
73 // What dimension is this Func vectorized over, in terms of the pure args of the Func?
74 int vector_dim = -1;
75
76 // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
78
79 void copy_from(const LoopNest &n);
80
81 static void hash_combine(uint64_t &h, uint64_t next) {
82 // From boost
83 h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
84 }
85
86 // Hash the loop structure and sizes up to a fixed depth. This is
87 // used as the hash function for the coarse-to-fine beam search in
88 // the paper.
89 void structural_hash(uint64_t &h, int depth) const;
90
91 // How many funcs are scheduled inside this loop level. Used in
92 // the structural hash.
94 size_t count = inlined.size() + store_at.size();
95 for (const auto &c : children) {
96 count += c->funcs_realized_or_inlined();
97 }
98 return count;
99 }
100
101 // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
102 struct Sites {
103 const LoopNest *compute = nullptr; // Its containing compute_at site
104 const LoopNest *store = nullptr; // Its containing store_at site
105 const LoopNest *produce = nullptr; // Its own outermost node
106 const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
107 const LoopNest *task = nullptr; // The parallel for loop it belongs to
108 bool inlined = false; // Is the Func inlined?
109
110 // Used for caching features/feature intermediates.
112 };
113
114 // Compute all the sites of interest for each pipeline stage
116 const LoopNest *task = nullptr,
117 const LoopNest *parent = nullptr) const;
118
119 // A helper for the working_set_at_task feature. Most features are
120 // computed in the recursive pass 'compute_features' below, but
121 // this one must be done in a second separate recursive pass.
123 StageMap<ScheduleFeatures> *features) const {
124 for (const auto &c : children) {
125 c->set_working_set_at_task_feature(working_set, features);
126 features->get(c->stage).working_set_at_task = working_set;
127 }
128 }
129
130 // Do a recursive walk over the loop nest computing features to feed the cost model.
132 const MachineParams &params,
133 const StageMap<Sites> &sites,
134 int64_t instances,
135 int64_t parallelism,
136 const LoopNest *parent,
137 const LoopNest *grandparent,
138 const LoopNest &root,
139 int64_t *working_set,
141 bool use_cached_features) const;
142
143 bool is_root() const {
144 // The root is the sole node without a Func associated with
145 // it.
146 return node == nullptr;
147 }
148
149 // Set the region required of a Func at this site.
150 const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
151 return bounds.emplace(f, b);
152 }
153
154 // Get the region required of a Func at this site, from which we
155 // know what region would be computed if it were scheduled here,
156 // and what its loop nest would be.
157 const Bound &get_bounds(const FunctionDAG::Node *f) const;
158
159 // Recursively print a loop nest representation to stderr
160 void dump(string prefix, const LoopNest *parent) const;
161
162 // Does this loop nest access the given Func
163 bool calls(const FunctionDAG::Node *f) const;
164
165 // What is the maximum number of inlined calls to a Func that
166 // occur within this loop. Used to prune states that would
167 // generate too much code.
169
170 // Does this loop nest access an input buffer? Used to select
171 // trail strategies when splitting loops. We don't want to read
172 // out of bounds on inputs, even if we don't intend to use the
173 // values read. It could create annoying assertion failures for
174 // the user. It's OK to read out of range of the values computed
175 // on internal Funcs though. Allocation bounds inference just pads
176 // out the bounds so that it won't fault.
178
179 // Does this loop nest contain a computation of the given Func.
180 bool computes(const FunctionDAG::Node *f) const;
181
182 // Above here most methods query the loop nest. Below we have
183 // methods that mutate the loop nest.
184
185 // Inline a Func into all consumers within this loop.
187
188 // Compute a Func at this site.
189 void compute_here(const FunctionDAG::Node *f, bool tileable, int v);
190
191 // Parallelize this loop according to the given tiling.
193 const vector<int64_t> &tiling,
194 const LoopNest *parent) const;
195
196 // Return all possible ways to compute f in tiles somewhere within
197 // this loop nest.
198 std::vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
199 const LoopNest *parent,
200 const MachineParams &params,
201 int v,
202 bool in_realization) const;
203
204 // Below here we have methods that apply a schedule to a Halide pipeline.
205
206 // A model of the state of the loop nest of a Func while applying
207 // Halide's scheduling directives.
208
209 // Note that StageScheduleState is movable-but-not-copyable thanks
210 // to its ostringstream member.
212 // How much parallelism do we need to exploit with this Func?
213 double num_cores = 0;
214
215 // Which storage dimension is vectorized? We need to reorder it innermost
216 int vector_dim = -1;
218
219 // The various Vars and RVars used for scheduling a Func.
220 struct FuncVar {
221 // The top-level var or rvar this was split off from
223
224 // This var.
226
227 // Source code to access this Var/RVar. Used for printing
228 // valid Halide source for this schedule.
229 string accessor;
230
231 // Our estimate of the extent of this var. This is exact
232 // when constant_extent flag is true.
234
235 // Which index in the symbolic loop nest does this var
236 // belong to.
237 size_t index = 0;
238
239 // Some flags.
240 bool innermost_pure_dim = false,
241 outermost = false,
242 parallel = false,
243 exists = false,
244 pure = false,
247 : orig(Var()), var(Var()) {
248 }
249 };
250
251 // In order from innermost to outermost. Each group of d is one tiling level.
252 std::vector<FuncVar> vars;
253
254 std::ostringstream schedule_source;
255 };
256
257 // Apply the schedule represented by this loop nest to a Halide pipeline.
258 void apply(LoopLevel here,
259 StageMap<std::unique_ptr<StageScheduleState>> &state_map,
260 double num_cores,
261 int depth,
262 const LoopNest *parent,
263 const LoopNest *compute_site) const;
264
265 // The below are two feature caches.
266 // hash of producers -> StageMap
267 mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates_cache;
268 // hash of producers -> StageMap
269 mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features_cache;
270
271 // Same as copy_from (above) but also copies the two caches.
273
274 // Loops through inlined funcs and caches the pcm found in features, into memoized_features.
276 const StageMap<ScheduleFeatures> *features) const;
277
278 // Merges features_to_insert into memoized_features if it does not already exist there.
280 const StageMap<ScheduleFeatures> *features_to_insert) const;
281
282 // Recalculates working_set from cached features
284 const StageMap<ScheduleFeatures> *features) const;
285
286 // Features need to be recomputed for inlined Funcs
288 StageMap<ScheduleFeatures> *features) const;
289
290 // Create a (hopefully) unique hash of the producers.
292
293 // Gather all stages that are producers for any Func in this LoopNest.
294 std::vector<std::pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
295
296 // Collect all stages referenced in this LoopNest.
297 void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
298};
299
300// Find the deepest common ancestor of `a` and `b`.
301// `parents` is a map from loop nest to (parent, depth) tuples.
302// Assumes that `a` and `b` are found in `parents`, otherwise errors.
303const LoopNest *deepest_common_ancestor(const std::map<const LoopNest *, std::pair<const LoopNest *, int>> &parents,
304 const LoopNest *a, const LoopNest *b);
305
306// Compute the parent and depth of every loop nest node.
307// Stores in `parents` the children of `here` (keys) to tuples of (here, depth).
308// Recurses on all children of `here`.
309void compute_loop_nest_parents(std::map<const LoopNest *, std::pair<const LoopNest *, int>> &parents,
310 const LoopNest *here, int depth);
311
312} // namespace Autoscheduler
313} // namespace Internal
314} // namespace Halide
315
316#endif // LOOP_NEST_H
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:176
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const T & get(const K *n) const
std::vector< std::vector< int64_t > > generate_tilings(const vector< int64_t > &s, int d, int factor, bool allow_splits)
const LoopNest * deepest_common_ancestor(const std::map< const LoopNest *, std::pair< const LoopNest *, int > > &parents, const LoopNest *a, const LoopNest *b)
void compute_loop_nest_parents(std::map< const LoopNest *, std::pair< const LoopNest *, int > > &parents, const LoopNest *here, int depth)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
const FunctionDAG::Node::Stage * stage
Definition: LoopNest.h:62
const FunctionDAG::Node * node
Definition: LoopNest.h:59
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void dump(string prefix, const LoopNest *parent) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
void get_sites(StageMap< Sites > &sites, const LoopNest *task=nullptr, const LoopNest *parent=nullptr) const
const Bound & get_bounds(const FunctionDAG::Node *f) const
std::set< const FunctionDAG::Node * > store_at
Definition: LoopNest.h:51
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features_to_insert) const
std::map< uint64_t, StageMap< ScheduleFeatures > > features_cache
Definition: LoopNest.h:269
bool computes(const FunctionDAG::Node *f) const
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:44
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition: LoopNest.h:122
std::vector< std::pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition: LoopNest.h:150
std::vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const MachineParams &params, int v, bool in_realization) const
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
IntrusivePtr< const LoopNest > parallelize_in_tiles(const MachineParams &params, const vector< int64_t > &tiling, const LoopNest *parent) const
static void hash_combine(uint64_t &h, uint64_t next)
Definition: LoopNest.h:81
void compute_features(const FunctionDAG &dag, const MachineParams &params, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, int64_t *working_set, StageMap< ScheduleFeatures > *features, bool use_cached_features) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
bool calls(const FunctionDAG::Node *f) const
void copy_from_including_features(const LoopNest &n)
void structural_hash(uint64_t &h, int depth) const
void compute_here(const FunctionDAG::Node *f, bool tileable, int v)
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates_cache
Definition: LoopNest.h:267
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState > > &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site) const
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
A struct representing the machine parameters to generate the auto-scheduled code for.
Definition: Pipeline.h:33
A class that can represent Vars or RVars.
Definition: Func.h:30