In [9]:
import loopy as lp
import numpy as np

In [13]:
orig_knl = lp.make_kernel(
    "{[e,i]: 0<=e<=9 and 0<=i<=2}",
    """
    for e
        for i
           out[e, i] = out[e, i] + a[e]
        end
    end
    """, target=lp.CTarget())

In [14]:
knl = orig_knl
knl = lp.tag_inames(knl, "i:unr")
knl = knl.copy(iname_slab_increments={"e": (1, 1)})
knl = lp.prioritize_loops(knl, "e,i")

knl = lp.add_and_infer_dtypes(knl, {"out,a":np.float32})

print(lp.generate_code_v2(knl).device_code())


void loopy_kernel(float const *__restrict__ a, float *__restrict__ out)
{
  /* bulk slab for 'e' */
  for (int e = 1; e <= 8; ++e)
  {
    out[3 * e] = out[3 * e] + a[e];
    out[3 * e + 1] = out[3 * e + 1] + a[e];
    out[3 * e + 2] = out[3 * e + 2] + a[e];
  }
  /* initial slab for 'e' */
  {
    int const e = 0;

    out[0] = out[0] + a[0];
    out[1] = out[1] + a[0];
    out[2] = out[2] + a[0];
  }
  /* final slab for 'e' */
  {
    int const e = 9;

    out[27] = out[27] + a[9];
    out[27 + 1] = out[27 + 1] + a[9];
    out[27 + 2] = out[27 + 2] + a[9];
  }
}
