I don't think this is a good idea overall. The way simpler solution would be to disable tiling on the z axis for 3d images so that we don't hurt the most common case, 2d images. And that's what I was seeing nvidia doing anyway. So with that we would end up adding a bunch of instructions hurting the 2d image case, just to support something no user will care about anyway. On Mon, Oct 14, 2019 at 7:22 AM Ilia Mirkin <imirkin@xxxxxxxxxxxx> wrote: > > Unfortuantely we don't know if a particular load is a real 2d image (as > would be a cube face or 2d array element), or a layer of a 3d image. > Since we pass in the TIC reference, the instruction's type has to match > what's in the TIC (experimentally). In order to properly support > bindless images, this also can't be done by looking at the current > bindings and generating appropriate code. > > As a result all plain 2d loads are converted into a pair of 2d/3d loads, > with appropriate predicates to ensure only one of those actually > executes, and the values are all merged in. > > This goes somewhat against the current flow, so for GM107 we do the OOB > handling directly in the surface processing logic. Perhaps the other > gens should do something similar, but that is left to another change. > > This fixes dEQP tests like image_load_store.3d.*_single_layer and GL-CTS > tests like shader_image_load_store.non-layered_binding without breaking > anything else. > > Signed-off-by: Ilia Mirkin <imirkin@xxxxxxxxxxxx> > --- > > OK, first of all -- to whoever thought that binding single layers of a 3d > image and telling the shader they were regular 2d images was a good idea -- > I disagree. > > This change feels super super dirty, but I honestly don't see a materially > cleaner way of handling it. Instead of being able to reuse the OOB > handling, it's put in with the coord processing (!), and the surface > conversion function is seriously hacked up. > > But splitting it up is harder, since a lot of information has to flow > from stage to stage, like when to do what kind of access, and cloning > the surface op is much easier in the coord processing stage. > > .../nouveau/codegen/nv50_ir_emit_gm107.cpp | 34 ++- > .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 206 +++++++++++++----- > .../nouveau/codegen/nv50_ir_lowering_nvc0.h | 4 +- > src/gallium/drivers/nouveau/nvc0/nvc0_tex.c | 10 +- > 4 files changed, 201 insertions(+), 53 deletions(-) > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp > index 6eefe8f0025..e244bd0d610 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp > @@ -122,6 +122,8 @@ private: > void emitSAM(); > void emitRAM(); > > + void emitPSETP(); > + > void emitMOV(); > void emitS2R(); > void emitCS2R(); > @@ -690,6 +692,31 @@ CodeEmitterGM107::emitRAM() > * predicate/cc > ******************************************************************************/ > > +void > +CodeEmitterGM107::emitPSETP() > +{ > + > + emitInsn(0x50900000); > + > + switch (insn->op) { > + case OP_AND: emitField(0x18, 3, 0); break; > + case OP_OR: emitField(0x18, 3, 1); break; > + case OP_XOR: emitField(0x18, 3, 2); break; > + default: > + assert(!"unexpected operation"); > + break; > + } > + > + // emitINV (0x2a); > + emitPRED(0x27); // TODO: support 3-arg > + emitINV (0x20, insn->src(1)); > + emitPRED(0x1d, insn->src(1)); > + emitINV (0x0f, insn->src(0)); > + emitPRED(0x0c, insn->src(0)); > + emitPRED(0x03, insn->def(0)); > + emitPRED(0x00); > +} > + > /******************************************************************************* > * movement / conversion > ******************************************************************************/ > @@ -3557,7 +3584,12 @@ CodeEmitterGM107::emitInstruction(Instruction *i) > case OP_AND: > case OP_OR: > case OP_XOR: > - emitLOP(); > + switch (insn->def(0).getFile()) { > + case FILE_GPR: emitLOP(); break; > + case FILE_PREDICATE: emitPSETP(); break; > + default: > + assert(!"invalid bool op"); > + } > break; > case OP_NOT: > emitNOT(); > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > index 1f702a987d8..0f68a9a229f 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp > @@ -1802,6 +1802,9 @@ NVC0LoweringPass::loadSuInfo32(Value *ptr, int slot, uint32_t off, bool bindless > { > uint32_t base = slot * NVC0_SU_INFO__STRIDE; > > + // We don't upload surface info for bindless for GM107+ > + assert(!bindless || targ->getChipset() < NVISA_GM107_CHIPSET); > + > if (ptr) { > ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(slot)); > if (bindless) > @@ -2204,7 +2207,7 @@ getDestType(const ImgType type) { > } > > void > -NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su) > +NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su, Instruction **loaded) > { > const TexInstruction::ImgFormatDesc *format = su->tex.format; > int width = format->bits[0] + format->bits[1] + > @@ -2223,21 +2226,38 @@ NVC0LoweringPass::convertSurfaceFormat(TexInstruction *su) > if (width < 32) > untypedDst[0] = bld.getSSA(); > > - for (int i = 0; i < 4; i++) { > - typedDst[i] = su->getDef(i); > + if (loaded && loaded[0]) { > + for (int i = 0; i < 4; i++) { > + if (loaded[i]) > + typedDst[i] = loaded[i]->getDef(0); > + } > + } else { > + for (int i = 0; i < 4; i++) { > + typedDst[i] = su->getDef(i); > + } > } > > // Set the untyped dsts as the su's destinations > - for (int i = 0; i < 4; i++) > - su->setDef(i, untypedDst[i]); > + if (loaded && loaded[0]) { > + for (int i = 0; i < 4; i++) > + if (loaded[i]) > + loaded[i]->setDef(0, untypedDst[i]); > + } else { > + for (int i = 0; i < 4; i++) > + su->setDef(i, untypedDst[i]); > > - bld.setPosition(su, true); > + bld.setPosition(su, true); > + } > > // Unpack each component into the typed dsts > int bits = 0; > for (int i = 0; i < 4; bits += format->bits[i], i++) { > if (!typedDst[i]) > continue; > + > + if (loaded && loaded[0]) > + bld.setPosition(loaded[i], true); > + > if (i >= format->components) { > if (format->type == FLOAT || > format->type == UNORM || > @@ -2308,7 +2328,7 @@ NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su) > processSurfaceCoordsNVE4(su); > > if (su->op == OP_SULDP) { > - convertSurfaceFormat(su); > + convertSurfaceFormat(su, NULL); > insertOOBSurfaceOpResult(su); > } > > @@ -2421,7 +2441,7 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su) > processSurfaceCoordsNVC0(su); > > if (su->op == OP_SULDP) { > - convertSurfaceFormat(su); > + convertSurfaceFormat(su, NULL); > insertOOBSurfaceOpResult(su); > } > > @@ -2463,14 +2483,16 @@ NVC0LoweringPass::handleSurfaceOpNVC0(TexInstruction *su) > } > } > > -void > -NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su) > +TexInstruction * > +NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su, Instruction *ret[4]) > { > const int slot = su->tex.r; > const int dim = su->tex.target.getDim(); > - const int arg = dim + (su->tex.target.isArray() || su->tex.target.isCube()); > + const bool array = su->tex.target.isArray() || su->tex.target.isCube(); > + const int arg = dim + array; > Value *ind = su->getIndirectR(); > Value *handle; > + Instruction *pred = NULL, *pred2d = NULL; > int pos = 0; > > bld.setPosition(su, false); > @@ -2489,67 +2511,153 @@ NVC0LoweringPass::processSurfaceCoordsGM107(TexInstruction *su) > assert(pos == 0); > break; > } > + > + if (dim == 2 && !array) { > + // This might be a 2d slice of a 3d texture, try to load the z > + // coordinate in. > + Value *v; > + if (!su->tex.bindless) > + v = loadSuInfo32(ind, slot, NVC0_SU_INFO_UNK1C, su->tex.bindless); > + else > + v = bld.mkOp2v(OP_SHR, TYPE_U32, bld.getSSA(), ind, bld.mkImm(11)); > + Value *is_3d = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), v, bld.mkImm(1)); > + pred2d = bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), > + TYPE_U32, bld.mkImm(0), is_3d); > + > + bld.mkOp2(OP_SHR, TYPE_U32, v, v, bld.loadImm(NULL, 16)); > + su->moveSources(dim, 1); > + su->setSrc(dim, v); > + su->tex.target = nv50_ir::TEX_TARGET_3D; > + pos++; > + } > + > if (su->tex.bindless) > handle = ind; > else > handle = loadTexHandle(ind, slot + 32); > + > su->setSrc(arg + pos, handle); > > // The address check doesn't make sense here. The format check could make > // sense but it's a bit of a pain. > - if (su->tex.bindless) > - return; > + if (!su->tex.bindless) { > + // prevent read fault when the image is not actually bound > + pred = > + bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), > + TYPE_U32, bld.mkImm(0), > + loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); > + if (su->op != OP_SUSTP && su->tex.format) { > + const TexInstruction::ImgFormatDesc *format = su->tex.format; > + int blockwidth = format->bits[0] + format->bits[1] + > + format->bits[2] + format->bits[3]; > + > + assert(format->components != 0); > + // make sure that the format doesn't mismatch when it's not FMT_NONE > + bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), > + TYPE_U32, bld.loadImm(NULL, blockwidth / 8), > + loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless), > + pred->getDef(0)); > + } > + } > > - // prevent read fault when the image is not actually bound > - CmpInstruction *pred = > - bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE), > - TYPE_U32, bld.mkImm(0), > - loadSuInfo32(ind, slot, NVC0_SU_INFO_ADDR, su->tex.bindless)); > - if (su->op != OP_SUSTP && su->tex.format) { > - const TexInstruction::ImgFormatDesc *format = su->tex.format; > - int blockwidth = format->bits[0] + format->bits[1] + > - format->bits[2] + format->bits[3]; > + // Now we have "pred" which (optionally) contains whether to do the surface > + // op at all, and a "pred2d" which indicates that, in case of doing the > + // surface op, we have to create a 2d and 3d version, conditioned on pred2d. > + TexInstruction *su2d = NULL; > + if (pred2d) { > + su2d = cloneForward(func, su)->asTex(); > + for (unsigned i = 0; su->defExists(i); ++i) > + su2d->setDef(i, bld.getSSA()); > + su2d->moveSources(dim + 1, -1); > + su2d->tex.target = nv50_ir::TEX_TARGET_2D; > + } > + if (pred2d && pred) { > + Instruction *pred3d = bld.mkOp2(OP_AND, TYPE_U8, > + bld.getSSA(1, FILE_PREDICATE), > + pred->getDef(0), pred2d->getDef(0)); > + pred3d->src(0).mod = Modifier(NV50_IR_MOD_NOT); > + pred3d->src(1).mod = Modifier(NV50_IR_MOD_NOT); > + su->setPredicate(CC_P, pred3d->getDef(0)); > + pred2d = bld.mkOp2(OP_AND, TYPE_U8, bld.getSSA(1, FILE_PREDICATE), > + pred->getDef(0), pred2d->getDef(0)); > + pred2d->src(0).mod = Modifier(NV50_IR_MOD_NOT); > + } else if (pred) { > + su->setPredicate(CC_NOT_P, pred->getDef(0)); > + } else if (pred2d) { > + su->setPredicate(CC_NOT_P, pred2d->getDef(0)); > + } > + if (su2d) { > + su2d->setPredicate(CC_P, pred2d->getDef(0)); > + bld.insert(su2d); > + > + // Create a UNION so that RA assigns the same registers > + bld.setPosition(su, true); > + for (unsigned i = 0; su->defExists(i); ++i) { > + assert(i < 4); > > - assert(format->components != 0); > - // make sure that the format doesn't mismatch when it's not FMT_NONE > - bld.mkCmp(OP_SET_OR, CC_NE, TYPE_U32, pred->getDef(0), > - TYPE_U32, bld.loadImm(NULL, blockwidth / 8), > - loadSuInfo32(ind, slot, NVC0_SU_INFO_BSIZE, su->tex.bindless), > - pred->getDef(0)); > + ValueDef &def = su->def(i); > + ValueDef &def2 = su2d->def(i); > + Instruction *mov = NULL; > + > + if (pred) { > + mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); > + mov->setPredicate(CC_P, pred->getDef(0)); > + } > + > + Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32, > + bld.getSSA(), > + NULL, def2.get()); > + def.replace(uni->getDef(0), false); > + uni->setSrc(0, def.get()); > + if (mov) > + uni->setSrc(2, mov->getDef(0)); > + } > + } else if (pred) { > + // Create a UNION so that RA assigns the same registers > + bld.setPosition(su, true); > + for (unsigned i = 0; su->defExists(i); ++i) { > + assert(i < 4); > + > + ValueDef &def = su->def(i); > + > + Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); > + mov->setPredicate(CC_P, pred->getDef(0)); > + > + Instruction *uni = ret[i] = bld.mkOp2(OP_UNION, TYPE_U32, > + bld.getSSA(), > + NULL, mov->getDef(0)); > + def.replace(uni->getDef(0), false); > + uni->setSrc(0, def.get()); > + } > } > - su->setPredicate(CC_NOT_P, pred->getDef(0)); > + > + return su2d; > } > > void > NVC0LoweringPass::handleSurfaceOpGM107(TexInstruction *su) > { > - processSurfaceCoordsGM107(su); > + // processSurfaceCoords also takes care of fixing up the outputs and > + // union'ing them with 0 as necessary. Additionally it may create a second > + // surface which needs some of the similar fixups. > + > + Instruction *loaded[4] = {}; > + TexInstruction *su2 = processSurfaceCoordsGM107(su, loaded); > > if (su->op == OP_SULDP) { > - convertSurfaceFormat(su); > - insertOOBSurfaceOpResult(su); > + convertSurfaceFormat(su, loaded); > } > > if (su->op == OP_SUREDP) { > - Value *def = su->getDef(0); > - > su->op = OP_SUREDB; > + } > > - // There may not be a predicate in the bindless case. > - if (su->getPredicate()) { > - su->setDef(0, bld.getSSA()); > - > - bld.setPosition(su, true); > - > - // make sure to initialize dst value when the atomic operation is not > - // performed > - Instruction *mov = bld.mkMov(bld.getSSA(), bld.loadImm(NULL, 0)); > - > - assert(su->cc == CC_NOT_P); > - mov->setPredicate(CC_P, su->getPredicate()); > - > - bld.mkOp2(OP_UNION, TYPE_U32, def, su->getDef(0), mov->getDef(0)); > - } > + // If we fixed up the type of the regular surface load instruction, we also > + // have to fix up the copy. > + if (su2) { > + su2->op = su->op; > + su2->dType = su->dType; > + su2->sType = su->sType; > } > } > > diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h > index 0ce2a4b80f8..b4c405a9ea5 100644 > --- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h > +++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h > @@ -171,10 +171,10 @@ private: > Value *loadMsInfo32(Value *ptr, uint32_t off); > > void adjustCoordinatesMS(TexInstruction *); > - void processSurfaceCoordsGM107(TexInstruction *); > + TexInstruction *processSurfaceCoordsGM107(TexInstruction *, Instruction *[4]); > void processSurfaceCoordsNVE4(TexInstruction *); > void processSurfaceCoordsNVC0(TexInstruction *); > - void convertSurfaceFormat(TexInstruction *); > + void convertSurfaceFormat(TexInstruction *, Instruction **); > void insertOOBSurfaceOpResult(TexInstruction *); > Value *calculateSampleOffset(Value *sampleID); > > diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c > index f62e508258b..4948a8f4cea 100644 > --- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c > +++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c > @@ -1433,7 +1433,15 @@ gm107_create_image_handle(struct pipe_context *pipe, > > nvc0->screen->tic.lock[tic->id / 32] |= 1 << (tic->id % 32); > > - return 0x100000000ULL | tic->id; > + // Compute handle. This will include the TIC as well as some additional > + // info regarding the bound 3d surface layer, if applicable. > + uint64_t handle = 0x100000000ULL | tic->id; > + struct nv04_resource *res = nv04_resource(view->resource); > + if (res->base.target == PIPE_TEXTURE_3D) { > + handle |= 1 << 11; > + handle |= view->u.tex.first_layer << (11 + 16); > + } > + return handle; > > fail: > FREE(tic); > -- > 2.21.0 > > _______________________________________________ > Nouveau mailing list > Nouveau@xxxxxxxxxxxxxxxxxxxxx > https://lists.freedesktop.org/mailman/listinfo/nouveau _______________________________________________ Nouveau mailing list Nouveau@xxxxxxxxxxxxxxxxxxxxx https://lists.freedesktop.org/mailman/listinfo/nouveau