DIF being such an evergreen, I tried to develop on the nice heatmap we have in dexter. Using the dichotomized version of the verbal aggression data, the current version produces the following:

library(dexter)
dich = dexter::verbAggrRules             
dich$item_score[dich$item_score==2] = 1  # dichotomize the items
db = dexter::start_new_project(dich, ":memory:",
                               person_properties=list(gender="unknown"))
dexter::add_booklet(db, dexter::verbAggrData, "agg")
## no column `person_id` provided, automatically generating unique person id's
## $items
##  [1] "S1DoCurse"   "S1DoScold"   "S1DoShout"   "S1WantCurse" "S1WantScold"
##  [6] "S1WantShout" "S2DoCurse"   "S2DoScold"   "S2DoShout"   "S2WantCurse"
## [11] "S2WantScold" "S2WantShout" "S3DoCurse"   "S3DoScold"   "S3DoShout"  
## [16] "S3WantCurse" "S3WantScold" "S3WantShout" "S4DoCurse"   "S4DoScold"  
## [21] "S4DoShout"   "S4WantCurse" "S4WantScold" "S4WantShout"
## 
## $person_properties
## [1] "gender"
## 
## $columns_ignored
## [1] "anger"
d = dexter::DIF(db, 'gender')
plot(d)

A detailed explanation is given in the Psycho paper. The plot has two limitations: it does not have an option to cluster rows and columns, and it can look crisp or blurred depending on where and how it is reproduced. The first of these limitations will most probably be removed in the next version of dexter. For crisp images, I rather like package heatmap but adding it to dexter would add to an already largish number of dependencies. As a compromise, I have written this little function:

DIFplot = function(d, 
  what=c('distances','statistics','pvalues','significance'),
  pam=c("none", "holm", "hochberg", "hommel", "bonferroni", "BH", "BY",
     "fdr"), cluster = TRUE, alpha=0.05, ...) {
  what = match.arg(what)
  pam = match.arg(pam)
  alpha = alpha/2
  dist = abs(d$Delta_R)
  o = hclust(dist(dist))$order
  lbl = d$items$item_id
  stat = abs(d$DIF_pair)
  outl = lbl[o]
  if (cluster) {stat=stat[o,o]; lbl=lbl[o]}
  pval = 1 - pnorm(stat)
  u0 = pval[lower.tri(pval)]
  u1 = p.adjust(u0, method=pam)
  pval[lower.tri(pval)] = u1

  if(what=='distances') {
    rownames(dist) = colnames(dist) = d$items$item_id
    if (cluster) {
        pheatmap::pheatmap(dist, main='PDIF: raw differences', ...)
      } else {
        pheatmap::pheatmap(dist, main='PDIF: raw differences', cluster_rows=FALSE, cluster_cols=FALSE)
      }
  }
  if(what=='statistics') {
    rownames(stat) = colnames(stat) = lbl
    diag(stat) = NA
    pheatmap::pheatmap(stat, main='PDIF: standardized differences', cluster_rows=FALSE, cluster_cols=FALSE)
  } 
  if(what=='pvalues') {
    rownames(pval) = colnames(pval) = lbl
    diag(pval) = NA
    ttl = 'PDIF: p-values'
    if (pam != 'none') ttl=paste0(ttl, ' (below diagonal adjusted by ',pam,')')
    pheatmap::pheatmap(pval,
      main = ttl,
      cluster_rows=FALSE, cluster_cols=FALSE,
      color = colorRampPalette(RColorBrewer::brewer.pal(n=7, name="RdYlBu"))(100))
  }
  if(what=='significance') {
    ttl = paste0('PDIF: significance at alpha=',2*alpha)
    if (pam != 'none') ttl=paste0(ttl, ' (b/d adjusted by ',pam,')')
    v = (0 + (pval<alpha))
    rownames(v) = colnames(v) = lbl
    diag(v) = NA
    pheatmap::pheatmap(v, main=ttl, cluster_rows=FALSE, cluster_cols=FALSE, legend=FALSE)
  }
  return(outl)
}

The original plot (except for the clever coloring scheme) can be reproduced with:

DIFplot(d, what='statistics', cluster=FALSE)

##  [1] "S2WantShout" "S1DoShout"   "S3WantCurse" "S1WantScold" "S1WantShout"
##  [6] "S4DoShout"   "S1WantCurse" "S2WantScold" "S4WantCurse" "S3WantShout"
## [11] "S2WantCurse" "S4WantShout" "S2DoCurse"   "S2DoScold"   "S1DoScold"  
## [16] "S3DoCurse"   "S3DoScold"   "S3DoShout"   "S3WantScold" "S4DoCurse"  
## [21] "S4DoScold"   "S1DoCurse"   "S2DoShout"   "S4WantScold"

and the alternative plot in the Psycho paper with:

pl = DIFplot(d, what='distances', cluster=TRUE)

As argued in the paper, it is more logical to do the clustering on the raw differences (called Delta_R in the dexter object and distances here) than on the standardized differences (DIF_pair in the dexter object and statistics here). However, we can easily transfer the row and column ordering produced by clustering Delta_R to any other plot. Since this ordering is returned by the DIF_plot function, we can use it to easily rearrange the original heatmap:

plot(d, itemsX=pl, itemsY=pl)

Along the same logic, we can produce clustered (and, of course, unclustered) heatmaps of the corresponding p-values. In the following plot, the p-values are shown ‘as is’ above the main diagonal, and adjusted for multiple comparisons by the Bonferroni method below the main diagonal:

DIFplot(d, what='pvalues', pam='bonferroni', cluster=TRUE)

##  [1] "S2WantShout" "S1DoShout"   "S3WantCurse" "S1WantScold" "S1WantShout"
##  [6] "S4DoShout"   "S1WantCurse" "S2WantScold" "S4WantCurse" "S3WantShout"
## [11] "S2WantCurse" "S4WantShout" "S2DoCurse"   "S2DoScold"   "S1DoScold"  
## [16] "S3DoCurse"   "S3DoScold"   "S3DoShout"   "S3WantScold" "S4DoCurse"  
## [21] "S4DoScold"   "S1DoCurse"   "S2DoShout"   "S4WantScold"

pam stands for ‘p-value adjustment method’ and simply passes its value to function p.adjust. We can also see which adjusted and non-adjusted p-values are below a certain threshold:

DIFplot(d, what='significance', pam='hochberg', alpha=0.05)

##  [1] "S2WantShout" "S1DoShout"   "S3WantCurse" "S1WantScold" "S1WantShout"
##  [6] "S4DoShout"   "S1WantCurse" "S2WantScold" "S4WantCurse" "S3WantShout"
## [11] "S2WantCurse" "S4WantShout" "S2DoCurse"   "S2DoScold"   "S1DoScold"  
## [16] "S3DoCurse"   "S3DoScold"   "S3DoShout"   "S3WantScold" "S4DoCurse"  
## [21] "S4DoScold"   "S1DoCurse"   "S2DoShout"   "S4WantScold"

I am not quite sure whether it is necessary to adjust the p-values for multiple comparisons given the presence of an overall DIF test. However, since the overall test almost invariably turns out significant, any reasonably objective method to sort out the large deviations from the more trivial ones may be welcome in practice. I leave it to the reader to decide which adjustment method is most appropriate, bearing in mind that the p-values are not necessarily independent.