It is possible to run some programs on different CPUs/GPUs as follows.
In this example, a.out uses CPU0-6+GPU0, b.out uses CPU7-13+GPU1, c.out uses CPU14-20+GPU2, d.out uses CPU21-27+GPU3.
#!/bin/sh
#$ -cwd
#$ -V
#$ -l f_node=1
#$ -l h_rt=00:30:00
a[0]=./a.out
a[1]=./b.out
a[2]=./c.out
a[3]=./d.out
for i in $(seq 0 3)
do
export CUDA_VISIBLE_DEVICES=$i
numactl -C $((i*7))-$((i*7+6)) ${a[$i]} &
done
wait