#!/usr/bin/awk -f
BEGIN {
FS = "[_.]"
}
function display() {
if (length(gene_ids) > 1)
for (j=0; j <= i; j++)
print a[j]
}
{
if (/^>Cluster /) {
display()
delete a
delete gene_ids
a[i=0] = $0
} else {
a[++i] = $0
gene_ids[$7] = 1
}
}
END {
display()
}>Cluster 0
0 3843aa, >9606_9d1c13f4f2796e1bc5d9c034d256608e_ENSP00000478752_3843_318_ENST00000621744_ENSG00000286185... *
1 3843aa, >9606_9d1c13f4f2796e1bc5d9c034d256608e_ENSP00000498781_3843_318_ENST00000651566_ENSG00000271383... at 1:3843:1:3843/100.00%
>Cluster 17
0 1388aa, >9606_e3f5b4b466cd2bae95842b586d4d5ff5_ENSP00000419786_1388_4_ENST00000465301_ENSG00000243978... *
1 1388aa, >9606_e3f5b4b466cd2bae95842b586d4d5ff5_ENSP00000441452_1388_4_ENST00000540313_ENSG00000243978... at 1:1388:1:1388/100.00%
>Cluster 34
0 1150aa, >9606_c6fca1c116a00dbb0d2e8930f4056625_ENSP00000353655_1150_26_ENST00000360468_ENSG00000196547... *
1 1150aa, >9606_c6fca1c116a00dbb0d2e8930f4056625_ENSP00000452948_1150_26_ENST00000559717_ENSG00000196547... at 1:1150:1:1150/100.00%
>Cluster 39
0 1072aa, >9606_64cead9c681fd594c83c17cc06748bb6_ENSP00000315112_1072_50_ENST00000324103_ENSG00000092098... *
1 1072aa, >9606_64cead9c681fd594c83c17cc06748bb6_ENSP00000457512_1072_50_ENST00000558468_ENSG00000259529... at 1:1072:1:1072/100.00%
>Cluster 271
0 551aa, >9606_95dbfd3f219d32f1cc1074a79bfc576d_ENSP00000415200_551_42_ENST00000429354_ENSG00000268500... *
1 551aa, >9606_95dbfd3f219d32f1cc1074a79bfc576d_ENSP00000470259_551_42_ENST00000599649_ENSG00000268500... at 1:551:1:551/100.00%
2 551aa, >9606_95dbfd3f219d32f1cc1074a79bfc576d_ENSP00000473238_551_42_ENST00000534261_ENSG00000105501... at 1:551:1:551/100.00%
>Cluster 284
0 547aa, >9606_8ed59e1e16a1229b55495ff661b5aa66_ENSP00000354675_547_9_ENST00000361229_ENSG00000198908... *
1 547aa, >9606_8ed59e1e16a1229b55495ff661b5aa66_ENSP00000361820_547_9_ENST00000372735_ENSG00000198908... at 1:547:1:547/100.00%
2 547aa, >9606_8ed59e1e16a1229b55495ff661b5aa66_ENSP00000391722_547_9_ENST00000448867_ENSG00000198908... at 1:547:1:547/100.00%
3 547aa, >9606_8ed59e1e16a1229b55495ff661b5aa66_ENSP00000403226_547_9_ENST00000457056_ENSG00000198908... at 1:547:1:547/100.00%
4 547aa, >9606_8ed59e1e16a1229b55495ff661b5aa66_ENSP00000405893_547_9_ENST00000447531_ENSG00000198908... at 1:547:1:547/100.00%>Cluster 0
0 3843aa, >9606_9d1c13f4f2796e1bc5d9c034d256608e_ENSP00000478752_3843_318_ENST00000621744_ENSG00000286185... *
1 3843aa, >9606_9d1c13f4f2796e1bc5d9c034d256608e_ENSP00000498781_3843_318_ENST00000651566_ENSG00000271383... at 1:3843:1:3843/100.00%
>Cluster 39
0 1072aa, >9606_64cead9c681fd594c83c17cc06748bb6_ENSP00000315112_1072_50_ENST00000324103_ENSG00000092098... *
1 1072aa, >9606_64cead9c681fd594c83c17cc06748bb6_ENSP00000457512_1072_50_ENST00000558468_ENSG00000259529... at 1:1072:1:1072/100.00%
>Cluster 271
0 551aa, >9606_95dbfd3f219d32f1cc1074a79bfc576d_ENSP00000415200_551_42_ENST00000429354_ENSG00000268500... *
1 551aa, >9606_95dbfd3f219d32f1cc1074a79bfc576d_ENSP00000470259_551_42_ENST00000599649_ENSG00000268500... at 1:551:1:551/100.00%
2 551aa, >9606_95dbfd3f219d32f1cc1074a79bfc576d_ENSP00000473238_551_42_ENST00000534261_ENSG00000105501... at 1:551:1:551/100.00%(FILENAME=test_cluster FNR=1) fatal: attempt to use scalar `gene_ids' as an arraylength(array)相关:awk 'BEGIN{a[1]=10;a[2]=20;print length(a)}'如建议的那样这里
posix变量的状态:set -o | grep posix发布于 2021-08-31 19:33:03
length(gene_ids)将gene_ids声明为标量,如果gene_ids以前未使用,因为过去length()只用于字符串(在即将发布的gawk版本中,这种行为将发生变化,如果以前未设置,length()将不会设置参数的类型)。
将delete gene_ids添加到BEGIN部分,将其声明为数组,而不管脚本中现有行的命中顺序如何,这是由输入数据驱动的:
$ awk 'BEGIN{ length(gene_ids); gene_ids[1] }'
awk: cmd. line:1: fatal: attempt to use scalar `gene_ids' as an array
$ awk 'BEGIN{ delete gene_ids; length(gene_ids); gene_ids[1] }'
$https://stackoverflow.com/questions/69004344
复制相似问题