我目前正在编写一个API来处理生物信息SAM记录。下面是一个例子:
SBL_XSBF463_ID:3230017:BCR1:GCATAA:BCR2:CATATA/1:vpe 97 hs07 38253395 3 30M = 38330420 77055 TTGTTCCACTGCCAAAGAGTTTCTTATAAT EEEEEEEEEEEEAEEEEEEEEEEEEEEEEE PG:Z:novoalign AS:i:0 UQ:i:0 NM:i:0 MD:Z:30 ZS:Z:R NH:i:2 HI:i:1 IH:i:1
由制表符分隔的每一段信息都是它自己的字段,并对应于某种类型的数据。
现在,需要注意的是,这些文件会变大(GB的10‘S),因此,在某种POJO中实例化每个文件之后,就会效率低下。
因此,我决定创建一个具有延迟加载机制的对象。只有原始字符串被存储,直到某个调用代码请求其中一个字段。这将使创建对象时完成的工作量最小化,并将对象占用的内存量降到最低。
以下是我的尝试:
/** Class for storing and working with sam formatted DNA sequence.
*
* Upon construction, only the String record is stored.
* All querying of fields is done on demand, to save time.
*
*/
public class SamRecord implements Record {
private final String read;
private String id = null;
private int flag = -1;
private String referenceName = null;
private int pos = -1;
private int mappingQuality = -1;
private String cigar = null;
private String mateReferenceName = null;
private int matePosition = -1;
private int templateLength = -1;
private String sequence = null;
private String quality = null;
private String variableTerms = null;
private final static String REPEAT_TERM = "ZS:Z:R";
private final static String MATCH_TERM = "ZS:Z:NM";
private final static String QUALITY_CHECK_TERM = "ZS:Z:QC";
/** Simple constructor for the sam record
* @param read full read
*/
public SamRecord(String read) {
this.read = read;
}
public String getRead() {
return read;
}
/**
* {@inheritDoc}
*/
@Override
public String getId() {
if(id == null){
id = XsamReadQueries.findID(read);
}
return id;
}
/**
* {@inheritDoc}
*/
@Override
public int getFlag() throws NumberFormatException {
if(flag == -1) {
flag = Integer.parseInt(XsamReadQueries.findElement(read, 1));
}
return flag;
}
/**
* {@inheritDoc}
*/
@Override
public String getReferenceName() {
if(referenceName == null){
referenceName = XsamReadQueries.findReferneceName(read);
}
return referenceName;
}
/**
* {@inheritDoc}
*/
@Override
public int getPos() throws NumberFormatException{
if(pos == -1){
pos = Integer.parseInt(XsamReadQueries.findElement(read, 3));
}
return pos;
}
/**
* {@inheritDoc}
*/
@Override
public int getMappingQuality() throws NumberFormatException {
if(mappingQuality == -1){
mappingQuality = Integer.parseInt(XsamReadQueries.findElement(read, 4));
}
return mappingQuality;
}
/**
* {@inheritDoc}
*/
@Override
public String getCigar() {
if(cigar == null){
cigar = XsamReadQueries.findCigar(read);
}
return cigar;
}
/**
* {@inheritDoc}
*/
@Override
public String getMateReferenceName() {
if(mateReferenceName == null){
mateReferenceName = XsamReadQueries.findElement(read, 6);
}
return mateReferenceName;
}
/**
* {@inheritDoc}
*/
@Override
public int getMatePosition() throws NumberFormatException {
if(matePosition == -1){
matePosition = Integer.parseInt(XsamReadQueries.findElement(read, 7));
}
return matePosition;
}
/**
* {@inheritDoc}
*/
@Override
public int getTemplateLength() throws NumberFormatException {
if(templateLength == -1){
templateLength = Integer.parseInt(XsamReadQueries.findElement(read, 8));
}
return templateLength;
}
/**
* {@inheritDoc}
*/
@Override
public String getSequence() {
if(sequence == null){
sequence = XsamReadQueries.findBaseSequence(read);
}
return sequence;
}
/**
* {@inheritDoc}
*/
@Override
public String getQuality() {
if(quality == null){
quality = XsamReadQueries.findElement(read, 10);
}
return quality;
}
/**
* {@inheritDoc}
*/
@Override
public boolean isRepeat() {
return read.contains(REPEAT_TERM);
}
/**
* {@inheritDoc}
*/
@Override
public boolean isMapped() {
return !read.contains(MATCH_TERM);
}
/**
* {@inheritDoc}
*/
@Override
public String getVariableTerms() {
if(variableTerms == null){
variableTerms = XsamReadQueries.findVariableRegionSequence(read);
}
return variableTerms;
}
/**
* {@inheritDoc}
*/
@Override
public boolean isQualityFailed() {
return read.contains(QUALITY_CHECK_TERM);
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
SamRecord samRecord = (SamRecord) o;
return Objects.equals(read, samRecord.read);
}
@Override
public int hashCode() {
return Objects.hash(read);
}
@Override
public String toString() {
return read;
}
}这些字段由助手类中的静态方法返回,该方法通过查看制表符的位置来检索它们。即flag = Integer.parseInt(XsamReadQueries.findElement(read, 1));
下面是XsamReadQuery类
/**
* Non-instantiable utility class for working with Xsam reads
*/
public final class XsamReadQueries {
// Suppress instantiation
private XsamReadQueries() {
throw new AssertionError();
}
/** finds the position of the tab directly before the start of the variable region
* @param read whole sam or Xsam read to search
* @return position of the tab in the String
*/
public static int findVariableRegionStart(String read){
int found = 0;
for(int i = 0; i < read.length(); i++){
if(read.charAt(i) == '\t'){
found++;
if(found >= 11 && i+1 < read.length() && (read.charAt(i+1) != 'x' && read.charAt(i+1) != '\t')){ //guard against double-tabs
return i + 1;
}
}
}
return -1;
}
/** Attempts to find the library name from SBL reads
* where SBL reads have the id SBL_LibraryName_ID:XXXXX
* if LibraryName end's with a lower case letter, the letter will be removed.
* if SBL_LibID is not valid, return the full ID.
* @param ID or String to search.
* @return Library name with lower case endings removed
*/
public static String findLibraryName(String ID){
if(!ID.startsWith("SBL")) return "";
try {
int firstPos = XsamReadQueries.findPosAfter(ID, "_");
int i = firstPos;
while (ID.charAt(i) != '_' && ID.charAt(i) != '\t') {
i++;
}
String library = ID.substring(firstPos, i);
char lastChar = library.charAt(library.length()-1);
if(lastChar >= 97 && lastChar <= 122){
library = library.substring(0, library.length()-1);
}
return library;
}catch (Exception e){
int i = 0;
while(ID.charAt(i) != '\t'){
i++;
if(i == ID.length()){
break;
}
}
return ID.substring(0, i);
}
}
/** Returns the ID from the sample
* @param sample Xsam read
* @return ID
*/
public static String findID(String sample){
return findElement(sample, 0);
}
/** Returns the phred score from the sample
* @param sample Xsam read
* @return phred string
*/
public static String findPhred(String sample){
return findElement(sample, 10);
}
/**
* Returns the cigar from the xsam read
*
* @param sample read
* @return cigar string
*/
public static String findCigar(String sample) {
return findElement(sample, 5);
}
/**
* Returns the bases from the xsam read
*
* @param sample read
* @return base string
*/
public static String findBaseSequence(String sample) {
return findElement(sample, 9);
}
/**
* finds the n'th element in the tab delimited sample
* i.e findElement(0) returns one from "one\ttwo"
* 0 indexed.
*
* @param sample String to search
* @param element element to find
* @return found element or "" if not found
*/
public static String findElement(String sample, int element) {
boolean tabsFound = false;
int i = 0;
int firstTab = 0;
int secondTab = 0;
int tabsToSkip = element - 1 >= 0 ? element - 1 : 0;
int skippedTabs = 0;
if (element == 0) {
while (sample.charAt(i) != '\t') {
i++;
}
return sample.substring(0, i);
} else {
while (!tabsFound) {
if (sample.charAt(i) != '\t') {
i++;
} else {
if (skippedTabs == tabsToSkip) {
if (firstTab == 0) {
firstTab = i;
} else {
secondTab = i;
tabsFound = true;
}
} else {
skippedTabs++;
}
i++;
}
}
}
return sample.substring(firstTab + 1, secondTab);
}
/** finds the variable region past the quality
* @param sample sam or Xsam record string
* @return variable sequence or empty string
*/
public static String findVariableRegionSequence(String sample){
int start = findVariableRegionStart(sample);
if(start == -1) return "";
return sample.substring(findVariableRegionStart(sample));
}
/** finds the xL field
* @param sample String to search
* @return position if found, '\0' (null) value if not.
*/
public static int findxLField(String sample) {
int chartStart = findPosAfter(sample, "\txL:i:");
if (chartStart == -1) {
return -1; //return -1 if not found.
}
int i = chartStart;
while (sample.charAt(i) != '\t') {
i++;
}
return Integer.parseInt(sample.substring(chartStart, i));
}
/** finds the xR field
* @param sample String to search
* @return position if found, '\0' (null) value if not.
*/
public static int findxRField(String sample) {
int chartStart = findPosAfter(sample, "\txR:i:");
if (chartStart == -1) {
return '\0'; //return NULL if not found.
}
int i = chartStart;
while (sample.charAt(i) != '\t') {
i++;
}
return Integer.parseInt(sample.substring(chartStart, i));
}
/** finds the xLSeq field
* @param sample String to search
* @return String if found, empty string if not.
*/
public static Optional<String> findxLSeqField(String sample) {
int charStart = findPosAfter(sample, "\txLseq:i:");
if (charStart == -1) {
return Optional.empty(); //return NULL if not found.
}
int i = charStart;
while (sample.charAt(i) != '\t') {
i++;
}
return Optional.of(sample.substring(charStart, i));
}
/** finds the reference name field
* @param sample String to search
* @return String if found, empty string if not.
*/
public static String findReferneceName(String sample) {
//should always appear between the second and third tabs
boolean tabsFound = false;
int i = 0;
int secondTab = 0;
int thirdTab = 0;
boolean skippedFirstTab = false;
while (!tabsFound) {
if (sample.charAt(i) != '\t') {
i++;
} else {
if (skippedFirstTab) {
if (secondTab == 0) {
secondTab = i;
} else {
thirdTab = i;
tabsFound = true;
}
}
skippedFirstTab = true;
i++;
}
}
if(sample.substring(secondTab + 1, thirdTab).contains("/")){
String[] split = sample.substring(secondTab + 1, thirdTab).split("/");
return split[split.length-1];
}
return sample.substring(secondTab + 1, thirdTab);
}
/**
* Finds the needle in the haystack, and returns the position of the single next digit.
*
* @param haystack The string to search
* @param needle String field to search on.
* @return position of the end of the needle
*/
private static int findPosAfter(String haystack, String needle) {
int hLen = haystack.length();
int nLen = needle.length();
int maxSearch = hLen - nLen;
outer:
for (int i = 0; i < maxSearch; i++) {
for (int j = 0; j < nLen; j++) {
if (haystack.charAt(i + j) != needle.charAt(j)) {
continue outer;
}
}
// If it reaches here, match has been found:
return i + nLen;
}
return -1; // Not found
}
}我的问题是,这种做法是否有任何缺点?或者其他更有效的方法?
提前谢谢你,
相同的
编辑:
实例化代码:
public interface RecordFactory<T extends Record> {
T createRecord(String recordString);
}它的实施方式如下:
private RecordFactory<SamRecord> samRecordFactory = SamRecord::new发布于 2019-04-23 14:13:27
有一件事,我相信可以提高您的应用程序的性能。
您经常调用findElement,它每次都要通过SAM记录。
通过加载记录,您非常肯定至少会访问它一次。
在某个时候,也许在创建类时,或者当第一次访问第一个属性时,应该“索引”您的SAM记录。
检查整个文件一次,并保留选项卡所在位置的数组。这样,如果您的代码最终调用:
XsamReadQueries.findElement(read, 1)
XsamReadQueries.findElement(read, 2)
XsamReadQueries.findElement(read, 3)对第二个和第三个方法的调用将比现在快得多。
为此,您可以向XsamReadQueries名称添加一个方法(类似于IndexTabs ),这将返回一个ints数组。
如果你想对如何做到这一点有更多的洞察力,你可以写一个评论,我会添加更多的信息,但我很肯定这会对你有所帮助。
在您的代码中,有两件事在清晰性和未来维护方面困扰着我。
您有名为findPhred的方法,它调用findElement,但在SamRecord中,有时您调用findElement和某个特定的find*,这基本上是相同的代码。您应该决定一种方法,要么对XsamReadQueries中的每个字段都有特定的方法,要么只保留findElement方法。
最后,可以考虑对enum方法的element参数使用一个findElement。
https://codereview.stackexchange.com/questions/217955
复制相似问题