先创建一个读对象:
fasta_open = pysam.Fastafile(fasta_file)
读出来的DNA序列是字符串的格式:
seq_dna = fasta_open.fetch('chr1', 0, 100)
将序列转为one-hot编码:
'''
A: [1, 0, 0, 0]
C: [0, 1, 0, 0]
G: [0, 0, 1, 0]
T: [0, 0, 0, 1]
'''
seq_1hot = dna_1hot(seq_dna, n_uniform=False, n_sample=False).astype('float32')
'''
output:
array([[1., 0., 0., 0.],
[1., 0., 0., 0.],
[1., 0., 0., 0.],
[0., 0., 0., 1.],
[0., 1., 0., 0.],
[0., 0., 1., 0.]], dtype=float32)
'''
def dna_1hot(seq, seq_len=None, n_uniform=False, n_sample=False):
""" dna_1hot
Args:
seq: nucleotide sequence.
seq_len: length to extend/trim sequences to.
n_uniform: represent N's as 0.25, forcing float16,
n_sample: sample ACGT for N
Returns:
seq_code: length by nucleotides array representation.
"""
if seq_len is None:
seq_len = len(seq)
seq_start = 0
else:
if seq_len <= len(seq):
# trim the sequence
seq_trim = (len(seq) - seq_len) // 2
seq = seq[seq_trim:seq_trim + seq_len]
seq_start = 0
else:
seq_start = (seq_len - len(seq)) // 2
seq = seq.upper()
# map nt's to a matrix len(seq)x4 of 0's and 1's.
if n_uniform:
seq_code = np.zeros((seq_len, 4), dtype='float16')
else:
seq_code = np.zeros((seq_len, 4), dtype='bool')
for i in range(seq_len):
if i >= seq_start and i - seq_start < len(seq):
nt = seq[i - seq_start]
if nt == 'A':
seq_code[i, 0] = 1
elif nt == 'C':
seq_code[i, 1] = 1
elif nt == 'G':
seq_code[i, 2] = 1
elif nt == 'T':
seq_code[i, 3] = 1
else:
if n_uniform:
seq_code[i, :] = 0.25
elif n_sample:
ni = random.randint(0,3)
seq_code[i, ni] = 1
return seq_code
|