??在做Drugbank数据处理时,发现的一个能够解决我问题的开源代码: 项目地址以及Test文件,都在:
https://github.com/Deshan-Zhou/deal_DrugBank
from xml.sax.handler import ContentHandler
from xml.sax import parse
import pandas as pd
"""
简写:
dbid : DrugBank id
dbname : DrugBank name
chid : ChEMBL id
ptid : protein id
"""
class ExtractData(ContentHandler):
def __init__(self):
self.dbid_chid = {}
self.dbid_dbname = {}
self.dbid_dbid = {}
self.dbid_ptid = {}
self.curr_id = ""
self.limit = 0
def characters(self,content):
if self.limit == 2:
self.curr_id = content
self.limit = 3
elif self.limit == 4:
self.dbid_dbname[self.curr_id] = content
self.limit = 0
elif self.limit == 6:
self.dbid_dbid.setdefault(self.curr_id,set()).add(content)
self.limit = 5
elif self.limit == 8:
if content == "ChEMBL":
self.limit = 9
elif self.limit == 10:
self.dbid_chid[self.curr_id] = content
self.limit = 0
def startElement(self,name,attrs):
if name == "drug":
self.limit = 1
if self.limit == 1 and name == "drugbank-id" and attrs:
if attrs["primary"] == "true":
self.limit = 2
elif self.limit == 3 and name=="name":
self.limit = 4
elif name == "drug-interactions":
self.limit = 5
elif self.limit == 5 and name == "drugbank-id":
self.limit = 6
elif name == "targets":
self.limit = 7
elif self.limit == 7 and name == "polypeptide":
self.dbid_ptid.setdefault(self.curr_id,set()).add(attrs["id"])
elif name == "resource" and self.limit!=7:
self.limit= 8
elif self.limit == 9 and name == "identifier":
self.limit = 10
def endElement(self,name):
if name == "drug-interactions":
self.limit = 0
elif name == "targets":
self.limit = 0
def endDocument(self):
list1_key=[]
list1_val=[]
list1_columns="ChEMBL_id",
for key,val in self.dbid_chid.items():
list1_key.append(key)
list1_val.append(val)
file1=pd.DataFrame(index=list1_key,columns=list1_columns,data=list1_val)
file1.to_csv('dbid_chid.csv')
list2_key=[]
list2_val=[]
list2_columns="Drug_name",
for key,val in self.dbid_dbname.items():
list2_key.append(key)
list2_val.append(val)
file2=pd.DataFrame(index=list2_key,columns=list2_columns,data=list2_val)
file2.to_csv('dbid_dbname.csv')
list4_key=[]
list4_val=[]
for key,val in self.dbid_ptid.items():
list4_key.append(key)
list4_val.append(list(val))
file4=pd.DataFrame(index=list4_key,data=list4_val)
file4.to_csv('dbid_ptid.csv')
parse('full database.xml',ExtractData())
|