import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
cds_id = "NP_001339842.1"
fasta_url = ("https://www.ncbi.nlm.nih.gov/protein/%s/?report=fasta" %cds_id)
fasta_html = requests.get(fasta_url)
fasta_html.raise_for_status()
soup = BeautifulSoup(fasta_html.text, "html.parser")
print(soup.select('div > pre > #text'))我希望解析来自ncbi的序列数据,但是当我搜索标记<pre>时,返回的总是'None‘。我也使用了“find”和“findAll”,但结果是一样的。



发布于 2022-10-07 07:00:01
只是为了暗示一个方向--例如,提取以下参数,并调用数据来源的api。
...
soup = BeautifulSoup(fasta_html.text, "html.parser")
phid = soup.select_one('[name="ncbi_phid"]').get('content')
id = soup.select_one('[name="ncbi_uidlist"]').get('content')
requests.get(f'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?id={id}&db=protein&report=fasta&extrafeat=null&conwithfeat=on&hide-cdd=on&retmode=html&ncbi_phid={phid}&withmarkup=on&tool=portal&log$=seqview&maxdownloadsize=1000000').text输出
>NP_001339842.1 serine/threonine-protein phosphatase 2A 56 kDa regulatory subunit gamma isoform isoform g [Homo sapiens]\nMPNKNKKEKESPKAGKSGKSSKEGQDTVESEQISVRKNSLVAVPSTVSAKIKVPVSQPIVKKDKRQNSSR\nFSASNNRELQKLPSLKDVPPADQEKLFIQKLRQCCVLFDFVSDPLSDLKWKEVKRAALSEMVEYITHNRN\nVITEPIYPEVVHMFAVNMFRTLPPSSNPTGAEFDPEEDEPTLEAAWPHLQLVYEFFLRFLESPDFQPNIA\nKKYIDQKFVLQLLELFDSEDPRERDFLKTTLHRIYGKFLGLRAYIRKQINNIFYRFIYETEHHNGIAELL\nEILGSIINGFALPLKEEHKIFLLKVLLPLHKVKSLSVYHPQLAYCVVQFLEKDSTLTEPVVMALLKYWPK\nTHSPKEVMFLNELEEILDVIEPSEFVKIMEPLFRQLAKCVSSPHFQVAERALYYWNNEYIMSLISDNAAK\nILPIMFPSLYRNSKTHWNKTIHGLIYNALKLFMEMNQKLFDDCTQQFKAEKLKEKLKMKEREEAWVKIEN\nLAKANPQYTVYSQASTMSIPVAMETDGPLFEDVQMLRKTVKDEAHQAQKDPKKDRPLARRKSELPQDPHT\nKKALEAHCRADELASQDGR\n\n发布于 2022-10-07 07:06:05
您需要从页面中获取一些元字段。然后你就能得到你需要的信息
cds_id = 'NP_001339842.1'
url = f'https://www.ncbi.nlm.nih.gov/protein/{cds_id}/?report=fasta'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
ncbi_uid = soup.find('meta', {'name': 'ncbi_uidlist'}).get('content')
ncbi_db = soup.find('meta', {'name': 'ncbi_db'}).get('content')
ncbi_phid = soup.find('meta', {'name': 'ncbi_phid'}).get('content')
response = requests.get(f'https://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?'
f'id={ncbi_uid}&db={ncbi_db}&report=fasta&extrafeat=null&conwithfeat=on&hide-cdd=on&'
f'retmode=html&ncbi_phid={ncbi_phid}&withmarkup=on&tool=portal'
f'&log$=seqview&maxdownloadsize=1000000')
print(response.text)产出:
>NP_001339842.1 serine/threonine-protein phosphatase 2A 56 kDa regulatory subunit gamma isoform isoform g [Homo sapiens]
MPNKNKKEKESPKAGKSGKSSKEGQDTVESEQISVRKNSLVAVPSTVSAKIKVPVSQPIVKKDKRQNSSR
FSASNNRELQKLPSLKDVPPADQEKLFIQKLRQCCVLFDFVSDPLSDLKWKEVKRAALSEMVEYITHNRN
VITEPIYPEVVHMFAVNMFRTLPPSSNPTGAEFDPEEDEPTLEAAWPHLQLVYEFFLRFLESPDFQPNIA
KKYIDQKFVLQLLELFDSEDPRERDFLKTTLHRIYGKFLGLRAYIRKQINNIFYRFIYETEHHNGIAELL
EILGSIINGFALPLKEEHKIFLLKVLLPLHKVKSLSVYHPQLAYCVVQFLEKDSTLTEPVVMALLKYWPK
THSPKEVMFLNELEEILDVIEPSEFVKIMEPLFRQLAKCVSSPHFQVAERALYYWNNEYIMSLISDNAAK
ILPIMFPSLYRNSKTHWNKTIHGLIYNALKLFMEMNQKLFDDCTQQFKAEKLKEKLKMKEREEAWVKIEN
LAKANPQYTVYSQASTMSIPVAMETDGPLFEDVQMLRKTVKDEAHQAQKDPKKDRPLARRKSELPQDPHT
KKALEAHCRADELASQDGRhttps://stackoverflow.com/questions/73983308
复制相似问题