基于区块链的简易论文加密设计

本次实验的要求是对给予的一篇论文设计一个简易的加密原型设计，可通过查询区块链上的对应信息获得论文的基本信息以及上链的时间。以下我通过5个步骤进操作的拆分。

Step 1

由于论文的格式是PDF格式，第一步我们需要先对PDF文档进行处理，使用Python处理PDF文档的模块实现论文基本信息的提取，包括题目，作者，联系方式，出处，摘要等几个方面，并将该信息采用JSON格式存储。


#首先安装pdfminer库并调用其中的方法，以便对pdf格式文件进行操作。
from pdfminer.pdfparser import PDFParser,PDFDocument 
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter,PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams,LTTextLineHorizontal,LTFigure,LTRect,LTLine,LTCurve
import json
import codecs
pd_file = open(r"C:\Users\zz\Desktop\bitcoin.pdf", "rb")
file = open(r'C:\Users\zz\Desktop\linshichucun.txt','a+',encoding='utf-8')    
parser = PDFParser(pd_file) #parser = PDFParser(pd_file)获得pdf文件解析对象
# print(parser)

document = PDFDocument()
parser.set_document(document)
document.set_parser(parser)

document.initialize() #document.initialize()初始化文档密码
if document.is_extractable:
    print(True)
else:
    raise PDFTextExtractionNotAllowed

src = PDFResourceManager() #src = PDFResourceManager()用于储存文档资源
device = PDFPageAggregator(src,laparams=LAParams()) #device = PDFPageAggregator(src,laparams=LAParams())表示设备对象 
inter = PDFPageInterpreter(src,device) #inter = PDFPageInterpreter(src,device)表示解释器对象
pages = document.get_pages()
for page in pages:
    #print(page.contents)
    inter.process_page(page)
    layout = device.get_result()

    for x in layout:
        if isinstance(x, LTTextBoxHorizontal):
            print(str(x.get_text()))            
            file.write(str(x.get_text()))           
            pd_file.close()
                   
#将PDF格式转化为可读文本之后，将其逐行写入我们建立好的文本文件，也就是C:\Users\zz\Desktop\linshichucun.txt。   

file.seek(0)
content = file.readlines()[0:18] 
#该文件储存的是整篇文章的文本文档格式，不符合我们提取关键信息的要求。
#所以我们利用file.seek(0)指令将指针从文末重新调整到文章开始，并提取前18行的内容，这18行内容也就是我们需要的关键信息。
#然后储存在 C:/Users/zz/Desktop/bitcoinlinshi.json文件中。
       
filename = 'C:/Users/zz/Desktop/bitcoinlinshi.json'
f_obj = open(filename,'a')
json.dump(content,f_obj)      
f_obj.close()       
file.close()
#打开文件，发现信息储存是这样的：
#["Bitcoin: A Peer-to-Peer Electronic Cash System\n", "Satoshi Nakamoto\n", "satoshin@gmx.com\n", "www.bitcoin.org\n", "Abstract.  A purely peer-to-peer version of electronic cash would allow online \n", "payments to be sent directly from one party to another without going through a \n", "financial institution.  Digital signatures provide part of the solution, but the main \n", "benefits are lost if a trusted third party is still required to prevent double-spending. \n", "We propose a solution to the double-spending problem using a peer-to-peer network. \n", "The network timestamps transactions by hashing them into an ongoing chain of \n", "hash-based proof-of-work, forming a record that cannot be changed without redoing \n", "the proof-of-work.  The longest chain not only serves as proof of the sequence of \n", "events witnessed, but proof that it came from the largest pool of CPU power.  As \n", "long as a majority of CPU power is controlled by nodes that are not cooperating to \n", "attack the network, they'll generate the longest chain and outpace attackers.  The \n", "network itself requires minimal structure.  Messages are broadcast on a best effort \n", "basis, and nodes can leave and rejoin the network at will, accepting the longest \n", "proof-of-work chain as proof of what happened while they were gone.\n"]
#这是一个列表，并且其中有很多无效字符，比如换行符以及很多不该有的空格，引号。因此，我们要对获得的数据进行提纯。

readfile = 'C:/Users/zz/Desktop/bitcoin.json'
s = open(r'C:/Users/zz/Desktop/bitcoinlinshi.json','r')
k = s.readlines()
a = k[0]
b = ""
for i in range(0, len(a)-1):
    if a[i] in "]\"[":
        continue
    else:
        b = b + a[i]
c = b.replace(" \\n,","")
d = c.replace("\\n","")
print(d)
refile = open(realfile,'a')
json.dump(d,refile)         
s.close()
refile.close()

#新建一个文件，也就是C:/Users/zz/Desktop/bitcoin.json。
#对bitcoinlinshi.json中的信息进行进一步处理，我们最终得到了包含关键信息的json文件bitcoin.json，内容如下："Bitcoin: A Peer-to-Peer Electronic Cash System, Satoshi Nakamoto, satoshin@gmx.com, www.bitcoin.org, Abstract.  A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.  Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work.  The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power.  As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers.  The network itself requires minimal structure.  Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone."

Step 2

第二步是对文件进行加密，如添加数字指纹，进行数字签名。我选择使用非对称加密算法生成一对私钥和公钥，私钥进行签名，公钥进行签名验证。

import codecs 
#在这里使用非对称加密算法，即椭圆曲线算法。
from ecdsa import SigningKey,SECP256k1 
#安装完成后先导入算法库：from ecdsa import SigningKey,SECP256k1
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager,PDFPageInterpreter,PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal,LAParams,LTTextLineHorizontal,LTFigure,LTRect,LTLine,LTCurve
import json

pd_file = open(r"C:\Users\zz\Desktop\bitcoin.pdf", "rb")
parser = PDFParser(pd_file)
document = PDFDocument()
parser.set_document(document)
document.set_parser(parser)
document.initialize()
if document.is_extractable:
    print(True)
else:
    raise PDFTextExtractionNotAllowed
src = PDFResourceManager()
device = PDFPageAggregator(src,laparams=LAParams())
inter = PDFPageInterpreter(src,device)
pages = document.get_pages()
for page in pages:
    #print(page.contents)
    inter.process_page(page)
    layout = device.get_result()

    for x in layout:
        if isinstance(x, LTTextBoxHorizontal):
            print(str(x.get_text()))                     
            pd_file.close()

sk = SigningKey.generate(curve=SECP256k1)
vk = sk.get_verifying_key()
signature = sk.sign(str(x.get_text()).encode('utf-8'))
vk.verify(signature,str(x.get_text()).encode('utf-8'))

#先用SigningKey.generate（）方法生成一个私钥，由这个私钥可以生成唯一一个公钥。
#然后使用私钥对论文的内容生成签名，而由私钥生成的公钥就可以用来验证这个签名是否正确。
#由于代码运行的结果为True，可以认为签名成功。

Step 3

第三步是设计一条区块链，利用加密技术，将论文信息添加入区块中。
其中，代码的前一部分是构造一个区块和区块链，后半部分是将论文信息写入区块中。

import hashlib
from datetime import datetime

class Block:
    """
    区块结构
        prev_hash:    父区块哈希值
        data:         区块内容
        timestamp:    区块创建时间
        hash:         区块哈希值
    """
    def __init__(self, data, prev_hash):
        self.prev_hash = prev_hash
        self.data = data
        self.timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        message = hashlib.sha256()
        message.update(str(self.prev_hash).encode('utf-8'))
        message.update(str(self.data).encode('utf-8'))
        message.update(str(self.timestamp).encode('utf-8'))
        self.hash = message.hexdigest()
#为了构造一个区块链，首先我们需要构造一个区块。
#因此，我们先定义一个Block类，其中构造方法def__init__(self, data, prev_hash)用于获取参数构造新的区块。
#区块的结构包括父区块哈希值，区块内容，区块创建时间以及区块哈希值。

class BlockChain:
    """
    区块链结构体
        blocks:        包含的区块链表
    """
    def __init__(self):
        self.blocks=[]
        
    def add_block(self, block):
        """
        添加区块
        """
        self.blocks.append(block)
#此外我们还定义了BlockChain类，这是构造区块链的时候使用的，利用该类中定义的方法可以实现将区块写入区块链的操作。 
       
file = open(r'C:\Users\zz\Desktop\bitcoin.json','r')
k = file.readlines()
a = k[0]
b = ""
for i in range(0, len(a)-1):
    if a[i] in "]\"[":
        continue
    else:
        b = b + a[i]
c = b.replace(" \\n,","")
d = c.replace("\\n","")
print(d)
file.close()

genesis_block = Block(data=d,prev_hash="")

blockchain = BlockChain()
blockchain.add_block(genesis_block)

print('区块链包含区块个数: %d\n' % len(blockchain.blocks))
for block in blockchain.blocks:
    print("父区块区块哈希：%s" % block.prev_hash)
    print("区块内容：%s" % block.data)
    print("区块哈希：%s" % block.hash)
    print("\n")
file.close() 

#输出的结果为:
#区块链包含区块个数:1

#父区块区块哈希:
#(父区块区块哈希为空是因为该区块是构建的第一个区块，也就是创世区块，没有父区块。)
#区块内容：Bitcoin: A Peer-to-Peer Electronic Cash System, Satoshi Nakamoto, satoshin@gmx.com, www.bitcoin.org, Abstract.  A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.  Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work.  The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power.  As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers.  The network itself requires minimal structure.  Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.
#区块哈希：921837a42e7157eb6eefaa4c7b083e15bbeb13d97d63f0de561a00ff942c3776

Step 4

第四步是将论文在区块中的地址写到原PDF文档中，就可以将上面生成的区块哈921837a42e7157eb6eefaa4c7b083e15bbeb13d97d63f0de561a00ff942c3776写入文档。

import PyPDF2 #引入一个新库PyPDF2
mypdf = open(r'C:\Users\zz\Desktop\bitcoin.pdf',mode = 'rb')
pdfdoc = PyPDF2.PdfFileReader(mypdf)
pdfdoc.numPages

for i in range(pdfdoc.numPages):
    page = pdfdoc.getPage(i)
    newbitcoin = PyPDF2.PdfFileWriter()
    newbitcoin.addPage(page)
pdfout = open(r'C:\Users\zz\Desktop\newbitcoin.pdf','wb')
newbitcoin.write(pdfout)
mypdf.close()
pdfout.close()
#通过库中的方法，我们实现了文字的添加然后生成了一个新的PDF文件，也就是'C:\Users\zz\Desktop\newbitcoin.pdf。
#这样既不会对原文件有不可逆转的改动，又可以满足题目的要求。

Step 5

最后一步是通过区块链信息直接访问该论文信息。

import base64
file = open(r'C:\Users\zz\Desktop\bitcoin.json','r')
read = file.readlines()
a = read[0]
b = ""
for i in range(0, len(a)-1):
        b = b + a[i]
result = base64.b64encode(b.encode('utf-8'))
print(result)
text = base64.b64decode(result)
print(text.decode('utf-8'))
#输出结果为："Bitcoin: A Peer-to-Peer Electronic Cash System, Satoshi Nakamoto, satoshin@gmx.com, www.bitcoin.org, Abstract.  A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution.  Digital signatures provide part of the solution, but the main benefits are lost if a trusted third party is still required to prevent double-spending. We propose a solution to the double-spending problem using a peer-to-peer network. The network timestamps transactions by hashing them into an ongoing chain of hash-based proof-of-work, forming a record that cannot be changed without redoing the proof-of-work.  The longest chain not only serves as proof of the sequence of events witnessed, but proof that it came from the largest pool of CPU power.  As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers.  The network itself requires minimal structure.  Messages are broadcast on a best effort basis, and nodes can leave and rejoin the network at will, accepting the longest proof-of-work chain as proof of what happened while they were gone.