|
import kiss
import urllib.request
import signal
import gzip
import hashlib
import requests
from bs4 import BeautifulSoup
call = "VA1QLE".encode()
ESC = b'\x05'
ACK = b'\x06'
PAGE_REQUEST = b'\x07'
PAGE_RESPONSE = b'\x08'
PAGE_RESPONSE_END = b'\x09'
CHECKSUM = b'\x10' #Splits md5 checksum from HTML
RESEND_PACKET = b'\x11'
Max_Packet_Length = 750
Max_Connect_Retries = 5
Max_Retries = 10 #Retries after a connection is established
k = kiss.TCPKISS(host='localhost', port=8001)
k.start()
def escapeData(data): #Not very pretty
return data.replace(
ACK,
ESC + ACK
).replace(
PAGE_REQUEST,
ESC + PAGE_REQUEST
).replace(
PAGE_RESPONSE,
ESC + PAGE_RESPONSE
).replace(
PAGE_RESPONSE_END,
ESC + PAGE_RESPONSE_END
).replace(
CHECKSUM,
ESC + CHECKSUM
).replace(
RESEND_PACKET,
ESC + RESEND_PACKET
)
def unescapeData(data):
out = b''
i = 0
while i < len(data):
if data[i] == ESC and i + 1 < len(data):
if data[i + 1] in (ACK, PAGE_REQUEST, PAGE_RESPONSE, PAGE_RESPONSE_END, CHECKSUM, RESEND_PACKET):
out += str(bytearray(data[i + 1]))
i += 1 # Skips over the next byte
else:
out += str(bytearray(data[i]))
else:
out += str(bytearray(data[i]))
i += 1
return out
def signal_handler(signum, frame):
raise TimeoutError
signal.signal(signal.SIGALRM, signal_handler)
def decode_packet(pkt, detectorByte):
try:
(calls, body) = b''.join(pkt)[1:].split(detectorByte, 1)
(src, dest) = calls.split(b'>', 1)
return (src, dest, body)
except ValueError: #Exception when split fails.
return (None, None, None)
def downloadPage(url): #Strips out images, scripts, etc. Makes it look ugly, but much quicker.
try:
page = requests.get(url)
code = page.status_code
text = page.text
soup = BeautifulSoup(text, "html5lib")
#Remove script tags
for item in soup.findAll('script'):
item.extract()
#Remove img tags
for item in soup.findAll('img'):
item.extract()
return(code, soup.encode())
except requests.ConnectionError:
return (503, b"Failed to connect to website")
while True:
frame = b''.join(k.read(readmode=False))[1:]
#print("Received packet: {}".format(frame)) #DEBUG
(calls, body) = frame.split(PAGE_REQUEST, 1)
(src, dest) = calls.split(b'>', 1)
if dest == call: #It's for this server
url = body.decode()
print(url)
(code, html) = downloadPage(url)
html = code.to_bytes(2, 'big') + html
html = escapeData(gzip.compress(html, compresslevel=9)) + PAGE_RESPONSE_END #Okay, it's not HTML. But I'm already reusing the 'body' variable and HTML is good enough
attempts = 0
connected = False
while html != b'':
success = True
signal.alarm(10)
try:
k.write(call + b'>' + src + PAGE_RESPONSE + html[0:Max_Packet_Length] + CHECKSUM + hashlib.md5(html[0:Max_Packet_Length]).hexdigest().encode())
print(html[0:Max_Packet_Length]) #DEBUGGING
ack = False
while not ack:
pkt = k.read(readmode=False)
(src_ack, dest_ack, garbage) = decode_packet(pkt, ACK)
resend = False
if src_ack == None:
(src_ack, dest_ack, garbage) = decode_packet(pkt, RESEND_PACKET)
resend = True
if dest_ack == call:
ack = True
success = not resend
except TimeoutError:
#Timeout. Try again
success = False
if success:
connected = True
html = html[Max_Packet_Length:]
attempts = 0
else:
attempts += 1
if (not connected) and attempts > Max_Connect_Retries:
print("Failure connecting")
break
elif (connected) and attempts > Max_Retries:
print("Failure sending packet")
break
signal.alarm(0) #Reset alarm
|