#coding=utf-8
import datetime
import time
import sys
import os
import urllib2
import urllib
sx = '小说站网址'
type = sys.getfilesystemencoding()
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
fo = open("note.txt", "wb")
def getHtml(url):
try:
request = urllib2.Request(url, headers=headers)
response = urllib2.urlopen(request)
data = response.read()
data = data.decode('gbk')
data = data.encode('utf-8')
print len(data)
return data
except urllib2.URLError, e:
if hasattr(e, "code"):
print e.code
if hasattr(e, "reason"):
print e.reson
pass
def dealIndex(url):
data = getHtml(url)
# pos = data.find()
bgnpos = data.index('ChapterList_HengFu_1') + 10
endpos = data.index('ChapterList_HengFu_2') - 10
print bgnpos
print endpos
achfx = data[bgnpos:endpos]
pos = bgnpos
i =while 1:
newpos = achfx.find('href=', pos)
if newpos == -1 or newpos >= endpos:
break
# print data[newpos:newpos+200]
indexurl = achfx[newpos+6:newpos+19]
titlepos = achfx.find('</a>', newpos+20)
titlename = achfx[newpos+21:titlepos+1]
# print indexurl + " " + titlename
pos = titlepos + 5
dealContext(sx + indexurl, titlename)
# i = i + 1
# # print "-----------------" + str(pos)
# if i >= 1:
# break
pass
# print achfx
def dealContext(url, title):
print url
print title
data = getHtml(url)
bgnpos = data.find('name="content"', 10) + 15
endpos = data.find('yuedu_bottom', bgnpos)
endpos = data.find('</div>', endpos - 50)
sContent = data[bgnpos:endpos]
sContent = sContent.replace(' ', ' ')
sContent = sContent.replace('<br />', ' ')
# # sContent = sContent.strip(" ")
# # sContent = sContent.strip('<br />')
# print sContent
# # print sContent.strip('<br />')
sContent = title + " " + sContent
fo.write(sContent)
dealIndex(sx)
fo.close()