scrapy缺省配置

BOT_NAME=‘scrapybot’

CLOSESPIDER_TIMEOUT=0

CLOSESPIDER_PAGECOUNT=0

CLOSESPIDER_ITEMCOUNT=0

CLOSESPIDER_ERRORCOUNT=0

COMMANDS_MODULE=”

CONCURRENT_ITEMS=100

CONCURRENT_REQUESTS=16

CONCURRENT_REQUESTS_PER_DOMAIN=8

CONCURRENT_REQUESTS_PER_IP=0

COOKIES_ENABLED=True

COOKIES_DEBUG=False

DEFAULT_ITEM_CLASS=‘scrapy.item.Item’

DEFAULT_REQUEST_HEADERS={

‘Accept’:‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8′,

‘Accept-Language’:‘en’,

}

DEPTH_LIMIT=0

DEPTH_STATS=True

DEPTH_PRIORITY=0

DNSCACHE_ENABLED=True

DOWNLOAD_DELAY=0

DOWNLOAD_HANDLERS={}

DOWNLOAD_HANDLERS_BASE={

‘file’:‘scrapy.core.downloader.handlers.file.FileDownloadHandler’,

‘http’:‘scrapy.core.downloader.handlers.http.HttpDownloadHandler’,

‘https’:‘scrapy.core.downloader.handlers.http.HttpDownloadHandler’,

‘s3′:‘scrapy.core.downloader.handlers.s3.S3DownloadHandler’,

}

DOWNLOAD_TIMEOUT=180#3mins

DOWNLOADER_DEBUG=False

DOWNLOADER_HTTPCLIENTFACTORY=‘scrapy.core.downloader.webclient.ScrapyHTTPClientFactory’

DOWNLOADER_CLIENTCONTEXTFACTORY=‘scrapy.core.downloader.webclient.ScrapyClientContextFactory’

DOWNLOADER_MIDDLEWARES={}

DOWNLOADER_MIDDLEWARES_BASE={

#Engineside

‘scrapy.contrib.downloadermiddleware.robotstxt.RobotsTxtMiddleware’:100,

‘scrapy.contrib.downloadermiddleware.httpauth.HttpAuthMiddleware’:300,

‘scrapy.contrib.downloadermiddleware.downloadtimeout.DownloadTimeoutMiddleware’:350,

‘scrapy.contrib.downloadermiddleware.useragent.UserAgentMiddleware’:400,

‘scrapy.contrib.downloadermiddleware.retry.RetryMiddleware’:500,

‘scrapy.contrib.downloadermiddleware.defaultheaders.DefaultHeadersMiddleware’:550,

‘scrapy.contrib.downloadermiddleware.redirect.RedirectMiddleware’:600,

‘scrapy.contrib.downloadermiddleware.cookies.CookiesMiddleware’:700,

‘scrapy.contrib.downloadermiddleware.httpproxy.HttpProxyMiddleware’:750,

‘scrapy.contrib.downloadermiddleware.httpcompression.HttpCompressionMiddleware’:800,

‘scrapy.contrib.downloadermiddleware.chunked.ChunkedTransferMiddleware’:830,

‘scrapy.contrib.downloadermiddleware.stats.DownloaderStats’:850,

‘scrapy.contrib.downloadermiddleware.httpcache.HttpCacheMiddleware’:900,

#Downloaderside

}

DOWNLOADER_STATS=True

DUPEFILTER_CLASS=‘scrapy.dupefilter.RFPDupeFilter’

try:

EDITOR=os.environ['EDITOR']

exceptKeyError:

ifsys.platform==‘win32′:

EDITOR=‘%s-midlelib.idle’

else:

EDITOR=‘vi’

EXTENSIONS={}

EXTENSIONS_BASE={

‘scrapy.contrib.corestats.CoreStats’:0,

‘scrapy.webservice.WebService’:0,

‘scrapy.telnet.TelnetConsole’:0,

‘scrapy.contrib.memusage.MemoryUsage’:0,

‘scrapy.contrib.memdebug.MemoryDebugger’:0,

‘scrapy.contrib.closespider.CloseSpider’:0,

‘scrapy.contrib.feedexport.FeedExporter’:0,

‘scrapy.contrib.logstats.LogStats’:0,

‘scrapy.contrib.spiderstate.SpiderState’:0,

‘scrapy.contrib.throttle.AutoThrottle’:0,

}

FEED_URI=None

FEED_URI_PARAMS=None#afunctiontoextenduriarguments

FEED_FORMAT=‘jsonlines’

FEED_STORE_EMPTY=False

FEED_STORAGES={}

FEED_STORAGES_BASE={

”:‘scrapy.contrib.feedexport.FileFeedStorage’,

‘file’:‘scrapy.contrib.feedexport.FileFeedStorage’,

‘stdout’:‘scrapy.contrib.feedexport.StdoutFeedStorage’,

‘s3′:‘scrapy.contrib.feedexport.S3FeedStorage’,

‘ftp’:‘scrapy.contrib.feedexport.FTPFeedStorage’,

}

FEED_EXPORTERS={}

FEED_EXPORTERS_BASE={

‘json’:‘scrapy.contrib.exporter.JsonItemExporter’,

‘jsonlines’:‘scrapy.contrib.exporter.JsonLinesItemExporter’,

‘csv’:‘scrapy.contrib.exporter.CsvItemExporter’,

‘xml’:‘scrapy.contrib.exporter.XmlItemExporter’,

‘marshal’:‘scrapy.contrib.exporter.MarshalItemExporter’,

‘pickle’:‘scrapy.contrib.exporter.PickleItemExporter’,

}

HTTPCACHE_ENABLED=False

HTTPCACHE_DIR=‘httpcache’

HTTPCACHE_IGNORE_MISSING=False

HTTPCACHE_STORAGE=‘scrapy.contrib.httpcache.DbmCacheStorage’

HTTPCACHE_EXPIRATION_SECS=0

HTTPCACHE_IGNORE_HTTP_CODES=[]

HTTPCACHE_IGNORE_SCHEMES=['file']

HTTPCACHE_DBM_MODULE=‘anydbm’

ITEM_PROCESSOR=‘scrapy.contrib.pipeline.ItemPipelineManager’

#Itempipelinesaretypicallysetinspecificcommandssettings

ITEM_PIPELINES=[]

LOG_ENABLED=True

LOG_ENCODING=‘utf-8′

LOG_FORMATTER=‘scrapy.logformatter.LogFormatter’

LOG_STDOUT=False

LOG_LEVEL=‘DEBUG’

LOG_FILE=None

LOG_UNSERIALIZABLE_REQUESTS=False

LOGSTATS_INTERVAL=60.0

MAIL_DEBUG=False

MAIL_HOST=‘localhost’

MAIL_PORT=25

MAIL_FROM=‘scrapy@localhost’

MAIL_PASS=None

MAIL_USER=None

MEMDEBUG_ENABLED=False#enablememorydebugging

MEMDEBUG_NOTIFY=[]#sendmemorydebuggingreportbymailatengineshutdown

MEMUSAGE_ENABLED=False

MEMUSAGE_LIMIT_MB=0

MEMUSAGE_NOTIFY_MAIL=[]

MEMUSAGE_REPORT=False

MEMUSAGE_WARNING_MB=0

NEWSPIDER_MODULE=”

RANDOMIZE_DOWNLOAD_DELAY=True

REDIRECT_ENABLED=True

REDIRECT_MAX_METAREFRESH_DELAY=100

REDIRECT_MAX_TIMES=20#usesFirefoxdefaultsetting

REDIRECT_PRIORITY_ADJUST=+2

REFERER_ENABLED=True

RETRY_ENABLED=True

RETRY_TIMES=2#initialresponse+2retries=3requests

RETRY_HTTP_CODES=[500,503,504,400,408]

RETRY_PRIORITY_ADJUST=-1

ROBOTSTXT_OBEY=False

SCHEDULER=‘scrapy.core.scheduler.Scheduler’

SCHEDULER_DISK_QUEUE=‘scrapy.squeue.PickleLifoDiskQueue’

SCHEDULER_MEMORY_QUEUE=‘scrapy.squeue.LifoMemoryQueue’

SPIDER_MANAGER_CLASS=‘scrapy.spidermanager.SpiderManager’

SPIDER_MIDDLEWARES={}

SPIDER_MIDDLEWARES_BASE={

#Engineside

‘scrapy.contrib.spidermiddleware.httperror.HttpErrorMiddleware’:50,

‘scrapy.contrib.spidermiddleware.offsite.OffsiteMiddleware’:500,

‘scrapy.contrib.spidermiddleware.referer.RefererMiddleware’:700,

‘scrapy.contrib.spidermiddleware.urllength.UrlLengthMiddleware’:800,

‘scrapy.contrib.spidermiddleware.depth.DepthMiddleware’:900,

#Spiderside

}

SPIDER_MODULES=[]

STATS_CLASS=‘scrapy.statscol.MemoryStatsCollector’

STATS_DUMP=True

STATSMAILER_RCPTS=[]

TEMPLATES_DIR=abspath(join(dirname(__file__),‘..’,‘templates’))

URLLENGTH_LIMIT=2083

USER_AGENT=‘Scrapy/%s(+http://scrapy.org)’%__import__(‘scrapy’).__version__

TELNETCONSOLE_ENABLED=1

TELNETCONSOLE_PORT=[6023,6073]

TELNETCONSOLE_HOST=’0.0.0.0′

WEBSERVICE_ENABLED=True

WEBSERVICE_LOGFILE=None

WEBSERVICE_PORT=[6080,7030]

WEBSERVICE_HOST=’0.0.0.0′

WEBSERVICE_RESOURCES={}

WEBSERVICE_RESOURCES_BASE={

‘scrapy.contrib.webservice.crawler.CrawlerResource’:1,

‘scrapy.contrib.webservice.enginestatus.EngineStatusResource’:1,

‘scrapy.contrib.webservice.stats.StatsResource’:1,

}

SPIDER_CONTRACTS={}

SPIDER_CONTRACTS_BASE={

‘scrapy.contracts.default.UrlContract’:1,

‘scrapy.contracts.default.ReturnsContract’:2,

‘scrapy.contracts.default.ScrapesContract’:3,

}

相关推荐