Fork me on GitHub
×

博客更新通知

感谢一直支持我的朋友,目前在做一个自己的博客项目。几个月后会有新版本的博客,新版本博客将主要涉及python,ruby,lisp,算法,devops,emacs,github开源项目等方面内容,敬请关注

前言

  1. 工作经常写一些东西发邮件,但是渐渐的已经用markdown写东西,每次很纠结,
  2. 而且还需要我打开邮箱,然后balabala,比如我还要在后面加入公司和自己的一些信息
  3. 经常邮件或者html都带有python的源码段,想要一个支持python语法的css显示效果

使用的模块

  • docopt Pythonic的命令行函数解析,只需要把显示的参数列表放在__doc__
  • schema Pythonic的数据结构验证,不需要那么多的异常处理
  • markdown
  • PyYAML 解析yaml文件
  • pygments 借用它对python语法的一些正则匹配
  • requests 我没有自己实现css,css可以本地自己自定义,也可以从网站下载,这里去爬网站的css文件

PS:安装这些可以

sudo easy_install schema docopt markdown pygments pyyaml

功能

  • 支持python语法
  • 支持本地有配置文件,不需要命令行balabala那么多(使用yaml)
  • 支持多种颜色方案,方案可选项: pygments-css
  • 支持本地自定义css(默认去这个网站爬回来)
  • 支持中文
  • 支持自定义html模板文件,比如我们公司邮件下部的联系方式等说明,可以放在模板邮件里面
  • 可以不发送邮件,只保留和加了css后的html到本地文件

使用举例

  1. 默认模式
python MarkPygments.py  --mailto mailto@qq.com,mailto2@qq.com  -s 标题 --mailserver smtp.exmail.qq.com -u youremailname
-p yourpassword   --cc cc@qq.com whatever.md --template template.html
  1. 使用本地yaml配置,配置如下, 配置中没有能命令行选项找,配置和终端都有会使用中有文件配置 这是yaml文件的内容:
markemail:
    --theme: autumn
    --username: XX
    --password: YY
    --mailserver: smtp.exmail.qq.com
    --mailto: to1@qq.com,to2@qq.com
    --subject: '周报'

然后这样使用:

python MarkPygments.py --config ~/.config.yaml whatever.md --template template.html
  1. 使用本地css目录下的css, 不发送邮件只保存html到本地文件
python MarkPygments.py --config ~/.config.yaml whatever.md --template template.html
-o out.html --local pygments-css

这里是代码,或者你可以去看MarkPygments.py

# coding=utf-8
'''
Usage:
MarkPygments.py [options] MDFILE
MarkPygments.py [options] --local <cssdir> MDFILE
MarkPygments.py [options] --config <yamlfile> MDFILE

Arguments:
MDFILE the markdown file
-u --username user your email name
-p --password pass your email login password
-mt --mailto tolist mailto list
--theme theme css style for python syntax [default: monokai]
-s --subject subject email's subject
--mailserver server mail server [default: smtp.exmail.qq.com]

Options:
-h --help show this help message and exit
--version show version and exit
--config yamlfile config yaml file path (e.g. .config.yaml)
--local cssdir use local custom css dir
-o --output [outhtml] make output to html file
-c --cc list cc list
--template html template html

'''


import os
import re
import codecs
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart
import markdown
from docopt import docopt
from schema import Schema, And, Or, Use, SchemaError


def log(message, error=False):
    '''终端输出log'''
    color = 31 if error else 32
    print '\x1B[1;{0}m * {1}\x1B[0m'.format(color, message)


def regex():
    '''借用pygments对python语法的实现以及自己实现的正则'''
    from pygments.lexers import PythonLexer
    dict = {}
    lex = PythonLexer()
    token = lex.tokens
    l = ['keywords', 'builtins']
    for i in l:
        dict[i[:2]] = token[i][0][0]
    dict['fu'] = '.*(def)\W+(.*)\((.*)\)'
    dict['cl'] = '.*(?!<span)(class)(?!=)\W+(?!=)(.*)\((.*)\)'
    dict['fm'] = '.*(from)\W+(.*)\W+(import)\W+(.*)'
    dict['im'] = '.*(import)\W\{,\4}(.*)'
    return dict


def sendMail(mailserver, username, password, tolist, subject, msg, cc=[]):
    '''发送邮件'''
    def makeEmail(content):

        msg = MIMEMultipart()
        msg['Subject'] = subject
        msg['From'] = username
        msg['To'] = ','.join(tolist)
        if cc:
            msg['Cc'] = ','.join(cc)
        html_part = MIMEText(content, 'html', 'utf-8')
        msg.attach(html_part)
        return msg
    try:

        smtp = smtplib.SMTP()
        log('Connect to {0}'.format(mailserver))
        smtp.connect(mailserver, 25)
        smtp.login(username, password)
        log('Login Success with {0}'.format(username))
        log('To send this Email...')
        if cc:
            smtp.sendmail(username, tolist + cc, makeEmail(msg).as_string())
        else:
            smtp.sendmail(username, tolist, makeEmail(msg).as_string())
        log('Send Success')
    except Exception, e:
        log(e, error=True)


def paserYaml(yamlfile):
    '''解析yaml文件配置'''
    import yaml
    return yaml.load(open(yamlfile)).get('markemail', {})


def check_email(emails):
    '''检查选项是否是邮件格式'''
    print emails
    regex = r'''^[_a-z0-9-]+(\.[a-z0-9-]+)*@[a-z0-9-]+
(\.[a-z0-9-]+)*(\.[a-z]{2,3})$'''
    rst = map(lambda n: True if re.compile(
        regex).match(n) else False, emails.split(','))
    return True if False not in rst else False


def colorClass():
    '''pygments-css对语法的class对应字典'''
    return dict(
        cl=['k', 'nc', 'nb'],
        fu=['k', 'nf', 'bp'],
        fm=['nd', 'vi', 'nd', 'vi'],
        im=['nd', 'mi'],
        ke=['kd'],
        bu=['vc']
    )


class cssStyle(object):

    '''获取css设置'''
    def __init__(self, style, *args):

        self.style = style
        self.args = args

    def fusionCss(self, csshtml):

        css = '<style type="text/css">'
        css += '.codehilite {border: 2px solid rgb(225, 225, 225)}'
        css += csshtml
        css += '</style>'
        return css

    def local(self, cssdir, theme):
        '''从本地css文件'''
        log('Fetch css from local dir:{0}'.format(cssdir))
        with open('{0}/{1}.css'.format(cssdir, theme)) as f:
            css = f.read().strip()
        return self.fusionCss(css)

    def crawler(self, theme):
        '''去这个网站爬回来'''
        import requests
        log('Fetch css from site:igniteflow.com')
        r = requests.get(
            'http://igniteflow.com/static/css/pygments/{0}.css'.format(theme))
        return self.fusionCss(r.text.strip())

    def main(self):

        return getattr(self, self.style)(*self.args)


class FabricHtml(object):

    def __init__(self, md, css):

        self.css = css
        self.md_html = self.makeToHtml(md)

    def makeToHtml(self, md):

        log('Markdown converted into html')
        input_file = codecs.open(md, mode="r", encoding="utf-8")
        text = input_file.read()
        return markdown.markdown(text)

    def AddCssToHtml(self, html, css_html):
        '''增加css的style'''
        ohtml = css_html
        c = html.split('```')
        inc = 0
        ohtml += c[0]
        for inc in range(1, len(c[1:]) + 1):
            if inc % 2:
                ohtml += '<div class="codehilite">'
            else:
                ohtml += '</div>'
            ohtml += c[inc]
            inc += 1
        return ohtml

    def makeSpan(self, html, c):
        '''构造span包含符合的语法块'''
        if not html:
            return ''

        return '<span class="{0}">{1}</span>'.format(c, html)

    def markHtml(self, h):
        '''给html加python语法的颜色css'''
        for k, v in regex().items():
            args = colorClass()[k]
            m = re.compile(r'%s' % v).match(h)
            if m:
                match = m.groups()
                for i in range(len(args)):
                    h = re.sub(match[i], self.makeSpan(
                        match[i], args[i]), h, 1)
        return h

    def main(self, template=''):

        has_css_html = self.AddCssToHtml(self.md_html, self.css)
        return self.pygments(has_css_html) + template

    def pygments(self, html):

        log('Mark span label with python syntax')
        ohtml = ''
        for h in html.split('\n'):
            ohtml += self.markHtml(h)
            ohtml += '\n'
        return ohtml


def checkSchema(schemadict, args):
    '''Pythonic的检查schema'''
    schema = Schema(schemadict)
    try:
        args = schema.validate(args)
    except SchemaError as e:
        raise
        exit(log(e, error=True))
    return args


def main():

    args = docopt(__doc__, version='1.0r1')

    isLocal = args.get('--local')
    hasConfig = args.get('--config')
    theme = args.get('--theme')
    if hasConfig:
        checkSchema({
            '--config': And(Use(str),
                            os.path.exists,
                            error='Invalid config format or not exists')
        }, {'--config': hasConfig}
        )
        yamlConfig = paserYaml(hasConfig)
        args.update(yamlConfig)
    args.pop('--config')

    if isLocal:
        checkSchema({
            '--local': And(Use(str), os.path.isdir,
                           lambda n: os.path.exists('{0}/{1}.css'.format(
                                                    n, theme)), error=
                           'Invalid custom css dir or hasnot this theme'),
        }, {'--local': isLocal}
        )
        css_dict = cssStyle('local', isLocal, theme).main()
    else:
        css_dict = cssStyle('crawler', theme).main()
    args.pop('--local')
    args.pop('--theme')
    args = checkSchema({
        'MDFILE': os.path.exists,
        '--mailserver': Use(str, error='Invalid server format'),
        '--mailto': And(Use(str), lambda n: check_email(n),
                        error='Invalid email format'),
        '--subject': Or(Use(str), Use(unicode),
                        error='Invalid suject format'),
        '--password': Use(str, error='Invalid suject format'),
        '--cc': Or(None, And(Use(str), lambda n: check_email(n)),
                   error='Invalid email format'),
        '--output': Or(False, lambda n: os.path.exists(
        os.path.dirname('{0}/{1}'.format(
        os.path.abspath('.'), n))),
            error='Dir must exists'),
        '--template': Or(None, os.path.exists, error='template must exists'),
        '--username': Use(check_email, error='Invalid username format'),
        '--help': Or(False, True),
        '--version': Or(False, True)
    }, args)
    do = FabricHtml(args['MDFILE'], css_dict)
    cc = args['--cc'].split(',') if args['--cc'] else []

    if args['--template']:
        with codecs.open(args['--template'], mode="r", encoding="utf-8") as f:
            html_content = do.main(f.read())
    else:
        html_content = do.main()
    if args['--output']:
        with codecs.open(args['--output'], mode="w", encoding="utf-8") as f:
            f.write(html_content)
            exit()

    sendMail(
        args['--mailserver'],
        args['--username'],
        args['--password'],
        args['--mailto'].split(','),
        args['--subject'],
        html_content,
        cc
    )

if __name__ == '__main__':

    main()

故障描述

最近终于更新了下gentoo,重启发现我的eth0网卡启动失败:

 * Bringing up interface eth0
 *   ERROR: interface eth0 does not exist
 *   Ensure that you have loaded the correct kernel module for your hardware
 * ERROR: net.eth0 failed to start

而启动某些我常用的服务,比如mongodb,也报错:

~ # /etc/init.d/mongodb restart
 * Bringing up interface eth0
 *   ERROR: interface eth0 does not exist
 *   Ensure that you have loaded the correct kernel module for your hardware
 * ERROR: net.eth0 failed to start
 * ERROR: cannot start mongodb as net.eth0 would not start

竟然也需要启动网卡?

查看内核和dmesg:

查看内核模块已经选中,而且以前eth0也有,再看dmesg

dmesg |grep network 
[   74.261872] systemd-udevd[14259]: renamed network interface wlan0 to wlp2s0
[   74.391865] systemd-udevd[14259]: renamed network interface eth0 to enp0s4

原来被重命名了

为什么?

从udev-197将自动分配更好的接口名字,具体解释请看[PredictableNetworkInterfaceNames] (http://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames),

解决办法,有三种

  1. 临时办法,重启还是会失效

ifrename -i enp0s4 -n eth0 #修改网卡名字变成原来的eth0

  1. 使用新的名字
rm /etc/init.d/net.eth0 #删除不存在的引用

localhost ~ # rc-update delete net.eth0 default #删除不存在的开机启动
 * service net.eth0 removed from runlevel default
localhost ~ # rc-update add net.enp0s4 default #使用新名字
  1. 重置udev的rules,还是用原来的方法
ln -s /dev/null /etc/udev/rules.d/80-net-name-slot.rules

第二种,和第三种需要重启

启动应用为什么也需要启动应该启动的网卡

查看/etc/init.d/mongodb脚本,发现是因为depend,一般的初始化脚本结构是

#!/sbin/runscript

depend() {
  (依赖关系信息)
}

start() {
  (启动服务所必需的命令)
}

stop() {
  (停止服务所必需的命令)
}

restart() {
  (重启服务所必需的命令)
}

比如 mongodb 的依赖是

depend() {
  need net #需要依赖net.X
}

下次我专门研究一篇gentoo初始化脚本的文章

前言

octopress自带的markdown语法高亮代码,最后展示在页面上的效果比较不友好-不能复制粘贴代码,不高亮,还有很丑的行数提示。 我一直使SHJS,还算比较喜欢,但是以前每次都是编辑markdown文章,在使用

```XX```

的时候,使用

<div class="bogus-wrapper"><notextile><figure class="code"><pre class="sh_python">
XXX
</pre></figure></notextile></div>

这样的苦逼方式,最近实在是不了了,自定义octopress的解析过程

其实就是修改plugins/pygments_code.rb

require 'pygments'
require 'fileutils'
require 'digest/md5'

PYGMENTS_CACHE_DIR = File.expand_path('../../.pygments-cache', __FILE__)
FileUtils.mkdir_p(PYGMENTS_CACHE_DIR)

module HighlightCode
  def highlight(str, lang)
    lang = 'ruby' if lang == 'ru'
    lang = 'objc' if lang == 'm'
    lang = 'perl' if lang == 'pl'
    lang = 'yaml' if lang == 'yml'
    str = pygments(str, lang).match(/<pre>(.+)<\/pre>/m)[1].to_s.gsub(/ *$/, '') #strip out divs <div class="highlight">
    tableize_code(str, lang)
  end

  def pygments(code, lang)
    if defined?(PYGMENTS_CACHE_DIR)
      path = File.join(PYGMENTS_CACHE_DIR, "#{lang}-#{Digest::MD5.hexdigest(code)}.html")
      if File.exist?(path)
        highlighted_code = File.read(path)
      else
        highlighted_code = Pygments.highlight(code, :lexer => lang, :formatter => 'html', :options => {:encoding => 'utf-8'})
        File.open(path, 'w') {|f| f.print(highlighted_code) }
      end
    else
      highlighted_code = Pygments.highlight(code, :lexer => lang, :formatter => 'html', :options => {:encoding => 'utf-8'})
    end
    highlighted_code
  end
  def tableize_code (str, lang = 'python') #主要是修改这个方法
    table = "<pre class='sh_#{lang}'>"
    str.lines.each_with_index do |line,index|
      table += "<span class='line'>#{line}</span>"
    end
    table += "</pre>"
  end
end

使用方法

和过去一样,在md的文章中使用:

```XX```

要是想指定某语言,需要先引用这个css,然后在md中 比如这里用bash语法(也是我的默认)

```bash
XX
\``` # 这里不能正常显示,加个反斜杠

前言

最近做自己开发用相关服务的一个checklist,就写了这个脚本,用来在跳板机去检查各个服务器上面的相关服务是否正常

思路

使用expect登录每个机器(因为安全问题,不能直接使用ssh信任),然后根据yaml文件的配置读取服务名字以及启动的进程数量 去检查每个服务是否正常 PS:难点是没有用端口转发也只有普通用户权限

checklist.py

#coding=utf-8
import sys
#因为我这个脚本要让很多人能运行,但是不能给他们看见我的密码算法,所以是pyc
#我这个脚本要给很多其他普通用户去用,是用我的ssh登录操作,不能放在我的home目录,所以放在tmp
sys.path.append('/tmp/local/lib/python2.6/site-packages/PyYAML-3.10-py2.6-linux-x86_64.egg') #依赖yaml
sys.path.append('/tmp/local/lib/python2.6/site-packages/pexpect-2.4-py2.6.egg') #依赖pexpect
import yaml
import pexpect
dataDict = yaml.load(open('/tmp/config.yaml')) #将我的yaml配置load进来

def myprint(color,mes): #以前写的一个终端彩色打印的函数
    '''使用ANSI控制码终端显示彩色'''
    d = dict(r=31, g=32, gb=36, y=33, b=34, p=35, o=37)
    color = "\x1B[%d;%dm" % (1, d[color])
    print "%s%s\x1B[0m" % (color, mes)

def main():
    list = ['g', 'b', 'y', 'gb', 'p']
    light = 0
    for k in dataDict:
        if k.startswith('bj-'):
        color = list[light%5] #根据服务器对颜色轮循
            SERVER = dataDict[k]
        #我这是使用了-F 是因为我没有root权限不能修改hosts文件,但是我在config.yaml使用了别名,
        而这个定义就是自定义了sshconfig,默认是~/.ssh/config
        child = pexpect.spawn('ssh -F /tmp/sshconfig dongwm@{0}'.format(SERVER['host']))
        #因为有其他用户,可能他还没有链接过某服务器,最开始会让你确认服务器标识,需要点yes
        f = child.expect(['Password: ', 'password: ', 'continue connecting (yes/no)?'])
        if f == 2:
            #当这个flag为2  表示那个用户没有登录过某服务器
            child.sendline('yes')
            child.expect('password:')
            child.sendline('{0}'.format(mypasswd(SERVER['host']))) #mypasswd是加密我服务器权限的函数,每个服务器密码不同
        if f == 1:
            child.sendline('{0}'.format(mypasswd(SERVER['host'])))
        child.expect('~')
        for service in SERVER['service']:
        flag = 0
        #我在配置里面会加服务,一般会指定服务的进程数来对比是否正常
        if isinstance(service, dict):
            data =service.items()[0]
            service = data[0]
            num = data[1]
        else:
        #假如我在配置只指定服务,不指定进程数,那么只要确定跑了进程 不在乎进程数
            num = 0
            flag = 1
            child.expect('~')
            child.sendline('ps -ef|grep {0}|grep -v grep|wc -l'.format(
            service))
            child.readline()
            #进程数
            pro_num = child.readline().split('\r\n')[0]
        if int(pro_num) == num or flag:
            #进程数符合配置标注的数值
            myprint(color, '[{0}]  [{1}]  [{2}]  [{3}]'.format(k.center(12), 
            SERVER['ip'].center(14), service.center(20), 'ok'.center(4)))
        else:
            myprint('r', '[{0}]  [{1}]  [{2}]  [{3}]  [{4}!={5}]'.format(k.center(12), 
            SERVER['ip'].center(14), service.center(20), 'fail', 
            pro_num, num))
        light += 1
            child.sendline('exit')

if __name__ == '__main__':
    main()

config.yaml 我这里只截取了其中一段

bj-2:
  host: s233 #这个s233在sshconfig指定
  ip: XXX.XXX.XXX.233 #只是为了显示出ip 好确认
  service: #服务load后是一个列表
  #给XX用
  - nginx: 5
  - uwsgi: 25
  - supervisord: 1
  #给本机XX提供mysql服务
  - mysql: 3 #django
  #给本机XX提供XX
  - celery: 12 
  #给本机XX提供XX
  - rabbitmq: 9
  - redis: 1
  - mongod: 2

前言

最近又开始开始了expect的一些更深层次的东西,分享出来

字典

expect没有严格意义的字典,但是确实可以使用

创建字典:

set mydict [dict create tbj tbjpass server serverpass]
它表示创建一个字典叫做mydict,包含2个kv对:tbj & tbjpass 和server & serverpass

你也可以这样添加数据:

set mydict .dongwm dongwmpass 
 表示添加一个键为.dongwm 值为dongwmpass的新数据到mydict

根据key获取值可以这样:

[dict get $mydict server]
表示从mydict获取server的值

NB的事,可以直接这样写,看我的片段:

expect "password:"
send "[dict get $mydict s70]\n"
也就是直接把这个看起来像列表的东东直接写到字符串里面

判断变量是否存在

if {[info exists serverpass]!=1} {
    puts 'sd'
}
表示如果serverpass这个变量要是不存在,就puts,但是注意的是,
假如上面你已经set 这个变量,不管有没有值,这个变量都已经被*定义*了

判断列表包含

一种使用switch结构,还有一种是if方式,将属于一类的放在一个列表, 看它是不是’in’:

set listserver "1 2 3 4 "
if {1 in $listserver} {puts 11}
当1在列表$listserver里面puts

switch多条件

假如有一些switch的结果,但是他们有一些需要做一样的操作, 那么就可以吧他们放在一起

switch $port {
    100  -
    200  { puts 1}
    300  -
    400 {puts 2}
    }
这里表示当port是100,或者200会puts1,当port是300或者400,会puts2

前言

上周看了docopt, 感想很多。最近因为工作需要用opensuse,用ubuntu,个人pc用gentoo, 实在够折腾,每个系统都要相应的安装那些软件,搭建环境。早就想好好整理下思路,更geek的做这件事情。上段时间还看了个 laptop,觉得能力很一般,但是fork真不少。但是确实这个想法很不错,很有必要。 然后周末就构思了我的laptop

它的特性

  • 记录操作记录,当某处出现故障,下次会从这个位置继续执行,而不需要全部执行一遍
  • docopt启发,根据我特定的语法写配置文件,不需要修改初始化脚本initialize.sh
  • 只需要添加你要安装的软件包的安装命令(使用包管理器的就需要修改相应操作系统的install文件)
  • 支持对已安装软件的确认,不再安装而跳过
  • 根据特定语法打印安装过程的提示
  • 提供绿色,红色的asciilinux终端显示字体
  • 执行在没有git等环境下git clone项目安装

目前每个版本会安装那些软件?

  • gcc g++ automake
  • tmux htop dstat
  • ruby python python库 expect
  • zsh oh-my-zsh
  • easy_install pip
  • django torando flask
  • nginx uwsgi
  • redis mongodb
  • mit-scheme commonlisp
  • emacs
  • gitflow
  • celery
  • colout
  • MySQLdb pymongo
  • taglist
  • gevent twisted
  • the_silver_searcher
  • 检查vim是否自带python/ruby支持,否则下载编译一个执行的版本
  • 我的常用脚本mytools,目前包含一个expect脚本和orzdba
  • 我的dotfiles

gentoo系统一些软件

  • gentoolkit module-rebuild genlop eix euses elogv
  • fcitx
  • iproute2 netkit-telnetd

使用方法

  1. 有git的情况:
shell>git clone https://github.com/orzrd/laptop
shell>cd laptop
shell>bash initialize.sh
  1. 没有git的情况:
shell>bash <(curl -s https://raw.github.com/orzrd/laptop/master/setup_laptop)
脚本语法

可操作文件

  1. common.install: 用户自定义的软件安装脚本,推荐非操作系统包管理器的都放在这里,注意逻辑顺序
  2. opensuse/ubuntu/gentoo.install: 相应系统的安装脚本,这个只需要修改,我的脚本会自动根据系统信息找到
  3. initialize.sh: 假如你想把需要我的安装方法,添加功能等,修改他,他是主入口
  4. setup_laptop: 当用户没有git环境不能git clone 直接远程curl我,主要是下载git,clone我的laptop

  5. ’#’ 以’#’开头的行表示这个信息会被安装过程以绿色字体打印,提示一下你要安装的东西等
  6. : 以’:’开头的行,表示后面的字符串是个命令,也就是检查这个软件包有没有被安装需要的,假如which找到了路径说明被安装
  7. ; 以’;’开头的行为注释
  8. 其它行就是要执行的语句,请不要当作shell注释等,因为他会把你写的东西当成要执行的命令

TODO

  • 文件下载后就不需要再下载而直接使用
  • 在执行某软件的安装过程中其他进程继续下载其他软件包(也就是实现shell版本的emerge)
  • 进度条或者python_koans的提示已完成/剩余,更多的异常处理等
  • 更多的异常处理
  • 打印彩色字体内容嵌其他颜色字体(比如提示出错,高亮错误的原因或者软件包)

注意我的项目地址,欢迎各种fork,pull request,issue

https://github.com/orzrd/laptop

前言

公司全部使用了无线网络,我也被‘逼’的开始研究gentoo的无线上网,看了网上很多文章,以及gentoo文档,但是感觉都让我很迷糊,以下是我使用wpa_supplicant是一些总结

总结

  • 查看本机的无线网卡
emerge pciutils #这样就有了lspci这个命令
localhost ~ # lspci |grep -i wire
02:00.0 Network controller: Atheros Communications Inc. AR9285 Wireless Network Adapter (PCI-Express) (rev 01)

可以发现,网卡是Atheros的AR9285

  • 安装wpa_supplicant
emerge -s wpa_supplicant
  • 生成一个配置配置文件
zcat /usr/share/doc/wpa_supplicant-2.0/wpa_supplicant.conf.bz2 > /etc/wpa_supplicant/wpa_supplicant.conf
  • 配置,以下是我去掉注释行,空白行等剩下的配置,其中的psk的字符串这样生成:
localhost ~ # wpa_passphrase 我的ssid 我的key
network={
	ssid="我的ssid"
	#psk="我的key"
	psk=e596aa911775ed47e04f5b9a9540978203210874eb258208b87cf82b5cf72588
}

把这段加在配置文件中

localhost ~ # cat /etc/wpa_supplicant.conf 

ctrl_interface=/var/run/wpa_supplicant
eapol_version=1
ap_scan=1
fast_reauth=1
network={
	ssid="我的ssid"
	psk=e596aa911775ed47e04f5b9a9540978203210874eb258208b87cf82b5cf72588
	priority=2
}
  • 命令行启动wpa(如果想看详细的信息用于调试,加-d选项)
localhost ~ # wpa_supplicant -i wlan0 -c /etc/wpa_supplicant/wpa_supplicant.conf   
Successfully initialized wpa_supplicant
wlan0: Trying to associate with 20:dc:c6:61:ab:34 (SSID='我的ssid' freq=2437 MHz)
ioctl[SIOCSIWFREQ]: Device or resource busy
wlan0: Association request to the driver failed
wlan0: Associated with 20:dc:c6:61:ab:34
wlan0: WPA: Key negotiation completed with 20:dc:c6:61:ab:34 [PTK=CCMP GTK=CCMP]
wlan0: CTRL-EVENT-CONNECTED - Connection to 20:dc:c6:61:ab:34 completed [id=0 id_str=]

其中wlan0: Association request to the driver failed 没关系

  • 安装udhcpc
emerge udhcpc
  • 通过dhcp自动获得
localhost ~ # dhcpcd wlan0
dhcpcd[12395]: version 5.6.4 starting
dhcpcd[12395]: wlan0: waiting for carrier
dhcpcd[12395]: wlan0: carrier acquired
dhcpcd[12395]: wlan0: carrier lost
dhcpcd[12395]: wlan0: waiting for carrier
dhcpcd[12395]: wlan0: carrier acquired
dhcpcd[12395]: wlan0: sending IPv6 Router Solicitation
dhcpcd[12395]: wlan0: sendmsg: Cannot assign requested address
dhcpcd[12395]: wlan0: rebinding lease of 192.168.0.106
dhcpcd[12395]: wlan0: acknowledged 192.168.0.106 from 192.168.0.1 `�'
dhcpcd[12395]: wlan0: checking for 192.168.0.106
dhcpcd[12395]: wlan0: sending IPv6 Router Solicitation
dhcpcd[12395]: wlan0: leased 192.168.0.106 for 7200 seconds
dhcpcd[12462]: wlan0: wlan0: MTU set to 576
dhcpcd[12395]: forked to background, child pid 12479

看到了吧 获得了192.168.0.106这个地址

前言:

上段时间做了个demo, 使用了flask和mongodb,以及bootstrap, jquery,分享给大家当作入门flask的例子

启动程序代码

#!/usr/bin/env python2
# encoding=utf-8
# Version 2 by Dongwm 2012/12/18

import os
from pymongo import Connection

from flask import Flask, request, render_template, redirect, url_for, jsonify
from paginate import Pagination
import setting

def static(filename):
    filepath = os.path.join(os.path.dirname(__file__), 'static', filename)
    last_modification = '%d' % os.path.getmtime(filepath)
    return url_for('.static', filename=filename) + '?' + last_modification  #我这里给每个文件加了一个唯一性质的时间戳

def create_app():
    app = Flask(__name__)
    app.config.from_object(setting)  #把一些可以控制的参数放在setting模块里面
    @app.context_processor
    def inject_static():
        return dict(static=static)
    return app

def conMongo(): #因为我很多地方都需要mongodb的游标,封装了下
	mongo = Connection(host='127.0.0.1',port=28012)
	return mongo

app = create_app()

@app.route('/list')  #flask使用装饰器的作为路由方式 表示访问你网站(比如http://localhost/list)的请求都会通过这个函数处理
def list(): #函数名字不重要,只要你能理解好维护,通过名字了解用途就好
	pagesize = 100  
	page = int(request.args.get('page',0))
	data = get_list_MongoData(page, pagesize) #这个获得mongodb的函数我就不提供了 简单理解就是更具页数和每页条目获取数据
	pagination = Pagination(total=data[1], per_page=pagesize, page=page)
	return render_template("list.html", tables=data[0], pagination=pagination) #有点像django的render_to_response,但是flask直接把要渲染的数据用K=V的方式传进来,而django需要放在字典里面,作为第二个参数传

@app.route('/')  
def index():
	return redirect(url_for('list')) #到网站跟目录的请求定向到/list

@app.route('/json')  
def getJson():

	db = conMongo()
	res = results(db)
	return jsonify(res)  #类似django的HttpResponse(simplejson.dumps(res), mimetype='application/json') 返回json数据

@app.route('/dev')
def dev():
	
	return render_template("dev.html")

if __name__ == '__main__':
	app.run(host="0.0.0.0")

settings.py

DEBUG = True  #指定开启debug模式
PORT = 5000  #指定监听端口

dev.html




{% extends 'base.html' %}  //这里是先继承base.html模板
{% block title %}Dev{% endblock %}  //重新设定title块的内容
{%- block css %}  //重新设定css块  注意引用静态文件的方式
    <link rel="stylesheet" href="{{ static(filename='css/devstyle.css') }}" />
    <style>
	.col_content{ height:500px; }
	h2 {text-align:center;}
	</style>
{%- endblock %}
{%- block js %}
    <script type="text/javascript" src="{{ static(filename='js/amcharts.js') }}"></script>
    <script type="text/javascript" src="{{ static(filename='js/raphael.js') }}"></script>
    <script language="javascript" type="text/javascript">
		</script>
		{%- endblock %}
		{%- block diejs %}
			pie2html();  //这个js函数在core.js定义(base.html有引用)
		{%- endblock %}
		{%- block dev %}
	<div>
		<h2>服务器服务信息</h2>
		<div class="well col_content" id="webserver_content">
		Loading&#8230;&#8230;
		</div>
		<h2>服务器应用信息</h2>
		<div class="well col_content" id="webapp_content">
		Loading&#8230;&#8230;
		</div>
		<h2>Nginx服务具体版本</h2>
		<div class="well col_content" id="nginx_content">
		Loading&#8230;&#8230;
		</div>
		<h2>Apache服务具体版本</h2>
		<div class="well col_content" id="apache_content">
		Loading&#8230;&#8230;
		</div>
		<h2>Asp服务具体版本</h2>
		<div class="well col_content" id="asp_content">
		Loading&#8230;&#8230;
		</div>
		<h2>网站技术信息</h2>
		<div class="well col_content" id="tech_content">
		Loading&#8230;&#8230;
		</div>
		<h2>系统分类</h2>
		<div class="well col_content" id="system_content">
		Loading&#8230;&#8230;
		</div>
		<h2>系统分类</h2>
		<div class="well col_content" id="version_content">
		Loading&#8230;&#8230;
		</div>
		<h2>os</h2>
		<div class="well col_content" id="os_content">
		Loading&#8230;&#8230;
		</div>
	</div>
{%- endblock %}


base.html



&lt;!doctype html&gt;
<html>
  <head>
    <title> {% block title %}{% endblock %}</title>  //block会设置的一个块,每个模板文件要是重新定义会覆盖,否则继承它的值
    <link rel="stylesheet" href="{{ static(filename='css/bootstrap.css') }}" />
    <link rel="stylesheet" href="{{ static(filename='css/bootstrap-responsive.css') }}" />  
    <link rel="stylesheet" href="{{ static(filename='css/style.css') }}" />
    {%- block css %}
    {%- endblock %}
    <script type="text/javascript" src="{{ static(filename='js/jquery-1.8.0.min.js') }}"></script>
    <script type="text/javascript" src="{{ static(filename='js/core.js') }}"></script>
    {%- block js %}
    {%- endblock %}
	<script language="javascript" type="text/javascript">
		$(document).ready(function() {
		var i = 0;
		$('#control').click(function() {
				if(i%2 == 0) {
					$('#zt-user').slideDown(500);  //加载完成的一个特效,都在style.css中定义
					$('#control').removeClass('bkg-control-down').addClass('control-up');
				}
				else {
					$('#zt-user').slideUp(500);
					$('#control').removeClass('bkg-control-up').addClass('control-down');
				}
				i++;
		});
		{%- block diejs %}  //这个块定义在这里是为了每个模板文件都能定义文档加载完成执行的函数,一个页面只能有一个$(document).ready
		{%- endblock %}
		</script>
</head>
	<body>
		<div id="zt-user">
			<div class="container">
				<div id="zt-user-inner" class="row-fluid">
					<div id="zt-top1" class="span12">
						<div class="zt-box-inside">
							<div class="moduletable">
								<div class="modulecontent">
						<form action="/" method="post">
							<div class="search">
								<input name="searchword" maxlength="20" class="inputbox" type="text" size="20" value="Start Searching ... " onblur="if (this.value=='') this.value='Start Searching ... ';" onfocus="if (this.value=='Start Searching ... ') this.value='';" /><input type="submit" value="Search" class="button" onclick="this.form.searchword.focus();" />
							</div>
						</form>
								</div>
							</div>
						</div>
					</div>																		
				</div>
			</div>
		</div>
		{%- block dev %}{%- endblock %}   //上面的dev.html重新声明了这个块,那么数据就会显示在这个位置
		{%- block top %}
		<div id="zt-top">
			<div class="container">						
				<div class="row-fluid">
					<div class="control-up span6" id="control"><span>Search</span></div>
				<ul id="zt-topright" class="pull-right">
				<li class="blue" target="_blank"><a title="Demo" href="/list">列表</a></li>
				<li class="green" target="_blank"><a title="Demo" href="/dev">画图</a></li>
				</ul>
					</div>
				</div>
			</div>
		{%- endblock %}
		{%- block ttable %}{%- endblock %}
		{%- block footer %}
		<div id="zt-footer">
			<div class="container">				
				<p id="copyright">
				Copyright &copy; 2009 - 2013 <a href="http://www.dongwm.com" title="dongwm">(C)dongwm</a>. All Rights Reserved
				</p>
			</div>						
		</div>
		{%- endblock %}
	</body>
</html>


前言

标题有点唬人,以前了解过研究gevent,twisted,scrapy(基于twisted)。最近有个想法:这些东西比如做爬虫,谁的效率更好呢? 我就写了以下程序(附件)测试然后用timeit(跑3次,每次10遍,时间有限)看效果

原理:

  1. 为了防止远程网络的问题,从一个网站爬下网页代码(html),页面下载本地放在了我的本机(gentoo+apache)
  2. 然后爬虫去分析这些页面上面的链接(开始是主页),再挖掘其他页面,抓取页面关键字(我这里就是个‘py’) 程序打包Crawler.tar.bz2

先看代码树:

dongwm@localhost ~ $ tree Crawler/
Crawler/
├── common_Crawler.py  #标准爬虫,里面只是多线程编程,抓取分析类在common.py
├── common.py  #共用函数,里面只是抓取页面分析页面关键字
├── common.pyc #你懂得
├── Crawler #scrapy和django框架差不多的用法
│   ├── __init__.py
│   ├── __init__.pyc
│   ├── items.py #不需要利用,默认
│   ├── pipelines.py
│   ├── settings.py
│   ├── settings.pyc
│   └── spiders #抓取脚本文件夹
│       ├── __init__.py
│       ├── __init__.pyc
│       ├── spiders.py #我做的分析页面,这个和多线程/gevent调用的抓取分析类不同,我使用了内置方法(大家可以修改共用函数改成scrapy的方式,这样三种效果就更准确了)
│       └── spiders.pyc
├── gevent_Crawler.py #gevent版本爬虫,效果和标准版一样,抓取分析类也是common.py 保证其他环节相同,只是一个多线程,一个用协程
├── scrapy.cfg
└── scrapy_Crawler.py #因为scrapy使用是命令行,我用subproess封装了命令,然后使用timeit计算效果

2 directories, 16 files

实验前准备:

停掉我本机使用的耗费资源的进程 firefox,vmware,compiz等,直到负载保持一个相对拨波动平衡

测试程序:

  1. common.py
#!/usr/bin/python
#coding=utf-8

# Version 1 by Dongwm 2013/01/10
# 脚本作用:多线程抓取
# 方式: lxml + xpath + requests

import requests
from  cStringIO import StringIO
from lxml import etree

class Crawler(object):

    def __init__(self, app):
        self.deep = 2  #指定网页的抓取深度        
        self.url = '' #指定网站地址
        self.key = 'by' #搜索这个词
        self.tp = app #连接池回调实例
        self.visitedUrl = [] #抓取的网页放入列表,防止重复抓取

    def _hasCrawler(self, url): 
        '''判断是否已经抓取过这个页面'''
        return (True if url in self.visitedUrl else False)
     
    def getPageSource(self, url, key, deep): 
        ''' 抓取页面,分析,入库.
        '''
        if self._hasCrawler(url): #发现重复直接return
            return 
        else:
            self.visitedUrl.append(url) #发现新地址假如到这个列
        r = requests.get('http://localhost/%s' % url)
        encoding = r.encoding #判断页面的编码
        result = r.text.encode('utf-8').decode(encoding)
	    #f = StringIO(r.text.encode('utf-8'))
        try:  
            self._xpath(url, result, ['a'], unicode(key, 'utf8'), deep) #分析页面中的连接地址,以及它的内容
            self._xpath(url, result, ['title', 'p', 'li', 'div'], unicode(key, "utf8"), deep) #分析这几个标签的内容
        except TypeError: #对编码类型异常处理,有些深度页面和主页的编码不同
            self._xpath(url, result, ['a'], key, deep)
            self._xpath(url, result, ['title', 'p', 'li', 'div'], key, deep)
        return True

    def _xpath(self, weburl, data, xpath, key, deep):
        page = etree.HTML(data)
        for i in xpath:
            hrefs = page.xpath(u"//%s" % i) #根据xpath标签
            if deep >1:
                for href in hrefs:
                    url = href.attrib.get('href','')
                    if not url.startswith('java') and not url.startswith('#') and not \
                        url.startswith('mailto') and url.endswith('html'):  #过滤javascript和发送邮件的链接
                            self.tp.add_job(self.getPageSource,url, key, deep-1) #递归调用,直到符合的深
            for href in hrefs:
                value = href.text  #抓取相应标签的内容
                if value:
                    m = re.compile(r'.*%s.*' % key).match(value) #根据key匹配相应内容

    def work(self):
        self.tp.add_job(self.getPageSource, self.url, self.key, self.deep)
        self.tp.wait_for_complete() #等待线程池完成
  1. common_Crawler.py
#!/usr/bin/python
#coding=utf-8

# Version 1 by Dongwm 2013/01/10
# 脚本作用:多线程



import time
import threading
import Queue
from common import Crawler

#lock = threading.Lock()   #设置线程锁


class MyThread(threading.Thread):

    def __init__(self, workQueue, timeout=1, **kwargs):
        threading.Thread.__init__(self, kwargs=kwargs)
        self.timeout = timeout #线程在结束前等待任务队列多长时间
        self.setDaemon(True)  #设置deamon,表示主线程死掉,子线程不跟随死掉
        self.workQueue = workQueue
        self.start() #初始化直接启动线程

    def run(self):
        '''重载run方法'''
        while True:
            try:
                #lock.acquire() #线程安全上锁 PS:queue 实现就是线程安全的,没有必要上锁 ,否者可以put/get_nowait
                callable, args = self.workQueue.get(timeout=self.timeout) #从工作队列中获取一个任务
                res = callable(*args)  #执行的任务
                #lock.release()  #执行完,释放锁 
            except Queue.Empty: #任务队列空的时候结束此线程
                break
            except Exception, e:
                return -1


class ThreadPool(object):

    def __init__(self, num_of_threads):
         self.workQueue = Queue.Queue()
         self.threads = []
         self.__createThreadPool(num_of_threads)
 
    def __createThreadPool(self, num_of_threads):
        for i in range(num_of_threads):
             thread = MyThread(self.workQueue)
             self.threads.append(thread)

    def wait_for_complete(self):
        '''等待所有线程完成'''
        while len(self.threads):
            thread = self.threads.pop()
            if thread.isAlive():  #判断线程是否还存活来决定是否调用join
                thread.join()
     
    def add_job( self, callable, *args):
        '''增加任务,放到队列里面'''
        self.workQueue.put((callable, args))
def main():

    tp = ThreadPool(10) 
    crawler = Crawler(tp)
    crawler.work()

if __name__ == '__main__':

    import timeit
    t = timeit.Timer("main()") 
    t.repeat(3, 10)
  1. gevent_Crawler.py
#!/usr/bin/python
#coding=utf-8

# Version 1 by Dongwm 2013/01/10
# 脚本作用:gevent

import gevent.monkey
gevent.monkey.patch_all()
from gevent.queue import Empty, Queue
import gevent
from common import Crawler

class GeventLine(object):

    def __init__(self, workQueue, timeout=1, **kwargs):
        self.timeout = timeout #线程在结束前等待任务队列多长时间
        self.workQueue = workQueue

    def run(self):
        '''重载run方法'''
        while True:
            try:
                callable, args = self.workQueue.get(timeout=self.timeout) #从工作队列中获取一个任务
                res = callable(*args)  #执行的任务
                print res
            except Empty:
                break
            except Exception, e:
            	print e
                return -1

class GeventPool(object):

	def __init__(self, num_of_threads):
	         self.workQueue = Queue()
	         self.threads = []
	         self.__createThreadPool(num_of_threads)
	 
	def __createThreadPool(self, num_of_threads):
	    for i in range(num_of_threads):
	         thread = GeventLine(self.workQueue)
	         self.threads.append(gevent.spawn(thread.run))


	def wait_for_complete(self):
	    '''等待所有线程完成'''

	    while len(self.threads):
	        thread = self.threads.pop()
	        thread.join()
	    gevent.shutdown()
	 
	def add_job( self, callable, *args):
	    '''增加任务,放到队列里面'''
	    self.workQueue.put((callable, args))

def main():
	tp = GeventPool(10) 
	crawler = Crawler(tp)
	crawler.work()

if __name__ == '__main__':

    import timeit
    t = timeit.Timer("main()") 
    t.repeat(3, 10)

  1. Crawler/spiders/spiders.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.item import Item

class MySpider(CrawlSpider):
    name = 'localhost'
    allowed_domains = ['localhost']
    start_urls = ['http://localhost']
    rules = ( 
        Rule(SgmlLinkExtractor(allow=(r'http://localhost/.*')), callback="parse_item"),  
    )  
    def parse_item(self, response):
        hxs = HtmlXPathSelector(response)
        hxs.select('//*[@*]/text()').re(r'py')  #实现了common.py里面的抓取和分析,但是common.py是抓取五种标签,分2次抓取,这里是抓取所有标签,不够严禁

  1. scrapy_Crawler.py #时间有限,没有研究模块调用,也不够严禁

#!/usr/bin/python
#coding=utf-8

# Version 1 by Dongwm 2013/01/10
# 脚本作用:scrapy

from subprocess import call

def main():
	call('scrapy crawl localhost --nolog', shell=True)

if __name__ == '__main__':

    import timeit
    t = timeit.Timer("main()") 
    t.repeat(3, 10)

实验过程

1. 同时启动三个终端,一起跑(手点回车,肯定有点延迟)
dongwm@localhost ~/Crawler $ python scrapy_Crawler.py
10000000 loops, best of 3: 0.024 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0223 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0223 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0223 usec per loop
10000000 loops, best of 3: 0.0222 usec per loop
10000000 loops, best of 3: 0.0223 usec per loop #他是最快跑完的,非常快~~  数据很稳定

dongwm@localhost ~/Crawler $ python gevent_Crawler.py
100000000 loops, best of 3: 0.0134 usec per loop
100000000 loops, best of 3: 0.0131 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0134 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0133 usec per loop
100000000 loops, best of 3: 0.0133 usec per loop
100000000 loops, best of 3: 0.0133 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0133 usec per loop
100000000 loops, best of 3: 0.0132 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0123 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0123 usec per loop  #跑得很慢,不知道是不是timeit的原因(或者调用的优先级太低,抢资源能力不行),很奇怪,但是它的数据最快,数据稳定在0.0123-0.0133


dongwm@localhost ~/Crawler $ python common_Crawler.py
100000000 loops, best of 3: 0.0274 usec per loop
10000000 loops, best of 3: 0.0245 usec per loop
10000000 loops, best of 3: 0.0252 usec per loop
10000000 loops, best of 3: 0.0239 usec per loop
10000000 loops, best of 3: 0.025 usec per loop
10000000 loops, best of 3: 0.0273 usec per loop
10000000 loops, best of 3: 0.0255 usec per loop
10000000 loops, best of 3: 0.0261 usec per loop
10000000 loops, best of 3: 0.0275 usec per loop
10000000 loops, best of 3: 0.0261 usec per loop
10000000 loops, best of 3: 0.0257 usec per loop
10000000 loops, best of 3: 0.0273 usec per loop
10000000 loops, best of 3: 0.0241 usec per loop
10000000 loops, best of 3: 0.0257 usec per loop
10000000 loops, best of 3: 0.0275 usec per loop
10000000 loops, best of 3: 0.0241 usec per loop
10000000 loops, best of 3: 0.0259 usec per loop
10000000 loops, best of 3: 0.0251 usec per loop
10000000 loops, best of 3: 0.0193 usec per loop
10000000 loops, best of 3: 0.0176 usec per loop
100000000 loops, best of 3: 0.0199 usec per loop
100000000 loops, best of 3: 0.0167 usec per loop
100000000 loops, best of 3: 0.018 usec per loop
10000000 loops, best of 3: 0.0179 usec per loop
100000000 loops, best of 3: 0.0173 usec per loop
100000000 loops, best of 3: 0.0172 usec per loop
100000000 loops, best of 3: 0.018 usec per loop
100000000 loops, best of 3: 0.0162 usec per loop
100000000 loops, best of 3: 0.0179 usec per loop
100000000 loops, best of 3: 0.0171 usec per loop  #第二跑得快,但是还是数据不稳定,时间在0.017-0.026之间

#####2. 挨个启动,待负载保持一个相对拨波动平衡 在换另一个

dongwm@localhost ~/Crawler $ python scrapy_Crawler.py
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0122 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0123 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0123 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0122 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0123 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0122 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0122 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop   #数据很稳定,在0.0122-0.0126之间 机器负载在1.3左右,最高超过了1.4(闲暇0.6左右)
dongwm@localhost ~/Crawler $ python gevent_Crawler.py
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0126 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop
100000000 loops, best of 3: 0.0125 usec per loop
100000000 loops, best of 3: 0.0124 usec per loop  #数据很稳定,在0.0124-0.0126之间 机器负载在1.2左右(闲暇0.6左右)
dongwm@localhost ~/Crawler $ python common_Crawler.py
10000000 loops, best of 3: 0.0135 usec per loop
100000000 loops, best of 3: 0.0185 usec per loop
10000000 loops, best of 3: 0.0174 usec per loop
100000000 loops, best of 3: 0.019 usec per loop
10000000 loops, best of 3: 0.016 usec per loop
10000000 loops, best of 3: 0.0181 usec per loop
10000000 loops, best of 3: 0.0146 usec per loop
100000000 loops, best of 3: 0.0192 usec per loop
10000000 loops, best of 3: 0.0165 usec per loop
10000000 loops, best of 3: 0.0176 usec per loop
10000000 loops, best of 3: 0.0177 usec per loop
10000000 loops, best of 3: 0.0182 usec per loop
100000000 loops, best of 3: 0.0195 usec per loop
10000000 loops, best of 3: 0.0163 usec per loop
10000000 loops, best of 3: 0.0161 usec per loop
100000000 loops, best of 3: 0.0191 usec per loop
100000000 loops, best of 3: 0.0193 usec per loop
10000000 loops, best of 3: 0.0147 usec per loop
100000000 loops, best of 3: 0.0197 usec per loop
10000000 loops, best of 3: 0.0178 usec per loop
10000000 loops, best of 3: 0.0172 usec per loop
100000000 loops, best of 3: 0.022 usec per loop
100000000 loops, best of 3: 0.0191 usec per loop
10000000 loops, best of 3: 0.0208 usec per loop
10000000 loops, best of 3: 0.0144 usec per loop
10000000 loops, best of 3: 0.0201 usec per loop
100000000 loops, best of 3: 0.0195 usec per loop
100000000 loops, best of 3: 0.0231 usec per loop
10000000 loops, best of 3: 0.0149 usec per loop
100000000 loops, best of 3: 0.0211 usec per loop #数据有点不稳定,浮动较大,但是最要在0.016-0.019  机器负载曾经长时间在1.01,最高未超过1.1 (闲暇0.6左右)

一些我的看法

虽然我的实验有不够严禁的地方,我的代码能力也有限(希望有朋友看见代码能提供修改意见或更NB的版本),但是效果还是比较明显的,我总结下

  1. gevent确实性能很好,并且很稳定,占用io一般(据说长时间使用有内存泄露的问题?我不理解)
  2. scrapy这个框架把爬虫封装的很好,只需要最少的代码就能实现,性能也不差gevent
  3. 多线程编程确实有瓶颈,并且不稳定

前言

最近搞了一些关于flask和django的东西,尤其是django的模板和admin功能以及这些框架使用bootstrap的东西,没时间更新博客,先说一下flask和django分页吧

flask的bootstrap分页插件flask-paginate

其实安装很常规,他的思路就是根据你的数据量给每个页面加一个li前缀到最后返回的div里面。因为官网提供的说明很简单,我在这里仔细说说:

  1. 官网说给你的网站页面添加css:
.pagination-page-info {
    padding: .6em;
    padding-left: 0;
    width: 40em;
    margin: .5em;
    margin-left: 0;
    font-size: 12px;
}
.pagination-page-info b {
    color: black;
    background: #6aa6ed;
    padding-left: 2px;
    padding: .1em .25em;
    font-size: 150%;
}

其实这个是给你页面显示统计数据的方法pagination.info提供的样式,默认的class=’pagination’是bootstrap自带的,不需要你添加

  1. 官网的例子使用的是:Blueprint:

我们一般都是: ‘from flask import Flask’,其实Blueprint就是一个可定制的容器,一个应用可以有多个容器,他们都继承于flask.helpers._PackageBoundObject 可以看我的一个例子:

@app.route('/')
def index():

	pagesize = 100 #设定每页显示条目数
	page = int(request.args.get('page',0)) #获取当前页面页数
	data = get_MongoData(page, pagesize) #get_MongoData是我自己的函数,根据页数过滤要显示的数据(因为实在太大了)
	pagination = Pagination(total=data[1], per_page=pagesize, page=page) #total的值是总数据条目,per_page表示每页显示数目,page就是当前页数。还可以设置向前/后页面标签(默认是<</>>)等
	return render_template("index.html", pagination=pagination)
  1. 我对他的一点修改:
    1. 我发现在我的程序里面,这个分页栏在后部会放不下而换行显示,我就直接把link_css制定的div改成了行内元素span
    2. 当我默认使用link_size,代码是这样:
    link_css = &#8217;<span class="pagination{0} green"><ul>&#8217; 其实最后页面出来的效果是&#8217;<span class="paginationNone green"><ul>&#8217; 这样就没有符合的bootstrap类,所以我修改了links方法: <div class="bogus-wrapper"><notextile><div class="bogus-wrapper"><notextile><figure class="code"><pre class="sh_python"> @property def links(self): '''get all the pagination links''' if self.total_pages &lt;= 1: return '' if not self.link_size: self.link_size = '' s = [link_css.format(self.link_size)] s.append(self.prev_page) for page in self.pages: s.append(self.single_page(page) if page else gap_marker) s.append(self.next_page) s.append('') return ''.join(s) </pre></figure></notextile></div></notextile></div> </ul></span></ul></span>

#####flask的bootstrap分页插件django-bootstrap-pagination

django的插件比较复杂,它自己定义了中间件和标签,这样你需要在模板中load它提供的函数,并且很nb的使用了RequestContext去处理变量,可以看张沈鹏以前写的一个小文章:django 简化 view 函数的编写

  1. 先看我的后台方法:

def showlist(req):

	t = req.GET.get('type', None)
	l = req.GET.get('app', None)
	if t and l:
		db = getMongo('XXX.XXX.XXX.XXX:XX', 'dc2')
		if t == 'v':
			q = re.compile(r'.*%s$' % l)
			data = db.site.find({'modules.site.level':'v4', 'site':{ '$regex' : q }}, 
				{'site':1, '_id':0, 'modules.site.links':1, 'modules.site.keywords':1}).sort(
				'modules.site.site.check_time')

	return render_to_response("list.html", {'data':data}, context_instance=RequestContext(req))

但是运行时候会报错:

Traceback:
File "/usr/local/lib/python2.6/dist-packages/django/core/handlers/base.py" in get_response
  111.                         response = callback(request, *callback_args, **callback_kwargs)
File "/home/dongwm/centerCon/views.py" in showlist
  68. 	return render_to_response("list.html", {'data':data}, context_instance=RequestContext(req))
File "/usr/local/lib/python2.6/dist-packages/django/shortcuts/__init__.py" in render_to_response
  20.     return HttpResponse(loader.render_to_string(*args, **kwargs), **httpresponse_kwargs)
File "/usr/local/lib/python2.6/dist-packages/django/template/loader.py" in render_to_string
  176.         return t.render(context_instance)
File "/usr/local/lib/python2.6/dist-packages/django/template/base.py" in render
  140.             return self._render(context)
File "/usr/local/lib/python2.6/dist-packages/django/template/base.py" in _render
  134.         return self.nodelist.render(context)
File "/usr/local/lib/python2.6/dist-packages/django/template/base.py" in render
  823.                 bit = self.render_node(node, context)
File "/usr/local/lib/python2.6/dist-packages/django/template/debug.py" in render_node
  74.             return node.render(context)
File "/home/dongwm/centerCon/templatetags/pagination_tags.py" in render 
  91.             page_obj = paginator.page(context['request'].page)
File "/usr/local/lib/python2.6/dist-packages/django/template/context.py" in __getitem__
  54.         raise KeyError(key)

Exception Type: KeyError at /showlist/
Exception Value: 'request'

不管你用那个插件都会有这个报错。。。

后来发现原因是:

settings文件没有设置TEMPLATE_CONTEXT_PROCESSORS 理由:模板上下文处理器会指定了哪些contextprocessors总是默认被使用。这样就省去了每次使用RequestContext都指定processors的麻烦 在settings加入: TEMPLATE_CONTEXT_PROCESSORS = ( “django.core.context_processors.media”, “django.core.context_processors.request” )