服务器hang的检查

做运维的都知道,最怕的不是机器直接挂掉,而是怕机器hang在那里,能ping通但是又登录不上去。周末加班写了个检测脚本,发送icmp包进行ping的检查,如果有返回再继续做ssl端口的检查或者ssh登录的检查。python不像perl下直接有个很好用的net::ping,自己网上找了个python-ping,修改了一下放脚本里面直接用。


#!/usr/bin/env python2.7
import socket
import sys
import paramiko
import os
import select
import struct
import time
import threading
import Queue
import copy
import string
import hashlib
from collections import deque
ICMP_ECHO_REQUEST = 8 # Seems to be the same on Solaris.
class CheckHang:
    def __init__(self,server):
        self.server=server
    def check_ssh(self):
        """
        return 1 when i can't ssh to the server
        """
        ssh = paramiko.SSHClient()
        key = paramiko.RSAKey.from_private_key_file("/home/pm/keys/id_rsa")
        ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
        try:
            ssh.connect(self.server,username="root",pkey=key,timeout=1)
            flag=1
            ssh.close()
        except:
            flag=0
        return flag
    def check_ssh_port(self,port):
        """
        check the 22 port alive, return 1 when the port is alive.
        """
        port_test = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            port_test.settimeout(1)
            port_test.connect( (self.server,port) )
            port_test.close()
            flag=1
        except :
            flag=0
        return flag
    def checksum(self,source_string):
        """
        I'm not too confident that this is right but testing seems
        to suggest that it gives the same answers as in_cksum in ping.c
        """
        sum = 0
        count_to = (len(source_string) / 2) * 2
        for count in xrange(0, count_to, 2):
            this = ord(source_string[count + 1]) * 256 + ord(source_string[count])
            sum = sum + this
            sum = sum & 0xffffffff # Necessary?

        if count_to < len(source_string):
            sum = sum + ord(source_string[len(source_string) - 1])
            sum = sum & 0xffffffff # Necessary?

        sum = (sum >> 16) + (sum & 0xffff)
        sum = sum + (sum >> 16)
        answer = ~sum
        answer = answer & 0xffff

        # Swap bytes. Bugger me if I know why.
        answer = answer >> 8 | (answer << 8 & 0xff00)

        return answer


    def receive_one_ping(self,my_socket, id, timeout):
        """
        Receive the ping from the socket.
        """
        time_left = timeout
        while True:
            started_select = time.time()
            what_ready = select.select([my_socket], [], [], time_left)
            how_long_in_select = (time.time() - started_select)
            if what_ready[0] == []: # Timeout
                return

            time_received = time.time()
            received_packet, addr = my_socket.recvfrom(1024)
            icmpHeader = received_packet[20:28]
            type, code, checksum, packet_id, sequence = struct.unpack(
                "bbHHh", icmpHeader
            )
            if packet_id == id:
                bytes = struct.calcsize("d")
                time_sent = struct.unpack("d", received_packet[28:28 + bytes])[0]
                return time_received - time_sent

            time_left = time_left - how_long_in_select
            if time_left <= 0:
                return


    def send_one_ping(self,my_socket, dest_addr, id, psize):
        """
        Send one ping to the given >dest_addr<.
        """
        dest_addr  =  socket.gethostbyname(dest_addr)

        # Remove header size from packet size
        psize = psize - 8

        # Header is type (8), code (8), checksum (16), id (16), sequence (16)
        my_checksum = 0

        # Make a dummy heder with a 0 checksum.
        header = struct.pack("bbHHh", ICMP_ECHO_REQUEST, 0, my_checksum, id, 1)
        bytes = struct.calcsize("d")
        data = (psize - bytes) * "Q"
        data = struct.pack("d", time.time()) + data

        # Calculate the checksum on the data and the dummy header.
        my_checksum = self.checksum(header + data)

        # Now that we have the right checksum, we put that in. It's just easier
        # to make up a new header than to stuff it into the dummy.
        header = struct.pack(
            "bbHHh", ICMP_ECHO_REQUEST, 0, socket.htons(my_checksum), id, 1
        )
        packet = header + data
        my_socket.sendto(packet, (dest_addr, 1)) # Don't know about the 1


    def do_one(self, timeout, psize):
        """
        Returns either the delay (in seconds) or none on timeout.
        """
        icmp = socket.getprotobyname("icmp")
        try:
            my_socket = socket.socket(socket.AF_INET, socket.SOCK_RAW, icmp)
        except socket.error, (errno, msg):
            if errno == 1:
                # Operation not permitted
                msg = msg + (
                    " - Note that ICMP messages can only be sent from processes"
                    " running as root."
                )
                raise socket.error(msg)
            raise # raise the original error

        #my_id = os.getpid() & 0xFFFF
        my_id= int(hashlib.md5(self.server).hexdigest(), 16) &  0xFFFF
        self.send_one_ping(my_socket,self.server, my_id, psize)
        delay = self.receive_one_ping(my_socket, my_id, timeout)

        my_socket.close()
        return delay
    def check_ping(self, timeout = 2, maxcount = 4, psize = 64 ):
        """
        if success to receive 1 response,return 1.max retry time is maxcount
        """
        for i in xrange(maxcount):
            try:
                delay = self.do_one(timeout,psize)
            except:
                continue
            if delay:
                return 1
        return 0

    def verbose_ping(self, timeout = 2, count = 4, psize = 64):
        """
        Send `count' ping with `psize' size to `dest_addr' with
        the given `timeout' and display the result.
        """
        for i in xrange(count):
            print "ping %s with ..." % self.server,
            try:
                delay  =  self.do_one( timeout, psize)
            except socket.gaierror, e:
                print "failed. (socket error: '%s')" % e[1]
                break

            if delay  ==  None:
                print "failed. (timeout within %ssec.)" % timeout
            else:
                delay  =  delay * 1000
                print "get ping in %0.4fms" % delay
        print    

class Muti_Check:
    """
    mutithread check
    """
    def __init__(self,servers):
        self.servers=servers
        self.downlist=deque()
        self.hanglist=deque()
    def server_check(self,ser):
        ser=ser.strip()
        test=CheckHang(ser)
        ping=test.check_ping(timeout=1)
        if ping == 1:
            ssh=test.check_ssh_port(22)
            #ssh=test.check_ssh()
            if ssh != 1:
                self.hanglist.append(ser)
        else:
            self.downlist.append(ser)
    def get_result(self):
        for ser in  self.hanglist:
            print "Hang: %s"%ser
        for ser in self.downlist:
            print "Down: %s"%ser
    def multi_check(self,concurrent_max):
        lists=copy.deepcopy(self.servers)
        concurrent=0
        thread_list=set()
        while( len( lists ) > 0 ):
            if len(thread_list) <= concurrent_max:
                ser=string.strip(lists.pop())
                pid=threading.Thread(target=self.server_check,args=(ser,))
                thread_list.add(pid)
                pid.start()
            else:
                alive_threads=set(threading.enumerate())
                join_threads=thread_list-alive_threads
                for job in join_threads:
                    job.join()
                    thread_list.remove(job)
            time.sleep(0.01)
        while(len(thread_list)>0):
            alive_threads=set(threading.enumerate())
            join_threads=thread_list-alive_threads
            for job in join_threads:
                job.join()
                thread_list.remove(job)
            time.sleep(0.01)

if __name__ == "__main__":
    fd=open(sys.argv[1])
    servers=fd.readlines()
    fd.close()
    cluster=Muti_Check(servers)  
    cluster.multi_check(20)
    cluster.get_result()
if __name__ == "__main2__":
    fd=open(sys.argv[1])
    num=0
    servers=fd.readlines()
    fd.close()
    hang_list=set()
    down_list=set()
    for ser in servers:
        ser=ser.strip()
        test=CheckHang(ser)
        ping=test.check_ping(timeout=1)
        if ping == 1:
            #ssh=test.check_ssh()
            ssh=test.check_ssh_port(22)
            if ssh != 1:
                num=num+1
                hang_list.add(ser)
        else:
            down_list.add(ser)
    for ser in  hang_list:
        print "Hang: %s" %ser
    for ser in down_list:
        print "Down: %s" %ser
    if not ( hang_list | down_list):
        print "all %d server ok"%(len(servers))

输入的列表就是机器名,使用多线程进行检测,线程数可以multi_check传入。如果机器上不用多线程的版本,那就用后面的那个直接简单的轮询。

另外说一下对hang的模拟,iptables放行icmp但是把22端口封掉,对于宕机的场景就是把这个检查服务器的IP直接封掉就OK。

发表在 OS, python | 服务器hang的检查已关闭评论

openwrt设置vlan tag

前面试过openwrt下把个别的端口单独划分到独立的vlan,使得连到这个端口的机器的IP是单独一个vlan的IP。今天测试了一下单个端口直接绑定到多个vlan,客户端机器自己使用vlan id来标记。


config interface 'loopback'
    option ifname 'lo'
    option proto 'static'
    option ipaddr '127.0.0.1'
    option netmask '255.0.0.0'

config interface 'lan'
    option ifname 'eth1.0'
    option type 'bridge'
    option proto 'static'
    option ipaddr '192.168.1.1'
    option netmask '255.255.255.0'
config interface 'lan1'
    option ifname 'eth1.1'
    option type 'bridge'
    option proto 'static'
    option ipaddr '192.168.100.1'
    option netmask '255.255.255.0'
config interface 'lan2'              
    option ifname 'eth1.2'      
    option type 'bridge'          
    option proto 'static'        
    option ipaddr '192.168.200.1'
    option netmask '255.255.255.0'
config interface 'wan'
    option ifname 'eth0'
    option proto 'dhcp'

config interface 'wwan'
    option proto 'dhcp'
config switch eth1
    option reset 1
    option enable_vlan 1
config switch_vlan
    option device eth1
    option vlan 0
    option ports '0 1 2t 5*'

config switch_vlan
    option device eth1
    option vlan 1
    option ports '2t 5*'

config switch_vlan
    option device eth1
    option vlan 2
    option ports '2t 5*'

以上就是把db120的eth3除了在默认的vlan 0内,还在vlan1和vlan2里面,vlan1使用192.168.100.0/24的段,而vlan2使用192.168.200.0/24的段。
dnsmasq加上对于的dhcp配置


config dnsmasq
    option domainneeded 1
    option boguspriv    1
    option filterwin2k  0  # enable for dial on demand
    option localise_queries 1
    option rebind_protection 1  # disable if upstream must serve RFC1918 addresses
    option rebind_localhost 1  # enable for RBL checking and similar services
    #list rebind_domain example.lan  # whitelist RFC1918 responses for domains
    option local    '/lan/'
    option domain   'lan'
    option expandhosts  1
    option nonegcache   0
    option authoritative    1
    option readethers   1
    option leasefile    '/tmp/dhcp.leases'
    option resolvfile   '/tmp/resolv.conf.auto'
    #list server        '/mycompany.local/1.2.3.4'
    #option nonwildcard 1
    #list interface     br-lan
    #list notinterface  lo
    #list bogusnxdomain     '64.94.110.11'

config dhcp lan
    option interface    lan
    option start    100
    option limit    150
    option leasetime    12h
config dhcp lan1
        option interface        lan1
        option start    10
        option limit    20
        option leasetime        12h
config dhcp lan2                    
        option interface        lan2
        option start    20        
        option limit    30                
        option leasetime        12h                                
config dhcp wan
    option interface    wan
    option ignore   1

别忘记了修改lan1和lan2对应防火墙规则


config forwarding
    option src 'lan'
    option dest 'wan'
config forwarding                
        option src 'lan1'            
        option dest 'wan'
config forwarding                
        option src 'lan2'            
        option dest 'wan'
config zone
    option name 'lan'
    option network 'lan'
    option input 'ACCEPT'
    option output 'ACCEPT'
    option forward 'REJECT'
config zone
        option name 'lan1'
        option network 'lan1'
        option input 'ACCEPT'
        option output 'ACCEPT'
        option forward 'REJECT'
config zone
        option name 'lan2'
        option network 'lan2'
        option input 'ACCEPT'
        option output 'ACCEPT'
        option forward 'REJECT'

配置修改好后重启路由器或者是把network,dnsmasq,firewall都重启了。

然后自己电脑连上LAN3口后,可以修改/etc/network/interfaces带上vlan tag,此时获取到的IP是 192.168.100.x
auto eth0.1
iface eth0.1 inet dhcp
vlan-raw-device eth0
这样就带vlan tag1,同理可以改为带vlan tag2.此时获取到的IP是 192.168.200.x
auto eth0.2
iface eth0.2 inet dhcp
vlan-raw-device eth0
如果恢复默认的配置,则获取到的IP是 192.168.1.x。

参考:
https://wiki.debian.org/NetworkConfiguration
http://www.lanis.nl/twiki/bin/view/Main/CreatingVLANsInOpenWRT
http://wiki.openwrt.org/doc/uci/network/switch

发表在 System | openwrt设置vlan tag已关闭评论

虚拟机上LVM测试

LVM可以配合RAID方便灵活调正分区的大小。虚拟机简单试了下还是很方便
1. 创建一个RAID5
mdadm –create /dev/md1 –run –level=5 -n 3 /dev/sdc /dev/sdb /dev/sda

2.创建LVM
2.1 初始物理卷
create pv
pvcreate /dev/md0
pvdisplay /dev/md1

2.2 创建卷组
vgcreate raidlvm /dev/md1
vgchange -a y raidlvm
vgdisplay raidlvm
2.3 创建逻辑卷
lvcreate -L1G -nusrfs raidlvm
lvcreate -L1000M -nrootfs raidlvm
lvdisplay /dev/raidlvm/rootfs
lvdisplay /dev/raidlvm/usrfs
mkfs.ext4 /dev/raidlvm/usrfs
2.4 调整卷大小
umount
lvextend -L+1Gsize2fs /dev/mapper/raidlvm-rootfs
eize2fs /dev/mapper/raidlvm-rootfs
resize2fs /dev/mapper/raidlvm-rootfs
mount -a

主要参考:http://www.tldp.org/HOWTO/LVM-HOWTO/extendlv.html

发表在 fs | 虚拟机上LVM测试已关闭评论

LVS DR模式到TUN模式的平滑迁移。

一般情况下大家在使用LVS的时候都很喜欢直接用DR模式,觉得DR模式的效率是最高。不过实际上DR模式在很多时候给我们带来的约束也非常大,最明显的莫过于LVS机器需要和RS机器能有一张网卡共处在一个vlan下。机房环境比较复杂的时候还用DR模式经常会受到各种的约束,比如同一个VLAN的IP都被用光了、同一个交换机下机柜没有空位了,etc。所以实际上我们也经常使用TUN模式。最近遇到一个之前使用DR模式,现在不能扩容的情况,上午就尝试在测试环境测试了一下DR模式到TUN模式的切换,整体影响和LVS主备切换的时候差不多,影响可控。
实际的背景是现在LVS1和LVS2做互备给DNS1、DNS2做负载均衡。但是因为找不到机为能和LVS机器挂同一个VLAN下,所以我现在需要把LVS的模式修改为TUN模式,以便对RS直接扩容。
机器列表:


LVS:
192.168.100.16 LVS1-slave
192.168.100.17 LVS2-slave
VIP:192.168.100.8
DNS Server:
192.168.100.18 DNS1
192.168.100.22 DNS2
192.168.100.38 DNS3(NEW)
192.168.128.29 DNS4(NEW)

keepalived原来的配置文件:


vrrp_instance dns {
    !state MASTER
    state BACKUP
    interface bond0
    lvs_sync_daemon_interface bond0
    virtual_router_id 51
    priority 99
    advert_int 1
    nopreempt
    garp_master_delay 1
    authentication {
        auth_type PASS
        auth_pass 1111
    }
    virtual_ipaddress {
        192.168.100.8/22 dev bond0 label bond0:1
        #配置的时候主要掩码不要写错了
    }
}

virtual_server 192.168.100.8 53 {
    delay_loop 30
    lb_algo rr
    lb_kind DR
    ha_suspend
    persistence_timeout 0
    protocol TCP
    real_server 192.168.100.18 53 {
        weight 100
          TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }

        }
    real_server 192.168.100.22 53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }

}
virtual_server 192.168.100.8 53 {
    delay_loop 30
    lb_algo rr
    lb_kind DR
    ha_suspend
    persistence_timeout 0
    omega
    protocol UDP
    real_server 192.168.100.18 53 {
        weight 100
          TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }

        }
    real_server 192.168.100.22 53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }

}

基本的步奏:
1. 在RS服务器上把tun设备加上,直接绑定上VIP。有报错提示IP加重复了也没有关系。


#!/bin/bash
VIP='192.168.100.8'
case $1 in
start)
modprobe -r ipip
modprobe ipip
ip link set tunl0 up
ip link set tunl0 arp off
for IP in $VIP
do
        NO=$((NO+1))
        ip addr add $IP/32 br $IP label tunl0:$NO dev tunl0
        ip route add $IP/32 dev tunl0
done
echo 1 > /proc/sys/net/ipv4/conf/tunl0/arp_ignore
echo 2 > /proc/sys/net/ipv4/conf/tunl0/arp_announce
echo 0 > /proc/sys/net/ipv4/conf/tunl0/rp_filter
echo 1 > /proc/sys/net/ipv4/conf/all/arp_ignore
  echo 2 > /proc/sys/net/ipv4/conf/all/arp_announce
;;
stop)
 modprobe -r ipip
;;
*)
echo "$0: Usage: $0 {start|stop|status}"
exit 1
;;
esac
  1. 先把目前做backup状态的LVS配置文件修改掉,改成使用TUN模式。重启好keepalived后把主的服务器停掉,观察主备切换后是否正常。不出问题就可以继续把停掉的这台的配置文件也修改好重启。

  2. 向keeplived里把其他要扩容的RS加如,分别重启


vrrp_instance test {
    !state MASTER
    state BACKUP
    interface bond0
    lvs_sync_daemon_interface bond0
    virtual_router_id 51
    priority 99
    advert_int 1
    nopreempt
    garp_master_delay 1
    authentication {
        auth_type PASS
        auth_pass 1111
    }
    virtual_ipaddress {
        192.168.100.8/22 dev bond0 label bond0:1
    }
}

virtual_server 192.168.100.8 53 {
    delay_loop 30
    lb_algo rr
    #lb_kind DR
    lb_kind TUN
    ha_suspend
    persistence_timeout 0
    protocol TCP
    real_server 192.168.100.18 53 {
        weight 100
          TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }

        }
    real_server 192.168.100.22 53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }
    real_server 192.168.100.38 53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }

    real_server 192.168.128.29 53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }

}
virtual_server 192.168.100.8 53 {
    delay_loop 30
    lb_algo rr
    lb_kind TUN
    #lb_kind DR
    ha_suspend
    persistence_timeout 0
    omega
    protocol UDP
    real_server 192.168.100.18 53 {
        weight 100
          TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }

        }
    real_server 192.168.100.22 53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }
    real_server 192.168.100.38 53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }

    real_server 192.168.128.29  53 {
        weight 100
                  TCP_CHECK {
                        connect_port    53
                        connect_timeout 3
                        nb_get_retry 3
                        delay_before_retry 10
                }
    }

}

IP Virtual Server version 1.2.1 (size=4096)
Prot LocalAddress:Port Scheduler Flags
  -> RemoteAddress:Port           Forward Weight ActiveConn InActConn
UDP  192.168.100.8:53 rr
  -> 192.168.128.29:53             Tunnel  100    0          223      
  -> 192.168.100.38:53             Tunnel  100    0          227      
  -> 192.168.100.22:53             Tunnel  100    0          224      
  -> 192.168.100.18:53             Tunnel  100    0          220      
TCP  192.168.100.8:53 rr
  -> 192.168.128.29:53             Tunnel  100    0          0        
  -> 192.168.100.38:53             Tunnel  100    0          0        
  -> 192.168.100.22:53             Tunnel  100    0          0        
  -> 192.168.100.18:53             Tunnel  100    0          0
发表在 lvs | LVS DR模式到TUN模式的平滑迁移。已关闭评论

openwrt划分多个vlan

原本是想把db120的eth1绑定的三个端口划分为3个vlan,每个vlan也单独配置一个SSID。结果比较悲剧的是b43的驱动不支持多个SSID。简单讲一下如何划分多个vlan。对于DB120来说,靠近电源的网口其实是eth0,原版的固件是把这个网卡和eth1桥接了,使得有4个lan口。刷了openwrt后LAN4就变成了WAN口(eth0),LAN1-3是eth1。可以单独配置一下把LAN3划到一个单独的VLAN


config interface 'loopback'
    option ifname 'lo'
    option proto 'static'
    option ipaddr '127.0.0.1'
    option netmask '255.0.0.0'

config interface 'lan'
    option ifname 'eth1.0'
    option type 'bridge'
    option proto 'static'
    option netmask '255.255.255.0'
    option ipaddr '10.10.10.1'

config interface 'lan2'
    option ifname 'eth1.1'
    option type 'bridge'
    option proto 'static'
    option netmask '255.255.255.0'
    option ipaddr '10.10.20.1'

config interface wan
    option ifname eth0
    option proto dhcp
config interface 'wwan'
    option _orig_ifname 'wlan0'
    option _orig_bridge 'false'
    option proto 'static'
    option ipaddr '192.168.3.2'
    option netmask '255.255.255.0'
    option gateway '192.168.3.1'
    option dns '192.168.3.1'
config 'switch' 'eth1'
    option 'reset' '1'
    option 'enable_vlan' '1'
config 'switch_vlan'
    option 'device' 'eth1'
    option 'vlan' '0'
    option 'ports' '0 1 5*'
config 'switch_vlan'            
    option 'device' 'eth1'          
    option 'vlan' '1'    
    option 'ports' '2 5*'

port 5是CPU,port 0-2分别对应的LAN1-3,以上就是把LAN3划到单独的VLAN,IP段是10.10.20.1/24,而LAN1-2的IP段是10.10.10.1/24。另外需要配置一下dhcp加一段


config dhcp 'lan'
        option interface 'lan2'
        option limit '100'
        option leasetime '12h'
        option start '10'

重启一下这样当我们网线插入到LAN3口的时候就能顺利获取到一个10.10.20.1/24段的IP了。另外防火墙的配置也需要单独加一点


#add for lan 2
config forwarding
    option src lan2
    option dest wan

config zone                          
        option name 'lan2'            
        option network 'lan2'        
        option input 'ACCEPT'        
        option output 'ACCEPT'        
        option forward 'REJECT'

DB120本来是个很好的折腾的路由器,BCM的CFE很方便刷机,只要不自己刷带了CFE的固件把CFE搞坏都是可以随便刷机的,劈开BCM6358的性能和发热量来说,b43驱动确实弱了点,不支持AP+STA,不支持多SSID,etc…

参考:
1. http://wiki.openwrt.org/doc/uci/network/switch

发表在 OpenWrt | openwrt划分多个vlan已关闭评论

linux做软RAID10

最近拿到的几个12块盘的服务器,都是没有RAID卡的。自己就简单做了一下软RAID。

  1. 用/dec/sd[b-l]1创建一个raid10的/dev/md1,其中一个做备用盘。

mdadm --create /dev/md1 --run --level=10 -n 10  /dev/sd[b-l] -x 1
mkdir /mnt/data  ; mkfs.ext4 /dev/md1

2.把mda1加到fstab
/dev/md1 /mnt/data ext4 defaults,noatime 0 0
3.把阵列信息加到/etc/mdadm.conf
mdadm –detail –scan >> /etc/mdadm.conf

状态查看可以


#cat /proc/mdstat
Personalities : [raid10]
md1 : active raid10 sdl[10](S) sdk[9] sdj[8] sdi[7] sdh[6] sdg[5] sdf[4] sde[3] sdd[2] sdc[1] sdb[0]
      9767564800 blocks super 1.2 512K chunks 2 near-copies [10/10] [UUUUUUUUUU]

unused devices: <none>

#mdadm --detail /dev/md1
/dev/md1:
        Version : 1.2
  Creation Time : Sat Jul 20 00:33:49 2013
     Raid Level : raid10
     Array Size : 9767564800 (9315.08 GiB 10001.99 GB)
  Used Dev Size : 1953512960 (1863.02 GiB 2000.40 GB)
   Raid Devices : 10
  Total Devices : 11
    Persistence : Superblock is persistent

    Update Time : Sat Jul 20 08:33:55 2013
          State : active
 Active Devices : 10
Working Devices : 11
 Failed Devices : 0
  Spare Devices : 1

         Layout : near=2
     Chunk Size : 512K

           Name : xxx
           UUID : xxx
         Events : 18

    Number   Major   Minor   RaidDevice State
       0       8       16        0      active sync   /dev/sdb
       1       8       32        1      active sync   /dev/sdc
       2       8       48        2      active sync   /dev/sdd
       3       8       64        3      active sync   /dev/sde
       4       8       80        4      active sync   /dev/sdf
       5       8       96        5      active sync   /dev/sdg
       6       8      112        6      active sync   /dev/sdh
       7       8      128        7      active sync   /dev/sdi
       8       8      144        8      active sync   /dev/sdj
       9       8      160        9      active sync   /dev/sdk

      10       8      176        -      spare   /dev/sdl

单个盘的操作
1.卸掉故障盘
mdadm /dev/md1 -f /dev/sdd
mdadm /dev/md1 -r /dev/sdd
2.恢复单个盘
mdadm –zero-superblock /dev/sdd
mdadm /dev/md1 -a /dev/sdd
3.停掉整个RAID
mdadm –stop /dev/md1
4.启动整个RAID
mdadm -A /dev/md1

简单测试了一下RAID的性能和单个盘的性能差别


RAID10:
#dd if=/dev/zero of=test1 bs=1024000 count=10240 oflag=direct ;sleep 5; dd if=test1 of=/dev/null bs=1024000 count=10240 iflag=direct
10240+0 records in
10240+0 records out
10485760000 bytes (10 GB) copied, 20.4646 s, 512 MB/s
10240+0 records in
10240+0 records out
10485760000 bytes (10 GB) copied, 15.0176 s, 698 MB/s



单个盘:
#dd if=/dev/zero of=test1 bs=1024000 count=10240 oflag=direct ;sleep 5; dd if=test1 of=/dev/null bs=1024000 count=10240 iflag=direct
10240+0 records in
10240+0 records out
10485760000 bytes (10 GB) copied, 62.761 s, 167 MB/s
10240+0 records in
10240+0 records out
10485760000 bytes (10 GB) copied, 61.865 s, 169 MB/s

做了RAID后读取的是基本是把SAS卡的6G的带宽跑满了,写入也有三倍左右的提升。

发表在 OS | 留下评论

搭建私有的apt源

内部搞了个apt源,简单记录一下,主要参考了:http://www.debian-administration.org/articles/286

1. 新建立目录结构
mkdir test/{conf,dists,incoming,indices,logs,pool,project,tmp}

2.新建配置文件

2.1 conf/distributions
Origin: GNUer
Label: GNUer’s repository
Codename: testing
Architectures: amd64
Components: main
Description: Description of repository you are creating
2.2 conf/options
verbose
ask-passphrase
basedir .

3.包的维护

导入包需要使用reprepro

reprepro -b /home/mirrors/test/ includedeb testing /home/test_deb/dropbox_1.6.0_amd64.deb
删除包
reprepro -v -b /home/mirrors/test/ remove testing dropbox

4. apt源的配置
新增
deb http://192.168.2.2/test testing main

发表在 System | 留下评论

二分法查找指定时间的日志

二分法是很基础的一个查询方法。试想一个场景,应用的访问量非常大,单天的日志单个文件上100G,要准实时地统计出TPM的大小。没有什么storm之类的高级玩意,就自己写脚本进行统计的话其实不太好搞。这个时候可以试试每次用二分法找出上一分钟的日志所在的偏移量,然后再顺序读入日志进行处理,可以比较高效地跳过大量的日志。python简单写了个
[python]
#!/usr/bin/env python
import re
import datetime
import sys
class logtools:
"""
this tools can get the bind qps and the ips which query with high frequency
"""
def __init__(self,filename="/xx/acess.log"):
self.logname=filename
try:
#print "logs is",filename
self.fd=open(filename,"r")
except IOError:
print "open log failed"
sys.exit(1)
def __del__(self):
try:
self.fd.close()
except:
print "close fd failed"
def get_last_min(self):
now=datetime.datetime.now()
last=datetime.datetime.now()+datetime.timedelta(minutes=-2)
qps_time=datetime.datetime.now()+datetime.timedelta(minutes=-1)
t=qps_time.strftime(‘\s+%H:%M:’)
t2=qps_time.strftime(‘%H:%M’)
return (int(last.strftime("%s")),t,t2)
def get_current_min(self):
time_reg=re.compile("\s+(?P<hour>\d+):(?P<min>\d+):(?P<sec>\d+)")
now=datetime.datetime.now()
i=1
while True:
line=self.fd.readline()
if not line:
return None
match=time_reg.search(line)
i=i+1
if match:
match_time=datetime.datetime(year=now.year,month=now.month,day=now.day,
hour=int(match.group("hour")),
minute=int(match.group("min")),
second=int(match.group("sec")),
)
break
return int(match_time.strftime("%s"))
def get_last_seek(self,last_time):
old_seek=self.fd.tell()
self.fd.seek(0,0)
start_seek=self.fd.tell()
start_time=self.get_current_min()
pos_off=len(self.fd.readline())*2
self.fd.seek(0,2)
end_seek=self.fd.tell()
self.fd.seek(-pos_off,2)
end_time=self.get_current_min()
#print "time range:",start_time,last_time,end_time
#print "pos_off:",pos_off
if last_time < start_time:
print "error last-time <start-time"
return end_seek
elif last_time > end_time:
print "error %d > %d"%(last_time,end_time)
return end_seek
time=0
while (end_seek – start_seek > 2*pos_off and end_time – start_time > 3) :
half_seek=int((end_seek+start_seek)/2)
self.fd.seek(half_seek,0)
half_time=self.get_current_min()
#print "%d –<%d>—%d"%(start_seek,half_seek,end_seek)
if last_time<=half_time:
end_seek=half_seek
self.fd.seek(end_seek,0)
end_time=self.get_current_min()
else:
start_seek=half_seek
self.fd.seek(start_seek,0)
start_time=self.get_current_min()
time+=1
#print "search %d times"%time
return half_seek
def get_tpm(self):
reg=self.get_last_min()[1]+"\d{2}"
reg_time=self.get_last_min()[2]
regex=re.compile(reg)
time_pre=self.get_last_min()[0]
pos=self.get_last_seek(time_pre)
self.fd.seek(pos,0)
query=0
line=self.fd.readline()
while line:
if line == None:
break
elif regex.search(line):
query+=1
line=self.fd.readline()
print "%s qps %d"%(str(reg_time),query)
a=logtools(filename=sys.argv[1])
a.get_tpm()

[/python]

发表在 python | 留下评论

策略路由的配置

最近测试DNS服务器直接和交换机跑OSPF。2上联网卡分别接入2交换机,形成邻居。服务器不设置静态的默认路由,通过和上层路由器交换路由信息的时候学习默认路由。
另外的办公网接入的网卡只是绑定了IP。因为是在测试环境所以有个问题是上联的链路其实是不能访问外网的。我就单独设置了一下策略路由解决。需要达到的目的其实只是能从办公网络ssh登陆服务器,服务器上能访问部分外网(比如8.8.8.8进行DNS解析)。
配置其实比较简单:
1. 先新增策略路由
[text]
#cat /etc/iproute2/rt_tables
#
# reserved values
#
255 local
254 main
253 default
0 unspec
#
# local
200 dns
[/text]

2.给table 200增加默认的路由
[text]
#cat route-eth0
table dns 192.1.159.0/24 via 192.1.159.254 dev eth0
table dns default via 192.1.159.254 dev eth0
[/text]
等同于
[bash]
ip route add 192.1.159.0/24 via 192.1.159.254 dev eth0 table dns
ip route add default via 192.1.159.254 dev eth0 table dns
[/bash]
3.增加策略路由表
[text]
#cat rule-eth0
from 192.1.159.210 table dns
to 8.8.8.8 table dns
to 192.242.252.0/24 table dns
[/text]
等同于
[bash]
ip rule add to 8.8.8.8 table dns
ip rule add from 192.1.159.210 table dns
ip rule add to 192.242.252.0/24 table dns
[/bash]
备注:
本机IP:192.1.159.210
本机的办公网入口的网关:192.1.159.254
希望能走eth0访问的地址: 192.242.252.0/24, 8.8.8.8

发表在 net, System | 留下评论

gnome-shell下开机亮度的调节

gnome3下现在屏幕的默认亮度不能保存,网上搜了一圈简单的方式是在rc.local里加


echo 70 &gt; /sys/class/backlight/acpi_video0/brightness
发表在 System | 留下评论