##################################################
#qq:316118740
#BLOG:http://hi.baidu.com/alalmn
# ���� ��ȡ��ҳ�е����ӵ�ַ ���ж��Ƿ���HTTP��ַ
# ��ѧд�IJ������Ҽ���
##################################################
def URL_STR(data):#�ж��Ƿ���HTTP�ַ�
sStr2 = 'http://'
sStr3 = 'https://'
#print sStr1.find(sStr2)
if data.find(sStr2) and data.find(sStr3):
return 1 #print "û���ҵ�"
else:
return 0 #print "���ҵ���"
##################################################
import urllib2, re
def URL_DZ(URL): #����ҳ��ĵ�ַ
s = urllib2.urlopen(URL) #s = urllib2.urlopen(r"http://www.163.com")
ss = s.read()
p = re.compile( r'<a.+?href=.+?>.+?</a>' )
pname = re.compile( r'(?<=>).*?(?=</a>)' )
phref = re.compile( r'(?<=href\=\").*?(?=\")')
#���켰����������ʽ
sarr = p.findall(ss)
#�ҳ�һ��һ����<a></a>��ǩ
i=0
for every in sarr:
if i>1000:
print "����1000��URL��ַ������������\n"
break
else:
i+=1
sname = pname.findall( every )
if sname:
sname = sname[0]
shref = phref.findall( every )
if shref:
shref = shref[0]
#print sname.decode( 'gbk' ), "\n" #��ȡ��������
#print shref #��ȡURL
if URL_STR(shref):
print shref,"û���ҵ��Ƿ�HTTP��ַ"
else:
print shref #"���ҵ�����ȷURL��ַ"
# �����ǽ�ÿ��<a></a>��������ݺ͵�ַ��ƥ�����
##################################################
URL_DZ("http://www.baidu.com")
|
标签:
正则获取网页地址















