hax01tips
注意:本文之后极有可能设为私有
题目
hax01
Your mission is the following: Simply enter a URL into the box. The domain of the URL must be or end with 'nasa.gov'. The URL will be fetched right away. The content returned should contain the string: "2200178118" in the first 10 Kbytes of data. 404/403/etc error pages are not accepted. Remember, do not do anything illegal. Make sure you type the right URL, do not guess.
Hint: google is your friend.
http://google.com/search?q=site:nasa.gov
当时我的思路是找出所有以nasa.gov结尾的域名,然后遍历这些网址。之后我真的写了个python程序,取了google检索出来的前1000个页面,取出域名,保存起来,去除重复的有500多个。接着,读取html页面,判断是否还有字符串。其间,遇到了个网速的问题,超时后经常会跑到电信的114搜索上去。验证了170多个页面后,我发现自己理解错题目了,这里的URL并不是指URL以nasa.gov结尾,而是指URL的域名以nasa.gov结尾。我无语了,这相当于域名下的所有网页都有可能。这个工作量巨大得几乎是不可能的。暂时中止。
以下代码可供参考,修改了n次,可能现在已经没法直接运行。
2.5
1from urllib import FancyURLopener
2import urllib2
3import sys
4import re
5import locale
6"""
7class MyOpener(FancyURLopener):
8 version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9
10res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11
12myopener = MyOpener()
13url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14li = []
15for i in range(0, 10):
16 url = url + '&start=' + str(i*100)
17 page = myopener.open(url)
18 str1 = page.read()
19 for aItem in res.findall(str1):
20 if not aItem[0] in li:
21 li.append(aItem[0])
22"""
23with open('nasa.txt') as li:
24#li = open('nasa.txt')
25#print li.count
26 m = 0
27 for a in li:
28 #print 'http://'+a
29 m = m + 1
30 print m
31 url = a
32 req = urllib2.Request(url)
33 try:
34 response = urllib2.urlopen(req)
35 the_page = response.read()
36 with open(url + '.txt') as nasa:
37 write(the_page)
38 if the_page.find(r'daohang.118114.cn') <> -1 :
39 print '114'
40 elif the_page.find('2200178118', 0, 10240) <> -1 :
41 print url
42 else :
43 print ''
44 except urllib2.URLError, e:
45 print e.reason
46
47"""
48#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
49#google = urllib.urlopen(gUrl)
50#str = google.read()
51for str in open('sitenasa_gov.htm'):
52 for aItem in res.findall(str):
53 print aItem[0]
54
55#print str
56str = 'www.xxx.nasa.gov/wwf.nasa.gov'
57
58"""
59
60
61#2200178118
62
3.1
1from urllib.request import FancyURLopener
2import urllib
3import sys
4import re
5import locale
6"""
7class MyOpener(FancyURLopener):
8 version = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11)Gecko/20071127 Firefox/2.0.0.11'
9
10res = re.compile(r'(([a-zA-Z]+\.)+nasa.gov)')
11
12myopener = MyOpener()
13url = 'http://www.google.co.jp/search?&num=100&as_qdr=all&as_occt=any&as_sitesearch=nasa.gov'
14li = []
15for i in range(0, 10):
16 url = url + '&start=' + str(i*100)
17 page = myopener.open(url)
18 str1 = page.read()
19 for aItem in res.findall(str1):
20 if not aItem[0] in li:
21 li.append(aItem[0])
22"""
23fiPath = sys.argv[1]
24with open(fiPath) as li:
25#li = open('nasa.txt')
26#print li.count
27 m = 0
28 for a in li:
29 #print 'http://'+a
30 m = m + 1
31 #print m
32 url = a
33 req = urllib.request.Request(url)
34 try:
35 response = urllib.request.urlopen(req)
36 the_page = response.read()
37 with open(url[7:-1] + '.txt', 'wb') as nasa:
38 nasa.write(the_page)
39 nasa.flush()
40 if the_page.decode('utf8').find(r'icc.qonc.com') != -1:
41 print('114')
42 elif the_page.decode('utf8').find('2200178118', 0, 10240) != -1:
43 print(url)
44 else :
45 print('')
46 except urllib.error.URLError as e:
47 print(e.code)
48 except UnicodeDecodeError as UDE:
49 print(UDE)
50
51"""
52#gUrl = 'http://www.google.co.jp/search?hl=ja&source=hp&q=site%3Anasa.gov&lr=&aq=f&oq='
53#google = urllib.urlopen(gUrl)
54#str = google.read()
55for str in open('sitenasa_gov.htm'):
56 for aItem in res.findall(str):
57 print aItem[0]
58
59#print str
60str = 'www.xxx.nasa.gov/wwf.nasa.gov'
61
62"""
63
64
65#2200178118
66
而后,过了大概几个月,变换思路,解决,意外的简单……
事实上,只要向服务器提交数据,一般服务器也会将该数据返回到页面上。该题最后的hint不是让我们来搜该域名,而是告诉我们怎样在google.com的页面上显示我们想要的数据。譬如
http://www.google.co.jp/search?q=2200178118 该页面的前10K里应该包含了该字符串。接下来,我们只需要在nasa.gov上找个页面提交数据就行了。
over