示例程序

from hdfs import InsecureClient
from collections import defaultdict

hdurl = "http://172.16.115.212:50070"
hduser = "root"

def createcli() -> InsecureClient:
    return InsecureClient(url= hdurl,
                          user= hduser)

def deleteFile(cli: InsecureClient):
    cli.delete("/result/mr_alldata.csv")

def mapper(line):
    line = line.strip()
    line = line.split(",")[-1]
    if line == "url":
        return False
    if "https://" in line:
        url = line.split("https://")[1].split("/")[0]
        return (url, 1)
    else:
        url = line.split("http://")[1].split("/")[0]
        return (url, 1)

def reducer(cli: InsecureClient):
    res = defaultdict(int)
    with cli.read("/logdata/alldata.csv", encoding="utf-8") as reader:
        f = reader.readlines()
        for i in f:
            result = mapper(i)
            if result:
                domain, count = result
                res[domain] += count
    res = list(res.items())
    sorts = sorted(res,
                   key = lambda x: x[1],
                   reverse=True)
    output = ""
    for domain, counts in sorts:
        output += f"({domain},{counts})\n"
    print("testing results:")
    for i in sorts[:10]:
        print(i)

if __name__ == "__main__":
    cli = createcli()
    deleteFile(cli)
    reducer(cli)