很少用到perl,這次用了一把,特意記錄一下關於文件讀寫和hash統計頻數並排序的總結:
1.文件讀寫
perl下讀寫文件非常簡單:,首先是讀:
#打開文件
open(FILE_NAME, $_)||die "can't open part-m file";
一行一行讀出來並處理:
while (<FILE_NAME>)
{
chomp;
print $_,"\n";
###
處理
###
}
然後是寫:
#創建寫的文件:不存在就創建,存在就清空然後再寫;如果將">"改成">>"就是追加的寫
open(OUTFILE1_LIST, ">$resultpath.\\SpecificResult.txt")||die "can't open file SpecificResult.txt!";
#一行行寫文件
print OUTFILE1_LIST ("$lineTxt\n");
#寫完關閉
close(OUTFILE1_LIST);
2.哈希統計頻數並排序
#初始化
my %hash=();
#統計每個詞的個數
$hash{$lineTxt}++;
#按value值排序
my @keys = sort { $hash{$b} <=> $hash{$a} } keys %hash; #@key裏頭存的是按哈希值的數值大小排序後的鍵
#按key值排序
my @keys = sort { $b <=> $a} keys %hash; #@key裏頭存的是按哈希鍵的數值大小排序後的鍵
foreach(@keys)
{
print OUTFILE2_LIST ("$_"."\t"."$hash{$_}"."\n");
}
下面是我寫的用來解析現網數據並排序的源代碼:
#!/usr/bin/perl -W
##################
# File:
# Author:
# License:
use strict;
use warnings;
use encoding 'gbk'; # 系統默認編碼爲GBK
use open IN=>':encoding(utf8)'; # 讀入文件時認爲數據按UTF-16編碼,自動根據BOM頭判斷是LE還是BE
use Encode;
use File::Path;
use Tie::File;
#讀取外部傳入的待解析現網數據的存放目錄路徑
my $dirpath="";
if(@ARGV == 1)
{
$dirpath = $ARGV[0];
}else
{
print "< .pl > <待解析現網數據的存放目錄路徑>\n";
exit(0);
}
print "dir path: ${dirpath}\n";
#$dirpath="E:\\video_network_data";
my @filearray=(); #存放每個part-m文件的絕對路徑
my $filecount = 0; #存放part-m文件的個數
###### 遍歷文件夾 #####
sub parse_env {
my $path = $_[0];
my $subpath;
my $handle;
if (-d $path) {#當前路徑是否爲一個目錄
if (opendir($handle, $path)) {
while ($subpath = readdir($handle)) {
if (!($subpath =~ m/^\.$/) and !($subpath =~ m/^(\.\.)$/)) {
my $p = $path."/$subpath";
if (-d $p) {
parse_env($p);
}
else {
if($p=~m/part-m/) {
push(@filearray,$p);
$filecount++;
}
}
}
}
}
closedir($handle);
}
return $filecount;
}
my %hash=();
my $filenum=parse_env $dirpath;
if($filenum > 0) #存在part-m文件
{
print "There are $filenum part-m files!!","\n";
my $resultpath=$dirpath."\\parse-result";
mkdir($resultpath) unless(-d $resultpath); #創建目錄,準備存放解析結果
open(OUTFILE1_LIST, ">$resultpath.\\SpecificResult.txt")||die "can't open file SpecificResult.txt!";
open(OUTFILE2_LIST, ">$resultpath.\\FrequencyResult.txt")||die "can't open file FrequencyResult.txt!";
foreach(@filearray)
{
#print $_,"\n";
open(FILE_NAME, $_)||die "can't open part-m file";
while (<FILE_NAME>)
{
my @strlist=split("\t",$_);
if(($#strlist+1)>=4)
{
my $lineTxt=$strlist[3];
print OUTFILE1_LIST ("$lineTxt\n");
$hash{$lineTxt}++;
}
}
}
my @keys = sort { $hash{$b} <=> $hash{$a} } keys %hash; #sort the hash table
foreach(@keys)
{
print OUTFILE2_LIST ("$_"."\t"."$hash{$_}"."\n");
}
close(OUTFILE1_LIST);
close(OUTFILE2_LIST);
print "-------FINISH!!!--------\n";
}