今日頭條2017春招研發崗筆試題——Paragraph

(注:題解方法來自於“今日頭條校園”微信公衆號)


題意:給定一個英文段落(包含n個句子)和m次查詢,每次給定一個句子,求段落中相同單詞數量最多的句子。各個英文句子不包含標點,大小寫不敏感。

題解:一種簡單的做法是對原文中的每個英文句子,都預處理包含的單詞集合。對於每次查詢,枚舉句子中的單詞到各個set查找是否存在,隨後統計出現的次數取max即可。

Java代碼實現:

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;

public class Solution {
	
	// 方法一:對於每個句子,使用一個set保存它的單詞集合
	public List<Integer> getResult(String[] paragraph, String[] query){
		int paraLength = paragraph.length;
		List<HashSet<String>> paraSetList = new ArrayList<HashSet<String>>();
		// 將每一個句子的單詞集合,放到一個hashset中;然後使用一個list保存各個hashset
		for(int i=0; i<paraLength; i++){
			String[] wordSet = paragraph[i].split(" ");
			HashSet<String> set = new HashSet<String>();
			for(int j=0; j<wordSet.length; j++){
				set.add(wordSet[j]);
			}
			paraSetList.add(set);
		}
		int maxIndex = 0;

		List<Integer> result = new ArrayList<Integer>();
		// 對於每一個查詢字符串
		for(int i=0; i<query.length; i++){
			String currentStr = query[i].toLowerCase();
			String[] words = currentStr.split(" ");
			// query中的單詞也需要去重
			HashSet<String> setTemp = new HashSet<String>();
			for(int k=0; k<words.length; k++){
				setTemp.add(words[k]);
			}
			
			
			int maxCount = 0;
			// 遍歷每一個句子的單詞集合set,找到單詞匹配數最多的句子的下標
			for(int j=0; j<paraSetList.size(); j++){
				int count= 0;
				Iterator<String> iteator = setTemp.iterator();
				HashSet<String> set = paraSetList.get(j);
				while(iteator.hasNext()){
					if(set.contains(iteator.next())){
						count ++;
					}
				}
				if(count > maxCount){
					maxCount = count;
					maxIndex = j;
				}
			}	
			result.add(maxIndex);
		}
		return result;
	}
	
	
	public static void main(String[] args) {
		String[] paragraph = {
				"A bad beginning makes a bad ending",
				"A fool may ask more questions in an hour than a wise man can answer in seven years",
				"A friend exaggerates a man virtue an enemy his crimes",
				"A good head and an industrious hand are worth gold in any land",
				"Always taking out of the meal and never putting in soon comes to the bottom"
		};
		String[] query = {
				"man of gold makes worth land seldom falls ending madness industrious",
				"An enemy idle youth exaggerates his friend a needy age",	
				"bottom A poor man who taking a comes rich wife has never a ruler not a wife"
		};
		Solution solution = new Solution();
		List<Integer> result = solution.getResult(paragraph, query);
		for(int i=0; i<result.size(); i++){
			int index = result.get(i);
			System.out.println(paragraph[index]);
		}
	}
}


題解:(方法二)一種更快的做法是對原文中出現的所有單詞,通過一個hash map維護它們分別出現在哪些原文句子中。在每次查詢中,枚舉句子中的單詞,給它在原文中出現過的句子進行計數,最後在所有計數中取max即可。

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

public class Solution {

	// 方法一:對於原文中的每個單詞,使用hash map保存它們分別出現在哪些原文句子中
	public List<Integer> getResult(String[] paragraph, String[] query) {
		List<Integer> result = new ArrayList<Integer>();
		HashMap<String, HashSet<Integer>> map = new HashMap<String, HashSet<Integer>>();
		for (int i = 0; i < paragraph.length; i++) {
			String current = paragraph[i].toLowerCase();
			String[] wordsList = current.split(" ");
			for (int j = 0; j < wordsList.length; j++) {
				if (map.containsKey(wordsList[j])) {
					map.get(wordsList[j]).add(i);
				} else {
					HashSet<Integer> set = new HashSet<Integer>();
					set.add(i);
					map.put(wordsList[j], set);
				}
			}
		}

		// 對於query查詢字符串的每一個單詞
		for (int i = 0; i < query.length; i++) {
			String current = query[i].toLowerCase();
			HashMap<Integer, Integer> queryMap = new HashMap<Integer, Integer>();
			String[] words = current.split(" ");
			// 對query去重
			HashSet<String> querySet = new HashSet<String>();
			for (int j = 0; j < words.length; j++) {
				querySet.add(words[j]);
			}
			Iterator<String> iteator = querySet.iterator();
			while (iteator.hasNext()) {
				// 包含這個詞的文章下標集合
				HashSet<Integer> paraSet = map.get(iteator.next());
				if(paraSet != null){
					Iterator<Integer> setIteator = paraSet.iterator();
					while(setIteator.hasNext()){
						Integer currentIndex = setIteator.next();
						if(queryMap.containsKey(currentIndex)){
							int count = queryMap.get(currentIndex);
							queryMap.put(currentIndex, count + 1);
						} else{
							queryMap.put(currentIndex, 1);
						}
					}
				}
			}
			int max = 0;
			int index = 0;
			// 遍歷queryMap,從中找到出現次數最多的下標
			for(Map.Entry<Integer, Integer> entry: queryMap.entrySet()){
				if(entry.getValue() > max){
					max = entry.getValue();
					index = entry.getKey();
				} 
			}
			result.add(index);
		}
		return result;
	}

	public static void main(String[] args) {
		String[] paragraph = { "A bad beginning makes a bad ending",
				"A fool may ask more questions in an hour than a wise man can answer in seven years",
				"A friend exaggerates a man virtue an enemy his crimes",
				"A good head and an industrious hand are worth gold in any land",
				"Always taking out of the meal and never putting in soon comes to the bottom" };
		String[] query = { "man of gold makes worth land seldom falls ending madness industrious",
				"An enemy idle youth exaggerates his friend a needy age",
				"bottom A poor man who taking a comes rich wife has never a ruler not a wife" };
		Solution solution = new Solution();
		List<Integer> result = solution.getResult(paragraph, query);
		for (int i = 0; i < result.size(); i++) {
			int index = result.get(i);
			System.out.println(paragraph[index]);
		}
	}
}

(方法二的代碼不知道寫的對不對....對於我的代碼實現,感覺方法二的時間複雜度並不比方法一低;當然上述代碼還可以優化,比如在統計query中每個詞在哪些文章中出現時,可以使用數組下標作爲文章編號,值對應詞在文章中出現的次數)


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章