| <?php
//include('retidy.php');
function getLanguage($source) {
	$language = "unknown";
	if(strpos($source, "<body") !== false) {	
		$contents = $source;
	} else {
		$sourceIsAFilename = true;
		$array_non_filename_characters = array(/*'/', '\\', only disallowed for file and folder names; not paths */ ':', '*', '?', '"', '<', '>', '|');
		foreach($array_non_filename_characters as $non_filename_character) {
			if(strpos($source, $non_filename_character) !== false) {
				$contents = $source;
				$sourceIsAFilename = false;
				break;
			}
		}
		if($sourceIsAFilename) {
			$contents = file_get_contents($source);	
		}
		//  we could do something like the following if necessary
		//if(strlen($source) > 260) { // file and folder names shouldn't be longer than roughly 256 characters
		//		$slash_count = substr_count($source, '/');
	}
	//print('here374595969790870<br>');
	// method 1 to find the language of a file (look at the file extension) 
	// (notice that we are not limiting the search to the end of the filename)
	if($sourceIsAFilename === true) {
		// these are separate from the below since they are higher priority
		if (
		strpos(strtolower($source), "/eng/") != false || 
		strpos(strtolower($source), "/english/") != false
		) {
			return $language = "english";
		}
		if (
		strpos(strtolower($source), "/fra/") != false || 
		strpos(strtolower($source), "/french/") != false
		) {
			return $language = "french";
		}
		
		if (
		
		//strpos(strtolower($source), "e.htm") != false ||
		//strpos(strtolower($source), "_e.html") != false || 
		//strpos(strtolower($source), "-e.html") != false || 
		//strpos(strtolower($source), "-en.html") != false || 
		//strpos(strtolower($source), "_en.html") != false ||  
		//strpos(strtolower($source), "-eng.html") != false || 
		//strpos(strtolower($source), "_eng.html") != false ||  // redundant
		strpos(strtolower($source), "_e.htm") != false || 
		strpos(strtolower($source), "-e.htm") != false || 
		strpos(strtolower($source), "-en.htm") != false || 
		strpos(strtolower($source), "_en.htm") != false ||  
		strpos(strtolower($source), "-eng.htm") != false || 
		strpos(strtolower($source), "_eng.htm") != false || 
		strpos(strtolower($source), " eng.htm") != false || 		
		strpos(strtolower($source), "_e.xml") != false || 
		strpos(strtolower($source), "-e.xml") != false || 
		strpos(strtolower($source), "-en.xml") != false || 
		strpos(strtolower($source), "_en.xml") != false ||  
		strpos(strtolower($source), "-eng.xml") != false || 
		strpos(strtolower($source), "_eng.xml") != false || 
		strpos(strtolower($source), " eng.xml") != false || 
		strpos(strtolower($source), "_e.php") != false || 
		strpos(strtolower($source), "-e.php") != false || 
		strpos(strtolower($source), "-en.php") != false || 
		strpos(strtolower($source), "_en.php") != false ||  
		strpos(strtolower($source), "-eng.php") != false || 
		strpos(strtolower($source), "_eng.php") != false || 
		strpos(strtolower($source), " eng.php") != false || 
		strpos(strtolower($source), "_e.asp") != false || 
		strpos(strtolower($source), "-e.asp") != false || 		
		strpos(strtolower($source), "-en.asp") != false || 
		strpos(strtolower($source), "_en.asp") != false ||  
		strpos(strtolower($source), "-eng.asp") != false || 
		strpos(strtolower($source), "_eng.asp") != false ||
		strpos(strtolower($source), " eng.asp") != false ||
		strpos(strtolower($source), "_e.aspx") != false || 
		strpos(strtolower($source), "-e.aspx") != false || 		
		strpos(strtolower($source), "-en.aspx") != false || 
		strpos(strtolower($source), "_en.aspx") != false ||  
		strpos(strtolower($source), "-eng.aspx") != false || 
		strpos(strtolower($source), "_eng.aspx") != false ||
		strpos(strtolower($source), " eng.aspx") != false	
		) {
			return $language = "english";
		}
		if (
		
		//strpos(strtolower($source), "f.htm") != false ||
		
		//strpos(strtolower($source), "_f.html") != false || 
		//strpos(strtolower($source), "-f.html") != false || 
		//strpos(strtolower($source), "-fr.html") != false || 
		//strpos(strtolower($source), "_fr.html") != false ||  
		//strpos(strtolower($source), "-fra.html") != false || 
		//strpos(strtolower($source), "_fra.html") != false ||  // redundant
		strpos(strtolower($source), "_f.htm") != false || 
		strpos(strtolower($source), "-f.htm") != false || 
		strpos(strtolower($source), "-fr.htm") != false || 
		strpos(strtolower($source), "_fr.htm") != false ||  
		strpos(strtolower($source), "-fra.htm") != false || 
		strpos(strtolower($source), "_fra.htm") != false || 
		strpos(strtolower($source), " fre.htm") != false || 
		strpos(strtolower($source), "_f.xml") != false || 
		strpos(strtolower($source), "-f.xml") != false || 
		strpos(strtolower($source), "-fr.xml") != false || 
		strpos(strtolower($source), "_fr.xml") != false ||  
		strpos(strtolower($source), "-fra.xml") != false || 
		strpos(strtolower($source), "_fra.xml") != false || 
		strpos(strtolower($source), " fre.xml") != false || 
		strpos(strtolower($source), "_f.php") != false || 
		strpos(strtolower($source), "-f.php") != false || 
		strpos(strtolower($source), "-fr.php") != false || 
		strpos(strtolower($source), "_fr.php") != false ||  
		strpos(strtolower($source), "-fra.php") != false || 
		strpos(strtolower($source), "_fra.php") != false || 
		strpos(strtolower($source), " fre.php") != false || 
		strpos(strtolower($source), "_f.asp") != false || 
		strpos(strtolower($source), "-f.asp") != false || 
		strpos(strtolower($source), "-fr.asp") != false || 
		strpos(strtolower($source), "_fr.asp") != false ||  		
		strpos(strtolower($source), "-fra.asp") != false || 
		strpos(strtolower($source), "_fra.asp") != false ||
		strpos(strtolower($source), " fre.asp") != false ||
		strpos(strtolower($source), "_f.aspx") != false || 
		strpos(strtolower($source), "-f.aspx") != false || 
		strpos(strtolower($source), "-fr.aspx") != false || 
		strpos(strtolower($source), "_fr.aspx") != false ||  		
		strpos(strtolower($source), "-fra.aspx") != false || 
		strpos(strtolower($source), "_fra.aspx") != false ||
		strpos(strtolower($source), " fre.aspx") != false	
		) {
			return $language = "french";
		}
		if (
		strpos(strtolower($source), "-bil.html") != false || 
		strpos(strtolower($source), "-english_and_french.html") != false ||
		strpos(strtolower($source), "_bil.html") != false || 
		strpos(strtolower($source), "_english_and_french.html") != false		
		) {
			return $language = "english_and_french";
		}		
	}
	//print('here374595969790871<br>');
	// method 2 to find the language of a file (look for lang attributes on the <html> tag)
	preg_match('/<html[^<>]*?>/is', $contents, $html_tag_matches);
	preg_match_all('/lang="([^"]*?)"/is', $html_tag_matches[0], $lang_matches);
	foreach($lang_matches[1] as $lang_index => $lang_value) {
		if($lang_value === "fr") {
			return $language = "french";
		}
		if($lang_value === "en") {
			return $language = "english";
		}
	}
	
	// I suppose we could also look for lang attributes in the content and call the language the opposite of what they declare (with the assumption that these attributes would only be used when some piece of content is in the 
	// language opposite to the whole of the document)
	//print('here374595969790872<br>');
	// method 3 to find the language of a file (look for french é density)
	if($language === "unknown") {
		$strlen = strlen($contents);
		if ($strlen > 5000) {
			$code_divisions = bcdiv($strlen, 1000, 0);
			$division_count = 0;
			while($division_count < $code_divisions && $language === "unknown") {
				$substr = substr($contents, bcmul($division_count, 1000), bcmul($division_count + 1, 1000));
				preg_match_all('/((é)|(é)|(é)|(é))/is', $substr, $matches);
				if (sizeof($matches[1]) > 50) {
					return $language = "french";
				}				
				$division_count++;
			}
		}
		elseif ($strlen > 1000) {
			$substr = substr($contents, 300, 700);
			preg_match_all('/((é)|(é)|(é)|(é))/is', $substr, $matches);
			if (sizeof($matches[1]) > 10) {
				return $language = "french";			
			}
		}
	}
	//print('here374595969790873<br>');
	// method 4 to find the language of a file (look for french characters)
	if($language === "unknown") {
		if (
		(
		strpos($contents, "é") != false && 
		strpos($contents, "É") != false &&
		strpos($contents, "à") != false &&
		strpos($contents, "À") != false &&
		strpos($contents, "è") != false &&
		strpos($contents, "ô") != false
		)
		||
		(
		strpos($contents, "é") != false && 
		strpos($contents, "É") != false &&
		strpos($contents, "à") != false &&
		strpos($contents, "À") != false &&
		strpos($contents, "è") != false &&
		strpos($contents, "ô") != false
		)
		) {
			return $language = "french";
		}
	}
	//print('here374595969790874<br>');
	// method 5 uses a dictionary search
	$body_code = ReTidy::getBodyCode($contents);
	if($body_code !== false) {
		$body_code = ReTidy::tagless($body_code); // (2009-08-24)
		// choose a number of words to check...
		$number_of_words = 200;
		$word_count = 0;
		$body_length = strlen($body_code);
		$minimum_number_of_characters = 100;
		if($body_length < $minimum_number_of_characters) {
			return $language;
		}
		if($body_length < $number_of_words) {
			return $language; // otherwise we'll get an infinite loop since the step-size will be less than 1
		}
		$step_size = bcdiv($body_length, $number_of_words, 0);
		$position = 0;
		$arrayMatches = array();
		$english_words_file = "abbr/eng/words.txt";
		$ArrayEnglishWords = explode("\r\n", file_get_contents($english_words_file));
		$count_of_english_matches = 0;
		while($position < $body_length) {
			preg_match('/ ([a-z]{1,}) /i', $body_code, $matches, PREG_OFFSET_CAPTURE, $position);
			$arrayMatches = array_merge($arrayMatches, array($matches[1][0]));
			$position += $step_size;		
		}
		$arrayMatches = array_unique($arrayMatches);
		$arraySize = sizeof($arrayMatches);
		if($arraySize > 0) {
			foreach($arrayMatches as $index => $match) {
				$lowered = strtolower(html_entity_decode($match));
				foreach($ArrayEnglishWords as $englishWord) {
					if($englishWord === $lowered) {
						$count_of_english_matches++;
						break;
					}
				}
			}
			print("Number of words found: " . $arraySize . "<br>\r\n");
			print("Number of english words found: " . $count_of_english_matches . "<br>\r\n");
			if(bcdiv($count_of_english_matches, $arraySize, 2) > 0.90) {
				return $language = "english";
			}
		}
		
		$position = 0;
		//$arrayFrenchMatches = array();
		$french_words_file = "abbr/fra/mots.txt";
		$ArrayFrenchWords = explode("\n", file_get_contents($french_words_file));
		$count_of_french_matches = 0;
		//while($position < $body_length) {
		//	preg_match('/ ([a-z]{1,}) /i', $body_code, $matches, PREG_OFFSET_CAPTURE, $position);
		//	$arrayFrenchMatches = array_merge($arrayFrenchMatches, array($matches[1][0]));
		//	$position += $step_size;		
		//}
		//$arrayFrenchMatches = array_unique($arrayFrenchMatches);
		//$frenchArraySize = sizeof($arrayFrenchMatches);
		//if($frenchArraySize > 0) {
		if($arraySize > 0) {		
			//foreach($arrayFrenchMatches as $index => $match) {
			foreach($arrayMatches as $index => $match) {			
				$lowered = strtolower(html_entity_decode($match));
				foreach($ArrayFrenchWords as $frenchWord) {
					if($frenchWord === $lowered) {
						$count_of_french_matches++;
						break;
					}
				}
			}
			//print("Number of words found: " . $frenchArraySize . "<br>\r\n");
			//print("Number of words found: " . $arraySize . "<br>\r\n");
			print("Number of french words found: " . $count_of_french_matches . "<br>\r\n");
			//if(bcdiv($count_of_french_matches, $frenchArraySize, 2) > 0.90) {
			if(bcdiv($count_of_french_matches, $arraySize, 2) > 0.90) {			
				return $language = "french";
			}
		}
		//if($englishArraySize > 0 && $frenchArraySize > 0) {
		if($arraySize > 0) {
			//if(bcdiv($count_of_english_matches, $englishArraySize, 2) < 0.55 &&
			//bcdiv($count_of_english_matches, $englishArraySize, 2) > 0.45 &&
			//bcdiv($count_of_french_matches, $frenchArraySize, 2) < 0.55 &&
			//bcdiv($count_of_french_matches, $frenchArraySize, 2) > 0.45	
			if(bcdiv($count_of_english_matches, $arraySize, 2) < 0.55 &&
			bcdiv($count_of_english_matches, $arraySize, 2) > 0.45 &&
			bcdiv($count_of_french_matches, $arraySize, 2) < 0.55 &&
			bcdiv($count_of_french_matches, $arraySize, 2) > 0.45				
			) {
				return $language = "english_and_french";
			}
		}
	}
	return $language;
}
?>
 |