Language.cs
1 //
2 // Copyright (c) Microsoft. All rights reserved.
3 // Licensed under the MIT license.
4 //
5 // Microsoft Bot Framework: http://botframework.com
6 //
7 // Bot Builder SDK GitHub:
8 // https://github.com/Microsoft/BotBuilder
9 //
10 // Copyright (c) Microsoft Corporation
11 // All rights reserved.
12 //
13 // MIT License:
14 // Permission is hereby granted, free of charge, to any person obtaining
15 // a copy of this software and associated documentation files (the
16 // "Software"), to deal in the Software without restriction, including
17 // without limitation the rights to use, copy, modify, merge, publish,
18 // distribute, sublicense, and/or sell copies of the Software, and to
19 // permit persons to whom the Software is furnished to do so, subject to
20 // the following conditions:
21 //
22 // The above copyright notice and this permission notice shall be
23 // included in all copies or substantial portions of the Software.
24 //
25 // THE SOFTWARE IS PROVIDED ""AS IS"", WITHOUT WARRANTY OF ANY KIND,
26 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
29 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
30 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
31 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
32 //
33 
35 using System;
36 using System.Collections.Generic;
37 using System.Linq;
38 using System.Text;
39 using System.Text.RegularExpressions;
40 
41 namespace Microsoft.Bot.Builder.FormFlow.Advanced
42 {
46  public class Language
47  {
51  public static HashSet<string> StopWords = new HashSet<string>(Resources.LanguageStopWords.SplitList());
52 
56  public static HashSet<string> Articles = new HashSet<string>(Resources.LanguageArticles.SplitList());
57 
63  public static bool NonWord(string word)
64  {
65  bool nonWord = true;
66  foreach (var ch in word)
67  {
68  if (!(char.IsControl(ch) || char.IsPunctuation(ch) || char.IsWhiteSpace(ch)))
69  {
70  nonWord = false;
71  break;
72  }
73  }
74  return nonWord;
75  }
76 
82  public static bool NoiseWord(string word)
83  {
84  double number;
85  bool noiseWord = double.TryParse(word, out number);
86  if (!noiseWord) noiseWord = NonWord(word);
87  if (!noiseWord) noiseWord = StopWords.Contains(word.ToLower());
88  return noiseWord;
89  }
90 
96  public static bool NoiseResponse(string word)
97  {
98  bool noiseWord = NonWord(word);
99  if (!noiseWord) noiseWord = StopWords.Contains(word.ToLower());
100  return noiseWord;
101  }
102 
108  public static bool ArticleOrNone(string word)
109  {
110  return NonWord(word) || Articles.Contains(word);
111  }
112 
118  public static IEnumerable<string> NonNoiseWords(IEnumerable<string> words)
119  {
120  return from word in words where !NoiseResponse(word) select word;
121  }
122 
126  public static Regex WordBreaker = new Regex(@"\w+", RegexOptions.Compiled);
127 
133  public static IEnumerable<string> WordBreak(string input)
134  {
135  foreach (Match match in WordBreaker.Matches(input))
136  {
137  yield return match.Value;
138  }
139  }
140 
146  public static string CamelCase(string original)
147  {
148  var builder = new StringBuilder();
149  var name = original.Trim();
150  var previousUpper = Char.IsUpper(name[0]);
151  var previousLetter = Char.IsLetter(name[0]);
152  bool first = true;
153  for (int i = 0; i < name.Length; ++i)
154  {
155  var ch = name[i];
156  if (!first && (ch == '_' || ch == ' '))
157  {
158  // Non begin _ as space
159  builder.Append(' ');
160  }
161  else
162  {
163  var isUpper = Char.IsUpper(ch);
164  var isLetter = Char.IsLetter(ch);
165  if ((!previousUpper && isUpper)
166  || (isLetter != previousLetter)
167  || (!first && isUpper && (i + 1) < name.Length && Char.IsLower(name[i + 1])))
168  {
169  // Break on lower to upper, number boundaries and Upper to lower
170  builder.Append(' ');
171  }
172  previousUpper = isUpper;
173  previousLetter = isLetter;
174  builder.Append(ch);
175  if (first)
176  {
177  first = false;
178  }
179  }
180  }
181  return builder.ToString();
182  }
183 
189  public static IEnumerable<string> OptionalPlurals(IEnumerable<string> words)
190  {
191  bool addS = System.Threading.Thread.CurrentThread.CurrentUICulture.TwoLetterISOLanguageName == "en";
192  foreach (var original in words)
193  {
194  var word = original.ToLower();
195  var newWord = word;
196  if (addS && !NoiseWord(word) && word.Length > 1)
197  {
198  newWord = (word.EndsWith("s") ? word + "?" : word + "s?");
199  }
200  yield return newWord;
201  }
202  }
203 
215  public static string[] GenerateTerms(string phrase, int maxLength)
216  {
217  var words = (from word in phrase.Split(' ') where word.Length > 0 select word.ToLower()).ToArray();
218  var terms = new List<string>();
219  for (var length = 1; length <= Math.Min(words.Length, maxLength); ++length)
220  {
221  for (var start = 0; start <= words.Length - length; ++start)
222  {
223  var ngram = new ArraySegment<string>(words, start, length);
224  if (!ArticleOrNone(ngram.First()) && !ArticleOrNone(ngram.Last()))
225  {
226  terms.Add(string.Join(" ", OptionalPlurals(ngram)));
227  }
228  }
229  }
230  if (words.Length > maxLength)
231  {
232  terms.Add(string.Join(" ", words));
233  }
234  return terms.ToArray();
235  }
236 
237  private static Regex _aOrAn = new Regex(@"\b(a|an)(?:\s+)([aeiou])?", RegexOptions.Compiled | RegexOptions.IgnoreCase);
238 
247  public static string ANormalization(string input)
248  {
249  if (System.Threading.Thread.CurrentThread.CurrentUICulture.TwoLetterISOLanguageName == "en")
250  {
251  var builder = new StringBuilder();
252  var last = 0;
253  foreach (Match match in _aOrAn.Matches(input))
254  {
255  var currentWord = match.Groups[1];
256  builder.Append(input.Substring(last, currentWord.Index - last));
257  if (match.Groups[2].Success)
258  {
259  builder.Append("an");
260  }
261  else
262  {
263  builder.Append("a");
264  }
265  last = currentWord.Index + currentWord.Length;
266  }
267  builder.Append(input.Substring(last));
268  return builder.ToString();
269  }
270  else
271  {
272  return input;
273  }
274  }
275 
283  public static string BuildList(IEnumerable<string> values, string separator, string lastSeparator)
284  {
285  var builder = new StringBuilder();
286  var pos = 0;
287  var end = values.Count() - 1;
288  foreach (var elt in values)
289  {
290  if (pos > 0)
291  {
292  builder.Append(pos == end ? lastSeparator : separator);
293  }
294  builder.Append(elt);
295  ++pos;
296  }
297  return builder.ToString();
298  }
299 
304  public static string Normalize(string value, CaseNormalization normalization)
305  {
306  switch (normalization)
307  {
308  case CaseNormalization.InitialUpper:
309  value = string.Join(" ", (from word in Language.WordBreak(value)
310  select char.ToUpper(word[0]) + word.Substring(1).ToLower()));
311  break;
312  case CaseNormalization.Lower: value = value.ToLower(); break;
313  case CaseNormalization.Upper: value = value.ToUpper(); break;
314  }
315  return value;
316  }
317  }
318 }
static bool NoiseResponse(string word)
Test to see if a word can be ignored in a resposne.
Definition: Language.cs:96
static IEnumerable< string > WordBreak(string input)
Break input into words.
Definition: Language.cs:133
static bool NoiseWord(string word)
Test to see if a word is all noise.
Definition: Language.cs:82
A strongly-typed resource class, for looking up localized strings, etc.
static bool ArticleOrNone(string word)
Test a word for articles or noise.
Definition: Language.cs:108
static string[] GenerateTerms(string phrase, int maxLength)
Generate regular expressions to match word sequences in original string.
Definition: Language.cs:215
static IEnumerable< string > NonNoiseWords(IEnumerable< string > words)
Test words to see if they are all ignorable in a response.
Definition: Language.cs:118
static bool NonWord(string word)
Test to see if word is all punctuation or white space.
Definition: Language.cs:63
static string CamelCase(string original)
Break a string into words based on _ and case changes.
Definition: Language.cs:146
Namespace for resources.
CaseNormalization
How to normalize the case of words.
Definition: Attributes.cs:205
static string BuildList(IEnumerable< string > values, string separator, string lastSeparator)
Given a list of string values generate a proper English list.
Definition: Language.cs:283
static string ANormalization(string input)
Switch &#39;a&#39; before consonants and &#39;an&#39; before vowels.
Definition: Language.cs:247
static string LanguageStopWords
Looks up a localized string similar to a;about;above;above;across;after;afterwards;again;against;all;...
static IEnumerable< string > OptionalPlurals(IEnumerable< string > words)
Make sure all words end with an optional s.
Definition: Language.cs:189
static string Normalize(string value, CaseNormalization normalization)
Normalize a string.
Definition: Language.cs:304
static string LanguageArticles
Looks up a localized string similar to a;an;the.
Root namespace for the Microsoft Bot Builder SDK.